Example #1
0
    def sniff(self, filename):
        """
        http://www.mothur.org/wiki/Oligos_File
        Determines whether the file is a otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' )
        >>> Oligos().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' )
        >>> Oligos().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) == 2 and line[0] in ['forward', 'reverse']:
                    count += 1
                    continue
                elif len(line) == 3 and line[0] == 'barcode':
                    count += 1
                    continue
                else:
                    return False
        if count > 0:
            return True

        return False
Example #2
0
    def sniff(self, filename):
        """
        Determines whether the file is otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> Otu().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' )
        >>> Otu().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                if count >= 1:
                    try:
                        check = int(line[1])
                        if check + 2 != len(line):
                            return False
                    except ValueError:
                        return False
                count += 1
        if count > 2:
            return True

        return False
Example #3
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is otu (operational taxonomic unit) format
        label<TAB>count[<TAB>value(1..n)]

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' )
        >>> Sabund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' )
        >>> Sabund().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                try:
                    check = int(line[1])
                    if check + 2 != len(line):
                        return False
                    for i in range(2, len(line)):
                        int(line[i])
                except ValueError:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Example #4
0
    def sniff_prefix(self, file_prefix):
        """
        Try to guess if the file is a PDBQT file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt')
        >>> PDBQT().sniff(fname)
        True
        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDBQT().sniff(fname)
        False
        """
        headers = iter_headers(file_prefix, sep=' ', count=300)
        h = t = c = s = k = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'REMARK':
                h = True
            elif section_name == 'ROOT':
                t = True
            elif section_name == 'ENDROOT':
                c = True
            elif section_name == 'BRANCH':
                s = True
            elif section_name == 'TORSDOF':
                k = True

        if h * t * c * s * k:
            return True
        else:
            return False
Example #5
0
    def sniff(self, filename):
        """
        Try to guess if the file is a PDBQT file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('NuBBE_1_obabel_3D.pdbqt')
        >>> PDBQT().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDBQT().sniff(fname)
        False
        """
        headers = iter_headers(filename, sep=' ', count=300)
        h = t = c = s = k = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'REMARK':
                h = True
            elif section_name == 'ROOT':
                t = True
            elif section_name == 'ENDROOT':
                c = True
            elif section_name == 'BRANCH':
                s = True
            elif section_name == 'TORSDOF':
                k = True

        if h * t * c * s * k:
            return True
        else:
            return False
Example #6
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Example #7
0
    def set_meta(self, dataset, overwrite=True, skip=1, **kwd):
        super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd)

        # See if file starts with header line
        if dataset.has_data():
            label_names = set()
            group_names = set()
            data_lines = 0
            comment_lines = 0
            ncols = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            for line in headers:
                if line[0] == 'label' and line[1] == 'Group':
                    skip = 1
                    comment_lines += 1
                else:
                    skip = 0
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                    group_names.add(line[1])

            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.groups = list(group_names)
            dataset.metadata.groups.sort()
            dataset.metadata.skip = skip
Example #8
0
    def sniff(self, filename):
        """
        Determines whether the file is a secondary structure map format
        A single column with an integer value which indicates the row that this
        row maps to. Check to make sure if structMap[10] = 380 then
        structMap[380] = 10 and vice versa.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        line_num = 0
        rowidxmap = {}
        for line in headers:
            line_num += 1
            if len(line) > 1:
                return False
            try:
                pointer = int(line[0])
                if pointer > line_num:
                    rowidxmap[pointer] = line_num
                elif pointer > 0 or line_num in rowidxmap:
                    if rowidxmap[line_num] != pointer:
                        return False
            except (ValueError, KeyError):
                return False
        if line_num < 3:
            return False
        return True
Example #9
0
    def sniff(self, filename):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) != 3:
                    return False
                try:
                    float(line[2])
                    try:
                        # See if it's also an integer
                        int(line[2])
                    except ValueError:
                        # At least one value is not an integer
                        all_ints = False
                except ValueError:
                    return False
                count += 1

        if count > 2:
            return not all_ints

        return False
Example #10
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a secondary structure map format
        A single column with an integer value which indicates the row that this
        row maps to. Check to make sure if structMap[10] = 380 then
        structMap[380] = 10 and vice versa.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.map' )
        >>> SecondaryStructureMap().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        line_num = 0
        rowidxmap = {}
        for line in headers:
            line_num += 1
            if len(line) > 1:
                return False
            try:
                pointer = int(line[0])
                if pointer > line_num:
                    rowidxmap[pointer] = line_num
                elif pointer > 0 or line_num in rowidxmap:
                    if rowidxmap[line_num] != pointer:
                        return False
            except (ValueError, KeyError):
                return False
        if line_num < 3:
            return False
        return True
Example #11
0
    def sniff(self, filename):
        """
        Determines whether the file is otu (operational taxonomic unit) format
        label<TAB>count[<TAB>value(1..n)]

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.sabund' )
        >>> Sabund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.sabund' )
        >>> Sabund().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                try:
                    check = int(line[1])
                    if check + 2 != len(line):
                        return False
                    for i in range(2, len(line)):
                        int(line[i])
                except ValueError:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Example #12
0
    def sniff_prefix(self, file_prefix):
        """
        http://www.mothur.org/wiki/Oligos_File
        Determines whether the file is a otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.oligos' )
        >>> Oligos().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.oligos' )
        >>> Oligos().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) == 2 and line[0] in ['forward', 'reverse']:
                    count += 1
                    continue
                elif len(line) == 3 and line[0] == 'barcode':
                    count += 1
                    continue
                else:
                    return False
        if count > 0:
            return True

        return False
Example #13
0
 def sniff_prefix(self, file_prefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('sample.gfa2')
     >>> Gfa2().sniff(fname)
     True
     >>> Gfa1().sniff(fname)
     False
     """
     found_valid_lines = False
     for line in iter_headers(file_prefix, "\t"):
         if line[0].startswith('#'):
             continue
         if line[0] == 'H':
             return len(line) >= 2 and line[1] == 'VN:Z:2.0'
         elif line[0] == 'S':
             if len(line) < 3:
                 return False
         elif line[0] == 'F':
             if len(line) < 8:
                 return False
         elif line[0] == 'E':
             if len(line) < 9:
                 return False
         elif line[0] == 'G':
             if len(line) < 6:
                 return False
         elif line[0] == 'O' or line[0] == 'U':
             if len(line) < 3:
                 return False
         else:
             return False
         found_valid_lines = True
     return found_valid_lines
Example #14
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Example #15
0
    def sniff_prefix(self, file_prefix):
        """
        Try to guess if the file is a PDB file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('5e5z.pdb')
        >>> PDB().sniff(fname)
        True
        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDB().sniff(fname)
        False
        """
        headers = iter_headers(file_prefix, sep=' ', count=300)
        h = t = c = s = k = e = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'HEADER':
                h = True
            elif section_name == 'TITLE':
                t = True
            elif section_name == 'COMPND':
                c = True
            elif section_name == 'SOURCE':
                s = True
            elif section_name == 'KEYWDS':
                k = True
            elif section_name == 'EXPDTA':
                e = True

        if h * t * c * s * k * e:
            return True
        else:
            return False
Example #16
0
    def set_meta(self, dataset, overwrite=True, skip=1, **kwd):
        super(GroupAbund, self).set_meta(dataset, overwrite=overwrite, **kwd)

        # See if file starts with header line
        if dataset.has_data():
            label_names = set()
            group_names = set()
            data_lines = 0
            comment_lines = 0
            ncols = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            for line in headers:
                if line[0] == 'label' and line[1] == 'Group':
                    skip = 1
                    comment_lines += 1
                else:
                    skip = 0
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                    group_names.add(line[1])

            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.groups = list(group_names)
            dataset.metadata.groups.sort()
            dataset.metadata.skip = skip
Example #17
0
 def sniff_prefix(self, file_prefix):
     """
     Try to guess if the file is a PQR file.
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('5e5z.pqr')
     >>> PQR().sniff(fname)
     True
     >>> fname = get_test_fname('drugbank_drugs.cml')
     >>> PQR().sniff(fname)
     False
     """
     prog = self.get_matcher()
     headers = iter_headers(file_prefix,
                            sep=None,
                            comment_designator='REMARK   5',
                            count=3000)
     h = a = False
     for line in headers:
         section_name = line[0].strip()
         if section_name == 'REMARK':
             h = True
         elif section_name == 'ATOM' or section_name == 'HETATM':
             if prog.match(' '.join(line)):
                 a = True
                 break
     if h * a:
         return True
     else:
         return False
Example #18
0
 def sniff_prefix(self, file_prefix):
     """
     Try to guess if the file is a PQR file.
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('5e5z.pqr')
     >>> PQR().sniff(fname)
     True
     >>> fname = get_test_fname('drugbank_drugs.cml')
     >>> PQR().sniff(fname)
     False
     """
     prog = self.get_matcher()
     headers = iter_headers(file_prefix, sep=None, comment_designator='REMARK   5', count=3000)
     h = a = False
     for line in headers:
         section_name = line[0].strip()
         if section_name == 'REMARK':
             h = True
         elif section_name == 'ATOM' or section_name == 'HETATM':
             if prog.match(' '.join(line)):
                 a = True
                 break
     if h * a:
         return True
     else:
         return False
Example #19
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is otu (operational taxonomic unit) format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> Otu().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.otu' )
        >>> Otu().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 2:
                    return False
                if count >= 1:
                    try:
                        check = int(line[1])
                        if check + 2 != len(line):
                            return False
                    except ValueError:
                        return False
                count += 1
        if count > 2:
            return True

        return False
Example #20
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) != 3:
                    return False
                try:
                    float(line[2])
                    try:
                        # See if it's also an integer
                        int(line[2])
                    except ValueError:
                        # At least one value is not an integer
                        all_ints = False
                except ValueError:
                    return False
                count += 1

        if count > 2:
            return not all_ints

        return False
Example #21
0
    def sniff(self, filename):
        """
        Try to guess if the file is a PDB file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('5e5z.pdb')
        >>> PDB().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> PDB().sniff(fname)
        False
        """
        headers = iter_headers(filename, sep=' ', count=300)
        h = t = c = s = k = e = False
        for line in headers:
            section_name = line[0].strip()
            if section_name == 'HEADER':
                h = True
            elif section_name == 'TITLE':
                t = True
            elif section_name == 'COMPND':
                c = True
            elif section_name == 'SOURCE':
                s = True
            elif section_name == 'KEYWDS':
                k = True
            elif section_name == 'EXPDTA':
                e = True

        if h * t * c * s * k * e:
            return True
        else:
            return False
Example #22
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is an axes format
        The first line may have column headings.
        The following lines have the name in the first column plus float columns for each axis.

        .. code-block::

            group   axis1   axis2
            forest  0.000000        0.145743
            pasture 0.145743        0.000000

        .. code-block::

                    axis1   axis2
            U68589  0.262608        -0.077498
            U68590  0.027118        0.195197
            U68591  0.329854        0.014395

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' )
        >>> Axes().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' )
        >>> Axes().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        col_cnt = None
        all_integers = True
        for line in headers:
            if count != 0:
                if col_cnt is None:
                    col_cnt = len(line)
                    if col_cnt < 2:
                        return False
                else:
                    if len(line) != col_cnt:
                        return False
                    try:
                        for i in range(1, col_cnt):
                            check = float(line[i])
                            # Check abs value is <= 1.0
                            if abs(check) > 1.0:
                                return False
                            # Also test for whether value is an integer
                            try:
                                check = int(line[i])
                            except ValueError:
                                all_integers = False
                    except ValueError:
                        return False
            count += 1

        if count > 0:
            return not all_integers

        return False
Example #23
0
    def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd):
        super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines)

        group_names = set()
        headers = iter_headers(dataset.file_name, sep='\t', count=-1)
        for line in headers:
            if len(line) > 1:
                group_names.add(line[1])
        dataset.metadata.groups = list(group_names)
Example #24
0
    def set_meta(self, dataset, overwrite=True, skip=None, max_data_lines=None, **kwd):
        super(Group, self).set_meta(dataset, overwrite, skip, max_data_lines)

        group_names = set()
        headers = iter_headers(dataset.file_name, sep='\t', count=-1)
        for line in headers:
            if len(line) > 1:
                group_names.add(line[1])
        dataset.metadata.groups = list(group_names)
Example #25
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is an axes format
        The first line may have column headings.
        The following lines have the name in the first column plus float columns for each axis.
        ==> 98_sq_phylip_amazon.fn.unique.pca.axes <==
           group   axis1   axis2
           forest  0.000000        0.145743
           pasture 0.145743        0.000000

        ==> 98_sq_phylip_amazon.nmds.axes <==
                   axis1   axis2
           U68589  0.262608        -0.077498
           U68590  0.027118        0.195197
           U68591  0.329854        0.014395

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.axes' )
        >>> Axes().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.axes' )
        >>> Axes().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        col_cnt = None
        all_integers = True
        for line in headers:
            if count != 0:
                if col_cnt is None:
                    col_cnt = len(line)
                    if col_cnt < 2:
                        return False
                else:
                    if len(line) != col_cnt:
                        return False
                    try:
                        for i in range(1, col_cnt):
                            check = float(line[i])
                            # Check abs value is <= 1.0
                            if abs(check) > 1.0:
                                return False
                            # Also test for whether value is an integer
                            try:
                                check = int(line[i])
                            except ValueError:
                                all_integers = False
                    except ValueError:
                        return False
            count += 1

        if count > 0:
            return not all_integers

        return False
Example #26
0
    def set_meta(self, dataset, overwrite=True, skip=0, **kwd):
        super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd)

        headers = iter_headers(dataset.file_name, sep='\t')
        for line in headers:
            if not line[0].startswith('@'):
                try:
                    dataset.metadata.sequence_count = int(''.join(line))  # seq count sometimes preceded by tab
                    break
                except Exception as e:
                    if not isinstance(self, PairwiseDistanceMatrix):
                        log.warning("DistanceMatrix set_meta %s" % e)
Example #27
0
    def sniff_prefix(self, file_prefix):
        """
        Checks for and does cursory validation on data that looks like AGP

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('eg1.agp')
        >>> GoldenPath().sniff(fname)
        True
        >>> fname = get_test_fname('eg2.agp')
        >>> GoldenPath().sniff(fname)
        True
        >>> fname = get_test_fname('1.bed')
        >>> GoldenPath().sniff(fname)
        False
        >>> fname = get_test_fname('2.tabular')
        >>> GoldenPath().sniff(fname)
        False
        """
        found_non_comment_lines = False
        try:
            for line in iter_headers(file_prefix, '\t',
                                     comment_designator='#'):
                if line:
                    if len(line) != 9:
                        return False
                    assert line[4] in [
                        'A', 'D', 'F', 'G', 'O', 'P', 'W', 'N', 'U'
                    ]
                    ostensible_numbers = line[1:3]
                    if line[4] in ['U', 'N']:
                        ostensible_numbers.append(line[5])
                        assert line[6] in [
                            'scaffold', 'contig', 'centromere', 'short_arm',
                            'heterochromatin', 'telomere', 'repeat'
                        ]
                        assert line[7] in ['yes', 'no']
                        assert line[8] in [
                            'na', 'paired-ends', 'align_genus', 'align_xgenus',
                            'align_trnscript', 'within_clone', 'clone_contig',
                            'map', 'strobe', 'unspecified'
                        ]
                    else:
                        ostensible_numbers.extend([line[6], line[7]])
                        assert line[8] in ['+', '-', '?', '0', 'na']
                    if line[4] == 'U':
                        assert int(line[5]) == 100
                    assert all(
                        map(lambda x: str(x).isnumeric() and int(x) > 0,
                            ostensible_numbers))
                    found_non_comment_lines = True
        except Exception:
            return False
        return found_non_comment_lines
Example #28
0
    def set_meta(self, dataset, overwrite=True, skip=0, **kwd):
        super(DistanceMatrix, self).set_meta(dataset, overwrite=overwrite, skip=skip, **kwd)

        headers = iter_headers(dataset.file_name, sep='\t')
        for line in headers:
            if not line[0].startswith('@'):
                try:
                    dataset.metadata.sequence_count = int(''.join(line))  # seq count sometimes preceded by tab
                    break
                except Exception as e:
                    if not isinstance(self, PairwiseDistanceMatrix):
                        log.warning("DistanceMatrix set_meta %s" % e)
Example #29
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences

                5  # possibly but not always preceded by a tab :/
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = iter_headers(file_prefix, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should contain the number of sequences in the file
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                            assert sequence_count > 0
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the line number
                    if len(line) != (line_num):
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Example #30
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a lower-triangle distance matrix (phylip) format
        The first line has the number of sequences in the matrix.
        The remaining lines have the sequence name followed by a list of distances from all preceeding sequences
                5  # possibly but not always preceded by a tab :/
                U68589
                U68590	0.3371
                U68591	0.3609	0.3782
                U68592	0.4155	0.3197	0.4148
                U68593	0.2872	0.1690	0.3361	0.2842

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.lower.dist' )
        >>> LowerTriangleDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = iter_headers(file_prefix, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should contain the number of sequences in the file
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                            assert sequence_count > 0
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the line number
                    if len(line) != (line_num):
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Example #31
0
    def sniff(self, filename):
        """
        Determines whether the file is a Reference Taxonomy

        http://www.mothur.org/wiki/Taxonomy_outline
        A table with 2 or 3 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        - integer ?
        Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline)
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
        Example: 3-column (http://vamps.mbl.edu/resources/databases.php)
          v3_AA008	Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus	5
          v3_AA016	Bacteria	120
          v3_AA019	Archaea;Crenarchaeota;Marine_Group_I	1

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t', count=300)
        count = 0
        pat_prog = re.compile(
            '^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$'
        )
        found_semicolons = False
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if not (2 <= len(line) <= 3):
                    return False
                if not pat_prog.match(line[1]):
                    return False
                if not found_semicolons and line[1].find(';') > -1:
                    found_semicolons = True
                if len(line) == 3:
                    try:
                        int(line[2])
                    except Exception:
                        return False
                count += 1

        if count > 0:
            # Require that at least one entry has semicolons in the 2nd column
            return found_semicolons

        return False
Example #32
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a frequency tabular format for chimera analysis

        .. code-block::

            #1.14.0
            0	0.000
            1	0.000
            ...
            155	0.975

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' )
        >>> Frequency().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' )
        >>> Frequency().sniff( fname )
        False
        >>> # Expression count matrix (EdgeR wrapper)
        >>> fname = get_test_fname( 'mothur_datatypetest_false_2.mothur.freq' )
        >>> Frequency().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should be #<version string>
                if count == 0:
                    if not line[0].startswith('#') or len(line) != 1:
                        return False

                else:
                    # all other lines should be <int> <float>
                    if len(line) != 2:
                        return False
                    try:
                        int(line[0])
                        float(line[1])

                        if line[1].find('.') == -1:
                            return False
                    except Exception:
                        return False
                count += 1

        if count > 1:
            return True

        return False
Example #33
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.

               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = iter_headers(file_prefix, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                            assert sequence_count > 0
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the number of sequences
                    if len(line) != sequence_count + 1:
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Example #34
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Set metadata for Otu files.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> from galaxy.util.bunch import Bunch
        >>> dataset = Bunch()
        >>> dataset.metadata = Bunch
        >>> otu = Otu()
        >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> dataset.has_data = lambda: True
        >>> otu.set_meta(dataset)
        >>> dataset.metadata.columns
        100
        >>> len(dataset.metadata.labels) == 37
        True
        >>> len(dataset.metadata.otulabels) == 98
        True
        """
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            if first_line:
                first_line = first_line[0]
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Example #35
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a Reference Taxonomy

        http://www.mothur.org/wiki/Taxonomy_outline
        A table with 2 or 3 columns:
        - SequenceName
        - Taxonomy (semicolon-separated taxonomy in descending order)
        - integer ?
        Example: 2-column (http://www.mothur.org/wiki/Taxonomy_outline)
          X56533.1        Eukaryota;Alveolata;Ciliophora;Intramacronucleata;Oligohymenophorea;Hymenostomatida;Tetrahymenina;Glaucomidae;Glaucoma;
          X97975.1        Eukaryota;Parabasalidea;Trichomonada;Trichomonadida;unclassified_Trichomonadida;
          AF052717.1      Eukaryota;Parabasalidea;
        Example: 3-column (http://vamps.mbl.edu/resources/databases.php)
          v3_AA008	Bacteria;Firmicutes;Bacilli;Lactobacillales;Streptococcaceae;Streptococcus	5
          v3_AA016	Bacteria	120
          v3_AA019	Archaea;Crenarchaeota;Marine_Group_I	1

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.ref.taxonomy' )
        >>> RefTaxonomy().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t', count=300)
        count = 0
        pat_prog = re.compile('^([^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?(;[^ \t\n\r\x0c\x0b;]+([(]\\d+[)])?)*(;)?)$')
        found_semicolons = False
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if not (2 <= len(line) <= 3):
                    return False
                if not pat_prog.match(line[1]):
                    return False
                if not found_semicolons and line[1].find(';') > -1:
                    found_semicolons = True
                if len(line) == 3:
                    try:
                        int(line[2])
                    except Exception:
                        return False
                count += 1

        if count > 0:
            # Require that at least one entry has semicolons in the 2nd column
            return found_semicolons

        return False
Example #36
0
    def set_meta(self, dataset, overwrite=True, **kwd):
        """
        Set metadata for Otu files.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> from galaxy.util.bunch import Bunch
        >>> dataset = Bunch()
        >>> dataset.metadata = Bunch
        >>> otu = Otu()
        >>> dataset.file_name = get_test_fname( 'mothur_datatypetest_true.mothur.otu' )
        >>> dataset.has_data = lambda: True
        >>> otu.set_meta(dataset)
        >>> dataset.metadata.columns
        100
        >>> len(dataset.metadata.labels) == 37
        True
        >>> len(dataset.metadata.otulabels) == 98
        True
        """
        super(Otu, self).set_meta(dataset, overwrite=overwrite, **kwd)

        if dataset.has_data():
            label_names = set()
            otulabel_names = set()
            ncols = 0
            data_lines = 0
            comment_lines = 0

            headers = iter_headers(dataset.file_name, sep='\t', count=-1)
            first_line = get_headers(dataset.file_name, sep='\t', count=1)
            if first_line:
                first_line = first_line[0]
            # set otulabels
            if len(first_line) > 2:
                otulabel_names = first_line[2:]
            # set label names and number of lines
            for line in headers:
                if len(line) >= 2 and not line[0].startswith('@'):
                    data_lines += 1
                    ncols = max(ncols, len(line))
                    label_names.add(line[0])
                else:
                    comment_lines += 1
            # Set the discovered metadata values for the dataset
            dataset.metadata.data_lines = data_lines
            dataset.metadata.columns = ncols
            dataset.metadata.labels = list(label_names)
            dataset.metadata.labels.sort()
            dataset.metadata.otulabels = list(otulabel_names)
            dataset.metadata.otulabels.sort()
Example #37
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a square distance matrix (Column-formatted distance matrix) format
        The first line has the number of sequences in the matrix.
        The following lines have the sequence name in the first column plus a column for the distance to each sequence
        in the row order in which they appear in the matrix.
               3
               U68589  0.0000  0.3371  0.3610
               U68590  0.3371  0.0000  0.3783
               U68590  0.3371  0.0000  0.3783

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.square.dist' )
        >>> SquareDistanceMatrix().sniff( fname )
        False
        """
        numlines = 300
        headers = iter_headers(file_prefix, sep='\t', count=numlines)
        line_num = 0
        for line in headers:
            if not line[0].startswith('@'):
                if line_num == 0:
                    if len(line) > 2:
                        return False
                    else:
                        try:
                            sequence_count = int(''.join(line))
                            assert sequence_count > 0
                        except ValueError:
                            return False
                else:
                    # number of fields should equal the number of sequences
                    if len(line) != sequence_count + 1:
                        return False
                    try:
                        # Distances should be floats
                        for column in line[2:]:
                            float(column)
                    except ValueError:
                        return False
                line_num += 1

        # check if the number of lines in the file was as expected
        if line_num == sequence_count + 1 or line_num == numlines + 1:
            return True

        return False
Example #38
0
    def sniff(self, filename):
        """
        Determines whether the file is a frequency tabular format for chimera analysis
        #1.14.0
        0	0.000
        1	0.000
        ...
        155	0.975

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' )
        >>> Frequency().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' )
        >>> Frequency().sniff( fname )
        False

        # Expression count matrix (EdgeR wrapper)
        >>> fname = get_test_fname( 'mothur_datatypetest_false_2.mothur.freq' )
        >>> Frequency().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                # first line should be #<version string>
                if count == 0:
                    if not line[0].startswith('#') or len(line) != 1:
                        return False

                else:
                    # all other lines should be <int> <float>
                    if len(line) != 2:
                        return False
                    try:
                        int(line[0])
                        float(line[1])

                        if line[1].find('.') == -1:
                            return False
                    except Exception:
                        return False
                count += 1

        if count > 1:
            return True

        return False
Example #39
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a pairwise distance matrix (Column-formatted distance matrix) format
        The first and second columns have the sequence names and the third column is the distance between those sequences.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.pair.dist' )
        >>> PairwiseDistanceMatrix().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        names = [False, False]
        for line in headers:
            if line[0].startswith('@'):
                continue
            if len(line) != 3:
                return False
            # check if col3 contains distances (floats)
            try:
                float(line[2])
                try:
                    # See if it's also an integer
                    int(line[2])
                except ValueError:
                    # At least one value is not an integer
                    all_ints = False
            except ValueError:
                return False
            count += 1
            # check if col1 and col2 likely contain names
            for c in [0, 1]:
                try:
                    float(line[c])
                except ValueError:
                    names[c] = True

        if not names[0] or not names[1]:
            return False

        if count > 2:
            return not all_ints

        return False
Example #40
0
    def sniff(self, filename):
        """
        Determines whether the file is in html format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'complete.bed' )
        >>> Html().sniff( fname )
        False
        >>> fname = get_test_fname( 'file.html' )
        >>> Html().sniff( fname )
        True
        """
        headers = iter_headers(filename, None)
        for i, hdr in enumerate(headers):
            if hdr and hdr[0].lower().find('<html>') >= 0:
                return True
        return False
Example #41
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is in html format

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'complete.bed' )
        >>> Html().sniff( fname )
        False
        >>> fname = get_test_fname( 'file.html' )
        >>> Html().sniff( fname )
        True
        """
        headers = iter_headers(file_prefix, None)
        for i, hdr in enumerate(headers):
            if hdr and hdr[0].lower().find('<html>') >= 0:
                return True
        return False
Example #42
0
    def sniff(self, filename):
        """
        Try to guess if the file is a InChI file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('drugbank_drugs.inchi')
        >>> InChI().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> InChI().sniff(fname)
        False
        """
        inchi_lines = iter_headers(filename, sep=' ', count=10)
        for inchi in inchi_lines:
            if not inchi[0].startswith('InChI='):
                return False
        return True
Example #43
0
    def sniff(self, filename):
        """
        Try to guess if the file is a InChI file.

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('drugbank_drugs.inchi')
        >>> InChI().sniff(fname)
        True

        >>> fname = get_test_fname('drugbank_drugs.cml')
        >>> InChI().sniff(fname)
        False
        """
        inchi_lines = iter_headers(filename, sep=' ', count=10)
        for inchi in inchi_lines:
            if not inchi[0].startswith('InChI='):
                return False
        return True
Example #44
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis

        .. code-block::

            1	0	0	0	0	0	0
            2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
            3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
            ...

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' )
        >>> Quantile().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' )
        >>> Quantile().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) != 7:
                    return False
                try:
                    int(line[0])
                    float(line[1])
                    float(line[2])
                    float(line[3])
                    float(line[4])
                    float(line[5])
                    float(line[6])
                except Exception:
                    return False
                count += 1
        if count > 0:
            return True

        return False
Example #45
0
    def sniff(self, filename):
        """
        Determines whether the file is a frequency tabular format for chimera analysis
        #1.14.0
        0	0.000
        1	0.000
        ...
        155	0.975

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.freq' )
        >>> Frequency().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.freq' )
        >>> Frequency().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if count == 0:
                    # first line should be #<version string>
                    if not line[0].startswith('#') and len(line) == 1:
                        return False
                else:
                    # all other lines should be <int> <float>
                    if len(line) != 2:
                        return False
                    try:
                        int(line[0])
                        float(line[1])
                    except Exception:
                        return False
                count += 1
        if count > 1:
            return True

        return False
Example #46
0
 def sniff_prefix(self, file_prefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('big.gfa1')
     >>> Gfa1().sniff(fname)
     True
     >>> Gfa2().sniff(fname)
     False
     """
     found_valid_lines = False
     for line in iter_headers(file_prefix, "\t"):
         if line[0].startswith('#'):
             continue
         if line[0] == 'H':
             return len(line) == 2 and line[1] == 'VN:Z:1.0'
         elif line[0] == 'S':
             if len(line) < 3:
                 return False
         elif line[0] == 'L':
             if len(line) < 6:
                 return False
             for i in (2, 4):
                 if line[i] not in ('+', '-'):
                     return False
         elif line[0] == 'C':
             if len(line) < 7:
                 return False
             for i in (2, 4):
                 if line[i] not in ('+', '-'):
                     return False
             int(line[5])
         elif line[0] == 'P':
             if len(line) < 4:
                 return False
         else:
             return False
         found_valid_lines = True
     return found_valid_lines
Example #47
0
 def sniff_prefix(self, file_prefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('A-3105.paf')
     >>> Paf().sniff(fname)
     True
     """
     found_valid_lines = False
     for line in iter_headers(file_prefix, "\t"):
         if len(line) < 12:
             return False
         for i in (1, 2, 3, 6, 7, 8, 9, 10, 11):
             int(line[i])
         if line[4] not in ('+', '-'):
             return False
         if not (0 <= int(line[11]) <= 255):
             return False
         # Check that the optional columns after the 12th contain SAM-like typed key-value pairs
         for i in range(12, len(line)):
             if len(line[i].split(':')) != 3:
                 return False
         found_valid_lines = True
     return found_valid_lines
Example #48
0
    def sniff(self, filename, vals_are_int=False):
        """
        Determines whether the file is a otu (operational taxonomic unit)
        Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.2

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        False
        """
        headers = iter_headers(filename, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 3:
                    return False
                if count > 0 or line[0] != 'label':
                    try:
                        check = int(line[2])
                        if check + 3 != len(line):
                            return False
                        for i in range(3, len(line)):
                            if vals_are_int:
                                int(line[i])
                            else:
                                float(line[i])
                    except ValueError:
                        return False
                count += 1
        if count > 1:
            return True
        return False
Example #49
0
    def sniff_prefix(self, file_prefix, vals_are_int=False):
        """
        Determines whether the file is a otu (operational taxonomic unit)
        Shared format
        label<TAB>group<TAB>count[<TAB>value(1..n)]
        The first line is column headings as of Mothur v 1.2

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.shared' )
        >>> GroupAbund().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@'):
                if len(line) < 3:
                    return False
                if count > 0 or line[0] != 'label':
                    try:
                        check = int(line[2])
                        if check + 3 != len(line):
                            return False
                        for i in range(3, len(line)):
                            if vals_are_int:
                                int(line[i])
                            else:
                                float(line[i])
                    except ValueError:
                        return False
                count += 1
        if count > 1:
            return True
        return False
Example #50
0
    def sniff_prefix(self, file_prefix):
        """
        Determines whether the file is a quantiles tabular format for chimera analysis
        1	0	0	0	0	0	0
        2       0.309198        0.309198        0.37161 0.37161 0.37161 0.37161
        3       0.510982        0.563213        0.693529        0.858939        1.07442 1.20608
        ...

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'mothur_datatypetest_true.mothur.quan' )
        >>> Quantile().sniff( fname )
        True
        >>> fname = get_test_fname( 'mothur_datatypetest_false.mothur.quan' )
        >>> Quantile().sniff( fname )
        False
        """
        headers = iter_headers(file_prefix, sep='\t')
        count = 0
        for line in headers:
            if not line[0].startswith('@') and not line[0].startswith('#'):
                if len(line) != 7:
                    return False
                try:
                    int(line[0])
                    float(line[1])
                    float(line[2])
                    float(line[3])
                    float(line[4])
                    float(line[5])
                    float(line[6])
                except Exception:
                    return False
                count += 1
        if count > 0:
            return True

        return False