Exemple #1
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        # @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
        if file_prefix.search(TURTLE_PREFIX_PATTERN):
            return True

        if file_prefix.search(TURTLE_BASE_PATTERN):
            return True
        return False
Exemple #2
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """Quick test on file headings"""
     if file_prefix.startswith("fcs_files\tcluster_id\tlabel\tfcs_names"):
         header_line = file_prefix.string_io().readline()
         if header_line.strip().split("\t")[-1] == 'fraction':
             return True
         elif file_prefix.truncated and file_prefix.string_io().read(
         ) == header_line:
             return True
     return False
Exemple #3
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname( 'sequence.fasta' )
     >>> QualityScore454().sniff( fname )
     False
     >>> fname = get_test_fname( 'sequence.qual454' )
     >>> QualityScore454().sniff( fname )
     True
     """
     fh = file_prefix.string_io()
     for line in fh:
         line = line.strip()
         if line and not line.startswith(
                 '#'):  # first non-empty non-comment line
             if line.startswith('>'):
                 line = fh.readline().strip()
                 if line == '' or line.startswith('>'):
                     break
                 try:
                     [int(x) for x in line.split()]
                 except Exception:
                     return False
                 return True
             else:
                 break  # we found a non-empty line, but it's not a header
     return False
Exemple #4
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname( 'sequence.fasta' )
     >>> QualityScoreSOLiD().sniff( fname )
     False
     >>> fname = get_test_fname( 'sequence.qualsolid' )
     >>> QualityScoreSOLiD().sniff( fname )
     True
     """
     fh = file_prefix.string_io()
     readlen = None
     goodblock = 0
     for line in fh:
         line = line.strip()
         if not line.startswith('#'):  # first non-empty non-comment line
             if line.startswith('>'):
                 line = fh.readline().strip()
                 if line == '' or line.startswith('>'):
                     return False
                 try:
                     [int(x) for x in line.split()]
                     if not readlen:
                         readlen = len(line.split())
                     assert len(
                         line.split()
                     ) == readlen  # SOLiD reads should be of the same length
                 except Exception:
                     return False
                 goodblock += 1
                 if goodblock > 10:
                     return True
             else:
                 return False
     return goodblock > 0
Exemple #5
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Determines whether the file is a velveth produced  fasta format
        The id line has 3 fields separated by tabs: sequence_name  sequence_index category::

          >SEQUENCE_0_length_35   1       1
          GGATATAGGGCCAACCCAACTCAACGGCCTGTCTT
          >SEQUENCE_1_length_35   2       1
          CGACGAATGACAGGTCACGAATTTGGCGGGGATTA
        """
        fh = file_prefix.string_io()
        for line in fh:
            line = line.strip()
            if line:  # first non-empty line
                if line.startswith('>'):
                    if not re.match(r'>[^\t]+\t\d+\t\d+$', line):
                        return False
                    # The next line.strip() must not be '', nor startwith '>'
                    line = fh.readline().strip()
                    if line == '' or line.startswith('>'):
                        return False
                    return True
                else:
                    return False
        return False
Exemple #6
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """Determines whether the file is blastxml

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
        >>> BlastXml().sniff(fname)
        True
        >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.blastxml')
        >>> BlastXml().sniff(fname)
        True
        >>> fname = get_test_fname('interval.interval')
        >>> BlastXml().sniff(fname)
        False
        """
        handle = file_prefix.string_io()
        line = handle.readline()
        if line.strip() != '<?xml version="1.0"?>':
            return False
        line = handle.readline()
        if line.strip() not in [
                '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">',
                '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">'
        ]:
            return False
        line = handle.readline()
        if line.strip() != '<BlastOutput>':
            return False
        return True
Exemple #7
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """ Determines whether the file is a SpectraST generated file.
     """
     contents = file_prefix.string_io()
     return Msp.next_line_starts_with(
         contents, "Name:") and Msp.next_line_starts_with(
             contents, "LibID:")
Exemple #8
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
            Try to guess the Arff filetype.
            It usually starts with a "format-version:" string and has several stanzas which starts with "id:".
        """
        handle = file_prefix.string_io()
        relation_found = False
        attribute_found = False
        for line_count, line in enumerate(handle):
            if line_count > 1000:
                # only investigate the first 1000 lines
                return False
            line = line.strip()
            if not line:
                continue

            start_string = line[:20].upper()
            if start_string.startswith("@RELATION"):
                relation_found = True
            elif start_string.startswith("@ATTRIBUTE"):
                attribute_found = True
            elif start_string.startswith("@DATA"):
                # @DATA should be the last data block
                if relation_found and attribute_found:
                    return True
        return False
Exemple #9
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Determines whether the file is an amos assembly file format
        Example::

          {CTG
          iid:1
          eid:1
          seq:
          CCTCTCCTGTAGAGTTCAACCGA-GCCGGTAGAGTTTTATCA
          .
          qlt:
          DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD
          .
          {TLE
          src:1027
          off:0
          clr:618,0
          gap:
          250 612
          .
          }
          }
        """
        for line in file_prefix.line_iterator():
            if not line:
                break  # EOF
            line = line.strip()
            if line:  # first non-empty line
                if line.startswith('{'):
                    if re.match(r'{(RED|CTG|TLE)$', line):
                        return True
        return False
Exemple #10
0
 def _has_root_element_in_prefix(self, file_prefix: FilePrefix, root):
     for line in file_prefix.line_iterator():
         if not line.startswith('<?'):
             break
     # pattern match <root or <ns:root for any ns string
     pattern = r'^<(\w*:)?%s' % root
     return re.match(pattern, line) is not None
Exemple #11
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """Each file must have one or more data blocks.
        The start of a data block is defined by the keyword
        "data_" followed by an optional string for
        identification (e.g., "data_images").  All text
        before the first "data_" keyword are comments

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('1.star')
        >>> Star().sniff(fname)
        True
        >>> fname = get_test_fname('interval.interval')
        >>> Star().sniff(fname)
        False
        """
        in_data_block = False
        for line in file_prefix.line_iterator():
            # All lines before the first
            # data_ block must be comments.
            line = line.strip()
            if len(line) == 0:
                continue
            if line.startswith("data_"):
                in_data_block = True
                continue
            if in_data_block:
                # Lines within data blocks must
                # be blank, start with loop_, or
                # start with _.
                if len(line) == 0:
                    continue
                if line.startswith("loop_") or line.startswith("_"):
                    return True
                return False
        return False
Exemple #12
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """Determines whether the file is a Gifti file

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('Human.colin.R.activations.label.gii')
        >>> Gifti().sniff(fname)
        True
        >>> fname = get_test_fname('interval.interval')
        >>> Gifti().sniff(fname)
        False
        >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
        >>> Gifti().sniff(fname)
        False
        >>> fname = get_test_fname('tblastn_four_human_vs_rhodopsin.blastxml')
        >>> Gifti().sniff(fname)
        False
        """
        handle = file_prefix.string_io()
        line = handle.readline()
        if not line.strip().startswith('<?xml version="1.0"'):
            return False
        line = handle.readline()
        if line.strip(
        ) == '<!DOCTYPE GIFTI SYSTEM "http://www.nitrc.org/frs/download.php/1594/gifti.dtd">':
            return True
        line = handle.readline()
        if line.strip().startswith('<GIFTI'):
            return True
        return False
Exemple #13
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        >>> classname = DataIn
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> extn_true = classname().file_ext
        >>> file_true = get_test_fname("linkstudies." + extn_true)
        >>> classname().sniff(file_true)
        True
        >>> false_files = list(LinkageStudies.test_files)
        >>> false_files.remove("linkstudies." + extn_true)
        >>> result_true = []
        >>> for fname in false_files:
        ...     file_false = get_test_fname(fname)
        ...     res = classname().sniff(file_false)
        ...     if res:
        ...         result_true.append(fname)
        >>>
        >>> result_true
        []
        """
        intermarkers = 0
        num_markers = None

        def eof_function():
            return intermarkers > 0

        fio = file_prefix.string_io()
        for lcount, line in enumerate(fio):
            if lcount > self.max_lines:
                return eof_function()

            tokens = line.split()
            try:
                if lcount == 0:
                    num_markers = int(tokens[0])
                    map(int, tokens[1:])
                elif lcount == 1:
                    map(float, tokens)

                    if len(tokens) != 4:
                        return False
                elif lcount == 2:
                    map(int, tokens)
                    last_token = int(tokens[-1])

                    if num_markers is None:
                        return False
                    if len(tokens) != last_token:
                        return False
                    if num_markers != last_token:
                        return False
                elif tokens[0] == "3" and tokens[1] == "2":
                    intermarkers += 1

            except (ValueError, IndexError):
                return False

        return eof_function()
Exemple #14
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """ Determines whether the file is the correct XML type. """
     for line in file_prefix.line_iterator():
         line = line.strip()
         if not line.startswith('<?'):
             break
     # pattern match <root or <ns:root for any ns string
     pattern = r'<(\w*:)?%s' % self.root
     return re.search(pattern, line) is not None
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     The structure of a typical PLY file:
     Header, Vertex List, Face List, (lists of other elements)
     """
     if not self._is_ply_header(file_prefix.text_io(errors='ignore'),
                                self.subtype):
         return False
     return True
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     VTK files can be either ASCII or binary, with two different
     styles of file formats: legacy or XML.  We'll assume if the
     file contains a valid VTK header, then it is a valid VTK file.
     """
     if self._is_vtk_header(file_prefix.text_io(errors='ignore'),
                            self.subtype):
         return True
     return False
Exemple #17
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname( 'infernal_model.cm' )
     >>> InfernalCM().sniff( fname )
     True
     >>> fname = get_test_fname( '2.txt' )
     >>> InfernalCM().sniff( fname )
     False
     """
     return file_prefix.startswith("INFERNAL")
Exemple #18
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     fh = file_prefix.string_io()
     line = [_.strip() for _ in fh.readline().split("\t")]
     if line != self.column_names:
         return False
     line = fh.readline().split("\t")
     try:
         [int(_) for _ in line[1:5]]
         [float(_) for _ in line[5:13]]
     except ValueError:
         return False
     return True
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     Neper tesr format startswith:***tesr
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('test.neper.tesr')
     >>> NeperTesr().sniff(fname)
     True
     >>> fname = get_test_fname('test.neper.tess')
     >>> NeperTesr().sniff(fname)
     False
     """
     return file_prefix.text_io(
         errors='ignore').readline(10).startswith('***tesr')
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     Gmsh msh format startswith:$MeshFormat
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('test.gmsh.msh')
     >>> GmshMsh().sniff(fname)
     True
     >>> fname = get_test_fname('test.neper.tesr')
     >>> GmshMsh().sniff(fname)
     False
     """
     return file_prefix.text_io(
         errors='ignore').readline().startswith('$MeshFormat')
Exemple #21
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Determines whether the file is XML or not

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname( 'megablast_xml_parser_test1.blastxml' )
        >>> GenericXml().sniff( fname )
        True
        >>> fname = get_test_fname( 'interval.interval' )
        >>> GenericXml().sniff( fname )
        False
        """
        return file_prefix.startswith('<?xml ')
Exemple #22
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname( 'test.peff' )
     >>> PEFF().sniff( fname )
     True
     >>> fname = get_test_fname( 'sequence.fasta' )
     >>> PEFF().sniff( fname )
     False
     """
     fh = file_prefix.string_io()
     if re.match(r"# PEFF \d+.\d+", fh.readline()):
         return True
     else:
         return False
Exemple #23
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """
     Checking for keyword - 'Collection' or 'Image' in the first 200 lines.
     >>> from galaxy.datatypes.sniff import get_test_fname
     >>> fname = get_test_fname('1.dzi')
     >>> Dzi().sniff(fname)
     True
     >>> fname = get_test_fname('megablast_xml_parser_test1.blastxml')
     >>> Dzi().sniff(fname)
     False
     """
     for line in file_prefix.line_iterator():
         line = line.lower()
         if line.find('<collection') >= 0 or line.find('<image') >= 0:
             return True
     return False
Exemple #24
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
            Try to guess the Obo filetype.
            It usually starts with a "format-version:" string and has several stanzas which starts with "id:".
        """
        stanza = re.compile(r'^\[.*\]$')
        handle = file_prefix.string_io()
        first_line = handle.readline()
        if not first_line.startswith('format-version:'):
            return False

        for line in handle:
            if stanza.match(line.strip()):
                # a stanza needs to begin with an ID tag
                if next(handle).startswith('id:'):
                    return True
        return False
Exemple #25
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        The use of ESTScan implies the creation of scores matrices which
        reflect the codons preferences in the studied organisms.  The
        ESTScan package includes scripts for generating these files.  The
        output of these scripts consists of the matrices, one for each
        isochor, and which look like this:

        FORMAT: hse_4is.conf CODING REGION 6 3 1 s C+G: 0 44
        -1 0 2 -2
        2 1 -8 0

        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> fname = get_test_fname('test_space.txt')
        >>> Smat().sniff(fname)
        False
        >>> fname = get_test_fname('test_tab.bed')
        >>> Smat().sniff(fname)
        False
        >>> fname = get_test_fname('1.smat')
        >>> Smat().sniff(fname)
        True
        """
        line_no = 0
        fh = file_prefix.string_io()
        for line in fh:
            line_no += 1
            if line_no > 10000:
                return True
            if line_no == 1 and not line.startswith('FORMAT'):
                # The first line is always the start of a format section.
                return False
            if not line.startswith('FORMAT'):
                if line.find('\t') >= 0:
                    # Smat files are not tabular.
                    return False
                items = line.split()
                if len(items) != 4:
                    return False
                for item in items:
                    # Make sure each item is an integer.
                    if re.match(r"[-+]?\d+$", item) is None:
                        return False
        # Ensure at least a few matching lines are found.
        return line_no > 2
Exemple #26
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        >>> classname = MarkerMap
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> extn_true = classname().file_ext
        >>> file_true = get_test_fname("linkstudies." + extn_true)
        >>> classname().sniff(file_true)
        True
        >>> false_files = list(LinkageStudies.test_files)
        >>> false_files.remove("linkstudies." + extn_true)
        >>> result_true = []
        >>> for fname in false_files:
        ...     file_false = get_test_fname(fname)
        ...     res = classname().sniff(file_false)
        ...     if res:
        ...         result_true.append(fname)
        >>>
        >>> result_true
        []
        """
        fio = file_prefix.string_io()
        if not self.header_check(fio):
            return False

        for lcount, line in enumerate(fio):
            if lcount > self.max_lines:
                return True

            try:
                chrm, gpos, nam, bpos, row = line.split()
                float(gpos)
                int(bpos)

                try:
                    int(chrm)
                except ValueError:
                    if not chrm.lower()[0] in ('x', 'y', 'm'):
                        return False

            except ValueError:
                return False

        return True
Exemple #27
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        >>> classname = AllegroLOD
        >>> from galaxy.datatypes.sniff import get_test_fname
        >>> extn_true = classname().file_ext
        >>> file_true = get_test_fname("linkstudies." + extn_true)
        >>> classname().sniff(file_true)
        True
        >>> false_files = list(LinkageStudies.test_files)
        >>> false_files.remove("linkstudies." + extn_true)
        >>> result_true = []
        >>> for fname in false_files:
        ...     file_false = get_test_fname(fname)
        ...     res = classname().sniff(file_false)
        ...     if res:
        ...         result_true.append(fname)
        >>>
        >>> result_true
        []
        """
        fio = file_prefix.string_io()

        if not self.header_check(fio):
            return False

        for lcount, line in enumerate(fio):
            if lcount > self.max_lines:
                return True

            tokens = line.split()

            try:
                int(tokens[0])
                float(tokens[1])

                if tokens[2] != "-inf":
                    float(tokens[2])

            except (ValueError, IndexError):
                return False

        return True
Exemple #28
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     sep = None
     header = None
     for idx, line in enumerate(file_prefix.line_iterator()):
         line = line.strip()
         if sep is None:
             sep = self._parse_delimiter(line)
             if sep is None:
                 return False
         line = line.split(sep)
         if len(line) != 3:
             return False
         if idx == 0:
             header = self._parse_header(line)
             if (header is None) and not self._parse_dataline(line):
                 return False
         elif not self._parse_dataline(line):
             return False
     if sep is None or header is None:
         return False
     return True
Exemple #29
0
 def sniff_prefix(self, file_prefix: FilePrefix):
     """ Determines whether the file is the correct type. """
     has_version = False
     found_man_mtd = set()
     contents = file_prefix.string_io()
     for line in contents:
         if re.match(r"^\s*$", line):
             continue
         columns = line.strip("\r\n").split("\t")
         if columns[0] == "MTD":
             if columns[1] == "mzTab-version" and re.match(
                     self._version_re, columns[2]) is not None:
                 has_version = True
             elif columns[1] in self._man_mtd:
                 mandatory_field = self._man_mtd[columns[1]]
                 if mandatory_field is None or columns[2].lower(
                 ) in mandatory_field:
                     found_man_mtd.add(columns[1])
         elif not columns[0] in self._sections:
             return False
     return has_version and found_man_mtd == set(self._man_mtd.keys())
Exemple #30
0
    def sniff_prefix(self, file_prefix: FilePrefix):
        """
        Determines whether the file is a velveth produced RoadMap::
          142858  21      1
          ROADMAP 1
          ROADMAP 2
          ...
        """

        fh = file_prefix.string_io()
        for line in fh:
            line = line.strip()
            if line:  # first non-empty line
                if not re.match(r'\d+\t\d+\t\d+$', line):
                    return False
                # The next line.strip() should be 'ROADMAP 1'
                line = fh.readline().strip()
                return bool(re.match(r'ROADMAP \d+$', line))
            else:
                return False  # we found a non-empty line, but it's not a fasta header
        return False