Exemple #1
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096", None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN"),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT"),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT"),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.GenBankScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(
             t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(
             mt, mol_type,
             "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(
             d, div, "Wrong division %r not %r from %r" % (d, div, line))
Exemple #2
0
    def test_embl_cds_interaction(self):
        """Test EMBL CDS interaction, parse CDS features on embl files."""
        embl_s = Scanner.EmblScanner()

        # Test parse CDS features on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_cds_f = list(embl_s.parse_cds_features(handle_embl7046))
        # number of records, should be 10
        self.assertEqual(len(l_cds_f), 10)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, 'AAS58758.1')
        self.assertEqual(l_cds_f[0].description, 'putative transposase')
Exemple #3
0
    def test_embl_record_interaction(self):
        """Test EMBL Record interaction on embl files."""
        embl_s = Scanner.EmblScanner()

        #  Test parse records on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True))
        # number of records, should be 1
        self.assertEqual(len(l_embl_r), 1)
        self.assertEqual(l_embl_r[0].id, 'AE017046.1')
        self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus '
                                                  'str. 91001 plasmid pPCP1, complete '
                                                  'sequence.')
        self.assertEqual(len(l_embl_r[0].features), 29)
Exemple #4
0
 def test_topology_genbank(self):
     """Check GenBank LOCUS line parsing."""
     # This is a bit low level, but can test pasing the LOCUS line only
     tests = [
         ("LOCUS       U00096",
          None, None, None, None),
         # This example is actually fungal, accession U49845 from Saccharomyces cerevisiae:
         ("LOCUS       SCU49845     5028 bp    DNA             PLN       21-JUN-1999",
          None, "DNA", "PLN", None),
         ("LOCUS       AB070938                6497 bp    DNA     linear   BCT 11-OCT-2001",
          "linear", "DNA", "BCT", None),
         ("LOCUS       NC_005816               9609 bp    DNA     circular BCT 21-JUL-2008",
          "circular", "DNA", "BCT", None),
         ("LOCUS       SCX3_BUTOC                64 aa            linear   INV 16-OCT-2001",
          "linear", None, "INV", None),
         ("LOCUS       pEH010                  5743 bp    DNA     circular",
          "circular", "DNA", None, [BiopythonParserWarning]),
         # This is a test of the format > 80 chars long
         ("LOCUS       AZZZAA02123456789 1000000000 bp    DNA     linear   PRI 15-OCT-2018",
          "linear", "DNA", "PRI", None)
     ]
     for (line, topo, mol_type, div, warning_list) in tests:
         with warnings.catch_warnings(record=True) as caught:
             warnings.simplefilter("always")
             scanner = Scanner.GenBankScanner()
             consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
             scanner._feed_first_line(consumer, line)
             t = consumer.data.annotations.get('topology', None)
             self.assertEqual(t, topo,
                              "Wrong topology %r not %r from %r" % (t, topo, line))
             mt = consumer.data.annotations.get('molecule_type', None)
             self.assertEqual(mt, mol_type,
                              "Wrong molecule_type %r not %r from %r" %
                              (mt, mol_type, line))
             d = consumer.data.annotations.get('data_file_division', None)
             self.assertEqual(d, div,
                              "Wrong division %r not %r from %r" % (d, div, line))
             if warning_list is None:
                 self.assertEqual(len(caught), 0)
             else:
                 self.assertEqual(len(caught), len(warning_list))
                 for i, warning_class in enumerate(warning_list):
                     self.assertEqual(caught[i].category, warning_class)
Exemple #5
0
 def test_topology_embl(self):
     """Check EMBL ID line parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         # Modern examples with sequence version
         ("ID   X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.", "linear",
          "mRNA", "PLN"),
         ("ID   CD789012; SV 4; linear; genomic DNA; HTG; MAM; 500 BP.",
          "linear", "genomic DNA", "MAM"),
         # Example to match GenBank example used above:
         ("ID   U49845; SV 1; linear; genomic DNA; STD; FUN; 5028 BP.",
          "linear", "genomic DNA", "FUN"),
         # Old examples:
         ("ID   BSUB9999   standard; circular DNA; PRO; 4214630 BP.",
          "circular", "DNA", "PRO"),
         ("ID   SC10H5 standard; DNA; PRO; 4870 BP.", None, "DNA", "PRO"),
         # Patent example from 2016-06-10
         # ftp://ftp.ebi.ac.uk/pub/databases/embl/patent/
         ("ID   A01679; SV 1; linear; unassigned DNA; PAT; MUS; 12 BP.",
          "linear", "unassigned DNA", "MUS"),
         # Old patent examples
         ("ID   NRP_AX000635; PRT; NR1; 15 SQ", None, None, "NR1"),
         ("ID   NRP0000016E; PRT; NR2; 5 SQ", None, None, "NR2"),
         # KIPO patent examples
         ("ID   DI500001       STANDARD;      PRT;   111 AA.", None, None,
          None),
         ("ID   DI644510   standard; PRT;  1852 AA.", None, None, None),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner.EmblScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(
             t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(
             mt, mol_type,
             "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(
             d, div, "Wrong division %r not %r from %r" % (d, div, line))
Exemple #6
0
 def test_first_line_imgt(self):
     """Check IMGT ID line parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         ("ID   HLA00001   standard; DNA; HUM; 3503 BP.", None, "DNA",
          "HUM"),
         ("ID   HLA00001; SV 1; standard; DNA; HUM; 3503 BP.", None, "DNA",
          "HUM"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner._ImgtScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(
             t, topo, "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(
             mt, mol_type,
             "Wrong molecule_type %r not %r from %r" % (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(
             d, div, "Wrong division %r not %r from %r" % (d, div, line))
 def test_first_line_imgt(self):
     """Check IMGT ID line parsing."""
     # This is a bit low level, but can test pasing the ID line only
     tests = [
         ("ID   HLA00001   standard; DNA; HUM; 3503 BP.",
          None, "DNA", "HUM"),
         ("ID   HLA00001; SV 1; standard; DNA; HUM; 3503 BP.",
          None, "DNA", "HUM"),
     ]
     for (line, topo, mol_type, div) in tests:
         scanner = Scanner._ImgtScanner()
         consumer = GenBank._FeatureConsumer(1, GenBank.FeatureValueCleaner)
         scanner._feed_first_line(consumer, line)
         t = consumer.data.annotations.get('topology', None)
         self.assertEqual(t, topo,
                          "Wrong topology %r not %r from %r" % (t, topo, line))
         mt = consumer.data.annotations.get('molecule_type', None)
         self.assertEqual(mt, mol_type,
                          "Wrong molecule_type %r not %r from %r" %
                          (mt, mol_type, line))
         d = consumer.data.annotations.get('data_file_division', None)
         self.assertEqual(d, div,
                          "Wrong division %r not %r from %r" % (d, div, line))
Exemple #8
0
class GenBankScannerTests(unittest.TestCase):
    """GenBank Scanner tests, test parsing gbk and embl files."""

    gb_s = Scanner.GenBankScanner()

    def gb_to_l_cds_f(self, filename, tags2id=None):
        """Gb file to Seq list parse CDS features."""
        with open(filename) as handle:
            if tags2id:
                l_cds_f = list(self.gb_s.parse_cds_features(handle, tags2id=tags2id))
            else:
                l_cds_f = list(self.gb_s.parse_cds_features(handle))
        return l_cds_f

    def gb_to_l_r(self, filename, do_features=False):
        """Gb file to Seq list parse records."""
        with open(filename) as handle:
            l_gb_r = list(self.gb_s.parse_records(handle, do_features=do_features))
        return l_gb_r

    def test_genbank_cds_interaction(self):
        """Test CDS interaction, parse CDS features on gb(k) files."""
        # Test parse CDS features on NC_000932.gb
        l_cds_f = self.gb_to_l_cds_f("GenBank/NC_000932.gb")
        # number of records, should be 85
        self.assertEqual(len(l_cds_f), 85)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, 'NP_051037.1')
        self.assertEqual(l_cds_f[84].id, 'NP_051123.1')

        # Test parse CDS features on NC_005816.gb, Tag to ID
        l_cds_f = self.gb_to_l_cds_f("GenBank/NC_005816.gb",
                                     tags2id=('gene', 'locus_tag', 'product'))
        # number of records, should be 10
        self.assertEqual(len(l_cds_f), 10)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, '<unknown id>')
        self.assertEqual(l_cds_f[0].name, 'YP_pPCP01')

        # Test parse CDS features on NC_000932.gb and NC_005816.gb combined
        l_cds_f1 = self.gb_to_l_cds_f("GenBank/NC_000932.gb",
                                      tags2id=('gene', 'locus_tag', 'product'))
        l_cds_f2 = self.gb_to_l_cds_f("GenBank/NC_005816.gb",
                                      tags2id=('gene', 'locus_tag', 'product'))
        l_cds_combined = l_cds_f1 + l_cds_f2
        # number of records combined, should be 95
        self.assertEqual(len(l_cds_combined), 95)
        # Seq ID
        self.assertEqual(l_cds_combined[0].id, 'rps12')
        self.assertEqual(l_cds_combined[0].description, 'ribosomal protein S12')
        self.assertEqual(l_cds_combined[94].id, '<unknown id>')
        self.assertEqual(l_cds_combined[94].description, 'hypothetical protein')

    def test_genbank_interaction(self):
        """Test GenBank records interaction on gbk files."""
        # Test parse records, on NC_005816, do_features False
        l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=False)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_005816.1')
        self.assertEqual(l_r[0].name, 'NC_005816')
        self.assertEqual(l_r[0].description, 'Yersinia pestis biovar '
                                             'Microtus str. 91001 plasmid '
                                             'pPCP1, complete sequence')
        self.assertEqual(len(l_r[0].features), 0)

        # Test parse records on NC_005816, do_features True
        l_r = self.gb_to_l_r("GenBank/NC_005816.gb", do_features=True)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_005816.1')
        self.assertEqual(l_r[0].name, 'NC_005816')
        self.assertEqual(l_r[0].description, 'Yersinia pestis biovar '
                                             'Microtus str. 91001 plasmid '
                                             'pPCP1, complete sequence')
        self.assertEqual(len(l_r[0].features), 41)

        # Test parse records on "GenBank/NC_000932.gb", do_features False
        l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=False)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_000932.1')
        self.assertEqual(l_r[0].name, 'NC_000932')
        self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, '
                                             'complete genome')
        self.assertEqual(len(l_r[0].features), 0)

        # Test parse records on NC_000932, do_features True
        l_r = self.gb_to_l_r("GenBank/NC_000932.gb", do_features=True)
        # number of records, should be 1
        self.assertEqual(len(l_r), 1)
        self.assertEqual(l_r[0].id, 'NC_000932.1')
        self.assertEqual(l_r[0].name, 'NC_000932')
        self.assertEqual(l_r[0].description, 'Arabidopsis thaliana chloroplast, '
                                             'complete genome')
        self.assertEqual(len(l_r[0].features), 259)

    def test_embl_cds_interaction(self):
        """Test EMBL CDS interaction, parse CDS features on embl files."""
        embl_s = Scanner.EmblScanner()

        # Test parse CDS features on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_cds_f = list(embl_s.parse_cds_features(handle_embl7046))
        # number of records, should be 10
        self.assertEqual(len(l_cds_f), 10)
        # Seq ID
        self.assertEqual(l_cds_f[0].id, 'AAS58758.1')
        self.assertEqual(l_cds_f[0].description, 'putative transposase')

    def test_embl_record_interaction(self):
        """Test EMBL Record interaction on embl files."""
        embl_s = Scanner.EmblScanner()

        #  Test parse records on embl_file
        with open("EMBL/AE017046.embl") as handle_embl7046:
            l_embl_r = list(embl_s.parse_records(handle_embl7046, do_features=True))
        # number of records, should be 1
        self.assertEqual(len(l_embl_r), 1)
        self.assertEqual(l_embl_r[0].id, 'AE017046.1')
        self.assertEqual(l_embl_r[0].description, 'Yersinia pestis biovar Microtus '
                                                  'str. 91001 plasmid pPCP1, complete '
                                                  'sequence.')
        self.assertEqual(len(l_embl_r[0].features), 29)
def tab_parser(handle, quiet=False):
    from Bio.GenBank import _FeatureConsumer
    from Bio.GenBank.utils import FeatureValueCleaner

    def Si_parse_tab_features(object, skip=False):
        """Return list of tuples for the features (if present)

		Each feature is returned as a tuple (key, location, qualifiers)
		where key and location are strings (e.g. "CDS" and
		"complement(join(490883..490885,1..879))") while qualifiers
		is a list of two string tuples (feature qualifier keys and values).
		Assumes you have already read to the start of the features table.
		"""
        #		if object.line.rstrip() not in object.FEATURE_START_MARKERS:
        #			if object.debug : print "Didn't find any feature table"
        #			return []
        #
        #		while object.line.rstrip() in object.FEATURE_START_MARKERS:
        #			object.line = object.handle.readline()

        features = []
        line = object.line
        while True:
            if not line:
                break
                raise ValueError("Premature end of line during features table")

            if line[:object.HEADER_WIDTH].rstrip() in object.SEQUENCE_HEADERS:
                if object.debug: print "Found start of sequence"
                break
            line = line.rstrip()
            if line == "//":
                raise ValueError(
                    "Premature end of features table, marker '//' found")
            if line in object.FEATURE_END_MARKERS:
                if object.debug: print "Found end of features"
                line = object.handle.readline()
                break
            if line[2:object.FEATURE_QUALIFIER_INDENT].strip() == "":
                print line[2:object.FEATURE_QUALIFIER_INDENT].strip()
                raise ValueError("Expected a feature qualifier in line '%s'" %
                                 line)
            if line.split()[0] in ["ID", "source"]:
                line = object.handle.readline()
                continue
            if skip:
                line = object.handle.readline()
                while line[:object.
                           FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER:
                    line = object.handle.readline()
            else:
                #Build up a list of the lines making up this feature:
                feature_key = line[2:object.FEATURE_QUALIFIER_INDENT].strip()
                feature_lines = [line[object.FEATURE_QUALIFIER_INDENT:]]
                line = object.handle.readline()
                while line and (
                        line[:object.FEATURE_QUALIFIER_INDENT]
                        == object.FEATURE_QUALIFIER_SPACER
                        or line.rstrip() == ""
                ):  # cope with blank lines in the midst of a feature
                    feature_lines.append(
                        line[object.FEATURE_QUALIFIER_INDENT:].rstrip())
                    line = object.handle.readline()
                    if len(line) == 0:
                        break  #EOF

                feature_lines.append('/seq="N"')
                sys.stdout.flush()
                features.append(
                    object.parse_feature(feature_key, feature_lines))
        object.line = line

        return features

    def Si_feed(object, handle, consumer, do_features=True):
        """Feed a set of data into the consumer.

		This method is intended for use with the "old" code in Bio.GenBank

		Arguments:
		handle - A handle with the information to parse.
		consumer - The consumer that should be informed of events.
		do_features - Boolean, should the features be parsed?
				      Skipping the features can be much faster.

		Return values:
		true  - Passed a record
		false - Did not find a record
		"""
        #Should work with both EMBL and GenBank files provided the
        #equivalent Bio.GenBank._FeatureConsumer methods are called...
        #		object.set_handle(handle)

        #		if not object.find_start():
        #			#Could not find (another) record
        #			consumer.data=None
        #			print "here"
        #			return False

        #We use the above class methods to parse the file into a simplified format.
        #The first line, header lines and any misc lines after the features will be
        #dealt with by GenBank / EMBL specific derived classes.

        #First line and header:
        #		object._feed_first_line(consumer, object.line)
        #		object._feed_header_lines(consumer, object.parse_header())

        #Features (common to both EMBL and GenBank):
        if do_features:
            object._feed_feature_table(
                consumer, Si_parse_tab_features(object, skip=False))
        else:
            Si_parse_tab_features(object, skip=True)  # ignore the data

        #Footer and sequence
#		misc_lines, sequence_string = object.parse_footer()
#		object._feed_misc_lines(consumer, misc_lines)
        sequence_string = "N"
        consumer.sequence(sequence_string)
        #		Calls to consumer.base_number() do nothing anyway
        consumer.record_end("//")

        length = 0

        for record in consumer.data.features:
            if record.location.nofuzzy_end > length:
                length = record.location.nofuzzy_end

        consumer.data.seq = "N" * length

        #		assert object.line == "//"

        #And we are done
        return True

    myscanner = Scanner.InsdcScanner()
    myscanner.set_handle(handle)

    myscanner.line = myscanner.handle.readline()
    myscanner.FEATURE_QUALIFIER_INDENT = 21
    myscanner.FEATURE_QUALIFIER_SPACER = "FT" + " " * (
        myscanner.FEATURE_QUALIFIER_INDENT - 2)

    myscanner.debug = True

    #featuretuples=Si_parse_tab_features(myscanner)

    consumer = _FeatureConsumer(use_fuzziness=1,
                                feature_cleaner=FeatureValueCleaner())

    Si_feed(myscanner, handle, consumer)

    return consumer.data
def tab_parser(handle, quiet=False):
    def Drawer_parse_tab_features(object, skip=False):
        features = []
        line = object.line
        while True:
            if not line:
                break
                raise ValueError("Premature end of line during features table")
            if line[:object.HEADER_WIDTH].rstrip() in object.SEQUENCE_HEADERS:
                if object.debug: print("Found start of sequence")
                break
            line = line.rstrip()
            if line == "//":
                raise ValueError(
                    "Premature end of features table, marker '//' found")
            if line in object.FEATURE_END_MARKERS:
                if object.debug: print("Found end of features")
                line = object.handle.readline()
                break
            if line[2:object.FEATURE_QUALIFIER_INDENT].strip() == "":
                print(line[2:object.FEATURE_QUALIFIER_INDENT].strip())
                raise ValueError("Expected a feature qualifier in line '%s'" %
                                 line)

            if skip:
                line = object.handle.readline()
                while line[:object.
                           FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER:
                    line = object.handle.readline()
            else:
                #Build up a list of the lines making up this feature:
                feature_key = line[2:object.FEATURE_QUALIFIER_INDENT].strip()
                feature_lines = [line[object.FEATURE_QUALIFIER_INDENT:]]
                line = object.handle.readline()
                while line[:object.
                           FEATURE_QUALIFIER_INDENT] == object.FEATURE_QUALIFIER_SPACER or line.rstrip(
                           ) == "":  # cope with blank lines in the midst of a feature

                    feature_lines.append(
                        line[object.FEATURE_QUALIFIER_INDENT:].rstrip())
                    line = object.handle.readline()
                    if len(line) == 0:
                        break  #EOF

                feature_lines.append('/seq="N"')
                sys.stdout.flush()
                features.append(
                    object.parse_feature(feature_key, feature_lines))
        object.line = line

        return features

    def Drawer_feed(object, handle, consumer, do_features=True):
        if do_features:
            object._feed_feature_table(
                consumer, Drawer_parse_tab_features(object, skip=False))
        else:
            Drawer_parse_tab_features(object, skip=True)  # ignore the data

        sequence_string = "N"
        consumer.sequence(sequence_string)
        consumer.record_end("//")
        length = 0
        for record in consumer.data.features:
            if record.location.nofuzzy_end > length:
                length = record.location.nofuzzy_end

        consumer.data.seq = "N" * length

        return True

    myscanner = Scanner.InsdcScanner()
    myscanner.set_handle(handle)

    myscanner.line = myscanner.handle.readline()
    myscanner.FEATURE_QUALIFIER_INDENT = 21
    myscanner.FEATURE_QUALIFIER_SPACER = "FT" + " " * (
        myscanner.FEATURE_QUALIFIER_INDENT - 2)

    myscanner.debug = True

    consumer = _FeatureConsumer(use_fuzziness=1,
                                feature_cleaner=FeatureValueCleaner())

    Drawer_feed(myscanner, handle, consumer)

    return consumer.data