def test_parse_fasta_no_header(self): "Test parsing a fasta record without header" with open(get_file_path('no_header.fasta'), 'rU') as h: # plain BioPython parsing should fail records = list(seqio.parse(h)) self.assertEqual(0, len(records)) h.seek(0) # robust parsing should work records = list(seqio.parse(h, robust=True)) self.assertEqual(1, len(records))
def test_prepeptide_adjustment(self): dummy_record = Record(Seq("A"*400, generic_dna)) subregion = DummySubRegion(start=100, end=300) dummy_record.add_subregion(subregion) region = Region(subregions=[subregion]) dummy_record.add_region(region) dummy_prepeptide = DummyFeature(200, 230, 1, "CDS_motif") # ensure both FeatureLocation and CompoundLocations are handled appropriately leader_loc = FeatureLocation(200, 210, 1) tail_loc = CompoundLocation([FeatureLocation(220, 223, -1), FeatureLocation(227, 230, -1)]) dummy_prepeptide._qualifiers["leader_location"] = [str(leader_loc)] dummy_prepeptide._qualifiers["tail_location"] = [str(tail_loc)] dummy_record.add_feature(dummy_prepeptide) # and add a CDS_motif without either qualifier (e.g. NRPS/PKS motif) to ensure that doesn't break dummy_record.add_feature(DummyFeature(250, 280, 1, "CDS_motif")) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name))[0] assert len(bio.features) == 4 found = False for feature in bio.features: tail = feature.qualifiers.get("tail_location") leader = feature.qualifiers.get("leader_location") if tail and leader: # the part locations should now be adjusted backwards 100 bases assert leader == ["[100:110](+)"] assert tail == ["join{[120:123](-), [127:130](-)}"] found = True assert found, "prepeptide feature missing in conversion"
def _strict_parse(filename: str) -> List[SeqRecord]: """ Parses the input record with extra wrappers to catch biopython warnings as errors. Arguments: filename: the name of the file to parse Returns: a list of SeqRecords parsed """ filter_messages = [ r".*invalid location.*", r".*Expected sequence length.*", r".*Couldn't parse feature location.*", ] try: # prepend warning filters to raise exceptions on certain messages for message in filter_messages: warnings.filterwarnings("error", message=message) records = list(seqio.parse(filename)) except Exception as err: message = str(err) # strip the "Ignoring" part, since it's not being ignored if message.startswith("Ignoring invalid location"): message = message[9:] logging.error('Parsing %r failed: %s', filename, message) raise AntismashInputError(message) from err finally: # remove the new warning filters (functions in at least 3.5 and 3.6) # since mypy doesn't recognise this attribute, ignore the type warnings.filters = warnings.filters[len(filter_messages):] # type: ignore return records
def test_genbank(self): dummy_record = Record(Seq("A" * 100, generic_dna)) clusters = [ create_cluster(3, 20, "prodA"), create_cluster(25, 41, "prodB") ] for cluster in clusters: dummy_record.add_cluster(cluster) subregion = SubRegion(FeatureLocation(35, 71), "test", 0.7) dummy_record.add_subregion(subregion) supercluster = SuperCluster(SuperCluster.kinds.NEIGHBOURING, clusters) dummy_record.add_supercluster(supercluster) region = Region(superclusters=[supercluster], subregions=[subregion]) dummy_record.add_region(region) with NamedTemporaryFile(suffix=".gbk") as output: region.write_to_genbank(output.name) bio = list(seqio.parse(output.name)) assert len(bio) == 1 rec = Record.from_biopython(bio[0], taxon="bacteria") assert len(rec.get_regions()) == 1 new = rec.get_region(0) assert new.location.start == 3 - region.location.start assert new.location.end == 71 - region.location.start assert new.products == region.products assert new.probabilities == region.probabilities
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length: int = -1, start: int = -1, end: int = -1) -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] # type: List[SeqRecord] try: record_list = list(seqio.parse(filename)) except Exception as err: logging.error('Parsing %r failed: %s', filename, err) raise AntismashInputError(str(err)) from err for record in record_list: if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) # if no records are left, that's a problem if not records: raise AntismashInputError("no valid records found in file %r" % filename) for record in records: if isinstance(record.seq.alphabet, Bio.Alphabet.ProteinAlphabet): raise AntismashInputError("protein records are not supported") # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError( "--start and --end options cannot be used with multiple records" ) records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) try: return [Record.from_biopython(record, taxon) for record in records] except SecmetInvalidInputError as err: raise AntismashInputError(str(err)) from err
def parse_input_sequence(filename: str, taxon: str = "bacteria", minimum_length=-1, start=-1, end=-1) -> List[Record]: """ Parse input records contained in a file Arguments: filename: the path of the file to read taxon: the taxon of the input, e.g. 'bacteria', 'fungi' minimum_length: records with length less than this will be ignored if not positive, all records are included start: a start location for trimming the sequence, or -1 to use all end: an end location for trimming the sequence, or -1 to use all Returns: A list of secmet.Record instances, one for each record in the file """ logging.info('Parsing input sequence %r', filename) if not isinstance(minimum_length, int): raise TypeError("minimum_length must be an int") records = [] if not os.path.exists(filename): msg = "Sequence file not found: %r" % filename logging.error(msg) raise ValueError(msg) try: record_list = list(seqio.parse(filename)) if not record_list: raise RuntimeError('No records could be read from file %r' % filename) for record in record_list: if minimum_length < 1 \ or len(record.seq) >= minimum_length \ or 'contig' in record.annotations \ or 'wgs_scafld' in record.annotations \ or 'wgs' in record.annotations: records.append(record) except (ValueError, AssertionError) as err: logging.error('Parsing %r failed: %s', filename, err) raise except Exception as err: logging.error('Parsing %r failed with unhandled exception: %s', filename, err) raise # before conversion to secmet records, trim if required if start > -1 or end > -1: if len(records) > 1: raise ValueError( "--start and --end options cannot be used with multiple records" ) records[0] = trim_sequence(records[0], max(start, 0), min(len(records[0]), end)) return [Record.from_biopython(record, taxon) for record in records]
def test_parse_genbank(self): "Test parsing a gzipped GenBank file" with open(get_file_path('melanin.gbk.gz'), 'rb') as h: records = list(seqio.parse(h)) self.assertEqual(1, len(records))
def test_parse_genbank_path(self): "Test parsing a gzipped GenBank file specified by path" fname = get_file_path('melanin.gbk.gz') records = list(seqio.parse(fname)) self.assertEqual(1, len(records))
def test_parse_fasta_valid(self): "Test parsing a valid fasta record" with open(get_file_path('melanin.fasta'), 'rU') as h: records = list(seqio.parse(h)) self.assertEqual(1, len(records))
def test_parse_calls_biopython(self): "Test running the Bio.SeqIO parser" mock("Bio.SeqIO.parse", tracker=self.tt, returns=[]) expected_trace = " Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'genbank')" seqio.parse(self.handle) assert_same_trace(self.tt, expected_trace)
def test_parse_genbank_valid(self): "Test parsing a valid genbank record" with open(get_file_path('melanin.gbk'), 'rU') as h: records = list(seqio.parse(h)) self.assertEqual(1, len(records))
def test_parse_seqtype(self): "Test running the Bio.SeqIO parser with specified seqtype parameter" mock("Bio.SeqIO.parse", tracker=self.tt, returns=[]) expected_trace = " Called Bio.SeqIO.parse(DummyHandle('test.gbk'), 'embl')" seqio.parse(self.handle, 'embl') assert_same_trace(self.tt, expected_trace)