def _set_after(self, location): """ Changes a FeatureLocation to include a "BeforePosition" or "AfterPosition" to indicate that the mRNA does not include stop codon. """ if location.strand >= 0: # forward strand if len(location.parts) > 1: location.parts[-1] = FeatureLocation( location.parts[-1].start, AfterPosition(location.parts[-1].end), strand=location.parts[-1].strand) else: location = FeatureLocation(location.start, AfterPosition(location.end), strand=location.strand) else: if len(location.parts) > 1: location.parts[0] = FeatureLocation( BeforePosition(location.parts[0].start), location.parts[0].end, strand=location.parts[0].strand) else: location = FeatureLocation(BeforePosition(location.start), location.end, strand=location.strand) return location
def setUp(self): f0 = SeqFeature( FeatureLocation(0, 26), type="source", qualifiers={"mol_type": ["fake protein"]}, ) f1 = SeqFeature(FeatureLocation(0, ExactPosition(10))) f2 = SeqFeature( FeatureLocation(WithinPosition(12, left=12, right=15), BeforePosition(22))) f3 = SeqFeature( FeatureLocation( AfterPosition(16), OneOfPosition( 26, [ExactPosition(25), AfterPosition(26)]), )) self.record = SeqRecord( Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein), id="TestID", name="TestName", description="TestDescr", dbxrefs=["TestXRef"], annotations={"k": "v"}, letter_annotations={"fake": "X" * 26}, features=[f0, f1, f2, f3], )
def get_locations(CDSs, start, end, strand): """Return mRNA and CDS locations CDS has exact boundaries, while mRNA not. """ #gff is 1-based, gb also, but sf is 0-based if len(CDSs) > 1: parts, mrnaparts = [], [] for cdsi, (s, e) in enumerate(CDSs): parts.append(FeatureLocation(s - 1, e, strand=strand)) if cdsi == 0: mrnaparts.append( FeatureLocation(BeforePosition(s - 1), e, strand=strand)) elif cdsi == len(CDSs) - 1: mrnaparts.append( FeatureLocation(s - 1, AfterPosition(e), strand=strand)) else: mrnaparts.append(FeatureLocation(s - 1, e, strand=strand)) cdsloc = CompoundLocation(parts) mrnaloc = CompoundLocation(parts) else: cdsloc = FeatureLocation(start - 1, end, strand=strand) mrnaloc = FeatureLocation(BeforePosition(start - 1), AfterPosition(end), strand=strand) return cdsloc, mrnaloc
def test_after(self): """Features: write/read simple after locations.""" f = SeqFeature(FeatureLocation(AfterPosition(5),10), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), ">6..10") self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(15),AfterPosition(20)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), ">16..>20") self.record.features.append(f) f = SeqFeature(FeatureLocation(25,AfterPosition(30)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "26..>30") self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(35),40), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "complement(>36..40)") self.record.features.append(f) f = SeqFeature(FeatureLocation(AfterPosition(45),AfterPosition(50)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "complement(>46..>50)") self.record.features.append(f) f = SeqFeature(FeatureLocation(55,AfterPosition(60)), \ strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "complement(56..>60)") self.record.features.append(f) self.write_read_check()
def construct_location(raw_start: str, raw_end: str, raw_strand: str, attributes: Dict[str, str], strict: bool = False) -> FeatureLocation: """ Converts the raw sections of a GFF line into a FeatureLocation. Some attribute keys can modify the location's values. """ try: start = ExactPosition(int(raw_start) - 1) # 0-indexed as FeatureLocation expects end = ExactPosition(int(raw_end)) except ValueError as err: raise GFFParseError("Invalid location values: %s" % str(err)) if start < 0 or end < 0: raise GFFParseError("Invalid location values: %s, %s" % (raw_start, raw_end)) strand = interpret_strand(raw_strand, strict=strict) # handle ambiguous positions as noted in attributes if attributes.get("partial") == "true" and ("start_range" in attributes or "end_range" in attributes): attributes.pop("partial") start_range = attributes.pop("start_range", "%s,%s" % (start, end)) end_range = attributes.pop("end_range", "%s,%s" % (start, end)) if start_range.startswith("."): start = BeforePosition(int(start)) if end_range.endswith("."): end = AfterPosition(int(end)) return FeatureLocation(start, end, strand)
def _get_feature(line): # modified at 2017.8.15. Suppress error when a crispr is detected at the edge of a contig. cols = line.strip().split() start, end = int(cols[3]), int(cols[5]) extracted = [ x for x in self.seq_info if x.start - self.__class__.INTERVAL <= start and end <= x.end + self.__class__.INTERVAL ] # extracted = [x for x in self.seq_info if x.start <= start and end <= x.end] # if len(extracted) != 1: # print(extracted) # print(self.seq_info) # print(start, end) assert len(extracted) == 1 crt_seq = extracted[0] seq_id = crt_seq.id start = start - crt_seq.start end = end - crt_seq.start + 1 start_position = BeforePosition(0) if start <= 0 else start end_position = AfterPosition( crt_seq.length) if end >= crt_seq.length else end location = FeatureLocation(start_position, end_position, strand=1) return ExtendedFeature(location=location, type="CRISPR", seq_id=seq_id)
def get_feature_location(self): # Coordinate is 0-based in Biopython object start = BeforePosition( self.left - 1) if self.left_partial else ExactPosition(self.left - 1) end = AfterPosition( self.right) if self.right_partial else ExactPosition(self.right) return FeatureLocation(start, end, strand=self.strand)
def parse_position(string: str): """ Converts a positiong from a string into a Position subclass """ if string[0] == '<': return BeforePosition(int(string[1:])) if string[0] == '>': return AfterPosition(int(string[1:])) if string == "UnknownPosition()": return UnknownPosition() return ExactPosition(int(string))
def test_multiple(self): # start, stop, start, stop expected = [ FeatureLocation(ExactPosition(0), ExactPosition(66), strand=1), FeatureLocation(ExactPosition(66), ExactPosition(132), strand=1) ] self.run_both_dirs(expected, "ATG" + "N" * 60 + "TAGGTG" + "N" * 60 + "TGA") # start, stop, start expected[1] = FeatureLocation(ExactPosition(66), AfterPosition(69), strand=1) self.run_both_dirs(expected, "ATG" + "N" * 60 + "TAGGTG") # stop, start expected = [ FeatureLocation(BeforePosition(0), ExactPosition(3), strand=1), FeatureLocation(ExactPosition(3), AfterPosition(9), strand=1) ] self.run_both_dirs(expected, "TAGGTGNNN")
def getLocation(self, left, right, strand, partial_flag="00"): """partialFlag = {00:both ends existing, 10:left-end missing, 01:right-end missing, 00:both-ends missing}""" strand = 1 if (strand == "+" or strand == "1" or strand == 1) else -1 leftPosition = BeforePosition( int(left) - 1) if partial_flag[0] == "1" else ExactPosition(int(left) - 1) rightPosition = AfterPosition( int(right)) if partial_flag[1] == "1" else ExactPosition( int(right)) return FeatureLocation(leftPosition, rightPosition, strand=strand)
def test_after_position(self): location = FeatureLocation(ExactPosition(1), AfterPosition(6), strand=1) new_location = self.convert(location) assert isinstance(new_location.start, ExactPosition) assert new_location.start == 1 assert isinstance(new_location.end, AfterPosition) assert new_location.end == 6
def test_location_tostr_from_featloc(): '''Input is an iterable of Biopython FeatureLocation (can be Compound).''' f1 = FeatureLocation(5, 10, strand=-1) f2 = FeatureLocation(20, AfterPosition(30), strand=0) combined = f1 + f2 loc = [f1, f2, combined] gen = location_tostr(loc) assert next(gen) == '[5:10](-)' assert next(gen) == '[20:>30](?)' assert next(gen) == ['[5:10](-)', '[20:>30](?)'] with pytest.raises(StopIteration): next(gen)
def test_fuzzy_join(self): """Features: write/read fuzzy join locations.""" f1 = SeqFeature(FeatureLocation(BeforePosition(10), 20), strand=+1) f2 = SeqFeature(FeatureLocation(25, AfterPosition(40)), strand=+1) f = self.make_join_feature([f1, f2]) self.record.features.append(f) self.assertEqual(_insdc_feature_location_string(f), "join(<11..20,26..>40)") f1 = SeqFeature(FeatureLocation( OneOfPosition([ExactPosition(107), ExactPosition(110)]), 120), strand=+1) f2 = SeqFeature(FeatureLocation(125, 140), strand=+1) f3 = SeqFeature(FeatureLocation(145, WithinPosition(150, 10)), strand=+1) f = self.make_join_feature([f1, f2, f3], "CDS") self.assertEqual(_insdc_feature_location_string(f), "join(one-of(108,111)..120,126..140,146..(150.160))") self.record.features.append(f) f1 = SeqFeature(FeatureLocation(BeforePosition(210), 220), strand=-1) f2 = SeqFeature(FeatureLocation(225, WithinPosition(240, 4)), strand=-1) f = self.make_join_feature([f1, f2], "gene") self.assertEqual(_insdc_feature_location_string(f), "complement(join(<211..220,226..(240.244)))") self.record.features.append(f) f1 = SeqFeature(FeatureLocation(AfterPosition(310), 320), strand=-1) f2 = SeqFeature(FeatureLocation( 325, OneOfPosition([ExactPosition(340), ExactPosition(337)])), strand=-1) f3 = SeqFeature(FeatureLocation(345, WithinPosition(350, 5)), strand=-1) f = self.make_join_feature([f1, f2, f3], "CDS") self.assertEqual( _insdc_feature_location_string(f), "complement(join(>311..320,326..one-of(340,337),346..(350.355)))") self.record.features.append(f) self.write_read_check()
def create_1_part_seqfeature(start=0, stop=0, strand=1, type="", fuzzy="neither", qualifiers=None): """Constructs simple BioPython SeqFeature. Start = int Stop = int Strand = int (-1, 1) Type = 'CDS', 'Source', 'tRNA', etc. Fuzzy = 'start', 'stop', 'both', or 'neither' Qualifiers = dictionary of feature descriptions.""" if fuzzy == "start": seq_ftr = SeqFeature(FeatureLocation(BeforePosition(start), ExactPosition(stop), strand=strand), type=type, qualifiers=qualifiers) elif fuzzy == "stop": seq_ftr = SeqFeature(FeatureLocation(ExactPosition(start), AfterPosition(stop), strand=strand), type=type, qualifiers=qualifiers) elif fuzzy == "both": seq_ftr = SeqFeature(FeatureLocation(BeforePosition(start), AfterPosition(stop), strand=strand), type=type, qualifiers=qualifiers) else: seq_ftr = SeqFeature(FeatureLocation(ExactPosition(start), ExactPosition(stop), strand=strand), type=type, qualifiers=qualifiers) return seq_ftr
def test_start_before_end(self): expected = "must be greater than or equal to start location" with self.assertRaises(ValueError) as err: FeatureLocation(42, 23, 1) self.assertIn(expected, str(err.exception)) with self.assertRaises(ValueError) as err: FeatureLocation(42, 0, 1) self.assertIn(expected, str(err.exception)) with self.assertRaises(ValueError) as err: FeatureLocation(BeforePosition(42), AfterPosition(23), -1) self.assertIn(expected, str(err.exception)) with self.assertRaises(ValueError) as err: FeatureLocation(42, AfterPosition(0), 1) self.assertIn(expected, str(err.exception)) # Features with UnknownPositions should pass check FeatureLocation(42, UnknownPosition()) FeatureLocation(UnknownPosition(), 42) # Same start and end should pass check FeatureLocation(42, 42)
def test_eq_identical(self): """Test two identical locations are equal.""" loc1 = FeatureLocation(23, 42, 1) loc2 = FeatureLocation(23, 42, 1) self.assertEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, -1) loc2 = FeatureLocation(23, 42, -1) self.assertEqual(loc1, loc2) loc1 = FeatureLocation(BeforePosition(23), AfterPosition(42), 1) loc2 = FeatureLocation(23, 42, 1) self.assertEqual(loc1, loc2) loc1 = FeatureLocation(23, 42, 1, "foo", "bar") loc2 = FeatureLocation(23, 42, 1, "foo", "bar") self.assertEqual(loc1, loc2)
def getgenefromgbk(gbkfile, location): # change to work with locations """parses a genesequence from a gbk file using the gene location parameters ---------- gbkfile string, path to gbk file + file location string of coordinates, example: "[start:end>](+)" returns ---------- ret = DNA sequence of housekeepinggene from featurelocation coordinates abs_loc = validation, contains the location of HG on specific scaffold. [scaffold, start, end] """ ret = "" scaff_number, start, end, strand = location.split(",") scaff_number = int(scaff_number) # Making the FeatureLocation f_start = BeforePosition( start.strip("<")) if "<" in start else ExactPosition(start) f_end = AfterPosition(end.strip(">")) if ">" in end else ExactPosition(end) f = FeatureLocation(f_start, f_end, int(strand)) gbkcontents = SeqIO.parse(gbkfile, "genbank") for record in gbkcontents: record_no = record.name.split(".")[0] scaff_check = int(record_no[-3:]) # = scaffold number if scaff_check == scaff_number: DNA = record.seq ret = f.extract(DNA) # The DNA sequence of the housekeepinggene # VALIDATION start = start.replace(">", "") start = start.replace("<", "") start = int(start) end = end.replace(">", "") end = end.replace("<", "") end = int(end) abs_loc = [scaff_number, start, end] return (ret, abs_loc)
def test_fuzzy(self): """Test fuzzy representations.""" # check the positions alone exact_pos = ExactPosition(5) within_pos_s = WithinPosition(10, left=10, right=13) within_pos_e = WithinPosition(13, left=10, right=13) between_pos_e = BetweenPosition(24, left=20, right=24) before_pos = BeforePosition(15) after_pos = AfterPosition(40) self.assertEqual(int(within_pos_s), 10) self.assertEqual(str(within_pos_s), "(10.13)") self.assertEqual(int(within_pos_e), 13) self.assertEqual(str(within_pos_e), "(10.13)") self.assertEqual(int(between_pos_e), 24) self.assertEqual(str(between_pos_e), "(20^24)") self.assertEqual(str(before_pos), "<15") self.assertEqual(str(after_pos), ">40") # put these into Locations location1 = FeatureLocation(exact_pos, within_pos_e) location2 = FeatureLocation(before_pos, between_pos_e) location3 = FeatureLocation(within_pos_s, after_pos) self.assertEqual(str(location1), "[5:(10.13)]") self.assertEqual(str(location1.start), "5") self.assertEqual(str(location1.end), "(10.13)") self.assertEqual(str(location2), "[<15:(20^24)]") self.assertEqual(str(location2.start), "<15") self.assertEqual(str(location2.end), "(20^24)") self.assertEqual(str(location3), "[(10.13):>40]") self.assertEqual(str(location3.start), "(10.13)") self.assertEqual(str(location3.end), ">40") # --- test non-fuzzy representations self.assertEqual(location1.nofuzzy_start, 5) self.assertEqual(location1.nofuzzy_end, 13) self.assertEqual(location2.nofuzzy_start, 15) self.assertEqual(location2.nofuzzy_end, 24) self.assertEqual(location3.nofuzzy_start, 10) self.assertEqual(location3.nofuzzy_end, 40)
def _read_ft(record, line): name = line[5:13].rstrip() if name: if line[13:21] == " ": # new-style FT line location = line[21:80].rstrip() try: isoform_id, location = location.split(":") except ValueError: isoform_id = None try: from_res, to_res = location.split("..") except ValueError: from_res = location to_res = "" qualifiers = {} else: # old-style FT line from_res = line[14:20].lstrip() to_res = line[21:27].lstrip() isoform_id = None description = line[34:75].rstrip() qualifiers = {"description": description} if from_res == "?": from_res = UnknownPosition() elif from_res.startswith("?"): position = int(from_res[1:]) - 1 # Python zero-based counting from_res = UncertainPosition(position) elif from_res.startswith("<"): position = int(from_res[1:]) - 1 # Python zero-based counting from_res = BeforePosition(position) else: position = int(from_res) - 1 # Python zero-based counting from_res = ExactPosition(position) if to_res == "": position = from_res + 1 to_res = ExactPosition(position) elif to_res == "?": to_res = UnknownPosition() elif to_res.startswith("?"): position = int(to_res[1:]) to_res = UncertainPosition(position) elif to_res.startswith(">"): position = int(to_res[1:]) to_res = AfterPosition(position) else: position = int(to_res) to_res = ExactPosition(position) location = FeatureLocation(from_res, to_res, ref=isoform_id) feature = FeatureTable( location=location, type=name, id=None, qualifiers=qualifiers ) record.features.append(feature) return # this line is a continuation of the previous feature feature = record.features[-1] if line[5:34] == " ": # old-style FT line description = line[34:75].rstrip() if description.startswith("/FTId="): # store the FTId as the feature ID feature.id = description[6:].rstrip(".") return # this line is a continuation of the description of the previous feature old_description = feature.qualifiers["description"] if old_description.endswith("-"): description = "%s%s" % (old_description, description) else: description = "%s %s" % (old_description, description) if feature.type in ("VARSPLIC", "VAR_SEQ"): # special case # Remove unwanted spaces in sequences. # During line carryover, the sequences in VARSPLIC/VAR_SEQ can get # mangled with unwanted spaces like: # 'DISSTKLQALPSHGLESIQT -> PCRATGWSPFRRSSPC LPTH' # We want to check for this case and correct it as it happens. try: first_seq, second_seq = description.split(" -> ") except ValueError: pass else: extra_info = "" # we might have more information at the end of the # second sequence, which should be in parenthesis extra_info_pos = second_seq.find(" (") if extra_info_pos != -1: extra_info = second_seq[extra_info_pos:] second_seq = second_seq[:extra_info_pos] # now clean spaces out of the first and second string first_seq = first_seq.replace(" ", "") second_seq = second_seq.replace(" ", "") # reassemble the description description = first_seq + " -> " + second_seq + extra_info feature.qualifiers["description"] = description else: # new-style FT line value = line[21:].rstrip() if value.startswith("/id="): qualifier_type = "id" value = value[4:] assert value.startswith('"') assert value.endswith('"') feature.id = value[1:-1] return elif value.startswith("/evidence="): value = value[10:] assert value.startswith('"') if value.endswith('"'): value = value[1:-1] else: # continues on the next line value = value[1:] assert "evidence" not in feature.qualifiers feature.qualifiers["evidence"] = value return elif value.startswith("/note="): value = value[6:] assert value.startswith('"') if value.endswith('"'): value = value[1:-1] else: # continues on the next line value = value[1:] assert "note" not in feature.qualifiers feature.qualifiers["note"] = value return # this line is a continuation of the description of the previous feature keys = list(feature.qualifiers.keys()) key = keys[-1] description = value.rstrip('"') old_description = feature.qualifiers[key] if key == "evidence" or old_description.endswith("-"): description = "%s%s" % (old_description, description) else: description = "%s %s" % (old_description, description) if feature.type == "VAR_SEQ": # see VARSPLIC above try: first_seq, second_seq = description.split(" -> ") except ValueError: pass else: extra_info = "" # we might have more information at the end of the # second sequence, which should be in parenthesis extra_info_pos = second_seq.find(" (") if extra_info_pos != -1: extra_info = second_seq[extra_info_pos:] second_seq = second_seq[:extra_info_pos] # now clean spaces out of the first and second string first_seq = first_seq.replace(" ", "") second_seq = second_seq.replace(" ", "") # reassemble the description description = first_seq + " -> " + second_seq + extra_info feature.qualifiers[key] = description
def write_insdc(genome, features, genbank_output_path, embl_output_path): log.debug('prepare: genbank=%s, embl=%s', genbank_output_path, embl_output_path) contig_list = [] for contig in genome['contigs']: contig_features = [ feat for feat in features if feat['contig'] == contig['id'] ] comment = ( f"Annotated with Bakta (v{bakta.__version__}): https://github.com/oschwengers/bakta\n", f"Database (v{cfg.db_info['major']}.{cfg.db_info['minor']}): https://doi.org/10.5281/zenodo.4247252\n", '\n', f"##Genome Annotation Summary:##\n", f"{'Annotation Date':<30} :: {datetime.now().strftime('%m/%d/%Y, %H:%M:%S')}\n", f"{'Annotation Pipeline':<30} :: Bakta\n", f"{'Annotation Software version':<30} :: v{bakta.__version__}\n", f"{'Annotation Database version':<30} :: v{cfg.db_info['major']}.{cfg.db_info['minor']}\n", f"{'CDSs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CDS or feat['type'] == bc.FEATURE_SORF]):5,}\n", f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_T_RNA]):5,}\n", f"{'tmRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_TM_RNA]):5,}\n", f"{'tRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_R_RNA]):5,}\n", f"{'ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA]):5,}\n", f"{'regulatory ncRNAs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_NC_RNA_REGION]):5,}\n", f"{'CRISPR Arrays':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_CRISPR]):5,}", f"{'oriCs/oriVs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIC or feat['type'] == bc.FEATURE_ORIV]):5,}", f"{'oriTs':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_ORIT]):5,}", f"{'gaps':<30} :: {len([feat for feat in contig_features if feat['type'] == bc.FEATURE_GAP]):5,}", ) contig_annotations = { 'molecule_type': 'DNA', 'source': genome['taxon'], 'date': date.today().strftime('%d-%b-%Y').upper(), 'topology': contig['topology'], 'data_file_division': 'HGT' if contig['type'] == bc.REPLICON_CONTIG else 'BCT', 'comment': comment # TODO: taxonomy } source_qualifiers = { 'mol_type': 'genomic DNA' # 'molecule_type': 'DNA' # might be necessary in BioPython > 1.78 along with removal of Seq(..., generic_dna) } description = '' if (genome['taxon']): contig_annotations['organism'] = genome['taxon'] source_qualifiers['organism'] = genome['taxon'] description = genome['taxon'] if (genome['strain']): source_qualifiers['strain'] = genome['strain'] if (contig['type'] == bc.REPLICON_PLASMID): source_qualifiers['plasmid'] = contig['name'] if contig.get( 'name', None) else 'unnamed' description = f"{description} plasmid {contig.get('name', 'unnamed')}" description += ', complete sequence' if contig[ 'complete'] else ', whole genome shotgun sequence' elif (contig['type'] == bc.REPLICON_CHROMOSOME): source_qualifiers['chromosome'] = contig['name'] if contig.get( 'name', None) else contig['id'] description = f'{description} chromosome, complete genome' if contig[ 'complete'] else f"{description} chromosome {contig['id']}, whole genome shotgun sequence" else: description += f" {contig['id']}, whole genome shotgun sequence" if (len(description) > 0 and description[0] == ' '): # discard potential leading whitespace description = description[1:] contig_rec = SeqIO.SeqRecord(id=contig['id'], name=contig['id'], description=description, annotations=contig_annotations, seq=Seq(contig['sequence'])) source = SeqFeature(FeatureLocation(0, contig['length'], strand=+1), type='source', qualifiers=source_qualifiers) seq_feature_list = [source] for feature in contig_features: insdc_feature_type = None qualifiers = {} if ('db_xrefs' in feature): qualifiers['db_xref'] = feature['db_xrefs'] if ('product' in feature): qualifiers['product'] = feature['product'] if ('locus' in feature): qualifiers['locus_tag'] = feature['locus'] if (feature['type'] == bc.FEATURE_GAP): insdc_feature_type = bc.INSDC_FEATURE_GAP qualifiers['estimated_length'] = feature['length'] elif (feature['type'] == bc.FEATURE_ORIC or feature['type'] == bc.FEATURE_ORIV): # TODO: Add fuzzy positions for oriC/oriV insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_REPLICATION qualifiers['inference'] = 'similar to DNA sequence' elif (feature['type'] == bc.FEATURE_ORIT): # TODO: Add fuzzy positions for oriT insdc_feature_type = bc.INSDC_FEATURE_ORIGIN_TRANSFER qualifiers['inference'] = 'similar to DNA sequence' elif (feature['type'] == bc.FEATURE_CDS) or (feature['type'] == bc.FEATURE_SORF): qualifiers['translation'] = feature['sequence'] qualifiers['codon_start'] = 1 qualifiers['transl_table'] = cfg.translation_table insdc_feature_type = bc.INSDC_FEATURE_CDS inference = [] inference.append( 'ab initio prediction:Prodigal:2.6' if feature['type'] == bc.FEATURE_CDS else 'ab initio prediction:Bakta') if ('ups' in feature): if ('ncbi_nrp_id' in feature['ups']): qualifiers['protein_id'] = feature['ups'][ 'ncbi_nrp_id'] if ('ips' in feature): if ('uniref100_id' in feature['ips']): ips_subject_id = feature['ips']['uniref100_id'] inference.append( f'similar to AA sequence:UniProtKB:{ips_subject_id}' ) if ('psc' in feature): if ('uniref90_id' in feature['psc']): psc_subject_id = feature['psc']['uniref90_id'] inference.append( f'similar to AA sequence:UniProtKB:{psc_subject_id}' ) qualifiers['inference'] = inference elif (feature['type'] == bc.FEATURE_T_RNA): # TODO: Position anticodon if ('amino_acid' in feature and 'anti_codon' in feature): if ('anti_codon_pos' in feature): anti_codon_pos = feature['anti_codon_pos'] qualifiers[ 'anticodon'] = f"(pos:{anti_codon_pos[0]}..{anti_codon_pos[1]},aa:{feature['amino_acid']},seq:{feature['anti_codon']})" else: qualifiers[ 'note'] = f"tRNA-{feature['amino_acid']} ({feature['anti_codon']})" qualifiers['inference'] = 'profile:tRNAscan:2.0' insdc_feature_type = bc.INSDC_FEATURE_T_RNA if ('pseudo' in feature): qualifiers['pseudo'] = None elif (feature['type'] == bc.FEATURE_TM_RNA): qualifiers['inference'] = 'profile:aragorn:1.2' insdc_feature_type = bc.INSDC_FEATURE_TM_RNA elif (feature['type'] == bc.FEATURE_R_RNA): for dbxref in feature['db_xrefs']: if (dbxref.split(':')[0] == 'RFAM'): rfam_id = dbxref.split(':')[1] qualifiers['inference'] = f'profile:Rfam:{rfam_id}' insdc_feature_type = bc.INSDC_FEATURE_R_RNA elif (feature['type'] == bc.FEATURE_NC_RNA): # TODO: ncRNA_class for dbxref in feature['db_xrefs']: if (dbxref.split(':')[0] == 'RFAM'): rfam_id = dbxref.split(':')[1] qualifiers['inference'] = f'profile:Rfam:{rfam_id}' qualifiers[bc.INSDC_FEATURE_NC_RNA_CLASS] = select_ncrna_class( feature) insdc_feature_type = bc.INSDC_FEATURE_NC_RNA elif (feature['type'] == bc.FEATURE_NC_RNA_REGION): for dbxref in feature['db_xrefs']: if (dbxref.split(':')[0] == 'RFAM'): rfam_id = dbxref.split(':')[1] qualifiers['inference'] = f'profile:Rfam:{rfam_id}' qualifiers[ bc. INSDC_FEATURE_REGULATORY_CLASS] = select_regulatory_class( feature) insdc_feature_type = bc.INSDC_FEATURE_REGULATORY qualifiers['note'] = feature['product'] qualifiers.pop('product', None) elif (feature['type'] == bc.FEATURE_CRISPR): qualifiers[bc.INSDC_FEATURE_REPEAT_FAMILY] = 'CRISPR' qualifiers[bc.INSDC_FEATURE_REPEAT_TYPE] = 'direct' qualifiers[bc.INSDC_FEATURE_REPEAT_UNIT_SEQ] = feature[ 'repeat_consensus'] qualifiers['inference'] = 'COORDINATES:alignment:pilercr:1.02' insdc_feature_type = bc.INSDC_FEATURE_REPEAT_REGION qualifiers['note'] = feature['product'] qualifiers.pop('product', None) strand = None if (feature['strand'] == bc.STRAND_FORWARD): strand = 1 elif (feature['strand'] == bc.STRAND_REVERSE): strand = -1 elif (feature['strand'] == bc.STRAND_UNKNOWN): strand = 0 start = feature['start'] - 1 stop = feature['stop'] if ('edge' in feature): fl_1 = FeatureLocation(start, contig['length'], strand=strand) fl_2 = FeatureLocation(0, stop, strand=strand) feature_location = CompoundLocation([fl_1, fl_2]) else: if ('truncated' in feature): if (feature['truncated'] == bc.FEATURE_END_5_PRIME): if (feature['strand'] == bc.STRAND_FORWARD): start = BeforePosition(start) else: stop = AfterPosition(stop) elif (feature['truncated'] == bc.FEATURE_END_3_PRIME): if (feature['strand'] == bc.STRAND_FORWARD): stop = AfterPosition(stop) else: start = BeforePosition(start) else: start = BeforePosition(start) stop = AfterPosition(stop) feature_location = FeatureLocation(start, stop, strand=strand) if (feature.get('locus', None)): gene_qualifier = {'locus_tag': feature['locus']} if (feature.get('gene', None)): qualifiers['gene'] = feature['gene'] gene_qualifier['gene'] = feature['gene'] gen_seqfeat = SeqFeature(feature_location, type='gene', qualifiers=gene_qualifier) seq_feature_list.append(gen_seqfeat) feat_seqfeat = SeqFeature(feature_location, type=insdc_feature_type, qualifiers=qualifiers) seq_feature_list.append(feat_seqfeat) contig_rec.features = seq_feature_list contig_list.append(contig_rec) with genbank_output_path.open('wt', encoding='utf-8') as fh: log.info('write GenBank: path=%s', genbank_output_path) SeqIO.write(contig_list, fh, format='genbank') with embl_output_path.open('wt', encoding='utf-8') as fh: log.info('write EMBL: path=%s', embl_output_path) SeqIO.write(contig_list, fh, format='embl')
class SeqFeatureTests(unittest.TestCase): sprot: SeqRecord = SeqRecord( SeqEM2.protein('MYNAMEISFREDHEREIAMWHEREARETHEYALLTHISISEXCELLENT'), id='X', name='DummyProt') sprot.features = [ SeqFeatureEM2(parent=sprot, location=FeatureLocation(0, 11), type='domain', id='d1'), # MYNAMEISFRED SeqFeatureEM2(parent=sprot, location=FeatureLocation(8, 18), type='domain', id='d2'), # FREDHEREIAM SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 30), type='domain', id='d3'), # WHEREARETHEY SeqFeatureEM2(parent=sprot, location=FeatureLocation(6, 23), type='domain', id='d4'), # ISFREDHEREIAMWHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(34, AfterPosition(39)), id='d5'), # THISIS SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(2), 5), type='domain', id='d6'), # MYNAME SeqFeatureEM2(parent=sprot, location=FeatureLocation(19, 23), type='domain', id='d7'), # WHERE SeqFeatureEM2(parent=sprot, location=FeatureLocation(BeforePosition(30), 37), type='domain', id='d8') # YALLTHI ] @classmethod def test_parent(cls): assert [f.id for f in cls.sprot.features ] == ['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8'] assert cls.sprot.features[1].parent.id == cls.sprot.id assert cls.sprot.features[1].parent.name == cls.sprot.name assert cls.sprot.features[1].parent.seq._data == cls.sprot.seq._data @classmethod def test_lies_within(cls): assert cls.sprot.features[1].lies_within(5, 25) assert not cls.sprot.features[1].lies_within(10, 25) assert not cls.sprot.features[1].lies_within(19, 25) @classmethod def test_lies_within_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].lies_within(30, 42) cls.sprot.features[5].lies_within(0, 10) @classmethod def test_overlaps(cls): assert cls.sprot.features[2].overlaps(20, 25) assert cls.sprot.features[2].overlaps(20, 40) assert cls.sprot.features[2].overlaps(20) assert not cls.sprot.features[2].overlaps(35) assert not cls.sprot.features[2].overlaps(2, 5) @classmethod def test_overlaps_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].overlaps(35) cls.sprot.features[5].overlaps(3) @classmethod def test_covers(cls): assert cls.sprot.features[3].covers(15, 20) assert not cls.sprot.features[3].covers(4, 20) @classmethod def test_covers_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[4].covers(35, 38) cls.sprot.features[5].covers(3, 4) @classmethod def test_intersect(cls): assert cls.sprot.features[4].intersect( cls.sprot.features[7]).location == FeatureLocation(34, 37) assert cls.sprot.features[2].intersect( cls.sprot.features[3]).location == cls.sprot.features[6].location assert cls.sprot.features[1].intersect( cls.sprot.features[3]).location == FeatureLocation(8, 18) @classmethod def test_intersect_errors(cls): with pytest.raises(ValueError, match=r'Undetermined .*'): cls.sprot.features[0].intersect( SeqFeatureEM2(location=FeatureLocation(30, 37))) @classmethod def test_intersect_fuzzy(cls): with pytest.warns(UserWarning): cls.sprot.features[5].intersect(cls.sprot.features[0]) @classmethod def test_move(cls): assert cls.sprot.features[0].move(5).location == FeatureLocation(5, 16)
def scan_orfs(seq: str, direction: int, offset: int = 0) -> List[FeatureLocation]: """ Scan for open reading frames on a given sequence. Skips all ORFs with a size less than 60 bases. Arguments: seq: the sequence to examine direction: the search direction to use (all ORFs will use this as the strand) offset: an offset to add to any location discovered Returns: a list of FeatureLocations for each ORF, ordered by ascending position """ seq = seq.upper() start_codons = ('ATG', 'GTG', 'TTG') stop_codons = ('TAA', 'TAG', 'TGA') matches = [] # cache the sequence length seq_len = len(seq) for frame in [0, 1, 2]: i = frame last_stop = 0 while i < seq_len - 2: if seq[i:i+3] in stop_codons and last_stop == 0: # special case for unstarted stops last_stop = i new_orf = FeatureLocation(BeforePosition(offset), offset + i + 2 + 1, direction) if direction == -1: start = AfterPosition(seq_len + offset - new_orf.start) end = seq_len + offset - new_orf.end new_orf = FeatureLocation(end, start, strand=direction) matches.append(new_orf) if seq[i:i+3] not in start_codons: i += 3 continue # Look for the next stop codon in this frame for j in range(i, seq_len - 2, 3): if seq[j:j+3] in stop_codons: last_stop = j # Skip Orfs that are shorter than 20 AA / 60 bases if j - i <= 60: break # since no ORFs will be bigger before the stop start = i end = j + 2 + 1 if direction == 1: new_orf = FeatureLocation(offset + start, offset + end, direction) else: # reversed, so convert back to the forward positions new_orf = FeatureLocation(seq_len + offset - end, seq_len + offset - start, direction) matches.append(new_orf) # This was a good hit, update the last_stop cache. break # if we found a matching stop, carry on looking for starts after this stop if last_stop > i: i = last_stop continue # Save orfs ending at the end of the sequence without stop codon if direction == 1: new_orf = FeatureLocation(i + offset, AfterPosition(seq_len + offset), direction) else: # reversed, so convert back to the forward positions new_orf = FeatureLocation(BeforePosition(offset), offset + seq_len - i, direction) matches.append(new_orf) # since there are no stop codons, just stop here break return sorted(matches, key=lambda x: min(x.start, x.end))
def gene2features(r, gene, gene2position, gene2product, start, end, gcode, partialyes, verbose): """ """ contig, CDSs, gffstrand, function, frames = gene2position[gene] if gffstrand in ('1', '+'): strand = +1 else: strand = -1 CDSs.reverse() '''#add stop codon if not partial seq if strand==1 and CDSs[-1][1]+3 <= len(r.seq): CDSs[-1][1] += 3 elif strand==-1 and CDSs[0][0]-3 > 0: CDSs[0][0] -= 3''' cdsloc, mrnaloc = get_locations(CDSs, start, end, strand) #add gene geneid = gene #".".join(gene.split('.')[:-1]) #get product product = "hypothetical protein" if geneid in gene2product: product = gene2product[geneid] if gene.endswith('.t1'): sf = SeqFeature(FeatureLocation(BeforePosition(start - 1), AfterPosition(end)), strand=strand, type='gene', id=geneid) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } r.features.append(sf) #get mRNA sf sf = SeqFeature(mrnaloc, type='mRNA', id=gene) sf.qualifiers = { "locus_tag": geneid, "gene": geneid, "product": product } #"protein_id": gene r.features.append(sf) #get CDS sf sf = SeqFeature(cdsloc, type='CDS', id=gene) #get translation seq = sf.extract(r.seq) aa = str(seq.translate(table=gcode)) #solve non-triplets issue if len(seq) % 3: if strand == 1: end -= len(seq) % 3 else: start += len(seq) % 3 ##check for partial sequence - no M as first or no * as last aa partial = 0 #both ends partial if aa[0] != "M" and aa[-1] != "*": partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), AfterPosition(end)) #left end partial elif aa[0] != "M" and strand == 1 or aa[-1] != "*" and strand == -1: partial = 1 sf.location = FeatureLocation(BeforePosition(start - 1), end) #right end partial elif aa[-1] != "*" and strand == 1 or aa[0] != "M" and strand == -1: partial = 1 sf.location = FeatureLocation(start - 1, AfterPosition(end)) #strip stop codon aa = aa.strip("*") #replace internal stop codons by X if "*" in aa: if verbose: sys.stderr.write("[Warning] Stop codon(s) in: %s. Skipped!\n" % gene) return r #aa = aa.replace("*","X") sf.qualifiers = { 'transl_table': gcode, "locus_tag": geneid, "gene": geneid, "product": product, "translation": aa } #"protein_id": gene, if function: sf.qualifiers['note'] = function #inform about partial entries if partial: #skip if not partial are allowed if not partialyes: return r if aa[0] != "M": sf.qualifiers['codon_start'] = 1 sf.qualifiers['product'] += ", partial cds" if verbose: sys.stderr.write("[Warning] Partial sequence: %s\n" % (gene, )) #sys.stderr.write("[Warning] Partial sequence: %s %s\n" % (gene,sf)) #add to features r.features.append(sf) return r
def test_start_without_end(self): expected = [ FeatureLocation(ExactPosition(3), AfterPosition(9), strand=1) ] self.run_both_dirs(expected, "NNNATGNNN")
pep_seq = str(prot_seq_dict[pep_id].seq) # If the protein doesn't start with methionine, it is probably a partial one fuzzy_start = False fuzzy_end = False if pep_seq[0] != 'M': fuzzy_start = True if sf.strand == 1: cds_locs[0] = FeatureLocation( BeforePosition(cds_locs[0].start), cds_locs[0].end, sf.strand) cds_quals['codon_start'] = 1 else: cds_locs[-1] = FeatureLocation( cds_locs[-1].start, AfterPosition(cds_locs[-1].end), sf.strand) cds_quals['codon_start'] = 1 # If the protein doesn't end with stop codon, it is probably a partial one if not args.no_stop_codon and pep_seq[-1] not in ('.', '*'): fuzzy_end = True if sf.strand == 1: cds_locs[-1] = FeatureLocation( cds_locs[-1].start, AfterPosition(cds_locs[-1].end), sf.strand) cds_quals['codon_start'] = 1 else: cds_locs[0] = FeatureLocation( BeforePosition(cds_locs[0].start), cds_locs[0].end, sf.strand) cds_quals['codon_start'] = 1