def construct_location(raw_start: str, raw_end: str, raw_strand: str, attributes: Dict[str, str], strict: bool = False) -> FeatureLocation: """ Converts the raw sections of a GFF line into a FeatureLocation. Some attribute keys can modify the location's values. """ try: start = ExactPosition(int(raw_start) - 1) # 0-indexed as FeatureLocation expects end = ExactPosition(int(raw_end)) except ValueError as err: raise GFFParseError("Invalid location values: %s" % str(err)) if start < 0 or end < 0: raise GFFParseError("Invalid location values: %s, %s" % (raw_start, raw_end)) strand = interpret_strand(raw_strand, strict=strict) # handle ambiguous positions as noted in attributes if attributes.get("partial") == "true" and ("start_range" in attributes or "end_range" in attributes): attributes.pop("partial") start_range = attributes.pop("start_range", "%s,%s" % (start, end)) end_range = attributes.pop("end_range", "%s,%s" % (start, end)) if start_range.startswith("."): start = BeforePosition(int(start)) if end_range.endswith("."): end = AfterPosition(int(end)) return FeatureLocation(start, end, strand)
def new_compound_location( indices: List[Union[Tuple[int, int], Tuple[int, int, int]]], strand: int ) -> CompoundLocation: locations = [] for index in indices: if not isinstance(index, Tuple): raise ValueError( "Expects a tuple of integers size 2 or 3, not a {}".format( indices.__class__ ) ) if not len(index) in [2, 3]: raise ValueError("Expects a tuple of integers of size 2 or 3") if len(index) == 2: i, j = index s = strand elif len(index) == 3: i, j, s = index else: raise ValueError("Must be tuple of 2 or 3 integers") if not isinstance(i, int) or not isinstance(j, int) or not isinstance(s, int): raise ValueError( "Expects a tuple of integers of size 2 or 3. Found {}".format(index) ) locations.append(FeatureLocation(ExactPosition(i), ExactPosition(j), strand=s)) return CompoundLocation(locations)
def _get_translation(feature, seq): nucseq = feature.location.extract(seq) offset = feature.qualifiers.get("codon_start", [1])[0] - 1 right_offset = -1 * ((len(nucseq) - offset) % 3) if hasattr(tool, "transl_table"): transl_table = tool.transl_table else: transl_table = feature.qualifiers.get("transl_table", [11])[0] if transl_table == 4: start_codons = [ "TTA", "TTG", "CTG", "ATT", "ATC", "ATA", "GTG" ] # and ATG for transl_table 4 else: start_codons = ["TTG", "CTG", "ATT", "ATC", "ATA", "GTG"] # and ATG for transl_table 11 if right_offset == 0: if isinstance(feature.location.start, ExactPosition) and isinstance( feature.location.end, ExactPosition): try: translation = nucseq[offset:].translate( table=transl_table, cds=True) except TranslationError: translation = nucseq[offset:].translate( table=transl_table, to_stop=True) if len(translation) * 3 != len(nucseq[offset:]): self.logger.warning( "Translation error in {}. In-frame stop codon exists. Translation was terminated at the first in-frame stop codon." .format(feature.id)) before = str(feature.location) if feature.location.strand == 1: start = feature.location.start end = ExactPosition( start + offset + len(translation) * 3 + 3) # Trailing +3 for stop-codon feature.location = FeatureLocation( start, end, 1) else: end = feature.location.end start = ExactPosition( end - offset - len(translation) * 3 - 3) # Trailing -3 for stop-codon feature.location = FeatureLocation( start, end, -1) after = str(feature.location) self.logger.warning( "CDS[{}] was fixed from {} to {}.".format( feature.id, before, after)) else: translation = nucseq[offset:].translate(table=transl_table, to_stop=True) else: translation = nucseq[offset:right_offset].translate( table=transl_table) # , stop_symbol="") translation = str(translation) first_codon = str(nucseq[offset:offset + 3]).upper() if first_codon in start_codons: translation = "M" + translation[1:] return translation
def setUp(self): f0 = SeqFeature( FeatureLocation(0, 26), type="source", qualifiers={"mol_type": ["fake protein"]}, ) f1 = SeqFeature(FeatureLocation(0, ExactPosition(10))) f2 = SeqFeature( FeatureLocation(WithinPosition(12, left=12, right=15), BeforePosition(22))) f3 = SeqFeature( FeatureLocation( AfterPosition(16), OneOfPosition( 26, [ExactPosition(25), AfterPosition(26)]), )) self.record = SeqRecord( Seq("ABCDEFGHIJKLMNOPQRSTUVWZYX", generic_protein), id="TestID", name="TestName", description="TestDescr", dbxrefs=["TestXRef"], annotations={"k": "v"}, letter_annotations={"fake": "X" * 26}, features=[f0, f1, f2, f3], )
def add_point_feature(self, resnum, feat_type=None, feat_id=None, qualifiers=None): """Add a feature to the features list describing a single residue. Args: resnum (int): Protein sequence residue number feat_type (str, optional): Optional description of the feature type (ie. 'catalytic residue') feat_id (str, optional): Optional ID of the feature type (ie. 'TM1') """ if self.feature_file: raise ValueError( 'Feature file associated with sequence, please remove file association to append ' 'additional features.') if not feat_type: feat_type = 'Manually added protein sequence single residue feature' newfeat = SeqFeature(location=FeatureLocation( ExactPosition(resnum - 1), ExactPosition(resnum)), type=feat_type, id=feat_id, qualifiers=qualifiers) self.features.append(newfeat)
def create_genbank_file(self): """ Greate a genbank file containing For more documentation on how to create new features, visit - http://biopython.org/\\ DIST/docs/api/Bio.SeqRecord.SeqRecord-class.html#__getitem__ - http://biopython.org/\\ DIST/docs/api/Bio.SeqFeature.SeqFeature-class.html - http://www.ebi.ac.uk/\\ embl/Documentation/FT_definitions/feature_table.html """ log.info("augmenting genbank file %s with putative operons" % self.analysis.genbankfile_name) for i, o in enumerate(self.operons): location = FeatureLocation(ExactPosition(o.begin), ExactPosition(o.end)) self.genbank_record.features.append( SeqFeature(location, type='mRNA', strand=o.strand, qualifiers=dict(note='putative, confidence %d%%' % o.confidence, operon='rnas-%d' % i))) self.genbank_record.features.sort( key=lambda f: f.location.start.position) xgb_file = open(self.analysis.xgenbankfile_path, "w") SeqIO.write(self.genbank_record, xgb_file, "genbank")
def test_pickle(self): """Test pickle behaviour of position instances.""" # setup import pickle within_pos = WithinPosition(10, left=10, right=13) between_pos = BetweenPosition(24, left=20, right=24) oneof_pos = OneOfPosition( 1888, [ExactPosition(1888), ExactPosition(1901)]) # test __getnewargs__ self.assertEqual(within_pos.__getnewargs__(), (10, 10, 13)) self.assertEqual(between_pos.__getnewargs__(), (24, 20, 24)) self.assertEqual( oneof_pos.__getnewargs__(), (1888, [ExactPosition(1888), ExactPosition(1901)]), ) # test pickle behaviour within_pos2 = pickle.loads(pickle.dumps(within_pos)) between_pos2 = pickle.loads(pickle.dumps(between_pos)) oneof_pos2 = pickle.loads(pickle.dumps(oneof_pos)) self.assertEqual(within_pos, within_pos2) self.assertEqual(between_pos, between_pos2) self.assertEqual(oneof_pos, oneof_pos2) self.assertEqual(within_pos._left, within_pos2._left) self.assertEqual(within_pos._right, within_pos2._right) self.assertEqual(between_pos._left, between_pos2._left) self.assertEqual(between_pos._right, between_pos2._right) self.assertEqual(oneof_pos.position_choices, oneof_pos2.position_choices)
def test_get_mite_gene_location_intron_reverse_lots_of_introns(self): # Setup mite = SeqFeature(FeatureLocation(ExactPosition(511777-1), ExactPosition(512242), strand=1), type='mRNA', id='AT1G02470') sub_features = [ SubFeature(strand='-', start=510853-1, end=511011, name='T1'), SubFeature(strand='-', start=510853-1, end=511086, name='E7'), SubFeature(strand='-', start=511170-1, end=511217, name='E6'), SubFeature(strand='-', start=511310-1, end=511358, name='E5'), SubFeature(strand='-', start=511474-1, end=511526, name='E4'), SubFeature(strand='-', start=511621-1, end=511716, name='E3'), SubFeature(strand='-', start=512243-1, end=512342, name='E2'), SubFeature(strand='-', start=512428-1, end=512707, name='E1'), SubFeature(strand='-', start=512670-1, end=512707, name='F1'),] expected_start, expected_end = 'E2', 'I2' # Exercise start, end = get_mite_gene_location(mite, sub_features) # Verify self.assertEqual(start, expected_start) self.assertEqual(end, expected_end)
def test_annotate_sub_feature_reverse_correct_annotation_and_counts(self): # Setup sub_features = [SeqFeature(FeatureLocation(ExactPosition(9761599-1), ExactPosition(9761802), strand=-1), type='three_prime_UTR'), SeqFeature(FeatureLocation(ExactPosition(9761599-1), ExactPosition(9762165), strand=-1), type='exon'), SeqFeature(FeatureLocation(ExactPosition(9763450-1), ExactPosition(9764167), strand=-1), type='exon'), SeqFeature(FeatureLocation(ExactPosition(9764158-1), ExactPosition(9764167), strand=-1), type='five_prime_UTR'),] feature = SeqFeature(FeatureLocation(ExactPosition(9762301-1), ExactPosition(9762350), strand=-1), type='mRNA', id='AT1G28230', sub_features=sub_features) expected_exon_counts = 2 names = iter(['F1', 'E1', 'T1', 'E2']) # possible sort diff if tie? # Exercise annotate_sub_feature_counts(feature) # Verify self.assertEqual(feature.exon_count, expected_exon_counts) for sub in feature.sub_features: self.assertEqual(sub.name, next(names))
def test_get_mite_gene_location_exon_forward(self): # Setup mite = SeqFeature(FeatureLocation(ExactPosition(14301135-1), ExactPosition(14301495), strand=1), type='mRNA', id='AT1G38630') sub_features = [ SubFeature(strand='+', start=14298853-1, end=14299101, name='F1'), SubFeature(strand='+', start=14298853-1, end=14299175, name='E1'), SubFeature(strand='+', start=14299460-1, end=14299528, name='E2'), SubFeature(strand='+', start=14301089-1, end=14301157, name='E3'), SubFeature(strand='+', start=14301443-1, end=14301511, name='E4'), SubFeature(strand='+', start=14301621-1, end=14301689, name='E5'), SubFeature(strand='+', start=14302679-1, end=14302747, name='E6'), SubFeature(strand='+', start=14302843-1, end=14302939, name='E7'), SubFeature(strand='+', start=14302892-1, end=14302939, name='T1'),] expected_start, expected_end = 'E3', 'E4' # Exercise start, end = get_mite_gene_location(mite, sub_features) # Verify self.assertEqual(start, expected_start) self.assertEqual(end, expected_end)
def _annotate_feature( length: int, name: str, i: int = None, j: int = None, cyclic: bool = False, feature_type: str = None, ): if i is None: i = 0 if j is None: j = length if cyclic and (j > length or (i > j)): if j > length: j = j - length feature = new_compound_feature( name=name, indices=[(i, length), (0, j)], strand=1, feature_type=feature_type, ) else: feature = new_feature( name=name, location=FeatureLocation(ExactPosition(i), ExactPosition(j), strand=1), feature_type=feature_type, ) return feature
def test_lcs(): from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord as BSeqRecord from pydna.dseq import Dseq from pydna.dseqrecord import Dseqrecord from pydna.seqrecord import SeqRecord from pydna.seqfeature import SeqFeature from Bio.SeqFeature import FeatureLocation, ExactPosition s = SeqRecord(Seq("GGATCC")) expected = SeqFeature() expected.__dict__ = { "location": FeatureLocation(ExactPosition(0), ExactPosition(6), strand=1), "type": "read", "id": "<unknown id>", "qualifiers": { "label": ["sequence"], "ApEinfo_fwdcolor": ["#DAFFCF"], "ApEinfo_revcolor": ["#DFFDFF"], }, } assert s.lcs("GGATCC", limit=4).__dict__ == expected.__dict__ assert s.lcs(Seq("GGATCC"), limit=4).__dict__ == expected.__dict__ assert (s.lcs(BSeqRecord(Seq("GGATCC"), name="sequence"), limit=4).__dict__ == expected.__dict__) assert s.lcs(Dseq("GGATCC"), limit=4).__dict__ == expected.__dict__ assert (s.lcs(Dseqrecord(Dseq("GGATCC"), name="sequence"), limit=4).__dict__ == expected.__dict__) assert (s.lcs(Dseqrecord("GGATCC", name="sequence"), limit=4).__dict__ == expected.__dict__)
def get_feature_location(self): # Coordinate is 0-based in Biopython object start = BeforePosition( self.left - 1) if self.left_partial else ExactPosition(self.left - 1) end = AfterPosition( self.right) if self.right_partial else ExactPosition(self.right) return FeatureLocation(start, end, strand=self.strand)
def test_all_combos(self): expected = [ FeatureLocation(ExactPosition(0), ExactPosition(66), strand=1) ] for start in ('ATG', 'GTG', 'TTG'): for stop in ('TAA', 'TAG', 'TGA'): seq = "{}{}{}".format(start, "N" * 60, stop) self.run_both_dirs(expected, seq)
def _exact(csrange): ''' An internal static function to generate an exact feature location. ''' from Bio.SeqFeature import ExactPosition, FeatureLocation start_pos = csrange[0] stop_pos = csrange[-1]+1 start_exact = ExactPosition(start_pos) stop_exact = ExactPosition(stop_pos) return FeatureLocation(start_exact, stop_exact)
def partitionLines(self, split_factor=1.05): avgRowLength = int( float(self.genome_length) / float(self.rows * split_factor)) fake_count = 100 items = [] for i in range(fake_count): key = int(float(self.genome_length * i) / fake_count) items.append(FeatureLocation(key, key, strand=1)) for x in self.classes: if self.classes[x].included: items += [y.location for y in self.classes[x].objects] longest_last_object = 1 thisRowEnd = 1 + avgRowLength currentRow = 1 _internal_maxrowlength = 0 rowData = {1: {"start": ExactPosition(1)}} for item in sorted(items, key=lambda x: x.start): if item.start >= thisRowEnd or item.end > thisRowEnd: if self.justified or item.start >= rowData[currentRow]["end"]: rowData[currentRow]["end"] = thisRowEnd else: rowData[currentRow]["end"] = max(longest_last_object, item.start) _internal_maxrowlength = max( _internal_maxrowlength, rowData[currentRow]["end"] - rowData[currentRow]["start"], ) currentRow += 1 rowData[currentRow] = {} if item.start <= rowData[currentRow - 1]["end"]: rowData[currentRow]["start"] = item.start else: rowData[currentRow]["start"] = rowData[currentRow - 1]["end"] + 1 thisRowEnd = avgRowLength + rowData[currentRow]["start"] thisRowEnd = rowData[currentRow]["end"] = ExactPosition( self.genome_length + 1) _internal_maxrowlength = max( _internal_maxrowlength, rowData[currentRow]["end"] - rowData[currentRow]["start"], ) return rowData, avgRowLength, _internal_maxrowlength
def getLocation(self, left, right, strand, partial_flag="00"): """partialFlag = {00:both ends existing, 10:left-end missing, 01:right-end missing, 00:both-ends missing}""" strand = 1 if (strand == "+" or strand == "1" or strand == 1) else -1 leftPosition = BeforePosition( int(left) - 1) if partial_flag[0] == "1" else ExactPosition(int(left) - 1) rightPosition = AfterPosition( int(right)) if partial_flag[1] == "1" else ExactPosition( int(right)) return FeatureLocation(leftPosition, rightPosition, strand=strand)
def test_single_contained(self): expected = [ FeatureLocation(ExactPosition(0), ExactPosition(66), strand=1) ] self.run_both_dirs(expected, "ATG" + "N" * 60 + "TAG") self.run_both_dirs(expected, "ATG" + "N" * 60 + "TAGNNN") expected = [ FeatureLocation(ExactPosition(3), ExactPosition(69), strand=1) ] self.run_both_dirs(expected, "NNNATG" + "N" * 60 + "TAG") self.run_both_dirs(expected, "NNNATG" + "N" * 60 + "TAGNNN")
def gb(self): g = SeqRecord( Seq(self.sequence(),IUPAC.IUPACUnambiguousDNA()), id=self.name[0:8], name=self.name[0:8], description=self.description ) g.features = [SeqFeature( FeatureLocation(ExactPosition(f.start-1),ExactPosition(f.end)), f.type, qualifiers=dict([[q.name,q.data] for q in f.qualifiers.all()])) for f in self.features()] return g.format('genbank')
def test_multi_start_single_stop(self): seq = "ATGNNNATG" + "N" * 60 + "TAG" expected = [ FeatureLocation(ExactPosition(0), ExactPosition(72), strand=1) ] assert expected == [ feat.location for feat in find_all_orfs(DummyRecord(seq=seq)) ] seq = str(DummyRecord(seq=seq).seq.reverse_complement()) expected[0].strand = -1 assert expected == [ feat.location for feat in find_all_orfs(DummyRecord(seq=seq)) ]
def split_gbk(seq_records, outname, format = False): import re from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition output_handle = open(outname, "w") merged_record = '' fasta_record = False for i, record in enumerate(seq_records): print i for feature in record.features: if feature.type == "fasta_record": fasta_record = True merged_record+=record[feature.location.start:feature.location.end] merged_record += "N" * 200 my_start_pos = ExactPosition(len(merged_record)-200) my_end_pos = ExactPosition(len(merged_record)) my_feature_location = FeatureLocation(my_start_pos, my_end_pos) my_feature = SeqFeature(my_feature_location, type="assembly_gap") merged_record.features.append(my_feature) elif feature.type == 'source' and fasta_record == False: merged_record+=record[feature.location.start:feature.location.end] merged_record += "N" * 200 my_start_pos = ExactPosition(len(merged_record)-200) my_end_pos = ExactPosition(len(merged_record)) my_feature_location = FeatureLocation(my_start_pos,my_end_pos) my_feature = SeqFeature(my_feature_location, type="assembly_gap") merged_record.features.append(my_feature) to_remove = [] for n, feature in enumerate(merged_record.features): if (feature.type == 'source') or (feature.type == "fasta_record"): to_remove.append(n) for index in sorted(to_remove, reverse=True): if index != 0: #print index del merged_record.features[index] merged_record.id = seq_records[0].annotations["accessions"][-1] try: merged_record.description = "%s" % seq_records[0].annotations["organism"] except: merged_record.description = 'Unkown bacteria' merged_record.annotations = seq_records[0].annotations merged_record.name = seq_records[0].annotations["accessions"][-1] return merged_record[0:-200]
def test_translate_one1() -> None: seq = Seq('ACTGGCG') # ref @ 4 is G location = CompoundLocation( [FeatureLocation(ExactPosition(0), ExactPosition(6), 1), \ FeatureLocation(ExactPosition(8), ExactPosition(11), strand=1)], 'join') cds = SeqFeature(location=location) expected = TResult(position=3, alt='A', codon_position=3, ref_codon='GGC', alt_codon='AGC', in_coding_region=True, \ ref_aa='G', alt_aa='S', synonymous=False, alt_is_invalid_stop=False) actual_result = translate_one(seq, [cds], 3, 'A') eq_( expected, actual_result, f"\n\nGiven {seq._data}:\n\nexpected: {expected} \n actual: {actual_result}" )
def test_exact(self): """Features: write/read simple exact locations.""" #Note we don't have to explicitly give an ExactPosition object, #and integer will also work: f = SeqFeature(FeatureLocation(10, 20), strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "11..20") self.record.features.append(f) f = SeqFeature(FeatureLocation(30, 40), strand=-1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "complement(31..40)") self.record.features.append(f) f = SeqFeature(FeatureLocation(ExactPosition(50),ExactPosition(60)), \ strand=+1, type="CDS") self.assertEqual(_insdc_feature_location_string(f), "51..60") self.record.features.append(f) self.write_read_check()
def add_gaps(gbk_record, start_end_list): from Bio.SeqFeature import SeqFeature, FeatureLocation, ExactPosition merged_rec = '' for start, end in start_end_list: #print start, end my_start_pos = ExactPosition(start) my_end_pos = ExactPosition(end) my_feature_location = FeatureLocation(my_start_pos, my_end_pos) my_feature = SeqFeature(my_feature_location, type="assembly_gap") gbk_record.features.append(my_feature) #print gbk_record[40000:50000].features return gbk_record
def parse_position(string: str): """ Converts a positiong from a string into a Position subclass """ if string[0] == '<': return BeforePosition(int(string[1:])) if string[0] == '>': return AfterPosition(int(string[1:])) if string == "UnknownPosition()": return UnknownPosition() return ExactPosition(int(string))
def test_unknown_position(self): location = FeatureLocation(ExactPosition(1), UnknownPosition(), strand=1) new_location = self.convert(location) assert isinstance(new_location.start, ExactPosition) assert new_location.start == 1 assert isinstance(new_location.end, UnknownPosition)
def getgenefromgbk(gbkfile, location): # change to work with locations """parses a genesequence from a gbk file using the gene location parameters ---------- gbkfile string, path to gbk file + file location string of coordinates, example: "[start:end>](+)" returns ---------- ret = DNA sequence of housekeepinggene from featurelocation coordinates abs_loc = validation, contains the location of HG on specific scaffold. [scaffold, start, end] """ ret = "" scaff_number, start, end, strand = location.split(",") scaff_number = int(scaff_number) # Making the FeatureLocation f_start = BeforePosition( start.strip("<")) if "<" in start else ExactPosition(start) f_end = AfterPosition(end.strip(">")) if ">" in end else ExactPosition(end) f = FeatureLocation(f_start, f_end, int(strand)) gbkcontents = SeqIO.parse(gbkfile, "genbank") for record in gbkcontents: record_no = record.name.split(".")[0] scaff_check = int(record_no[-3:]) # = scaffold number if scaff_check == scaff_number: DNA = record.seq ret = f.extract(DNA) # The DNA sequence of the housekeepinggene # VALIDATION start = start.replace(">", "") start = start.replace("<", "") start = int(start) end = end.replace(">", "") end = end.replace("<", "") end = int(end) abs_loc = [scaff_number, start, end] return (ret, abs_loc)
def test_after_position(self): location = FeatureLocation(ExactPosition(1), AfterPosition(6), strand=1) new_location = self.convert(location) assert isinstance(new_location.start, ExactPosition) assert new_location.start == 1 assert isinstance(new_location.end, AfterPosition) assert new_location.end == 6
def test_before_position(self): location = FeatureLocation(BeforePosition(1), ExactPosition(6), strand=-1) new_location = self.convert(location) assert isinstance(new_location.start, BeforePosition) assert new_location.start == 1 assert isinstance(new_location.end, ExactPosition) assert new_location.end == 6
def addTLAFeatures(genbank, fragment_list): """ Function to add SNP lists to the genbank file. The only qualifier sofar is the 'name', which merges the SNP name with the MAF for visibility on SnapGene. There is an unfortunate bug in SnapGene where '1bp long' features are automatically converted to 2 base pairs. I will contact Snapgene to try to get the issue resolved, although know that the frount of the feature is its location. """ count = 0 for frag in fragment_list: location = FeatureLocation(ExactPosition(frag.start), ExactPosition(frag.end)) tla_feature = SeqFeature(location, type='tla', id='tla', qualifiers={'label': 'TLA_Region_%i' % count}) genbank.features.append(tla_feature) count += 1 return genbank