def make_genbank_recs(rec): new_rec = rec #new_rec.seq.alphabet = generic_dna scaffold = new_rec.id scaffold_recs = list( filter(lambda x: x.id.startswith(scaffold + '_'), protein_recs)) for protein_rec in scaffold_recs: start = int(protein_rec.description.split(' # ')[1]) startpos = SeqFeature.ExactPosition(start) end = int(protein_rec.description.split(' # ')[2]) endpos = int(SeqFeature.ExactPosition(end)) strand = int(protein_rec.description.split(' # ')[3]) rec_location = FeatureLocation(startpos, endpos) rec_feature = SeqFeature.SeqFeature(rec_location, type="CDS", strand=strand) #Add ORF name without genome ID rec_feature.qualifiers['protein_id'] = protein_rec.id rec_feature.qualifiers['translation'] = protein_rec.seq rec_feature.qualifiers['locus_tag'] = protein_rec.description new_rec.features.append(rec_feature) return new_rec
def create_feature_annot(loc_range, featuretype, s): """ Create a new feature annotation at loc_range with featuretype on strand s. """ location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def modify_genbank(gb_file, fasta_file): gb_filename = re.search(r'(.*/users/.*/uploads/.*).(\w*)', gb_file) out_file = str(gb_filename.group(1)) + '_modified.' + str( gb_filename.group(2)) genome = SeqIO.read(fasta_file, "fasta").seq final_annotations = get_final_annotations(genome) final_features = [] for record in SeqIO.parse(open(gb_file, "r"), "genbank"): for feature in record.features: if feature.type == "gene" or feature.type == "CDS": locus_tag = feature.qualifiers["locus_tag"][0] if locus_tag in final_annotations.keys(): new_start = final_annotations[locus_tag]["start"] feature.location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(new_start - 1), SeqFeature.ExactPosition( feature.location.end.position), feature.location.strand) if feature.type == "CDS": feature.qualifiers["product"][0] = final_annotations[ locus_tag]["function"] feature.qualifiers["translation"][ 0] = final_annotations[locus_tag]["translation"] else: continue final_features.append(feature) # Append final features record.features = final_features with open(out_file, "w") as new_gb: SeqIO.write(record, new_gb, "genbank") return out_file
def createFEATUREannot(loc_range, featuretype, s): """ Creates a new SeqFeature with ExactPositions based on range.""" location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(loc_range[0]), SeqFeature.ExactPosition(loc_range[1])) new_feature = SeqFeature.SeqFeature(location, type=featuretype, strand=s) return (new_feature)
def _extract_regions(gff_iterator): """Function added by KC Jan 2020. This Extracts regions from the first annotated position to the last annotated position, and updates the locations to correspond to the location in the sequence. """ for rec in gff_iterator: pos = [] loc = min([i.location.start for i in rec.features]) endloc = max([i.location.end for i in rec.features]) for i in range(len(rec.features)): pos += range(int(rec.features[i].location.start), int(rec.features[i].location.end)) rec.features[i].location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition(rec.features[i].location.start - loc), SeqFeature.ExactPosition(rec.features[i].location.end - loc), strand=rec.features[i].strand) for j in range(len(rec.features[i].sub_features)): rec.features[i].sub_features[ j].location = SeqFeature.FeatureLocation( SeqFeature.ExactPosition( rec.features[i].sub_features[j].location.start - loc), SeqFeature.ExactPosition( rec.features[i].sub_features[j].location.end - loc), strand=rec.features[i].sub_features[j].strand) rec.seq = rec.seq[loc:endloc] yield rec
def test_GenerateFeatLoc__make_start_fuzzy__1(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates the case where FeatureLocations are made fuzzy. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_start_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.start, Bio.SeqFeature.BeforePosition) # Fuzzy Start
def test_GenerateFeatLoc__make_start_fuzzy__3(self): ''' Test to evaluate function `make_start_fuzzy` of class `GenerateFeatLoc`. This test evaluates if end FeatureLocations are made fuzzy. See AfterPosition. ''' from Bio import SeqFeature start_pos = SeqFeature.ExactPosition(5) end_pos = SeqFeature.ExactPosition(9) location_object = SeqFeature.FeatureLocation(start_pos, end_pos) out = GnOps.GenerateFeatLoc().make_end_fuzzy(location_object) self.assertIsInstance(out, Bio.SeqFeature.FeatureLocation) # FeatureLocation self.assertIsInstance(out.end, Bio.SeqFeature.AfterPosition) # Fuzzy End
def _make_position(location_string, offset=0): """Turn a Swiss location position into a SeqFeature position object (PRIVATE). An offset of -1 is used with a start location to make it pythonic. """ if location_string == "?": return SeqFeature.UnknownPosition() # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0. try: return SeqFeature.ExactPosition(max(0, offset + int(location_string))) except ValueError: pass if location_string.startswith("<"): try: return SeqFeature.BeforePosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith(">"): # e.g. ">13" try: return SeqFeature.AfterPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass elif location_string.startswith("?"): # e.g. "?22" try: return SeqFeature.UncertainPosition( max(0, offset + int(location_string[1:]))) except ValueError: pass raise NotImplementedError("Cannot parse location '%s'" % location_string)
def write_gb(main_record_file, add="", destination=""): destination = expanduser(destination) main_record_file = expanduser(main_record_file) main_record = SeqIO.read(main_record_file, "fasta") add = expanduser(add) add = SeqIO.read(add, "fasta") main_record.seq.alphabet = IUPACAmbiguousDNA() cre_end = len(main_record.seq) main_record.seq = Seq( concatenate_overlapping_sequences(main_record.seq.tostring(), add.seq.tostring()), IUPACAmbiguousDNA()) main_record.name = "ePet-cre" main_record.id = "ePet-cre" main_record.description = "ePet-cre construct from doi:10.1038/nn.2623" my_start_pos = SeqFeature.ExactPosition(0) my_end_pos = SeqFeature.ExactPosition(cre_end) my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos, strand=1) my_feature_type = "CDS" my_feature = SeqFeature.SeqFeature(my_feature_location, type=my_feature_type) my_feature.qualifiers["gene"] = "Cre" main_record.features.append(my_feature) # Add SV40 PolyA my_start_pos = SeqFeature.ExactPosition(cre_end) my_end_pos = SeqFeature.ExactPosition(cre_end + 118) my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos, strand=1) my_feature_type = "PolyA" my_feature = SeqFeature.SeqFeature(my_feature_location, type=my_feature_type) my_feature.qualifiers["PolyA"] = "SV40-PolyA" main_record.features.append(my_feature) SeqIO.write(main_record, destination + ".gb", "genbank") SeqIO.write(main_record, destination + ".fastas", "fasta")
def get_newends(location, length, strand, end, distance, code, subject_start, feat_start, feat_finish, truncated): # Convert location if on reverse strand location = location if strand == 1 else abs(location - (2 * distance + 1)) # Generate the new end position # Correct by length of the match if at the finish end change = location + strand * length if end == "finish" else location # Multiply by 3 if AA change = change * 3 if code == 'A' else change # Calculate newend = subject_start + change # Apply new end to appropriate end if ((end == "start" and strand == 1) or (end == "finish" and strand == -1)): feat_start = SeqFeature.BeforePosition( newend) if truncated else SeqFeature.ExactPosition(newend) else: feat_finish = SeqFeature.AfterPosition( newend) if truncated else SeqFeature.ExactPosition(newend) return (feat_start, feat_finish)
def _parse_position(element, offset=0): try: position = int(element.attrib["position"]) + offset except KeyError as err: position = None status = element.attrib.get("status", "") if status == "unknown": assert position is None return SeqFeature.UnknownPosition() elif not status: return SeqFeature.ExactPosition(position) elif status == "greater than": return SeqFeature.AfterPosition(position) elif status == "less than": return SeqFeature.BeforePosition(position) elif status == "uncertain": return SeqFeature.UncertainPosition(position) else: raise NotImplementedError("Position status %r" % status)
#!/usr/bin/env python """Test the Location code located in SeqFeature.py This checks to be sure fuzzy and non-fuzzy representations of locations are working properly. """ from Bio import SeqFeature # --- test fuzzy representations print("Testing fuzzy representations...") # check the positions alone exact_pos = SeqFeature.ExactPosition(5) within_pos_s = SeqFeature.WithinPosition(10, left=10, right=13) within_pos_e = SeqFeature.WithinPosition(13, left=10, right=13) between_pos_e = SeqFeature.BetweenPosition(24, left=20, right=24) before_pos = SeqFeature.BeforePosition(15) after_pos = SeqFeature.AfterPosition(40) print "Exact:", exact_pos print("Within (as start, %i): %s" % (int(within_pos_s), within_pos_s)) print("Within (as end, %i): %s" % (int(within_pos_e), within_pos_e)) print("Between (as end, %i): %s" % (int(between_pos_e), between_pos_e)) print "Before:", before_pos print "After:", after_pos # put these into Locations location1 = SeqFeature.FeatureLocation(exact_pos, within_pos_e) location2 = SeqFeature.FeatureLocation(before_pos, between_pos_e) location3 = SeqFeature.FeatureLocation(within_pos_s, after_pos)
def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloroplast=False, dLoopSize=800, nWalk=20): ''' Creates a genbank file based on a fasta file given (resultfile) and a list of features that the genbank file should present (listoffeaturestooutput) ''' #creating the genbank file, not annotated, to be opened afterwards and have the features inserted with open(resultGbFile, "w") as outputResult: finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna) finalResults.seq = finalResults.seq.upper() finalResults.name = finalResults.name[0:10] + '_draft' finalResults.id = finalResults.name[0:10] + '_draft' if len( finalResults.name ) > 16: #has to 16 characters long at max, or else genbank file throws error finalResults.name = finalResults.name[0:16] finalResults.id = finalResults.id[0:16] count = SeqIO.write(finalResults, outputResult, "genbank") dico_intron = {} for thisFeatureAlignment in listOfFeaturesToOutput: if not ('trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \ or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower()): if dico_intron.has_key(thisFeatureAlignment.seq2.split("_")[0]): dico_intron[thisFeatureAlignment.seq2.split("_")[0]] += 1 else: dico_intron[thisFeatureAlignment.seq2.split("_")[0]] = 1 dico_gene = {} with open( resultGbFile, "rU" ) as outputResult: #opening the output file, this time to insert the features finalResults = SeqIO.read(outputResult, "genbank", generic_dna) #lastFeatureAlignment = None dLoopFound = False for thisFeatureAlignment in listOfFeaturesToOutput: # 1. Define a feature type as a text string main_feature_qualifiers = { } #create qualifiers dict where the name will be stored if 'trn' in thisFeatureAlignment.seq2.lower() or 'rrn' in thisFeatureAlignment.seq2.lower() \ or 'ribosomal' in thisFeatureAlignment.seq2.lower() or 'rnr' in thisFeatureAlignment.seq2.lower(): main_feature_qualifiers['product'] = thisFeatureAlignment.seq2 if 'trn' in thisFeatureAlignment.seq2.lower(): main_feature_type = "tRNA" else: main_feature_type = "rRNA" else: main_feature_qualifiers['gene'] = thisFeatureAlignment.seq2 main_feature_type = "gene" gene = thisFeatureAlignment.seq2.split("_")[0] if dico_gene.has_key(gene): dico_gene[gene] += 1 else: dico_gene[gene] = 1 main_start_pos = SeqFeature.ExactPosition( thisFeatureAlignment.startBase) main_end_pos = SeqFeature.ExactPosition( thisFeatureAlignment.endBase) if main_feature_type == "gene": codonDiff = ((main_end_pos - main_start_pos + 1) % 3) if codonDiff == 2: main_end_pos += 1 elif codonDiff == 1: main_end_pos -= 1 #print main_start_pos #print main_end_pos #print thisFeatureAlignment.frame # 2. Use the locations do define a FeatureLocation if thisFeatureAlignment.frame < 0: strandToOutput = -1 else: strandToOutput = 1 main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) # 3. Create a SeqFeature main_feature = SeqFeature.SeqFeature( main_feature_location, type=main_feature_type, qualifiers=main_feature_qualifiers) ''' #find d-loop part #basically just look for a big gap between last feature and this current feature, if there is a gap that #is about the size of a d-loop, it most likely is a dloop, since nothing aligned with it and it has that size #ignore this check if a cloroplast was built if lastFeatureAlignment != None and dLoopFound == False and buildCloroplast == False and dLoopSize > 0: if thisFeatureAlignment.startBase > lastFeatureAlignment.endBase + dLoopSize \ and thisFeatureAlignment.startBase < lastFeatureAlignment.endBase + 3200: dLoopFound = True dLoopStartPos = SeqFeature.ExactPosition(lastFeatureAlignment.endBase) dLoopEndPos = SeqFeature.ExactPosition(thisFeatureAlignment.startBase) dLoopLocation = SeqFeature.FeatureLocation(dLoopStartPos,dLoopEndPos,strand=-1) dLoopType = "D-loop" dLoopFeature = SeqFeature.SeqFeature(dLoopLocation,type=dLoopType) finalResults.features.append(dLoopFeature) lastFeatureAlignment = thisFeatureAlignment ''' # 4. Append your newly created SeqFeature to your SeqRecord if main_feature_type == "gene": cds_qualifiers = dict(main_feature_qualifiers) coding_dna = Seq( str(finalResults.seq[thisFeatureAlignment.startBase - 1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna = coding_dna.reverse_complement() translationTable = thisFeatureAlignment.translationTable tableToUse = CodonTable.unambiguous_dna_by_id[translationTable] listOfStartCodons = [] listOfStopCodons = [] for startCodon in tableToUse.start_codons: """startCodonSeq = Seq(startCodon, IUPAC.unambiguous_dna) startCodonTranslation = str(startCodonSeq.translate(table=translationTable)) if startCodonTranslation not in listOfStartCodons: listOfStartCodons.append(startCodonTranslation)""" if startCodon not in listOfStartCodons: listOfStartCodons.append(startCodon) startCodons = tuple( listOfStartCodons ) #need to make it a tuple so that startswith works with it for stopCodon in tableToUse.stop_codons: if stopCodon not in listOfStopCodons: listOfStopCodons.append(stopCodon) stopCodons = tuple(listOfStopCodons) nWalkStart = int(nWalk) nWalkStop = int(nWalk) ''' For genes in the -1 strand, we look for the stop codons at the start and the start codons at the end! ''' """ if strandToOutput == -1: tempStartCodons = startCodons tempStopCodons = stopCodons startCodons = tempStopCodons stopCodons = tempStartCodons nWalkStart = nWalk nWalkStop = nWalk """ try: ''' Making sure it starts with startCodons ''' try: coding_dna_Forward = coding_dna coding_dna_Backward = coding_dna startBase = int(thisFeatureAlignment.startBase) endBase = int(thisFeatureAlignment.endBase) n = 0 if strandToOutput == 1: while not coding_dna_Forward.startswith(startCodons) \ and not coding_dna_Backward.startswith(startCodons) \ and not coding_dna_Backward.startswith(stopCodons) \ and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0: try: n += 1 coding_dna_Backward = Seq( str(finalResults.seq[startBase - 1 - (3 * n):endBase]), IUPAC.unambiguous_dna) coding_dna_Forward = Seq( str(finalResults.seq[startBase - 1 - (3 * n):endBase]), IUPAC.unambiguous_dna) '''print str(strandToOutput) print "looking for start ="+str(startCodons) print "coding_dna_Forward" print coding_dna_Forward print coding_dna_Forward.startswith(startCodons) print "coding_dna_Backward" print coding_dna_Backward print coding_dna_Backward.startswith(startCodons)''' except: pass else: if coding_dna_Forward.startswith(startCodons): main_start_pos = SeqFeature.ExactPosition( startBase - (3 * n)) thisFeatureAlignment.startBase = main_start_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Backward.startswith( startCodons): main_start_pos = SeqFeature.ExactPosition( startBase - (3 * n)) thisFeatureAlignment.startBase = main_start_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Backward.startswith( stopCodons ): # we look for a start inside the hit n = 0 while not coding_dna_Forward.startswith( startCodons ) and n < nWalkStart and startBase + ( 3 * (n + 1)) <= endBase: try: n += 1 coding_dna_Forward = Seq( str(finalResults. seq[startBase - 1 + (3 * n):endBase]), IUPAC.unambiguous_dna) except: pass else: if coding_dna_Forward.startswith( startCodons): main_start_pos = SeqFeature.ExactPosition( startBase + (3 * n)) thisFeatureAlignment.startBase = main_start_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) if strandToOutput == -1: while not coding_dna_Forward.endswith(stopCodons) \ and not coding_dna_Backward.endswith(stopCodons) \ and dico_gene.get(gene) == 1 and n < nWalkStart and startBase - (3*(n+1)) >= 0 and startBase + (3*(n+1)) <= endBase: try: n += 1 coding_dna_Backward = Seq( str(finalResults.seq[startBase - 1 - (3 * n):endBase]), IUPAC.unambiguous_dna) coding_dna_Backward = coding_dna_Backward.reverse_complement( ) coding_dna_Forward = Seq( str(finalResults.seq[startBase - 1 + (3 * n):endBase]), IUPAC.unambiguous_dna) coding_dna_Forward = coding_dna_Forward.reverse_complement( ) '''print str(strandToOutput) print "looking for stop ="+str(stopCodons) print "coding_dna_Forward" print coding_dna_Forward print coding_dna_Forward.endswith(stopCodons) print "coding_dna_Backward" print coding_dna_Backward print coding_dna_Backward.endswith(stopCodons)''' except: pass else: if coding_dna_Forward.endswith(stopCodons): main_start_pos = SeqFeature.ExactPosition( startBase + (3 * n)) thisFeatureAlignment.startBase = main_start_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Backward.endswith(stopCodons): main_start_pos = SeqFeature.ExactPosition( startBase - (3 * n)) thisFeatureAlignment.startBase = main_start_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) except: pass ''' Updating coding_dna with (new) coordinates ''' coding_dna = Seq( str(finalResults.seq[thisFeatureAlignment.startBase - 1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna = coding_dna.reverse_complement() ''' Making sure it ends with * (stop codon) ''' try: coding_dna_Forward = coding_dna coding_dna_Backward = coding_dna startBase = int(thisFeatureAlignment.startBase) endBase = int(thisFeatureAlignment.endBase) n = 0 if strandToOutput == 1: while not coding_dna_Forward.endswith(stopCodons) \ and not coding_dna_Backward.endswith(stopCodons) \ and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults): try: n += 1 coding_dna_Backward = Seq( str(finalResults.seq[startBase - 1:endBase - (3 * n)]), IUPAC.unambiguous_dna) coding_dna_Forward = Seq( str(finalResults.seq[startBase - 1:endBase + (3 * n)]), IUPAC.unambiguous_dna) '''print str(strandToOutput) print "looking for stop ="+str(stopCodons) print "coding_dna_Forward" print coding_dna_Forward print coding_dna_Forward.endswith(stopCodons) print "coding_dna_Backward" print coding_dna_Backward print coding_dna_Backward.endswith(stopCodons)''' except: pass else: if coding_dna_Backward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition( endBase - (3 * n)) thisFeatureAlignment.endBase = main_end_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Forward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition( endBase + (3 * n)) thisFeatureAlignment.endBase = main_end_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) if strandToOutput == -1: while not coding_dna_Forward.startswith(startCodons) \ and not coding_dna_Backward.startswith(startCodons) \ and not coding_dna_Forward.startswith(stopCodons) \ and dico_gene.get(gene) == dico_intron.get(gene) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults): try: n += 1 coding_dna_Backward = Seq( str(finalResults.seq[startBase - 1:endBase + (3 * n)]), IUPAC.unambiguous_dna) coding_dna_Backward = coding_dna_Backward.reverse_complement( ) coding_dna_Forward = Seq( str(finalResults.seq[startBase - 1:endBase + (3 * n)]), IUPAC.unambiguous_dna) coding_dna_Forward = coding_dna_Forward.reverse_complement( ) '''print str(strandToOutput) print "looking for start ="+str(startCodons) print "coding_dna_Forward" print coding_dna_Forward print coding_dna_Forward.startswith(startCodons) print "coding_dna_Backward" print coding_dna_Backward print coding_dna_Backward.startswith(startCodons)''' except: pass else: if coding_dna_Backward.startswith(startCodons): main_end_pos = SeqFeature.ExactPosition( endBase + (3 * n)) thisFeatureAlignment.endBase = main_end_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Forward.startswith( startCodons): main_end_pos = SeqFeature.ExactPosition( endBase + (3 * n)) thisFeatureAlignment.endBase = main_end_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) elif coding_dna_Forward.startswith( stopCodons ): # we look for a start inside the hit n = 0 while not coding_dna_Forward.startswith( startCodons ) and n < nWalkStop and endBase - ( 3 * (n + 1)) >= startBase: try: n += 1 coding_dna_Forward = Seq( str(finalResults. seq[startBase - 1:endBase - (3 * n)]), IUPAC.unambiguous_dna) coding_dna_Forward = coding_dna_Forward.reverse_complement( ) except: pass else: if coding_dna_Forward.startswith( startCodons): main_end_pos = SeqFeature.ExactPosition( endBase - (3 * n)) thisFeatureAlignment.endBase = main_end_pos main_feature_location = SeqFeature.FeatureLocation( main_start_pos - 1, main_end_pos, strand=strandToOutput) except: pass coding_dna = Seq( str(finalResults.seq[thisFeatureAlignment.startBase - 1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) '''print "\n\nFINAL SEQUENCE IS:" if strandToOutput == 1: print coding_dna+"\n" else: print coding_dna.reverse_complement()+"\n"''' if strandToOutput == 1: coding_dna_Translation = coding_dna.translate( table=translationTable) else: coding_dna_Translation = coding_dna.reverse_complement( ).translate(table=translationTable) cds_qualifiers['translation'] = coding_dna_Translation except: cds_qualifiers['translation'] = 'ERROR' cds_feature = SeqFeature.SeqFeature(main_feature_location, type='CDS', qualifiers=cds_qualifiers) main_feature = SeqFeature.SeqFeature( main_feature_location, type=main_feature_type, qualifiers=main_feature_qualifiers) finalResults.features.append(main_feature) finalResults.features.append(cds_feature) else: #if it's a tRNA or rRNA gene_feature = SeqFeature.SeqFeature( main_feature_location, type='gene', qualifiers=main_feature_qualifiers) finalResults.features.append(gene_feature) finalResults.features.append(main_feature) #returns the final SeqRecord object, with all features, so that the script that called genbankOutput can output this result whatever way #it wants return finalResults
def prodigal_parser(seq_file, sco_file, prefix, output_folder): bin_ffn_file = '%s.ffn' % prefix bin_faa_file = '%s.faa' % prefix bin_gbk_file = '%s.gbk' % prefix pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file) pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file) pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file) # get sequence id list id_to_sequence_dict = {} sequence_id_list = [] for each_seq in SeqIO.parse(seq_file, 'fasta'): id_to_sequence_dict[each_seq.id] = str(each_seq.seq) sequence_id_list.append(each_seq.id) # get sequence to cds dict and sequence to transl_table dict current_seq_id = '' current_transl_table = '' current_seq_csd_list = [] seq_to_cds_dict = {} seq_to_transl_table_dict = {} for each_cds in open(sco_file): if each_cds.startswith('# Sequence Data'): # add to dict if current_seq_id != '': seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table # reset value current_seq_id = each_cds.strip().split('=')[-1][1:-1].split( ' ')[0] current_transl_table = '' current_seq_csd_list = [] elif each_cds.startswith('# Model Data'): current_transl_table = each_cds.strip().split(';')[-2].split( '=')[-1] else: current_seq_csd_list.append('_'.join( each_cds.strip().split('_')[1:])) seq_to_cds_dict[current_seq_id] = current_seq_csd_list seq_to_transl_table_dict[current_seq_id] = current_transl_table bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w') bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w') bin_faa_file_handle = open(pwd_bin_faa_file, 'w') gene_index = 1 for seq_id in sequence_id_list: # create SeqRecord current_sequence = Seq(id_to_sequence_dict[seq_id]) current_SeqRecord = SeqRecord(current_sequence, id=seq_id) current_SeqRecord.seq.alphabet = generic_dna transl_table = seq_to_transl_table_dict[seq_id] # add SeqFeature to SeqRecord for cds in seq_to_cds_dict[seq_id]: # define locus_tag id locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index)) # define FeatureLocation cds_split = cds.split('_') cds_start = SF.ExactPosition(int(cds_split[0])) cds_end = SF.ExactPosition(int(cds_split[1])) cds_strand = cds_split[2] current_strand = None if cds_strand == '+': current_strand = 1 if cds_strand == '-': current_strand = -1 current_feature_location = FeatureLocation(cds_start, cds_end, strand=current_strand) # get nc sequence sequence_nc = '' if cds_strand == '+': sequence_nc = id_to_sequence_dict[seq_id][cds_start - 1:cds_end] if cds_strand == '-': sequence_nc = str( Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end], generic_dna).reverse_complement()) # translate to aa sequence sequence_aa = str( SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table)) # remove * at the end sequence_aa = sequence_aa[:-1] # export nc and aa sequences export_dna_record(sequence_nc, locus_tag_id, '', bin_ffn_file_handle) export_aa_record(sequence_aa, locus_tag_id, '', bin_faa_file_handle) # Define feature type current_feature_type = 'CDS' # Define feature qualifiers current_qualifiers_dict = {} current_qualifiers_dict['locus_tag'] = locus_tag_id current_qualifiers_dict['transl_table'] = transl_table current_qualifiers_dict['translation'] = sequence_aa # Create a SeqFeature current_feature = SeqFeature(current_feature_location, type=current_feature_type, qualifiers=current_qualifiers_dict) # Append Feature to SeqRecord current_SeqRecord.features.append(current_feature) gene_index += 1 # export to gbk file SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank') bin_gbk_file_handle.close() bin_ffn_file_handle.close() bin_faa_file_handle.close()
def main(): if sys.version_info[0] < 3: sys.exit('Must be using Python 3. Try calling "python3 concat_seq_multi_fasta.py"') parser = argparse.ArgumentParser(prog='concat_seq_multi_fasta.py', description='Concatenate all sequences found in a fasta file into a single' ' sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) parser_required = parser.add_argument_group('Required options') parser_required.add_argument('-f', '--fasta', nargs=1, type=argparse.FileType('r'), required=True, metavar='/path/to/multi/fasta/file.fasta', help='Path to the input multi fasta file') parser_optional_general = parser.add_argument_group('General facultative options') parser_optional_general.add_argument('-o', '--outfile', type=str, metavar='/path/to/output/concatenated/file.fasta', help='Path to the directory where the sequences will be stored', required=False, default='concatenated.fasta') parser_optional_general.add_argument('-s', '--spacer', type=int, metavar='10', help='Number of "N"s to be added between sequences', required=False, default=100) args = parser.parse_args() print('\n' '===> RUNNING concat_seq_multi_fasta.py <===') args.fasta = os.path.abspath(args.fasta[0].name) args.outfile = os.path.abspath(args.outfile) if not os.path.isdir(os.path.dirname(args.outfile)): os.makedirs(os.path.dirname(args.outfile)) concatenated = '' features = {} for seq in SeqIO.parse(args.fasta, 'fasta'): id_seq = seq.id start = len(concatenated) end = len(concatenated) + len(seq) features[id_seq] = (start, end) concatenated += str(seq.seq) concatenated += 'N' * args.spacer if args.spacer > 0: concatenated = concatenated[:-args.spacer] concatenated = SeqRecord(Seq(concatenated, generic_dna), id=os.path.splitext(os.path.basename(args.outfile))[0], description='') with open(args.outfile, 'wt', newline='\n') as writer: _ = SeqIO.write(concatenated, writer, "fasta") for id_seq, info in list(features.items()): my_start_pos = SeqFeature.ExactPosition(info[0]) my_end_pos = SeqFeature.ExactPosition(info[1]) my_feature_location = SeqFeature.FeatureLocation(my_start_pos, my_end_pos) my_feature_type = "misc_feature" qualifiers = {'label': id_seq, 'note': 'Geneious type: Concatenated sequence'} strand = 0 my_feature = SeqFeature.SeqFeature(location=my_feature_location, type=my_feature_type, qualifiers=qualifiers, strand=strand) concatenated.features.append(my_feature) with open('{base}.{new_type}'.format(base=args.outfile, new_type='gb'), 'wt', newline='\n') as writer: _ = SeqIO.write(concatenated, writer, 'genbank')
from Bio import SeqFeature # 명확한 position start_pos=SeqFeature.ExactPosition(15) end_pos=SeqFeature.ExactPosition(30) location=SeqFeature.FeatureLocation(start_pos, end_pos) print(start_pos,end_pos,location) # 명확하지 않은 position start_pos2 = SeqFeature.AfterPosition(1) end_pos2 = SeqFeature.BeforePosition(8) # end_pos2 = SeqFeature.BetweenPosition(9, left=8, right=9) my_location = SeqFeature.FeatureLocation(start_pos2, end_pos2) print(start_pos2,end_pos2,my_location)
def construct_scaffold_genbank(protein_recs, protein_file, scaffold_id, outdir=outdir): genome_id = protein_file.split('.faa')[0] #Get the scaffold rec scaffold_filter = lambda x: x.id == scaffold_id if '|' not in protein_file: contigs = list( filter( scaffold_filter, SeqIO.parse( os.path.join(contigs_dir, protein_file.replace('.faa', '.fna')), 'fasta'))) else: genome_id = protein_file.split('|')[0] contigs = list( filter( scaffold_filter, SeqIO.parse(os.path.join(contigs_dir, genome_id + '.fna'), 'fasta'))) genbank_rec = contigs[0] #Scaffold-only pfamscan annotations if pfam_dir != None: try: pfam_file = list( filter(lambda x: x == protein_file + '.out', os.listdir(pfam_dir)))[0] pfam_df = parse_pfam_outfile(os.path.join(pfam_dir, pfam_file)) except: pfam_file = None pfam_df = None #pfam_df['orf_id'] = pfam_df.orf_id.apply(lambda x: x.split('|')[1] if '|' in x else x) else: pfam_df = None if kofam_dir != None: try: kofam_file = list( filter(lambda x: x == protein_file + '.out.parsed.good', os.listdir(kofam_dir)))[0] except: print(genome_id + ' doesnt have KOFAM') kofam_file = None kofam_scafdf = None if kofam_file != None: kofam_df = pd.read_csv(os.path.join(kofam_dir, kofam_file), sep='\t', names=kofam_header) kofam_df['scaffold_id'] = kofam_df.orf_id.apply( lambda x: '_'.join(x.split('_')[:-1])) kofam_df['scaffold_id'] = kofam_df.scaffold_id.apply( lambda x: x.split('|')[1] if '|' in x else x) kofam_scafdf = kofam_df[kofam_df.scaffold_id == scaffold_id] else: kofam_scafdf = None #Start the nucleotide sequence of the genbank file at the start position of the first CDS total_start = int(protein_recs[0].description.split(' # ')[1]) total_end = int(protein_recs[-1].description.split(' # ')[2]) genbank_rec.seq = genbank_rec.seq[total_start:total_end + 1] genbank_rec.seq.alphabet = generic_dna count = 0 if goososfile != None: goosos_df = pd.read_csv(goososfile, sep='\t') else: goosos_df = None for protein_rec in protein_recs: #Prepare location info and construct SeqFeature object start = int(protein_rec.description.split(' # ')[1]) - total_start startpos = SeqFeature.ExactPosition(start) end = int(protein_rec.description.split(' # ')[2]) - total_start endpos = int(SeqFeature.ExactPosition(end)) strand = int(protein_rec.description.split(' # ')[3]) rec_location = FeatureLocation(startpos, endpos) rec_feature = SeqFeature.SeqFeature(rec_location, type="CDS", strand=strand) #Add ORF name without genome ID if '|' in protein_rec.id: rec_feature.qualifiers['protein_id'] = protein_rec.id.split('|')[1] else: rec_feature.qualifiers['protein_id'] = protein_rec.id rec_feature.qualifiers['translation'] = protein_rec.seq #Get pfam info if pfam_dir != None and pfam_file != None: if '|' in protein_rec.id: red_pfam_df = pfam_df[pfam_df.orf_id == protein_rec.id.split( '|')[1]] else: red_pfam_df = pfam_df[pfam_df.orf_id == protein_rec.id] domains = '+'.join(red_pfam_df.pfam_name.tolist()) rec_feature.qualifiers['name'] = domains #Get kofam info if kofam_dir != None: red_kofam_df = kofam_scafdf[kofam_scafdf.orf_id == protein_rec.id] kofam_annotations = '+'.join(red_kofam_df.description.tolist()) if kofam_annotations != '': rec_feature.qualifiers['locus_tag'] = kofam_annotations if goosos_df != None: red_goosos_df = goosos_df[goosos_df.orf_id == protein_rec.id] goosos_annotations = '+'.join(goosos_df.family_hmm.tolist()) rec_feature.qualifiers['label'] = goosos_annotations if annotation_dict != None: if protein_rec.id in annotation_dict: rec_feature.qualifiers['id'] = annotation_dict[protein_rec.id] elif '|' in protein_rec.id and unlabeled_annotation: if protein_rec.id.split('|') in annotation_dict: rec_feature.qualifiers['id'] = annotation_dict[ protein_rec.id.split('|')] elif '|' not in protein_rec.id and not unlabeled_annotation: if genome_id + '|' + protein_rec.id in annotation_dict: rec_feature.qualifiers['id'] = annotation_dict[ genome_id + '|' + protein_rec.id] genbank_rec.features.append(rec_feature) SeqIO.write(genbank_rec, os.path.join(outdir, genome_id + '_' + scaffold_id + '.gbk'), 'genbank') return
def parse(self): """Parse the input.""" assert self.entry.tag == NS + 'entry' def append_to_annotations(key, value): if key not in self.ParsedSeqRecord.annotations: self.ParsedSeqRecord.annotations[key] = [] if value not in self.ParsedSeqRecord.annotations[key]: self.ParsedSeqRecord.annotations[key].append(value) def _parse_name(element): self.ParsedSeqRecord.name = element.text self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) def _parse_accession(element): append_to_annotations( 'accessions', element.text) # to cope with SwissProt plain text parser self.ParsedSeqRecord.dbxrefs.append(self.dbname + ':' + element.text) def _parse_protein(element): """Parse protein names (PRIVATE).""" descr_set = False for protein_element in element.getchildren(): if protein_element.tag in [ NS + 'recommendedName', NS + 'alternativeName' ]: #recommendedName tag are parsed before #use protein fields for name and description for rec_name in protein_element.getchildren(): ann_key = '%s_%s' % (protein_element.tag.replace( NS, ''), rec_name.tag.replace(NS, '')) append_to_annotations(ann_key, rec_name.text) if (rec_name.tag == NS + 'fullName') and not descr_set: self.ParsedSeqRecord.description = rec_name.text descr_set = True elif protein_element.tag == NS + 'component': pass #not parsed elif protein_element.tag == NS + 'domain': pass #not parsed def _parse_gene(element): for genename_element in element.getchildren(): if 'type' in genename_element.attrib: ann_key = 'gene_%s_%s' % (genename_element.tag.replace( NS, ''), genename_element.attrib['type']) if genename_element.attrib['type'] == 'primary': self.ParsedSeqRecord.annotations[ ann_key] = genename_element.text else: append_to_annotations(ann_key, genename_element.text) def _parse_geneLocation(element): append_to_annotations('geneLocation', element.attrib['type']) def _parse_organism(element): organism_name = com_name = sci_name = '' for organism_element in element.getchildren(): if organism_element.tag == NS + 'name': if organism_element.text: if organism_element.attrib['type'] == 'scientific': sci_name = organism_element.text elif organism_element.attrib['type'] == 'common': com_name = organism_element.text else: #e.g. synonym append_to_annotations("organism_name", organism_element.text) elif organism_element.tag == NS + 'dbReference': self.ParsedSeqRecord.dbxrefs.append( organism_element.attrib['type'] + ':' + organism_element.attrib['id']) elif organism_element.tag == NS + 'lineage': for taxon_element in organism_element.getchildren(): if taxon_element.tag == NS + 'taxon': append_to_annotations('taxonomy', taxon_element.text) if sci_name and com_name: organism_name = '%s (%s)' % (sci_name, com_name) elif sci_name: organism_name = sci_name elif com_name: organism_name = com_name self.ParsedSeqRecord.annotations['organism'] = organism_name def _parse_organismHost(element): for organism_element in element.getchildren(): if organism_element.tag == NS + 'name': append_to_annotations("organism_host", organism_element.text) def _parse_keyword(element): append_to_annotations('keywords', element.text) def _parse_comment(element): """Parse comments (PRIVATE). Comment fields are very heterogeneus. each type has his own (frequently mutated) schema. To store all the contained data, more complex data structures are needed, such as annidated dictionaries. This is left to end user, by optionally setting: return_raw_comments=True the orginal XMLs is returned in the annotation fields. available comment types at december 2009: "allergen" "alternative products" "biotechnology" "biophysicochemical properties" "catalytic activity" "caution" "cofactor" "developmental stage" "disease" "domain" "disruption phenotype" "enzyme regulation" "function" "induction" "miscellaneous" "pathway" "pharmaceutical" "polymorphism" "PTM" "RNA editing" "similarity" "subcellular location" "sequence caution" "subunit" "tissue specificity" "toxic dose" "online information" "mass spectrometry" "interaction" """ simple_comments = [ "allergen", "biotechnology", "biophysicochemical properties", "catalytic activity", "caution", "cofactor", "developmental stage", "disease", "domain", "disruption phenotype", "enzyme regulation", "function", "induction", "miscellaneous", "pathway", "pharmaceutical", "polymorphism", "PTM", "RNA editing", #positions not parsed "similarity", "subunit", "tissue specificity", "toxic dose", ] if element.attrib['type'] in simple_comments: ann_key = 'comment_%s' % element.attrib['type'].replace( ' ', '') for text_element in element.getiterator(NS + 'text'): if text_element.text: append_to_annotations(ann_key, text_element.text) elif element.attrib['type'] == 'subcellular location': for subloc_element in element.getiterator( NS + 'subcellularLocation'): for el in subloc_element.getchildren(): if el.text: ann_key = 'comment_%s_%s' % ( element.attrib['type'].replace( ' ', ''), el.tag.replace(NS, '')) append_to_annotations(ann_key, el.text) elif element.attrib['type'] == 'interaction': for interact_element in element.getiterator(NS + 'interactant'): ann_key = 'comment_%s_intactId' % element.attrib['type'] append_to_annotations(ann_key, interact_element.attrib['intactId']) elif element.attrib['type'] == 'alternative products': for alt_element in element.getiterator(NS + 'isoform'): ann_key = 'comment_%s_isoform' % element.attrib[ 'type'].replace(' ', '') for id_element in alt_element.getiterator(NS + 'id'): append_to_annotations(ann_key, id_element.text) elif element.attrib['type'] == 'mass spectrometry': ann_key = 'comment_%s' % element.attrib['type'].replace( ' ', '') start = end = 0 for loc_element in element.getiterator(NS + 'location'): pos_els = loc_element.getiterator(NS + 'position') pos_els = list(pos_els) # this try should be avoided, maybe it is safer to skip postion parsing for mass spectrometry try: if pos_els: end = int(pos_els[0].attrib['position']) start = end - 1 else: start = int( loc_element.getiterator(NS + 'begin') [0].attrib['position']) - 1 end = int( loc_element.getiterator(NS + 'end') [0].attrib['position']) except: #undefined positions or erroneusly mapped pass mass = element.attrib['mass'] method = element.attrib[ 'mass'] #TODO - Check this, looks wrong! if start == end == 0: append_to_annotations(ann_key, 'undefined:%s|%s' % (mass, method)) else: append_to_annotations( ann_key, '%s..%s:%s|%s' % (start, end, mass, method)) elif element.attrib['type'] == 'sequence caution': pass #not parsed: few information, complex structure elif element.attrib['type'] == 'online information': for link_element in element.getiterator(NS + 'link'): ann_key = 'comment_%s' % element.attrib['type'].replace( ' ', '') for id_element in link_element.getiterator(NS + 'link'): append_to_annotations( ann_key, '%s@%s' % (element.attrib['name'], link_element.attrib['uri'])) #return raw XML comments if needed if self.return_raw_comments: ann_key = 'comment_%s_xml' % element.attrib['type'].replace( ' ', '') append_to_annotations(ann_key, ElementTree.tostring(element)) def _parse_dbReference(element): self.ParsedSeqRecord.dbxrefs.append(element.attrib['type'] + ':' + element.attrib['id']) #e.g. # <dbReference type="PDB" key="11" id="2GEZ"> # <property value="X-ray" type="method"/> # <property value="2.60 A" type="resolution"/> # <property value="A/C/E/G=1-192, B/D/F/H=193-325" type="chains"/> # </dbReference> if 'type' in element.attrib: if element.attrib['type'] == 'PDB': method = "" resolution = "" for ref_element in element.getchildren(): if ref_element.tag == NS + 'property': dat_type = ref_element.attrib['type'] if dat_type == 'method': method = ref_element.attrib['value'] if dat_type == 'resolution': resolution = ref_element.attrib['value'] if dat_type == 'chains': pairs = ref_element.attrib['value'].split(',') for elem in pairs: pair = elem.strip().split('=') if pair[1] != '-': #TODO - How best to store these, do SeqFeatures make sense? feature = SeqFeature.SeqFeature() feature.type = element.attrib['type'] feature.qualifiers[ 'name'] = element.attrib['id'] feature.qualifiers['method'] = method feature.qualifiers[ 'resolution'] = resolution feature.qualifiers['chains'] = pair[ 0].split('/') start = int(pair[1].split('-')[0]) - 1 end = int(pair[1].split('-')[1]) feature.location = SeqFeature.FeatureLocation( start, end) #self.ParsedSeqRecord.features.append(feature) for ref_element in element.getchildren(): if ref_element.tag == NS + 'property': pass # this data cannot be fitted in a seqrecord object with a simple list. however at least ensembl and EMBL parsing can be improved to add entries in dbxrefs def _parse_reference(element): reference = SeqFeature.Reference() authors = [] scopes = [] tissues = [] journal_name = '' pub_type = '' pub_date = '' for ref_element in element.getchildren(): if ref_element.tag == NS + 'citation': pub_type = ref_element.attrib['type'] if pub_type == 'submission': pub_type += ' to the ' + ref_element.attrib['db'] if 'name' in ref_element.attrib: journal_name = ref_element.attrib['name'] pub_date = ref_element.attrib.get('date', '') j_volume = ref_element.attrib.get('volume', '') j_first = ref_element.attrib.get('first', '') j_last = ref_element.attrib.get('last', '') for cit_element in ref_element.getchildren(): if cit_element.tag == NS + 'title': reference.title = cit_element.text elif cit_element.tag == NS + 'authorList': for person_element in cit_element.getchildren(): authors.append(person_element.attrib['name']) elif cit_element.tag == NS + 'dbReference': self.ParsedSeqRecord.dbxrefs.append( cit_element.attrib['type'] + ':' + cit_element.attrib['id']) if cit_element.attrib['type'] == 'PubMed': reference.pubmed_id = cit_element.attrib['id'] elif ref_element.attrib['type'] == 'MEDLINE': reference.medline_id = cit_element.attrib['id'] elif ref_element.tag == NS + 'scope': scopes.append(ref_element.text) elif ref_element.tag == NS + 'source': for source_element in ref_element.getchildren(): if source_element.tag == NS + 'tissue': tissues.append(source_element.text) if scopes: scopes_str = 'Scope: ' + ', '.join(scopes) else: scopes_str = '' if tissues: tissues_str = 'Tissue: ' + ', '.join(tissues) else: tissues_str = '' reference.location = [ ] #locations cannot be parsed since they are actually written in free text inside scopes so all the references are put in the annotation. reference.authors = ', '.join(authors) if journal_name: if pub_date and j_volume and j_first and j_last: reference.journal = REFERENCE_JOURNAL % dict( name=journal_name, volume=j_volume, first=j_first, last=j_last, pub_date=pub_date) else: reference.journal = journal_name reference.comment = ' | '.join( (pub_type, pub_date, scopes_str, tissues_str)) append_to_annotations('references', reference) def _parse_position(element, offset=0): try: position = int(element.attrib['position']) + offset except KeyError, err: position = None status = element.attrib.get('status', '') if status == 'unknown': assert position is None return SeqFeature.UnknownPosition() elif not status: return SeqFeature.ExactPosition(position) elif status == 'greater than': return SeqFeature.AfterPosition(position) elif status == 'less than': return SeqFeature.BeforePosition(position) elif status == 'uncertain': return SeqFeature.UncertainPosition(position) else: raise NotImplementedError("Position status %r" % status)
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS, spacerLength, distanceToCutSiteFromPAM_bp): from Bio import SeqFeature if PAMside == 3: distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp else: distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp - 1 # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS) s = s.upper() PAM = Seq(seqStr, IUPAC.ambiguous_dna) PAM_length = len(seqStr) if seqStr == str(PAM.reverse_complement()): DoRevComp = 0 forwardNameString = "{name}_{num:0{width}}" else: DoRevComp = 1 forwardNameString = "{name}_F{num:0{width}}" listSpacer = [] listDistBetweenSpacers = [] spacerNum = 0 prevStartLocInRefSeq = -9999 if PAMside == 3: gbStringForSearch = s[spacerLength:] # Cas9 else: gbStringForSearch = s[:-spacerLength] # Cpf1, get all but last ~20 bases of sequence spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM)) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10))) print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition( item) # start and end pos of PAM endPos = SeqFeature.ExactPosition(item + PAM_length) if PAMside == 3: # Cas9-like startLocInRefSeq = startPos + 1 endLocInRefSeq = startLocInRefSeq + spacerLength - 1 else: # Cpf1-like startLocInRefSeq = endPos #Starts immediately after PAM endLocInRefSeq = startLocInRefSeq + spacerLength startLocInRefGenome = chromStartRG + startLocInRefSeq endLocInRefGenome = chromStartRG + endLocInRefSeq - 1 cutSiteInRefGenome = startLocInRefGenome + distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand = "+" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: spacerAsStr = str(s[startLocInRefSeq - 1:endLocInRefSeq]) exactPAM = s[endLocInRefSeq:endLocInRefSeq + PAM_length] else: spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq]) exactPAM = s[startLocInRefSeq - PAM_length:startLocInRefSeq] # Python slices: second index is first char you *DON'T* want GCcontent = SeqUtils.GC(spacerAsStr) listItem = [ spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent ] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq = startLocInRefSeq print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format( limit=cutoff_spacing, num=spacerNum) spacerNumTotal = spacerNum # Search rev complement of PAM # print PAM # print PAM.reverse_complement() prevStartLocInRefSeq = -9999 spacerNum = 0 if DoRevComp: if PAMside == 3: gbStringForSearch = s[:-spacerLength] # get all but last ~20 bases of sequence else: gbStringForSearch = s[spacerLength:] spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM.reverse_complement())) if len(spacerInds) > 1: # matches found del spacerInds[ 0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10))) print "Minus strand sgRNAs found: {num}".format( num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) endPos = SeqFeature.ExactPosition(item + PAM_length) #print "Start pos: {num} End pos: {num2}".format(num=startPos,num2=endPos) # Start and end locations are flipped here due to reverse strand if PAMside == 3: endLocInRefSeq = endPos + 1 #flipped for reverse strand startLocInRefSeq = endLocInRefSeq + spacerLength - 1 #flipped for reverse strand else: # startLocInRefSeq is 5' end of spacer on PAM-containing strand # endLocInRefSeq is 3' end of spacer on PAM-containing strand # Hence endLocInRefSeq < startLocInRefSeq since this is reverse strand startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq endLocInRefSeq = startLocInRefSeq - spacerLength + 1 startLocInRefGenome = chromStartRG + startLocInRefSeq - 1 endLocInRefGenome = chromStartRG + endLocInRefSeq - 1 cutSiteInRefGenome = startLocInRefGenome - distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand = "-" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: # Cas9-like spacerRC = Seq( str(s[endLocInRefSeq - 1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str( Seq( str(s[endLocInRefSeq - (PAM_length + 1):endLocInRefSeq - 1]), IUPAC.ambiguous_dna).reverse_complement()) else: # Cpf1-like spacerRC = Seq( str(s[endLocInRefSeq - 1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str( Seq( str(s[startLocInRefSeq:startLocInRefSeq + PAM_length]), IUPAC.ambiguous_dna).reverse_complement()) GCcontent = SeqUtils.GC(spacerAsStr) listItem = [ spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent ] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq = startLocInRefSeq print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format( limit=cutoff_spacing, num=spacerNum) spacerNumTotal = spacerNumTotal + spacerNum arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers) meanDist = np.mean(arrDistBetweenSpacers) return (listSpacer, spacerNumTotal, meanDist)