def run_pam_finder(target_fa, seq, PAM, abs_start_pos, chr): # SeqUtils.nt_search("AGGCGGGGG", "NGG") # SeqUtils.nt_search("CCACCA", "NGG") # forward rev_seq = revcomp(target_fa) fwd_search = SeqUtils.nt_search(target_fa, seq + PAM) rev_search = SeqUtils.nt_search(rev_seq, seq + PAM) out = [] if len(fwd_search) > 1: for s in fwd_search[1:]: # out.append([chr,s+abs_start_pos,s+abs_start_pos+len(seq),target_fa[s:(s+len(seq))],".","+"]) out.append([ chr, s + abs_start_pos, s + abs_start_pos + len(seq), target_fa[s:(s + len(seq))], target_fa[s:(s + len(seq) + len(PAM))], "+" ]) if len(rev_search) > 1: for s in rev_search[1:]: # out.append([chr,(len(target_fa)-s)+abs_start_pos-len(seq),(len(target_fa)-s)+abs_start_pos,rev_seq[s:(s+len(seq))],".","-"]) out.append([ chr, (len(target_fa) - s) + abs_start_pos - len(seq), (len(target_fa) - s) + abs_start_pos, rev_seq[s:(s + len(seq))], rev_seq[s:(s + len(seq) + len(PAM))], "-" ]) return pd.DataFrame(out)
def count_amplicons(in_name, fprimer, rc): Fprimer = Seq(fprimer, IUPAC.ambiguous_dna) pre_length = Counter() if rc: post_length = Counter() bothfound = 0 Rprimer = Seq(fprimer, IUPAC.ambiguous_dna).reverse_complement() lenRprimer = len(Rprimer) with open(in_name, 'r') as fastqF: for seqRecord in SeqIO.parse(fastqF, "fastq"): Fpos = SeqUtils.nt_search(str(seqRecord.seq), str(Fprimer)) if len(Fpos) > 1: # SeqUtils.nt_search returns the pattern, followed by positions of any matches # Forward primer found: increment pre_length pre_length[Fpos[1]] += 1 if rc: RCpos = SeqUtils.nt_search(str(seqRecord.seq), str(Rprimer)) if len(RCpos) > 1: tail = len(seqRecord) - RCpos[-1] - lenRprimer post_length[tail] += 1 if len(Fpos) > 1: bothfound += 1 print("Primers found:", sum(pre_length.values())) print("Counts of pre_length:", pre_length) if rc: print("Reverse primers found:", sum(post_length.values())) print("Counts of post_length", post_length) print("Both primer and reverse_complement found:", bothfound)
def filtByPrimer(self, fwd_primer, rvs_primer): with open(self.input_forward) as fh: with open(self.input_reverse) as rh: count_keep = 0 count_discard = 0 for ((title_f, seq_f, qual_f), (title_r, seq_r, qual_r)) in zip(FastqGeneralIterator(fh), FastqGeneralIterator(rh)): try: if (SeqUtils.nt_search(seq_f, fwd_primer)[1] == 0) & ( SeqUtils.nt_search(seq_r, rvs_primer)[1] == 0): with open(self.output_forward, 'a') as ofh: ofh.write( '@' + '\n'.join([title_f, seq_f, '+', qual_f]) + '\n') with open(self.output_reverse, 'a') as orh: orh.write( '@' + '\n'.join([title_r, seq_r, '+', qual_r]) + '\n') count_keep += 1 else: count_discard += 1 except IndexError: count_discard += 1 print(' Number of reads saved: ' + str(count_keep)) print(' Number of reads discard :' + str(count_discard))
def search_motif(sequence): motif = str(args.pam) len_motif = int(len(motif)) len_protospacer = int(args.length_protospacer) full_len = len_motif + len_protospacer len_dna = int(len(sequence.seq)) # Output of nt_search is a list containing the motif and the start position (0-based) # of every hit in the DNA sequence # Search on fw strand matches_fw = SeqUtils.nt_search(str(sequence.seq), motif) # Initialyze final list coordinates_fw = [] if len(matches_fw) > 1: end_positions_fw = matches_fw[1::] start_positions_fw = [ end - len_protospacer for end in end_positions_fw ] # Check if protospacer fits in the sequence before adding the start # and end coordinate to the list for start, end in zip(start_positions_fw, end_positions_fw): if start > 0: coordinates_fw.append([start, end]) # The coordinates are different and need to be corrected to match to fw strand reverse_seq = str(sequence.seq.reverse_complement()) matches_rv = SeqUtils.nt_search(reverse_seq, motif) # Initialyze final list coordinates_rv = [] if len(matches_rv) > 1: end_positions_rv = matches_rv[1::] start_positions_rv = [ end - len_protospacer for end in end_positions_rv ] # Need to convert the coordinates in forward strand end_positions = [len_dna - start for start in start_positions_rv] start_positions = [len_dna - end for end in end_positions_rv] # Check if protospacer fits in the sequence before adding the start # and end coordinate to the list for start, end in zip(start_positions, end_positions): if start > 0 and end < len_dna: coordinates_rv.append([start, end]) # Return a tuple of lists for fw and rv matches return coordinates_fw, coordinates_rv
def writePBS(): global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature for variation in featureStatistic_container[feature]: primerSeq = str(variation.seq) primerName = variation.note partialPrimerSeq = primerSeq[len(primerSeq) - 15::] seqRecordToCheck = str(record.seq) seqRecordToCheckComplement = str(reverse_complement(record.seq)) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature)
def _find_iseq(self, seq: Seq, iseq_str: str, iseq_id: str = "integrated sequence") -> int: """The Function to find index/location of iseq_str within the sequence. Args: seq: Sequence to search. iseq_str: The subsequence you are searching for. iseq_id (optional): The id/name of the subsequence (iseq_str), Defaults to "integrated sequence". Returns: int: The index/location of iseq within sequence. Raises: PartException: If iseq_str can not be found within the sequence, if multiple iseq_str exist within the sequence. """ search_out = SeqUtils.nt_search(str(seq), iseq_str) if len(search_out) < 2: raise PartException(f"{self.id} lacks {iseq_id}") elif len(search_out) > 2: raise PartException(f"{self.id} contains multiple {iseq_id}") return search_out[1]
def main(): """Main application body""" # Genome sequence and annotations genome = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/fasta/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like_Genome.fasta') annotations = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/gff/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like.gff') # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013) motifs = load_motifs('najafabadi_table_s1_2013.csv') # Load genome sequence chromosomes = load_fasta(genome) # Parse annotations and return 3'UTR coordinates genes = get_utr_coords(annotations, utr_length=500) # For each gene, return a list of the motifs that are present in its 3'UTR for gene in genes: utr_seq = get_3utr_seq(chromosomes, gene) # check each motif to see if it is present utr3_motifs = [] for motif in motifs: matches = SeqUtils.nt_search(utr_seq, motif)[1:] # save matched motif if len(matches) > 0: utr3_motifs.append(motif) # output results print("%s: %s" % (gene['id'], ", ".join(utr3_motifs)))
def parseSeqRecordForOligo(record,oligo): '''Parse SeqRecord for oligo and return True if found and False if not.''' results = SeqUtils.nt_search(str(record.seq),oligo) #search in SeqRecord sequence for oligo if (len(results) > 1): return True #if list > 1 item, a match position was found else: #print "Did NOT find %s in %s" % (ol.id, record.id) return False
def find_PAM(seq, PAM): try: PAM_index = seq.index(PAM) except: # PAM on the left left_search = SeqUtils.nt_search(seq[:len(PAM)], PAM) if len(left_search) > 1: PAM_index = left_search[1] else: right_search = SeqUtils.nt_search(seq[-len(PAM):], PAM) if len(right_search) > 1: PAM_index = len(seq) - len(PAM) else: print("PAM: %s not found in %s. Set PAM index to 20" % (PAM, seq)) PAM_index = 20 return PAM_index
def is_dPAM(PAM_seq, RTT, cut_offset=-3): # Assuming no N is RTT, which should be true # match PAM seq to RTT, should be abs(cut_offset) # print (PAM_seq, RTT) # will need to do revcomp no matter what, because RTT is always xxxxxxxPAM seq = revcomp(RTT) fwd_search = SeqUtils.nt_search(seq, PAM_seq) flag = 1 if len(fwd_search) > 1: if abs(cut_offset) in fwd_search: flag = 0 return flag
def main(): """Main application body""" # Parse command-line arguments args = parse_args() # Genome sequence and annotations genome = load_file(args.input_genome) annotations = load_file(args.input_annotations) # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013) motifs = load_motifs('najafabadi_table_s1_2013.csv') # Load genome sequence chromosomes = load_fasta(genome) # Parse annotations and return 3'UTR coordinates genes = get_utr_coords(annotations, utr_length=args.utr_length) # Create a list to store output rows output = [] # For each gene, return a list of the motifs that are present in its 3'UTR num_genes = len(genes) for i, gene in enumerate(genes): utr_seq = get_3utr_seq(chromosomes, gene) print('Processing gene %d/%d' % (i + 1, num_genes)) # check each motif to see if it is present utr3_motifs = [] for motif in motifs: matches = SeqUtils.nt_search(utr_seq, motif)[1:] # save matched motif if len(matches) > 0: utr3_motifs.append(motif) output.append([gene['id']] + utr3_motifs) # output results with open(args.output, 'w') as output_file: writer = csv.writer(output_file) writer.writerows(output)
def annotate_primer(primer_name, primer_seq, primer_direction, genome): if type(primer_seq) == SeqRecord: primer_seq = primer_seq.seq if primer_direction == -1: primer_seq = primer_seq.reverse_complement() primer_label = PRIMER_ANNOTATION_PREFIX + primer_name primer_genome_loc_start = SeqUtils.nt_search( str(genome.seq), str(primer_seq))[1] primer_genome_loc = FeatureLocation( primer_genome_loc_start, primer_genome_loc_start+len(primer_seq)) primer_feature = SeqFeature( location=primer_genome_loc, type='misc_feature', strand=primer_direction, qualifiers={'label': [primer_label]}) genome.features.append(primer_feature)
def digest(enzyme, sequence, outfile, count): # search input sequence using enzyme sequence and return results to 'matches' matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1]) # for each of the items in results 'matches' list from 2nd item on (first item is match string) for match in matches[1:]: # create line for match on query stand line1 = sequence.id+"\t"+`int(match)+int(enzyme[2])`+"\t"+`int(match)+int(enzyme[2])`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t+\n" # look for reverse complement line2 = sequence.id+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t-\n" # if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted if len(enzyme[1])/2 < int(enzyme[2]): outfile.write(line2+line1) # if cut site is not past halfway point in enzyme, we can output in logical order else: # write both lines to ouput outfile.write(line1+line2) count += 1 return count
def digest(enzyme, sequence, outfile, count): # search input sequence using enzyme sequence and return results to 'matches' matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1]) # for each of the items in results 'matches' list from 2nd item on (first item is match string) for match in matches[1:]: # create line for match on query stand line1 = sequence.id+"\t"+str(int(match)+int(enzyme[2]))+"\t"+str(int(match)+int(enzyme[2]))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t+\n" # look for reverse complement line2 = sequence.id+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+str(int(match)+int(len(enzyme[1])-int(enzyme[2])))+"\t"+enzyme[0]+"\tcut-"+str(count)+"\t-\n" # if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted if len(enzyme[1])/2 < int(enzyme[2]): outfile.write(line2+line1) # if cut site is not past halfway point in enzyme, we can output in logical order else: # write both lines to ouput outfile.write(line1+line2) count += 1 return count
def split_relabel(infile, gene, m, sample_IDs, output_handle): ## Set up logfile log = open("Primer_split_log.txt", "a") log.write("Reads file\tPrimer\tRead count\n") if m == "merged": label = re.split('Wx80_|_pear', infile)[1] elif m == "unmerged": label = re.split('Wx80_|_L001', infile)[1] reads_handle = open(infile) #reads_handle = gzip.open(fastq_file) # Use if files are gzipped well = re.split('_S', label)[0] # NZGL well number sample_ID = sample_IDs.get(well) # Corresponding sample ID count = 0 primer_f = primerlist.get(gene)[0] primer_r = primerlist.get(gene)[1] trimmed = "" for record in SeqIO.parse(reads_handle, "fastq"): primer_search = SeqUtils.nt_search(str( record.seq), primer_f) # Searches record.seq for primer if len(primer_search) > 1 and ( primer_search )[1] == 0: # Check if primer found (len > 1) at start of sequence ([1] == 0) if m == "merged": trimmed = record[len(primer_f):-len(primer_r)] elif m == "unmerged": trimmed = record[len(primer_f):] trimmed.id = ( ("{0}|gene_{1}|sample_{2}").format(trimmed.id, gene, sample_ID) ) # Adds gene and sample ID print(trimmed.id) print(len(trimmed.seq)) SeqIO.write(trimmed, output_handle, "fastq") count += 1 print("{0} {1} {2} {3}".format(gene, label, sample_ID, count)) print("{0} {1}: Saved {2} reads".format(label, gene, count)) log.write("{0} {1} {2}: {3}".format(gene, label, sample_ID, count)) reads_handle.close() #output_handle.close() log.close()
help='Motif fasta file') parser.add_argument('--search', '-s', required=True, help='Sequence fasta file to search.') parser.add_argument('--outfile', '-o', type=argparse.FileType('w', encoding='UTF-8'), required=True, help='Outfile (will be in bed format).') args = parser.parse_args() d = [] for seq_motif in SeqIO.parse(args.motif, "fasta"): for seq in SeqIO.parse(args.search, "fasta"): results = SeqUtils.nt_search(str(seq.seq), seq_motif.seq) results_rc = SeqUtils.nt_search(str(seq.seq), seq_motif.seq.reverse_complement()) for i in results[1:]: d.append({ 'search': seq.id, 'motif': results[0], 'first': i + 1, 'last': (i) + len(results[0]) }) for i in results_rc[1:]: d.append({ 'search': seq.id, 'motif': results_rc[0], 'first': i + 1, 'last': (i) + len(results_rc[0])
def map_locator_Spark(x, subsequence): return len(SeqUtils.nt_search(x[1], subsequence)) > 1
from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio import motifs from Bio import SeqUtils with open("sites/MA0106.1.sites") as handle: p53 = motifs.read(handle, "sites") motif = p53.degenerate_consensus with open("output/motif_result_p53.txt","w") as f: for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'): f.write(">" + str(seq_record.id) + "\n") result=SeqUtils.nt_search(str(seq_record), m) f.write(str(result) + "\n") ## with open("sites/MA0001.1.sites") as handle: AGL3 = motifs.read(handle, "sites") motif = AGL3.degenerate_consensus with open("output/motif_result_AGL3.txt","w") as f: for seq_record in SeqIO.parse('input/gencode.v26.lncRNA_transcripts.fa','fasta'): f.write(">" + str(seq_record.id) + "\n") result=SeqUtils.nt_search(str(seq_record), motif) f.write(str(result) + "\n")
def find_enzyme(input_seq): in_IUPAC = Seq(input_seq, alphabet=Bio.Alphabet.IUPAC.unambiguous_dna) for rec_seq in Restriction.data["IUPAC sequence"]: SeqUtils.nt_search(in_IUPAC, rec_seq) return
from Bio import SeqUtils consensus = "RGWYV" sequence = "CGTAGCTAGCTCAGAGCAGGGACACGTGCTAGCAACAGCGCT" SeqUtils.nt_search(sequence, consensus)
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS, spacerLength, distanceToCutSiteFromPAM_bp): from Bio import SeqFeature if PAMside == 3: distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp else: distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp - 1 # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS) s = s.upper() PAM = Seq(seqStr, IUPAC.ambiguous_dna) PAM_length = len(seqStr) if seqStr == str(PAM.reverse_complement()): DoRevComp = 0 forwardNameString = "{name}_{num:0{width}}" else: DoRevComp = 1 forwardNameString = "{name}_F{num:0{width}}" listSpacer = [] listDistBetweenSpacers = [] spacerNum = 0 prevStartLocInRefSeq = -9999 if PAMside == 3: gbStringForSearch = s[spacerLength:] # Cas9 else: gbStringForSearch = s[:-spacerLength] # Cpf1, get all but last ~20 bases of sequence spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM)) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10))) print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition( item) # start and end pos of PAM endPos = SeqFeature.ExactPosition(item + PAM_length) if PAMside == 3: # Cas9-like startLocInRefSeq = startPos + 1 endLocInRefSeq = startLocInRefSeq + spacerLength - 1 else: # Cpf1-like startLocInRefSeq = endPos #Starts immediately after PAM endLocInRefSeq = startLocInRefSeq + spacerLength startLocInRefGenome = chromStartRG + startLocInRefSeq endLocInRefGenome = chromStartRG + endLocInRefSeq - 1 cutSiteInRefGenome = startLocInRefGenome + distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand = "+" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: spacerAsStr = str(s[startLocInRefSeq - 1:endLocInRefSeq]) exactPAM = s[endLocInRefSeq:endLocInRefSeq + PAM_length] else: spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq]) exactPAM = s[startLocInRefSeq - PAM_length:startLocInRefSeq] # Python slices: second index is first char you *DON'T* want GCcontent = SeqUtils.GC(spacerAsStr) listItem = [ spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent ] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq = startLocInRefSeq print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format( limit=cutoff_spacing, num=spacerNum) spacerNumTotal = spacerNum # Search rev complement of PAM # print PAM # print PAM.reverse_complement() prevStartLocInRefSeq = -9999 spacerNum = 0 if DoRevComp: if PAMside == 3: gbStringForSearch = s[:-spacerLength] # get all but last ~20 bases of sequence else: gbStringForSearch = s[spacerLength:] spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM.reverse_complement())) if len(spacerInds) > 1: # matches found del spacerInds[ 0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds), 10))) print "Minus strand sgRNAs found: {num}".format( num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) endPos = SeqFeature.ExactPosition(item + PAM_length) #print "Start pos: {num} End pos: {num2}".format(num=startPos,num2=endPos) # Start and end locations are flipped here due to reverse strand if PAMside == 3: endLocInRefSeq = endPos + 1 #flipped for reverse strand startLocInRefSeq = endLocInRefSeq + spacerLength - 1 #flipped for reverse strand else: # startLocInRefSeq is 5' end of spacer on PAM-containing strand # endLocInRefSeq is 3' end of spacer on PAM-containing strand # Hence endLocInRefSeq < startLocInRefSeq since this is reverse strand startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq endLocInRefSeq = startLocInRefSeq - spacerLength + 1 startLocInRefGenome = chromStartRG + startLocInRefSeq - 1 endLocInRefGenome = chromStartRG + endLocInRefSeq - 1 cutSiteInRefGenome = startLocInRefGenome - distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq - prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand = "-" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq - prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: # Cas9-like spacerRC = Seq( str(s[endLocInRefSeq - 1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str( Seq( str(s[endLocInRefSeq - (PAM_length + 1):endLocInRefSeq - 1]), IUPAC.ambiguous_dna).reverse_complement()) else: # Cpf1-like spacerRC = Seq( str(s[endLocInRefSeq - 1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str( Seq( str(s[startLocInRefSeq:startLocInRefSeq + PAM_length]), IUPAC.ambiguous_dna).reverse_complement()) GCcontent = SeqUtils.GC(spacerAsStr) listItem = [ spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent ] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq = startLocInRefSeq print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format( limit=cutoff_spacing, num=spacerNum) spacerNumTotal = spacerNumTotal + spacerNum arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers) meanDist = np.mean(arrDistBetweenSpacers) return (listSpacer, spacerNumTotal, meanDist)
def clean(self): ''' Clean data, adding an unsaved InchwormAssembly model ('assembly') and a list of stages ('stages') to self.cleaned_data ''' cleaned_data = super(PathwayForm, self).clean() # Don't do anything if some fields are missing if not all(x in cleaned_data.keys() for x in ['file', 'rbs_annotation_type', 'cds_annotation_type']): return cleaned_data def validate_contiguity(features): for i in range(len(features) - 1): if features[i].location.end != features[i+1].location.start: raise forms.ValidationError( 'Features {} (of type {}) and {} (of type {}) must be contiguous.'.format( features[i].qualifiers['label'][0], features[i].type, features[i+1].qualifiers['label'][0], features[i+1].type, )) record = SeqIO.read(cleaned_data['file'], 'genbank') feature_dict = { (feature.qualifiers['label'][0], feature.type): feature for feature in record.features } # Make sure all the required features are present pathway_features = [] for stage_name in self.stage_names: rbs_key = (stage_name, cleaned_data['rbs_annotation_type']) cds_key = (stage_name, cleaned_data['cds_annotation_type']) try: pathway_features.append(feature_dict[rbs_key]) pathway_features.append(feature_dict[cds_key]) except KeyError as e: raise forms.ValidationError( 'Stage {} has no feature of type {}.'.format(*e.args[0])) # Make sure all the features are contiguous validate_contiguity(pathway_features) # Save all the annealable sequences annealable_seqs = [] for i, stage_name in enumerate(self.stage_names): cds_feature = pathway_features[2*i + 1] annealable_seq = None for sequence_context in self.sequence_contexts: annealable_seq_name = '{} from {}'.format( stage_name, sequence_context['name']) sequence_context['file'].seek(0) context_record = SeqIO.read(sequence_context['file'], 'genbank') search_result = SeqUtils.nt_search( str(context_record.seq), str(cds_feature.extract(record).seq), ) if len(search_result) > 1: annealable_seq = Gene( file=sequence_context['file'], start=search_result[1] + 1, end=search_result[1] + len(cds_feature), strand=1, name=annealable_seq_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() break # No forward match found, so search the reverse strand rev_search_result = SeqUtils.nt_search( str(context_record.seq), str(cds_feature.extract(record).seq.reverse_complement()), ) if len(rev_search_result) > 1: annealable_seq = Gene( file=sequence_context['file'], start=rev_search_result[1] + 1, end=rev_search_result[1] + len(cds_feature), strand=-1, name=annealable_seq_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() break if annealable_seq is None: # No sequence context matched, so do non-nested PCR directly off # the coding sequence seq_file = ContentFile('') annealable_seq = Gene( file=seq_file, start=1, end=len(cds_feature), strand=1, name=stage_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() seq_record = cds_feature.extract(record) seq_record.id = '' seq_record.name = '' SeqIO.write(seq_record, seq_file, 'genbank') annealable_seq.file.save(stage_name, seq_file) annealable_seqs.append(annealable_seq) # Save the genome if len(record[:pathway_features[0].location.start]) < self.fwd_ha_len: raise forms.ValidationError( '5’ genome context must be at least {} bp long.'.format( self.fwd_ha_len)) if len(record[pathway_features[-1].location.end:]) < self.rev_ha_len: raise forms.ValidationError( '3’ genome context must be at least {} bp long.'.format( self.rev_ha_len)) genome_record = record[:pathway_features[0].location.start] + \ record[pathway_features[-1].location.end:] genome_record.name = 'genome' genome_file = ContentFile('') genome = Gene( file=genome_file, start=pathway_features[0].location.start + 1, end=pathway_features[0].location.start, strand=1, name='Genome context', ) genome.save() SeqIO.write(genome_record, genome_file, 'genbank') genome.file.save('genome', genome_file) # Save the stages cleaned_data['stages'] = [] for i, stage_name in enumerate(self.stage_names): rbs_feature = pathway_features[2*i] stage = Stage( degeneracy=str(rbs_feature.extract(record).seq), annealable_seq = annealable_seqs[i], selection_cassette=self.selection_cassettes[i], name=stage_name, ) cleaned_data['stages'].append(stage) stage.save() # Save the InchwormAssembly object cleaned_data['assembly'] = InchwormAssembly( genome=genome, enzyme=self.enzyme, library_size=self.library_size, dna_required=self.dna_required, fwd_ha_len=self.fwd_ha_len, rev_ha_len=self.rev_ha_len, ) return cleaned_data
def get_context_data(self, **kwargs): output = self.object.output primers = self.object.primers library_sizes = self.get_library_sizes() primer_names_by_sequence = dict() for name, sequence in primers: primer_names_by_sequence[sequence] = name def primer_name(primer): return primer_names_by_sequence[str(primer.full_seq().seq)] for i, stage_output in enumerate(output): stage_output['gg_primer_names'] = [ (primer_name(primer1), primer_name(primer2)) for primer1, primer2 in stage_output['gg'].primers ] stage_output['integration_primer_names'] = [ primer_name(primer) for primer in stage_output['insert'].generate_primers() ] stage_output['phenotype'] = \ self.object.stages.order_by('pk')[i].selection_cassette.phenotype if library_sizes: stage_output['dna_required'] = \ library_sizes[i] * self.object.dna_required # Compile unique Golden Gate PCR reactions for the tabular view gg_pcrs_by_primers_and_template = dict() gg_pcr_details = [] for i, stage_output in enumerate(output): for j in range(3): primer_names = map(primer_name, stage_output['gg'].primers[j]) primer_names_and_template = tuple( primer_names + [str( stage_output['gg'].genes[j].subrecord().seq.upper())]) if primer_names_and_template in gg_pcrs_by_primers_and_template.keys(): continue else: # Get length of PCR product primer1 = stage_output['gg'].primers[j][0] primer2 = stage_output['gg'].primers[j][1] search_template = str( stage_output['gg'].genes[j].subrecord().seq.upper()) forward_search_result = SeqUtils.nt_search( search_template, primer1.anneal_seq().upper(), ) reverse_search_result = SeqUtils.nt_search( search_template, primer2.anneal_seq().reverse_complement().upper(), ) assert len(forward_search_result) > 1 and \ len(reverse_search_result) > 1 # Get name of template stage = self.get_object().stages.order_by('pk')[i] if j == 0: template_name = stage.annealable_seq.name elif j == 1: template_name = stage.selection_cassette.name else: template_name = 'Genome' # Get primer Tm forward_tm = recombineering.utils.Tm( str(primer1.anneal_seq().seq)) reverse_tm = recombineering.utils.Tm( str(primer2.anneal_seq().seq)) details = { 'product': 'gg{}-{}'.format(i+1, j+1), 'size': len(primer1.overhang) + (reverse_search_result[1] - forward_search_result[1]) + len(primer2.full_seq()), 'primer_names': primer_names_and_template, 'template': template_name, 'forward_tm': forward_tm, 'reverse_tm': reverse_tm, } gg_pcrs_by_primers_and_template[ primer_names_and_template] = details gg_pcr_details.append(details) # Compile information about second-round PCRs round2_pcr_details = [] for i, stage_output in enumerate(output): insert = stage_output['insert'] insert_len = sum([ insert.fwd_ha_len, len(insert.degeneracy), len(insert.sequence), insert.rev_ha_len, ]) details = { 'product': 'stage{}'.format(i+1), 'size': insert_len, 'primer_names': map(primer_name, insert.generate_primers()), 'template': 'gg{}'.format(i+1), 'forward_tm': recombineering.utils.Tm( str(insert.generate_primers()[0].anneal_seq().seq)), 'reverse_tm': recombineering.utils.Tm( str(insert.generate_primers()[1].anneal_seq().seq)), } if library_sizes: details['dna_required'] = \ library_sizes[i] * self.object.dna_required round2_pcr_details.append(details) # Determine what goes into which Golden Gate reaction gg_details = [] for i, stage_output in enumerate(output): fragments = [] for j, (primer1, primer2) in enumerate(stage_output['gg'].primers): template = str(stage_output['gg'].genes[j].subrecord().seq.upper()) primer_names_and_template = ( primer_name(primer1), primer_name(primer2), template, ) fragments.append( gg_pcrs_by_primers_and_template[primer_names_and_template]['product']) gg_details.append({ 'product': 'gg{}'.format(i+1), 'size': len(stage_output['gg'].product), 'fragments': fragments, }) # Transformation details transformation_details = [] for i in range(len(output)): stage = self.get_object().stages.order_by('pk')[i] transformation_details.append({ 'insert_name': round2_pcr_details[i]['product'], 'phenotype': stage.selection_cassette.phenotype, }) context = super(OutputView, self).get_context_data(**kwargs) context['output'] = output context['primers'] = primers context['gg_pcr_details'] = gg_pcr_details context['gg_details'] = gg_details context['round2_pcr_details'] = round2_pcr_details context['transformation_details'] = transformation_details return context
if fastafile == "test3prime.fasta": output_fh_name = "seqs_w_for_removed.fasta" output_fh = open(output_fh_name, mode='w+') output_text_name = "info_w_for_removed.txt" if fastafile == "test3prime.fasta": output_text_name = "info_w_for_removed.txt" output_text_fh = open(output_text_name, mode='w+') for record in parsed: try: sequence = str(record.seq) search = SeqUtils.nt_search(sequence, adapter) #This will search the index = int( search[1] ) #If it finds the adapter, is the starting index from which it was found. adapter_start = index adapter_end = index + len_adapter count_adapter_found += 1 total_seq_count += 1 if removeadapters == "True": #if the value is true, it removes the adapters from the sequences. if end_defn == "5": record = record[ adapter_end:] #If a 5' adapter, you remove adapter from beginning elif end_defn == "3": record = record[: adapter_start] #If it is a 3' adapter, you remove the adapter at the end elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences.
from Bio import SeqIO from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio import motifs from Bio import SeqUtils with open("sites/MA0106.1.sites") as handle: p53 = motifs.read(handle, "sites") motif = p53.degenerate_consensus with open("motif_result_p53.txt", "w") as f: for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa', 'fasta'): f.write(">" + str(seq_record.id) + "\n") result = SeqUtils.nt_search(str(seq_record), motif) f.write(str(result) + "\n") ## with open("sites/MA0001.1.sites") as handle: AGL3 = motifs.read(handle, "sites") motif = AGL3.degenerate_consensus with open("motif_result_AGL3.txt", "w") as f: for seq_record in SeqIO.parse('gencode.v26.lncRNA_transcripts.fa', 'fasta'): f.write(">" + str(seq_record.id) + "\n") result = SeqUtils.nt_search(str(seq_record), motif) f.write(str(result) + "\n")
if fastafile=="test3prime.fasta": output_fh_name="output2.fasta" output_fh = open(output_fh_name, mode='w+') output_text_name = "output.txt" if fastafile=="test3prime.fasta": output_text_name="output2.txt" output_text_fh = open(output_text_name, mode='w+') for record in parsed: try: sequence = str(record.seq) search = SeqUtils.nt_search(sequence, adapter) #This will search the index = int(search[1]) #If it finds the adapter, is the starting index from which it was found. adapter_start = index adapter_end = index+len_adapter count_adapter_found +=1 total_seq_count+=1 if removeadapters == "True": #if the value is true, it removes the adapters from the sequences. if end_defn=="5": record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning elif end_defn=="3": record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences. record = record SeqIO.write(record, output_fh, format="fasta") #No matter what, write the reads. except IndexError: count_adapter_not_found+=1
def createdb(): gis = [100753385, 100689306, 100751648] accession = [] description = [] sequence = [] request = Entrez.epost("nucleotide",id=",".join(map(str,gis))) result = Entrez.read(request) webEnv = result["WebEnv"] queryKey = result["QueryKey"] handle = Entrez.efetch(db="nucleotide",retmode="xml", webenv=webEnv, query_key=queryKey) for r in Entrez.parse(handle): # Grab the GI# try: gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1]) except ValueError: gi=None fastaseq = ">GI ",gi," "+r["GBSeq_primary-accession"]+" "+r["GBSeq_definition"]+"\n"+r["GBSeq_sequence"][0:20] accession.append(''.join(fastaseq[0].strip() + str(fastaseq[1]))) description.append(' '.join(fastaseq[2].split()[0:3])) sequence.append(fastaseq[2].split()[-1].upper()) alt_map = {'ins':'0'} complement = {'A':'T','G':'C','T':'A','C':'G'} # getting the complementary sequence# def reverse_complement(seq): for k,v in alt_map.iteritems(): seq = seq.replace(k,v) bases = list(seq) bases = reversed([complement.get(base,base) for base in bases]) bases = ''.join(bases) for k,v in alt_map.iteritems(): bases = bases.replace(v,k) return bases complementary_sequence = [reverse_complement(seq) for seq in sequence] #print sequence,complementary_sequence# #fetching the positions of 'GG' from the sequence exon = [] comp_exon = [] pattern = 'GG' for exons in sequence: exon_search = str(SeqUtils.nt_search(exons, pattern)) exon.append(exon_search) for comp in complementary_sequence: comp_exon_search = str(SeqUtils.nt_search(comp, pattern)) comp_exon.append(comp_exon_search) #print exon #print comp_exon conn = sqlite3.connect(sqlite_file) c = conn.cursor() c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'\ .format (tn=table_name2, nf=new_field, ft=field_type)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=id_column, ct=column_type2)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=description_column, ct=column_type3)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=seq_column, ct=column_type4)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=comp_seq_column, ct=column_type5)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=PAM_column1, ct=column_type6)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=PAM_column2, ct=column_type7)) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (1, accession[0], description[0], sequence[0],complementary_sequence[0],exon[0],comp_exon[0])) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (2, accession[1], description[1], sequence[1],complementary_sequence[0],exon[1],comp_exon[1])) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (3, accession[2], description[1], sequence[2],complementary_sequence[0],exon[2],comp_exon[2])) conn.commit() conn.close()
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS,spacerLength, distanceToCutSiteFromPAM_bp): from Bio import SeqFeature if PAMside == 3: distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp else: distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp-1 # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS); s=s.upper(); PAM = Seq(seqStr, IUPAC.ambiguous_dna) PAM_length = len(seqStr); if seqStr == str(PAM.reverse_complement()): DoRevComp=0 forwardNameString = "{name}_{num:0{width}}" else: DoRevComp=1 forwardNameString = "{name}_F{num:0{width}}" listSpacer=[] listDistBetweenSpacers=[] spacerNum=0 prevStartLocInRefSeq=-9999 if PAMside == 3: gbStringForSearch = s[spacerLength:]; # Cas9 else: gbStringForSearch = s[:-spacerLength]; # Cpf1, get all but last ~20 bases of sequence spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM)) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds),10))); print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) # start and end pos of PAM endPos = SeqFeature.ExactPosition(item+PAM_length) if PAMside == 3: # Cas9-like startLocInRefSeq = startPos+1 endLocInRefSeq = startLocInRefSeq+spacerLength-1 else: # Cpf1-like startLocInRefSeq = endPos #Starts immediately after PAM endLocInRefSeq = startLocInRefSeq+spacerLength startLocInRefGenome = chromStartRG+startLocInRefSeq endLocInRefGenome = chromStartRG+endLocInRefSeq-1 cutSiteInRefGenome = startLocInRefGenome+distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand="+" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: spacerAsStr = str(s[startLocInRefSeq-1:endLocInRefSeq]) exactPAM = s[endLocInRefSeq:endLocInRefSeq+PAM_length]; else: spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq]) exactPAM = s[startLocInRefSeq-PAM_length:startLocInRefSeq]; # Python slices: second index is first char you *DON'T* want GCcontent = SeqUtils.GC(spacerAsStr); listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq=startLocInRefSeq print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) spacerNumTotal=spacerNum # Search rev complement of PAM # print PAM # print PAM.reverse_complement() prevStartLocInRefSeq=-9999 spacerNum=0 if DoRevComp: if PAMside == 3: gbStringForSearch = s[:-spacerLength]; # get all but last ~20 bases of sequence else: gbStringForSearch = s[spacerLength:]; spacerInds = SeqUtils.nt_search(gbStringForSearch,str(PAM.reverse_complement())) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds),10))); print "Minus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) endPos = SeqFeature.ExactPosition(item+PAM_length) #print "Start pos: {num} End pos: {num2}".format(num=startPos,num2=endPos) # Start and end locations are flipped here due to reverse strand if PAMside == 3: endLocInRefSeq = endPos+1 #flipped for reverse strand startLocInRefSeq = endLocInRefSeq+spacerLength-1 #flipped for reverse strand else: # startLocInRefSeq is 5' end of spacer on PAM-containing strand # endLocInRefSeq is 3' end of spacer on PAM-containing strand # Hence endLocInRefSeq < startLocInRefSeq since this is reverse strand startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq endLocInRefSeq = startLocInRefSeq - spacerLength +1 startLocInRefGenome = chromStartRG+startLocInRefSeq-1 endLocInRefGenome = chromStartRG+endLocInRefSeq-1 cutSiteInRefGenome = startLocInRefGenome-distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand="-" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3:# Cas9-like spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str(Seq(str(s[endLocInRefSeq-(PAM_length+1):endLocInRefSeq-1]), IUPAC.ambiguous_dna).reverse_complement()) else: # Cpf1-like spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str(Seq(str(s[startLocInRefSeq:startLocInRefSeq+PAM_length]), IUPAC.ambiguous_dna).reverse_complement()) GCcontent = SeqUtils.GC(spacerAsStr); listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq=startLocInRefSeq print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) spacerNumTotal=spacerNumTotal+spacerNum; arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers) meanDist = np.mean(arrDistBetweenSpacers) return (listSpacer, spacerNumTotal, meanDist)
for randomRec in range(1,2): record = records[random.randint(1, len(records))] newRecord = SeqRecord(record.seq) #writing Header newRecord.seq.alphabet = generic_dna newRecord.id = record.id newRecord.name = record.name newRecord.description = record.description recordSeq = str(record.seq) for feature in featureStatistic_container: if feature not in ["PBS", "STF"]: for variation in featureStatistic_container[feature]: featureSeq = str(variation.seq) occurrence = SeqUtils.nt_search(recordSeq, featureSeq) writeFeature(strand=1) featureSeqComplement = str(variation.seq.complement()) occurrence = SeqUtils.nt_search(recordSeq, featureSeqComplement) writeFeature(strand=-1) else: if(feature == "STF"): writeSTF() if(feature == "PBS"): writePBS() SeqIO.write(newRecord, output_handle, "genbank")