def main(): """ The driver function of the program. This function demonstrates gthe central dogma of bio in biopython. """ print("\t Welcome to the Central Dogma of Biology Demo") print("\t BioPython version", Bio.__version__) # define the DNA sequences myDNASeqA = Seq("AGTACAGTA") myDNASeqB = Seq("AGTAGAGAA") print("\t The sequences:") print("\t myDNASeqA :", myDNASeqA) print("\t myDNASeqB :", myDNASeqB) # compliment of sequences compOfMyDNASeqA = myDNASeqA.complement() print("\n\t The comp of seq A is:", compOfMyDNASeqA) compOfMyDNASeqB = myDNASeqB.complement() print("\t The comp of seq B is:", compOfMyDNASeqB) # reverse compliment of sequences revCompOfMyDNASeqA = myDNASeqA.reverse_complement() print("\n\t The REV comp of seq A is:", revCompOfMyDNASeqA) revCompOfMyDNASeqB = myDNASeqB.reverse_complement() print("\t The REV comp of seq B is:", revCompOfMyDNASeqB) # transcribe: DNA to RNA RNAOfMyDNASeqA = myDNASeqA.transcribe() print("\n\t The RNA of seq A is:", RNAOfMyDNASeqA) RNAOfMyDNASeqB = myDNASeqB.transcribe() print("\t The RNA of seq B is:", RNAOfMyDNASeqB) # translate: RNA to Protein protOfMyDNASeqA = myDNASeqA.translate() print("\n\t The protein of seq A is:", protOfMyDNASeqA) protMyDNASeqB = myDNASeqB.translate() print("\t The protein of seq B is:", protMyDNASeqB) # comparison print("\n\t Compare the sequences char by char") print("\n\t seqA seqB") mismatchCount_int = 0 # keep a tally of the number of mismatches for i in range(len(myDNASeqA)): match_str = " " if (myDNASeqA[i] != myDNASeqB[i]): mismatchCount_int += 1 match_str = "!=" # replace this connection string to show that there is no match tmp_str = f"\t {myDNASeqA[i]} {match_str} {myDNASeqB[i]}" print(tmp_str) print(f"\n\t Total mismatches: {mismatchCount_int}")
def main(): try: my_seq = Seq("AGTACACTGGT") print(my_seq, end='\n') my_seq.complement() print(my_seq, end='\n') except: pass
def complementSeq(): my_seq = Seq("GGATCGAAATCGC", IUPAC.unambiguous_dna) print('My_Seq = ', my_seq) print('My_Seq complement = ', my_seq.complement()) print('My_Seq reverse complement = ', my_seq.reverse_complement()) print('My_Seq reverse reverse complement = ', my_seq.reverse_complement().reverse_complement())
def extract_single(seq, strand_info, seq_start, seq_end, bases): if strand_info == "-": dna = Seq(seq, NucleotideAlphabet()) seq_comp = dna.complement() # complement = {'a':'t','c':'g','g':'c','t':'a','n':'n'} # seq_comp = "".join([complement[nt.lower()] for nt in seq]) exon_sequence = ((str(seq_comp[seq_start - 1 : seq_end]))[::-1]).upper() upstream_start = seq_end upstream_end = seq_end + bases upstream_seq = (str(seq_comp[upstream_start:upstream_end]))[::-1] downstream_start = (seq_start - 1) - bases downstream_end = seq_start - 1 downstream_seq = (str(seq_comp[downstream_start:downstream_end]))[::-1] else: exon_sequence = (seq[seq_start - 1 : seq_end]).upper() upstream_start = (seq_start - 1) - bases upstream_end = seq_start - 1 upstream_seq = seq[upstream_start:upstream_end] downstream_start = seq_end downstream_end = seq_end + bases downstream_seq = seq[downstream_start:downstream_end] return upstream_seq, exon_sequence, downstream_seq
def find_palindromes_variable(seq): """ Go through for each length (starting with 2) by steps of 2 (even numbers are the only lenghts capable of being palindromes) and determine if there are palindromes in seq. Return once you go through the entire sequence or once you reach a length with no palindromes (once you find no palindromes there won't be any in the future) """ palindromeDict = {} seq_len = len(seq) max_size = seq_len for size in range(2, max_size + 1, 2): palindromeDict[size] = {} found = False for i in range(seq_len - size + 1): sub_seq = seq[i:i + size] biopy_seq = Seq(sub_seq, generic_dna) #if sub_seq[::-1] == mckinney_complement(sub_seq): if sub_seq[::-1] == biopy_seq.complement().tostring(): found = True palindromeDict[size][i] = sub_seq if not found: #return if no palindromes found return palindromeDict return palindromeDict
def AA_sequence(refDNA_dic,cds_df,gene,seq_type='AA'): pr_seqs = [] tr_seqs = [] # 1. get all proteins gene_df = cds_df[cds_df['geneid'].values==gene] prs = list(set(gene_df['access'].tolist())) prs = sorted(prs) obj = trpr(gene_df) # 2. loop for each pr for pr in prs: # 1) get chromosome chrom = obj.get_chrom(pr,id_type='access') pos = obj.get_trpr_pos(pr) ref_seq = refDNA_dic[chrom].seq sequence = ''.join([ref_seq[p-1] for p in pos]) nt_seq = Seq(sequence,generic_dna) if pos[0]>pos[1]: nt_seq = nt_seq.complement() AA = str(nt_seq.translate()) tr_seqs.append(str(nt_seq)) pr_seqs.append(AA) if seq_type=='AA': return pr_seqs,prs else: return tr_seqs,prs
def fasta_iter(fasta_name): contig_no = 0 fh = fasta_name faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">")) for header in faiter: contig_no += 1 header = header.next()[1:].strip() seq = "".join(s.strip() for s in faiter.next()) extract = seq[int(start):int(end)] if s.lower(reverse) == 'no': pass elif s.lower(reverse) == 'yes': extract = extract[::-1] else: print ' Please use only "Yes" or "No" when specifying if the sequence is in reverse orientation.' sys.exit() if s.lower(complement) == 'no': pass elif s.lower(complement) == 'yes': dna = Seq(extract, generic_dna) extract = str(dna.complement()) else: print 'Please use only "Yes" or "No" when specifying if the sequence is on the complement strand.' sys.exit() print extract yield header, extract, contig_no
def get_seq_meta(g, request): """get all the sequence metadata""" #get features feats = [] for f in g.features.all(): quals = [] for q in f.qualifiers.all(): quals.append({ 'name': q.name, 'data': q.data, }) s = None if f.direction == 'f': s = 1 elif f.direction == 'r': s = -1 feats.append({ 'start': f.start, 'end': f.end, 'strand': s, 'type': f.type, 'qualifiers': quals, }) #assume Ambiguous DNA let = Seq(IUPAC.IUPACAmbiguousDNA.letters, IUPAC.IUPACAmbiguousDNA()) rlet = let.complement() alpha = {} for i in range(len(let)): alpha[let[i].lower()] = rlet[i].lower() alpha[let[i].upper()] = rlet[i].upper() return JsonResponse({ 'len': len(g.sequence), 'feats': feats, 'alpha': alpha, })
def make_complementary_strand(strand_in): dna = Seq(strand_in) com_dna = dna.complement() res = str(com_dna) return res
def extract_single(seq, strand_info, seq_start, seq_end, bases): if strand_info == '-': dna = Seq(seq, NucleotideAlphabet()) seq_comp = dna.complement() #complement = {'a':'t','c':'g','g':'c','t':'a','n':'n'} #seq_comp = "".join([complement[nt.lower()] for nt in seq]) exon_sequence = ((str(seq_comp[seq_start-1:seq_end]))[::-1]).upper() upstream_start = seq_end upstream_end = seq_end + bases upstream_seq = (str(seq_comp[upstream_start:upstream_end]))[::-1] downstream_start = (seq_start - 1) - bases downstream_end = seq_start - 1 downstream_seq = (str(seq_comp[downstream_start:downstream_end]))[::-1] else: exon_sequence = (seq[seq_start-1:seq_end]).upper() upstream_start = (seq_start - 1) - bases upstream_end = seq_start - 1 upstream_seq = seq[upstream_start:upstream_end] downstream_start = seq_end downstream_end = seq_end + bases downstream_seq = seq[downstream_start:downstream_end] return upstream_seq, exon_sequence, downstream_seq
def find_palindromes_variable(seq): """ Go through for each length (starting with 2) by steps of 2 (even numbers are the only lenghts capable of being palindromes) and determine if there are palindromes in seq. Return once you go through the entire sequence or once you reach a length with no palindromes (once you find no palindromes there won't be any in the future) """ palindromeDict={} seq_len = len(seq) max_size = seq_len for size in range(2,max_size+1,2): palindromeDict[size]={} found = False for i in range(seq_len-size+1): sub_seq=seq[i:i+size] biopy_seq=Seq(sub_seq, generic_dna) #if sub_seq[::-1] == mckinney_complement(sub_seq): if sub_seq[::-1] == biopy_seq.complement().tostring(): found=True palindromeDict[size][i]=sub_seq if not found: #return if no palindromes found return palindromeDict return palindromeDict
def get_top_genes(f_name): p = re.compile('\s+') p_5TOP = re.compile('^[TC]{4,14}') #p_5TOP_like = re.compile('^[AG]{1,3}') p_5TOP_like = re.compile('^[AGTC]{0,3}[AG]') p_5TOP_2 = re.compile('^[TC]{5,14}') top_list = [] top_like_list = [] with open(f_name) as f: for x in f: name, seq = p.split(x.strip()) seq = seq.upper() if name.endswith('-'): min_seq = Seq(seq) seq = str(min_seq.complement()) m = p_5TOP_like.match(seq) if p_5TOP.match(seq): if name[:-2] not in top_list: top_list.append(name[:-2]) print('{}\t5\'TOP mRNA\t{}'.format(name[:-2], seq)) elif m: pur_end = m.end() if p_5TOP_2.match(seq[pur_end:]): if name[:-2] not in top_like_list: top_like_list.append(name[:-2]) print('{}\t5\'TOP-like mRNA\t{}'.format( name[:-2], seq))
def AA_sequence(refDNA_dic, cds_df, gene, seq_type='AA'): pr_seqs = [] tr_seqs = [] # 1. get all proteins gene_df = cds_df[cds_df['geneid'].values == gene] prs = list(set(gene_df['access'].tolist())) prs = sorted(prs) obj = trpr(gene_df) # 2. loop for each pr for pr in prs: # 1) get chromosome chrom = obj.get_chrom(pr, id_type='access') pos = obj.get_trpr_pos(pr) ref_seq = refDNA_dic[chrom].seq sequence = ''.join([ref_seq[p - 1] for p in pos]) nt_seq = Seq(sequence, generic_dna) if pos[0] > pos[1]: nt_seq = nt_seq.complement() AA = str(nt_seq.translate()) tr_seqs.append(str(nt_seq)) pr_seqs.append(AA) if seq_type == 'AA': return pr_seqs, prs else: return tr_seqs, prs
def mapGenoToPheno(self, genotype): #for each possible trait see if we have a genotype #if we do add it with its value to the traits dict results = [] for trait in self.possibleTraitsList: if genotype.has_key(trait.rsid): self.traits[trait.rsid] = genotype[trait.rsid] if trait.alleles.has_key(genotype[trait.rsid]): results.append( trait.rsid + " - " + genotype[trait.rsid] + " - " + trait.alleles[genotype[trait.rsid]] ) # if not try flipping the order of the alleles elif trait.alleles.has_key(genotype[trait.rsid][::-1]): results.append( trait.rsid + " - " + genotype[trait.rsid][::-1] + " (flipped) - " + trait.alleles[genotype[trait.rsid][::-1]] ) else: #try reverse complement #print genotype[trait.rsid] my_dna = Seq(genotype[trait.rsid], generic_dna) rev = str(my_dna.complement()) #print rev if trait.alleles.has_key(rev): results.append( trait.rsid + " - " + rev + " (rev comp) -" + trait.alleles[rev] ) # if not try flipping the order of the alleles elif trait.alleles.has_key(rev[::-1]): results.append( trait.rsid + " - " + rev[::-1] + " (flipped) - " + trait.alleles[rev[::-1]] ) else: results.append( "genotype " + genotype[trait.rsid] + " and rev comp " + rev + " not found in traits for " + trait.rsid ) else: # print trait, " genotype not found for ", trait.rsid, " available genotype mappings: ", trait.alleles results.append( "genotype not found for " + trait.rsid ) return results
def clipboard_content_manager(inst): """ This function changes clipboard contents according to action presented on button. Input: 1. Instance of Button. """ # Get sequence from clipboard and delete all non-sequence characters seq = clip.paste() seq = Seq(re.sub(r'[\d\s]*', '', seq)) # Biopython functions are applied to sequence. # Modified sequences are returned to clipboard. try: if (inst.text == 'Reverse'): clip.copy(str(seq[::-1])) elif (inst.text == 'Complement'): clip.copy(str(seq.complement())) elif (inst.text == 'Reversed\ncomplement'): clip.copy(str(seq.reverse_complement())) elif (inst.text == 'Translate'): clip.copy(str(seq.translate())) # Various errors are possible. except Exception as exc: print(exc)
def genSplintSeq(filename, *, splint_len=20): chosen_3_pairs = ['Pair_2'] chosen_5_pairs = ['Pair_1'] chosen_primers = list() with open(filename, 'r') as f: file = json.load(f) half_splint_len = splint_len // 2 splint_oligo_complement = '' for dic in file.values(): if dic['extension'] == '3_prime': oligo_4_splint = dic['sequence'][-half_splint_len:] splint_oligo_complement = oligo_4_splint + splint_oligo_complement elif dic['extension'] == '5_prime': oligo_4_splint = dic['sequence'][:half_splint_len] splint_oligo_complement += oligo_4_splint splint_oligo_complement = Seq(splint_oligo_complement) print("Sequence:", splint_oligo_complement, "\nCompliment:", splint_oligo_complement.complement(), "\nReverse Compliment:", splint_oligo_complement.reverse_complement(), end='\n\n') return splint_oligo_complement.reverse_complement()
def motif_finder(reference, ref_ids, query_seq, output): # Read contig DNA = reference # Read query query = Seq(query_seq, IUPAC.unambiguous_dna) # Make query a str and get str of complement (for minus strand) query_reg = str(query) query_comp = str(query.complement()) # must also search for complement # For bp in range from 0 to length of sequence - RE motif length, iterating by 1bp for i in range(0, len(DNA)-len(query), 1): # Rare test sequence is i + length of rare RE motif testseq = str(DNA[i:i+len(query)]) pos = i+1 # If test sequence equals query sequence (plus strand), print line to terminal and output file if testseq == query_reg: # out format = tab separated columns of sequence/contig id, position/coordinate, and + for strand line = str(ref_ids)+"\t"+str(pos)+"\t+\n" output.write(line) print line # If test sequence equal complement of query sequence (minus strand), print line to terminal and output file elif testseq == query_comp: # out format = tab separated columns of sequence/contig id, position/coordinate, and - for strand line = str(ref_ids)+"\t"+str(pos)+"\t-\n" output.write(line) print line
def complement_sequences(self): self.get_seq_names_and_contents() for seq_name, seq in zip(self.seq_names, self.seq_contents): raw_seq = Seq(seq, IUPAC.unambiguous_dna) transformed_seq = raw_seq.complement() add_result(self, seq_name, str(transformed_seq))
def translate(seq): seq = Seq(seq) try: return seq.translate() except ValueError: try: return seq.translate(seq.complement()) except ValueError: return ['None']
def selfDimerizeTest(primer): if not isinstance(primer, Seq): primer = Seq(primer, generic_dna) length = len(primer) primer_Rev = primer[::-1] primer_Com = primer.complement() return PLA_Seq.calcPhaseMatch(primer_Rev, primer_Com)
def manage_dna(data): sequence = Seq(data.sequence, IUPAC.unambiguous_dna) treated_data = Processed_dna_rna( creation_date=data.creation_date.strftime("%d/%m/%Y, %H:%M:%S"), translation_table=data.translation_table, coding_dna=str(sequence), dna_c=str(sequence.complement()), dna_rc=str(sequence.reverse_complement()), rna_m=str(sequence.transcribe()), rna_m_c=str(sequence.complement().transcribe()), protein=str(sequence.translate(table=data.translation_table)), protein_to_stop=str( sequence.translate(table=data.translation_table, to_stop=True))) return Sequencer.extract_sequence_data(treated_data)
def get_alpha(g, request): #assume Ambiguous DNA let = Seq(IUPAC.IUPACAmbiguousDNA.letters, IUPAC.IUPACAmbiguousDNA()) rlet = let.complement() data = {} for i in range(len(let)): data[let[i].lower()] = rlet[i].lower() data[let[i].upper()] = rlet[i].upper() return JsonResponse(data)
def in_silico_pcr(Primers_Tm_GC, fasta_seq): product_list = [] for data in Primers_Tm_GC: left = data[0][0] right = data[0][1] start = fasta_seq.find(left) reverse_right = ''.join(reversed(right)) seq_right = Seq(reverse_right) complement_right = str(seq_right.complement()) end = fasta_seq.find(complement_right) + len(right) distance = end - start product_leght = str(distance) + ' bp' product = fasta_seq[start:end] seq_product = Seq(product) complement_product = str(seq_product.complement()) ''' cont = 0 lines = [] while cont < distance: line = '|' lines.append(line) cont += 1 product_pair = [data, product_leght, product, complement_product, lines] product_list.append(product_pair) ''' cont = 0 lines = '' while cont < distance: line = '|' lines += line cont += 1 amplified = product + '\n' + lines + '\n' + complement_product product_pair = [data, product_leght, amplified] product_list.append(product_pair) return product_list
def comp(self): '''Return the complement of the primer Creates a Biopython Seq object, and uses the Seq object complement method''' from Bio.Seq import Seq from Bio.Alphabet import IUPAC s = Seq(self.seq,IUPAC.unambiguous_dna) s = s.complement() return Primer(str(s),self.strand,self.location)
def find_palindromes(seq,size): palindromeDict={} seq_len=len(seq) for i in range(seq_len-size+1): sub_seq=seq[i:i+size] biopy_seq=Seq(sub_seq, generic_dna) #if sub_seq[::-1] == mckinney_complement(sub_seq): if sub_seq[::-1] == biopy_seq.complement().tostring(): palindromeDict[i]=sub_seq return palindromeDict
def find_palindromes(seq, size): palindromeDict = {} seq_len = len(seq) for i in range(seq_len - size + 1): sub_seq = seq[i:i + size] biopy_seq = Seq(sub_seq, generic_dna) #if sub_seq[::-1] == mckinney_complement(sub_seq): if sub_seq[::-1] == biopy_seq.complement().tostring(): palindromeDict[i] = sub_seq return palindromeDict
def chkSelfDimerization(all_seq): filtered_seq = list() for (index, (_, forwP, revP, *MTs)) in enumerate(all_seq): forwP,revP = Seq(forwP,generic_dna), Seq(revP,generic_dna) forwP_Rev = forwP[::-1] forwP_Com = forwP.complement() print(index) print(f"Forward Primer: {forwP.tostring()}") print(f"Reverse Primer: {revP.tostring()}") match_ForwP = SequenceMatcher(a=forwP_Rev,b=forwP_Com).find_longest_match(0,len(forwP_Rev),0,len(forwP_Com)) match_forwP_block = SequenceMatcher(a=forwP_Rev, b=forwP_Com).get_matching_blocks() print(match_ForwP) print(match_forwP_block) print(forwP_Rev[match_ForwP.a:match_ForwP.a+match_ForwP.size], \ forwP_Com[match_ForwP.b:match_ForwP.b+match_ForwP.size],sep='\n') print("Forw_Rev: ",forwP_Rev, "Complement: ",forwP.complement(),sep='\n',end='\n\n') revP_Rev = revP[::-1] revP_Com = revP.complement() match_RevP = SequenceMatcher(a=revP_Rev, b=revP_Com).find_longest_match(0, len(revP_Rev), 0, len(revP_Com)) match_RevP_block = SequenceMatcher(a=revP_Rev, b=revP_Com).get_matching_blocks() print("Reverse Primer") print(match_RevP) print(match_RevP_block) print(revP_Rev[match_RevP.a:match_RevP.a + match_RevP.size], \ revP_Com[match_RevP.b:match_RevP.b + match_RevP.size], sep='\n') print("RevP_Rev: ", revP_Rev, "Complement: ", revP.complement(), sep='\n', end='\n\n') if match_ForwP.size > 3 or match_RevP.size > 3: continue else: print(f' Adding index: {index}',end='\n\n') filtered_seq.append(all_seq[index]) print(filtered_seq) print(len(filtered_seq),end='\n\n') return filtered_seq
def mapGenoToPheno(self, genotype): #for each possible trait see if we have a genotype #if we do add it with its value to the traits dict results = [] for trait in self.possibleTraitsList: if genotype.has_key(trait.rsid): self.traits[trait.rsid] = genotype[trait.rsid] if trait.alleles.has_key(genotype[trait.rsid]): results.append( dict(rsid=trait.rsid, genotype=genotype[trait.rsid], description=trait.alleles[genotype[trait.rsid]], flipped=False, revComp=False)) # if not try flipping the order of the alleles elif trait.alleles.has_key(genotype[trait.rsid][::-1]): results.append( dict(rsid=trait.rsid, genotype=genotype[trait.rsid][::-1], description=trait.alleles[genotype[trait.rsid] [::-1]], flipped=True, revComp=False)) else: #try reverse complement #print genotype[trait.rsid] my_dna = Seq(genotype[trait.rsid], generic_dna) rev = str(my_dna.complement()) #print rev if trait.alleles.has_key(rev): results.append( dict(rsid=trait.rsid, genotype=rev, description=trait.alleles[rev], flipped=False, revComp=True)) # if not try flipping the order of the alleles elif trait.alleles.has_key(rev[::-1]): results.append( dict(rsid=trait.rsid, genotype=rev[::-1], description=trait.alleles[rev[::-1]], flipped=True, revComp=True)) else: results.append( dict(rsid=trait.rsid, description="NOT FOUND w/ genotype " + genotype[trait.rsid] + " and rev comp " + rev)) else: # print trait, " genotype not found for ", trait.rsid, " available genotype mappings: ", trait.alleles results.append(dict(rsid=trait.rsid, description="NOT FOUND")) return results
def excelWithGenomicPositions(inputFile, outputFile, columnWithcDNAPos, parentDict, chromosome, strand=1): fin = open(inputFile, 'rU') fout = open(outputFile, 'w') startCol = 0 for line in fin: text_tokens = line.split(',') for i in range(0, len(text_tokens)): if columnWithcDNAPos in text_tokens[i]: startCol = i text_tokens[startCol] = "start" text_tokens.insert(startCol+1, "end") text_tokens[startCol-1] = "chr" if startCol !=0 and text_tokens[startCol] != "start": cDNApos = int(text_tokens[startCol]) results = get_key_from_value(parentDict, cDNApos) newValue = cDNA_to_genomic(results, strand) text_tokens[startCol] = newValue text_tokens.insert(startCol+1, newValue) text_tokens[startCol-1] = chromosome if strand < 0: ref = Seq(text_tokens[startCol+2]) ref = ref.complement() ref = str(ref) text_tokens[startCol+2] = ref var = Seq(text_tokens[startCol+3]) var = var.complement() var = str(var) text_tokens[startCol+3] = var text_tokens = str(text_tokens[startCol-1:startCol+4]) + "," + str(text_tokens[startCol+6:startCol+9]) newLine = text_tokens.replace('[','').replace(']', '').replace("'", "").replace(',', '\t') newLine = newLine + ' \n' fout.write(newLine) fin.close() fout.close() #excelWithGenomicPositions("LOVD_BRCA1_12.2.13.csv", "LOVD_BRCA1_12.2.13B.vcf", "BIC DNA change", brca1Dict, 17, -1) #excelWithGenomicPositions("LOVD_BRCA2_12.10.13.csv", "LOVD_BRCA2_12.10.13B.vcf", "BIC DNA change", brca2_dictIARC, 13)
def extract_fragment(args): fo = open(args['fastafile']) seqid = args['seqid'] pos = 0 found = False startpos = args.get('startpos', 0) endpos = args.get('endpos', False) seq = "" for line in fo: line = line.rstrip() if line.startswith('>'): fid = line[1:].split()[0] if fid == seqid: found = True out = '>' + fid if args.has_key('startpos') or args.has_key('endpos'): out += " %s:%s" % (startpos, endpos) if args['reverse']: out += " reverse" if args['complement']: out += " complement" print out elif found: break else: continue elif found: if not args['countGaps']: line = line.replace('-', '') if pos > endpos: break if pos < startpos and pos + len(line) < startpos: pos += len(line) continue if pos < startpos and pos + len(line) >= startpos: out = line[startpos - pos:] elif pos >= startpos: out = line if not endpos == False and endpos < pos + len(line): out = out[:endpos - pos] if not args['reverse'] and not args['complement']: print out else: seq += out pos += len(line) fo.close() if args['reverse'] or args['complement']: seq = Seq(seq, IUPAC.unambiguous_dna) if args['reverse'] and args['complement']: seq = seq.reverse_complement() elif args['complement']: seq = seq.complement() elif args['reverse']: seq = seq[::-1] seq = str(seq) print seq
def match(line1, line2, width): from Bio.Seq import Seq l1 = [] l2 = [] d1 = slyce(line1, width) d2 = slyce(line2, width) for i in d1: seq = Seq(i) #i in d1 and in d2?? if i in d2: for j in range(len(d1[i])): for k in range(len(d2[i])): l1.append(d1[i][j]) l2.append(d2[i][k]) #complementary? if str(seq.complement()) in d2: for j in range(len(d1[i])): for k in range(len(d2[str(seq.complement())])): l1.append(d1[i][j]) l2.append(d2[str(seq.complement())][k]) #it is in rev list? if i[::-1] in d2: for j in range(len(d1[i])): for k in range(len(d2[i[::-1]])): l1.append(d1[i][j]) l2.append(d2[i[::-1]][k]) #reverse complementary? if str(seq[::-1].complement()) in d2: for j in range(len(d1[i])): for k in range(len(d2[str(seq[::-1].complement())])): l1.append(d1[i][j]) l2.append(d2[str(seq[::-1].complement())][k]) return l1, l2
def transcriptionSeq(): coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", IUPAC.unambiguous_dna) complement_dna = coding_dna.complement() print('coding_dna = ', coding_dna) print('complement_dna = ', complement_dna) messenger_rna = coding_dna.transcribe() print('messenger_rna = ', messenger_rna) back = messenger_rna.back_transcribe() print('back = ', back) '''
def OrthogonalityTest(seq_1, seq_2, limit=3): if isinstance(seq_1, str) and isinstance(seq_2, str): seq_1 = Seq(seq_1, generic_dna) seq_2 = Seq(seq_2, generic_dna) elif not (isinstance(seq_1, Seq) and isinstance(seq_2, Seq)): raise AssertionError( 'Seq 1 and Seq 2 must both be either a str or Seq object.') #print(f"Checking orthogonality between:\n{seq_1} and \n{seq_2}") seq_1_Rev = seq_1[::-1] seq_2_Com = seq_2.complement() return PLA_Seq.calcPhaseMatch(seq_1_Rev, seq_2_Com, limit)
def comp(self): '''Return the complement of the oligo Creates a Biopython Seq object, and uses the Seq object complement method''' from Bio.Seq import Seq from Bio.Alphabet import IUPAC s = Seq(self.seq,IUPAC.unambiguous_dna) s = s.complement() if self.strand in ['coding','Coding']: strand = 'Template' else: strand = 'Coding' return Oligo(str(s),strand,self.loc,self.enz)
def make_reads(df_source, max_length, update_product_id, coverage, pos_coverage, neg_coverage, class_column): df_reads = pd.DataFrame().reindex_like(df_source) df_reads.drop(df_reads.index, inplace=True) reads = [] # not using apply here because in pandas .24 this wouldn't properly reduce count = 0 for index, row in df_source.iterrows(): seq = row['sequence'] if update_product_id: product_id = str(row['product_id']) n = int(len(seq) / max_length) # 4 = forward, reverse, complement, reverse complement cov = 1 if coverage == 0: cov = coverage if pos_coverage != 0 and row[class_column] == 1: cov = pos_coverage elif neg_coverage != 0 and row[class_column] == 0: cov = neg_coverage starts = random.choices(range(len(seq) - max_length), k=n*4*cov) for i, start in enumerate(starts): read_row = row if update_product_id: read_row['product_id'] = product_id + '_' + str(i) read = str(seq[start:start+max_length]) if i % 4 == 0: # forward read_row['sequence'] = read elif i % 4 == 1: # reverse read_row['sequence'] = read[::-1] elif i % 4 == 2: # complement seqr = Seq(read) complement = str(seqr.complement()) read_row['sequence'] = complement else: # reverse complement seqr = Seq(read) reverse_complement = str(seqr.reverse_complement()) read_row['sequence'] = reverse_complement reads.append(read_row) count = count + 1 if count > 0 and count % 1000000 == 0: print(f"{count} reads generated") print(f"assembling data frame from {count} reads") return df_reads.append(pd.DataFrame(reads, columns=df_source.columns)).reset_index()
def fetch_seq(self, chromosome, start, end, reverse=True, complement=True): if not isinstance(start, int) or not isinstance(end, int): raise ValueError("Start and End coordinates must be integers.") seq = self.hg[chromosome - 1][start - 1:end] seq = seq.seq if reverse is True and complement is False: return seq[::-1] if reverse is False and complement is True: seq = Seq(seq) return seq.complement() if reverse is True and complement is True: seq = Seq(seq) return seq.reverse_complement() return seq
def translate(sequence): DNA_seq = Seq(sequence, IUPAC.ambiguous_dna) cDNA_seq = DNA_seq.complement() mRNA_seq = DNA_seq.transcribe() read_seq = str(mRNA_seq) dic=[] if re.findall(r"AUG",read_seq) == []: dic.append({'STT':0,'STP':0,'LEN':0,'codon':'','protain':'','DNA':str(DNA_seq),'cDNA': str(cDNA_seq),'mRNA': read_seq}) else: start = re.finditer(r"AUG",read_seq) for s in start: tmp=make_codon(read_seq,s.start()) tmp.update({'DNA':str(DNA_seq),'cDNA': str(cDNA_seq),'mRNA': read_seq}) dic.append(tmp) return dic
def orfs(data): for row in data[1]: start = row[1]-1 stop = row[2]-1 print '>'+row[0] if int(stop) < int(start): dna = data[0][2][start:stop:-1] my_dna = Seq(dna,generic_dna) orfseq = my_dna.complement() aa = translatedna(str(orfseq)) print aa else: orfseq = data[0][2][start:stop] aa = translatedna(orfseq) print aa
def extract_fragment(args): fo = open(args['fastafile']) seqid = args['seqid'] pos = 0 found = False startpos = args.get('startpos', 0) endpos = args.get('endpos', False) seq = "" for line in fo: line = line.rstrip() if line.startswith('>'): fid = line[1:].split()[0] if fid == seqid: found = True out = '>' + fid if args.has_key('startpos') or args.has_key('endpos'): out += " %s:%s" %(startpos, endpos) if args['reverse']: out += " reverse" if args['complement']: out += " complement" print out elif found: break else: continue elif found: if not args['countGaps']: line = line.replace('-','') if pos > endpos: break if pos < startpos and pos+len(line) < startpos: pos += len(line) continue if pos < startpos and pos+len(line) >= startpos: out = line[startpos-pos:] elif pos >= startpos: out = line if not endpos == False and endpos < pos+len(line): out = out[:endpos-pos] if not args['reverse'] and not args['complement']: print out else: seq += out pos += len(line) fo.close() if args['reverse'] or args['complement']: seq = Seq(seq, IUPAC.unambiguous_dna) if args['reverse'] and args['complement']: seq = seq.reverse_complement() elif args['complement']: seq = seq.complement() elif args['reverse']: seq = seq[::-1] seq = str(seq) print seq
def get_seq(g, cutof=0.95, rare_filter = 0.05): vec = ['fA','fT','fG','fC'] comp= {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'} codon_table = standard_dna_table.forward_table for c in standard_dna_table.stop_codons: codon_table[c] = "*" start = pos_dict[g]['start']-1 end = pos_dict[g]['end'] c = pos_dict[g]['contig'] sense = pos_dict[g]['sense'] dat = nucl_compo[c][start:end] if sense == "-": dat = dat[::-1] str_seq = "".join([v['base'] for v in dat]) max_freq = [ max([v['fA'],v['fT'],v['fG'],v['fC']]) for v in dat if v['base'] != 'X'] rate = mean(max_freq) seq = Seq(str_seq) if sense == "-": seq = seq.complement() variants = [(i,dat[i]['base'],[v.split("f")[1] for v in vec if dat[i]['base']!='X' and dat[i][v] > rare_filter and dat[i]['base'] not in v]) for i,f in enumerate(max_freq) if f<cutof if dat[i]['coverage'] != 0] if sense == "-": variants = [(a, comp[b], [comp[d] for d in c]) for a,b,c in variants] codons = re.findall('...',str(seq)) syn = 0 non = 0 stop = False for i,b,vv in variants: off = i-(i/3)*3 codon = codons[i/3] if codon_table.has_key(codon): aa = codon_table[codon] for v in vv: c = list(codon) c[off]=v if codon_table["".join(c)] == aa: syn += 1 else: non += 1 if codon_table["".join(c)] == "*": stop = True return {"rate": rate,"syn":syn, "non":non , "stop": stop, "snp_freq" : float(syn+non)/float(end-start), "len" : end-start }
def Orthogonality_Test(seq_1, seq_2, limit=4): if not isinstance(seq_1, Seq): seq_1 = Seq(seq_1, generic_dna) if not isinstance(seq_2, Seq): seq_2 = Seq(seq_2, generic_dna) if len(seq_1) == len(seq_2): length = len(seq_1) else: length = (max([len(seq_2), len(seq_2)])) print(f"Checking orthogonality between:\n{seq_1} and \n{seq_2}") seq_1_Rev = seq_1[::-1] seq_2_Com = seq_2.complement() match = SequenceMatcher(a=seq_1_Rev, b=seq_2_Com).find_longest_match( 0, length, 0, length) all_match = SequenceMatcher(a=seq_1_Rev, b=seq_2_Com).get_matching_blocks() # print("rev ", primer_Rev) # print('comp ', primer_Com) print(match) step = match.b - match.a print("step: ", step) print(all_match) phase_matches = [] for match in all_match: diff = match.b - match.a if diff == step: phase_matches.append(match) phase_match_size = sum(match.size for match in phase_matches) # print(phase_matches) print("phase match: ", phase_match_size) if phase_match_size > limit: # print("rev ",' ' * step, primer_Rev) # print('comp ',primer_Com) return False return True
def Oligo(target_dna): ''' return should be dict type, GC_contents, Tm_value, Reverse compliment ''' result = { 'GC_contents': 0, 'Tm_value': 0, 'Complement_seq': 0, 'Reverse_complement_seq': 0, 'Length_of_oligo': 0 } dna = Seq(target_dna) # set biopython seq type result['GC_contents'] = '{:.2f} %'.format(GC(dna)) result['Tm_value'] = MeltingTemp.Tm_Wallace(dna) result['Complement_seq'] = str(dna.complement()) result['Reverse_complement_seq'] = str(dna.reverse_complement()) result['Length_of_oligo'] = str(len(dna)) return result
def mapGenoToPheno(self, genotype): #for each possible trait see if we have a genotype #if we do add it with its value to the traits dict for trait in self.possibleTraitsList: if genotype.has_key(trait.rsid): self.traits[trait.rsid] = genotype[trait.rsid] if trait.alleles.has_key(genotype[trait.rsid]): print trait.rsid, " - ", genotype[trait.rsid], " - ", trait.alleles[genotype[trait.rsid]] else: #try reverse complement #print genotype[trait.rsid] my_dna = Seq(genotype[trait.rsid], generic_dna) rev = str(my_dna.complement()) #print rev if trait.alleles.has_key(rev): print trait.rsid, " - ", rev, " (rev comp) -", trait.alleles[rev] else: print "genotype " , genotype[trait.rsid], "and rev comp " , rev, " not found in traits for " , trait.rsid else: print trait.rsid, " - unavailable"
def simple(): my_seq = Seq("AGTACACTGGT") print my_seq.complement() print my_seq.reverse_complement()
def dna2complement( seq ): """Convert DNA sequence to complement dna sequence""" return Seq.complement( Seq( str(seq) )).tostring()
''' Criando uma sequência complementar de DNA e imprimindo a sequência complementar e o reverso Complementar. ''' from Bio.Seq import Seq seq = Seq("ACCCCTATGTGACCACTG") print("Imprimindo sequência complementar: ", seq.complement()) print("Imprimindo o reverso complementar:", seq.reverse_complement())
from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.SeqRecord import SeqRecord from Bio import SeqIO import re sequence = Seq("ATGtcccacta",IUPAC.unambiguous_dna) print(sequence) #4 nić komplementarna print(sequence.complement()) # ::-1 to to samo co [0:len(seq):-1], początek, koniec i krok, sam : oznacza domyślne [0:len(seq):-1] print(sequence.complement()[::-1]) # to samo co print(sequence.reverse_complement()) simple_seq = Seq("TCTGTGCTAAAGTGTAACTCGTAGGCACTATCTAC") simple_seq_r = SeqRecord(simple_seq, id="AC834343", name="seqX", description="H**o erectus, chr25") print(simple_seq_r) record = SeqIO.read("/Users/Kozel/Documents/UJ/Biotechnologia molekuarna/3 semestr/Bioinformatyka/Kody/hemoglobin.txt","fasta") print(record.id) print(record.name) print(record.description) print(record.annotations) #print(record.seq) ## wyrażenia regularne #re.match(pattern,string) - szuka na początku stringa #re.search(pattern,string) - szuka w całym stringu print(re.search("C.T","ATCATGGC"))
class SeqLine: def __init__(self,_number,_seq): self.number = _number self.seq = Seq(_seq, IUPAC.IUPACUnambiguousDNA()) self.rseq = self.seq.complement() self.features = []
#-1 my_dna.count("GG") #note that count is non-overlapping "AAAAAAA".count("AA") """ BioPython has several built-in functions for biological applications: complement, reverse complement, translation, back translation """ #from Bio.Seq import Seq #from Bio.Alphabet import generic_dna #my_dna = Seq("AGTACACTGGT", generic_dna) print my_dna my_dna.complement() #Seq('TCATGTGACCA', DNAAlphabet()) my_dna.reverse_complement() #Seq('ACCAGTGTACT', DNAAlphabet()) my_dna.transcribe()
def double_digest(sequence, id, r_renz, r_cut_pos, c_renz, c_cut_pos, low, up, output): # Read contig DNA = sequence # Handles for various restriction enzymes, cut placements, and their complements for the rare-cutting enzyme rare_renz = Seq(r_renz, IUPAC.unambiguous_dna) rare_compmotif = str(rare_renz.complement()) # must also search for complement rare_motif = str(rare_renz) rare_cut_motif = int(r_cut_pos) rare_cut_compmotif = int(len(rare_motif))-int(r_cut_pos) # Handles for various restriction enzymes, cut placements, and their complements for the common-cutting enzyme common_renz = Seq(c_renz, IUPAC.unambiguous_dna) common_compmotif = str(common_renz.complement()) # must also search for complement common_motif = str(common_renz) common_cut_motif = int(c_cut_pos) common_cut_compmotif = int(len(common_motif))-int(c_cut_pos) # For bp in range from 0 to length of sequence - RE motif length, iterating by 1bp for i in range(0, len(DNA)-len(rare_motif), 1): # Rare test sequence is i + length of rare RE motif rare_testseq = str(DNA[i:i+len(rare_motif)]) rare_pos = i+1 if rare_testseq == rare_motif: # if rare enzyme test sequence equals the rare restriction enzyme motif, report position as position in loop + cut location in enzyme rare_digest = rare_pos + rare_cut_motif # whenever there is a rare enzyme cut, scan a window of basepairs upstream (based on lower/upper limits designed) for a common enzyme cut for j in range(rare_digest+int(low), (rare_digest+int(up))-len(rare_motif),1): common_testseq = str(DNA[j:j+len(common_motif)]) common_pos = j+1 # if common enzyme test sequence equals the common restriction enzyme motif, report position if common_testseq == common_motif: common_digest = common_pos + common_cut_motif if common_digest < len(DNA): rare_j_line = id+"\t"+str(rare_digest)+"\t"+str(common_digest)+"\t+\n" output.write(rare_j_line) print rare_j_line # whenever there is a rare enzyme cut, scan a window of basepairs downstream (based on lower/upper limits designed) for a common enzyme cut # for k in range(rare_digest-int(up), (rare_digest-int(low))-len(rare_motif),1): # common_testseq = str(DNA[k:k+len(common_motif)]) # common_pos = k+1 # if common_testseq == common_motif: # common_digest = common_pos + common_cut_motif # if common_digest > 0: # rare_k_line = id+"\t"+str(rare_digest)+"\t"+str(common_digest)+"\n" # output.write(rare_k_line) # print rare_k_line elif rare_testseq == rare_compmotif: # must do the same as above but with complement sequences (for opposite strand) rare_digest = rare_pos + rare_cut_compmotif # whenever there is a rare enzyme cut, scan a window of basepairs upstream (based on lower/upper limits designed) for a common enzyme cut. This actually ends up being downstream on the strand we care about. # for j in range(rare_digest+int(low), (rare_digest+int(up))-len(rare_motif),1): # common_testseq = str(DNA[j:j+len(common_compmotif)]) # common_pos = j+1 # if common_testseq == common_compmotif: # common_digest = common_pos + common_cut_compmotif # if common_digest < len(DNA): # rare_j_line = id+"\t"+str(rare_digest)+"\t"+str(common_digest)+"\n" # output.write(rare_j_line) # print rare_j_line # whenever there is a rare enzyme cut, scan a window of basepairs downstream (based on lower/upper limits designed) for a common enzyme cut. This actually ends up being upstream on the strand we care about. for k in range(rare_digest-int(up), (rare_digest-int(low))-len(rare_compmotif),1): common_testseq = str(DNA[k:k+len(common_compmotif)]) common_pos = k+1 if common_testseq == common_compmotif: common_digest = common_pos + common_cut_compmotif if common_digest > 0: rare_k_line = id+"\t"+str(common_digest)+"\t"+str(rare_digest)+"\t-\n" output.write(rare_k_line) print rare_k_line
from Bio.Seq import Seq # for Seq from Bio.Alphabet import IUPAC # for alphabet ## sequence with generic alphabet ## my_seq = Seq("AGTACACTGGT") print "sequence = ", my_seq print "alphabet = ", my_seq.alphabet print ## DNA sequence ## dna = Seq("ATGACACTGTAGGAA", IUPAC.unambiguous_dna) print "sequence = ", dna print "alphabet type = ", dna.alphabet print "DNA nucleotides = ", dna.alphabet.letters print # print DNA complement print "complement = ", dna.complement() print # trascribe from DNA (sense strand) to RNA rna =dna.transcribe() print "rna = ", rna print # translate from RNA to protein protein1 = rna.translate() # dna.translate() also works print "protein = ", protein1 protein = rna.translate(to_stop = True) print "protein = ", protein
#!/usr/bin/env python3 # --*-- utf-8 --*-- from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.Alphabet import generic_dna my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna) my_seq print(my_seq) print(my_seq.alphabet) print(my_seq.complement()) print(my_seq.reverse_complement()) my_seq2 = Seq("AGTACACTGGT", IUPAC.ambiguous_dna) print(my_seq2.alphabet) my_seq3 = Seq("AGTACACTGGT", IUPAC.extended_dna) print(my_seq3.alphabet) my_prot = Seq("AGTACACTGGT", IUPAC.protein) print(my_prot) print(my_prot.alphabet) ##seq object act like string for index, letter in enumerate(my_seq): print("%i %s" % (index, letter)) print(len(my_seq)) print(my_seq[0]) #first letter print(my_seq[-1]) #last letter
from Bio.Seq import Seq # working with sequences my_seq = Seq("AGTACACTGGT") print my_seq print "complement: " + my_seq.complement() print "reverse complement: " + my_seq.reverse_complement() print "transcribe: " + my_seq.transcribe() print "my_seq[2:4]: " + my_seq[2:4] my_rna = my_seq.transcribe() my_dna = my_rna.back_transcribe()
# convert to string my_seq.tostring() # concatenate sequences seq1 + seq2 # ONLY if alphabets are compatible # otherwise, convers both seq to generic alphabets from Bio.Alphabet import generic_alphabet seq1.alphabet = generic_alphabet seq2.alphabet = generic_alphabet seq1 + seq2 # sequence complement (only if alphabet allows complement) my_seq.complement() # reverse complement (only if alphabet allows complement) my_seq.reverse_complement() # transcribe RNA (DNA -> mRNA) #The actual biological transcription process works from the template strand, doing a reverse complement #(TCAG → CUGA) to give the mRNA. However, in Biopython and bioinformatics in general, we typically #work directly with the coding strand because this means we can get the mRNA sequence just by switching #T → U. from Bio.Seq import transcribe # just changes T with U from the coding strand (5' -> 3') messenger_rna = transcribe(coding_dna)
from Bio.Seq import Seq #create a sequence object my_seq = Seq('CATGTAGACTAG') #print out some details about it print 'seq %s is %i bases long' % (my_seq, len(my_seq)) print 'reverse complement is %s' % my_seq.reverse_complement() print 'protein translation is %s' % my_seq.translate() print 'complement is %s' % my_seq.complement()
""" Biopython官方文档中实例演示,以备查询。 """ #序列的创建于输出 print("\n###############\n1. 简单序列处理\n---------------") from Bio.Seq import Seq my_seq = Seq("AGTACACTGGT") #创建Seq() print("my_seq:", my_seq) #输出 print(repr(my_seq)) #原始输出 print("alphabet of my_seq:", my_seq.alphabet) #序列类型 #互补 print("正向互补:", my_seq.complement()) print("反向互补:", my_seq.reverse_complement()) #外部导入序列 print("\n###############\n2. FASTA 解析示例\n---------------") from Bio import SeqIO for seq_record in SeqIO.parse("ls_orchid.fasta", "fasta"): #解析fasta文件 print("序列名称:", seq_record.id) print("序列原始输出:", repr(seq_record.seq)) print("序列长度:", len(seq_record)) break #FASTA 文件并没有指定字母表,因此默认使用相当通用的 SingleLetterAlphabet() print("\n###############\n3. GenBank 解析示例\n---------------") for seq_record in SeqIO.parse("ls_orchid.gbk", "genbank"): #解析genbank文件 print("序列名称:", seq_record.id)
from Bio.Seq import Seq from Bio.Alphabet import generic_dna, generic_rna for line in sys.stdin: line = line.rstrip() if len(line)==0 or line.startswith("#"): continue line = line.upper() counts = Counter(line) n_u = counts.get('U', 0) + counts.get('u', 0) n_t = counts.get('T', 0) + counts.get('t', 0) if n_u > n_t: seq = Seq(line, generic_rna) seqtype = "RNA" else: seq = Seq(line, generic_dna) seqtype = "DNA" print "Parsed input as: %s" % seq print "Seq. type: %s" % seqtype print "Length: %d" % len(seq) print "Composition: %s" % (' '.join( ["%c:%.2f%% " % (nuc, cnt/float(len(seq))) for (nuc, cnt) in counts.iteritems()])) print "Complement: %s" % seq.complement() print "Reverse complement: %s" % seq.reverse_complement() print "Translated: %s" % seq.translate() # See http://biopython.org/wiki/Seq
from Bio.Seq import Seq from Bio.Alphabet import generic_dna my_dna = Seq("GTAG,GCTG,ATAC", generic_dna) print my_dna print my_dna.complement() print my_dna.reverse_complement()
from Bio.Alphabet import generic_rna from Bio.Alphabet import generic_protein my_dna = Seq("ATGGGGAGAAGGCCGTAG", generic_dna) #print my_dna #a = my_dna + 'aaa' #print a print my_dna.find('AGG') print my_dna.find('AGA') print my_dna print my_dna.count('A') print len(my_dna) your_dna = my_dna.complement() print your_dna my_rna = my_dna.transcribe() print my_rna my_protr = my_rna.translate(table=1, to_stop=True) #table = 1 is default std genetic code, http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG1 #to_stop=True tells it to stop at stops print my_protr my_protd = my_dna.translate(to_stop=True) print my_protd #playing with complete CDS' #yaaX = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA" + \ # "GCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGAT" + \ # "AATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACAT" + \