def get_sgrna(self): # return DataFrame contains possible sgRNAs. if not hasattr(self, 'sgrna'): ngg = re.compile('([atgcATGC]{20})([atgcATGC](GG|gg|Gg|gG))') ccn = re.compile('((CC|cc|Cc|cC)[atgcATGC])([atgcATGC]{20})') columns = ['seqname', 'start', 'cut', 'end', 'sgrna', 'pam'] sgrna = list() for chromosome in self.genome: sglist = [{ 'seqname': chromosome.id, 'start': x.start(), 'cut': x.end() - 6, 'end': x.end() - 3, 'sgrna': x.group(1), 'pam': x.group(2) } for x in ngg.finditer(str(chromosome.seq))] sglist.extend({ 'seqname': chromosome.id, 'start': x.start() + 3, 'cut': x.start() + 6, 'end': x.end(), 'sgrna': Seq.reverse_complement(x.group(3)), 'pam': Seq.reverse_complement(x.group(1)) } for x in ccn.finditer(str(chromosome.seq))) sgrna.append(pd.DataFrame(sglist, columns=columns)) self.sgrna = pd.concat(sgrna, axis=0, ignore_index=True) return self.sgrna
def itercodon(seq, frame, offset, table, reverse=False): stop = 0 if not reverse: for i in xrange(frame, len(seq) - offset, 3): subseq = str(seq.seq)[i:i + 3] assert (len(subseq) % 3 == 0), (str(seq)) aa = Seq.translate(subseq, table) yield i, aa if i + 3 != len(seq): subseq = seq[i + 3:] + "N" * (3 - offset) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa else: for i in xrange(len(seq), offset, -3): # the reverse complement subseq = Seq.reverse_complement(str(seq.seq)[i - 3:i]) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa if offset: subseq = Seq.reverse_complement("N" * (3 - offset) + str(seq.seq)[:offset]) assert (len(subseq) % 3 == 0) aa = Seq.translate(subseq, table) yield i, aa
def test_reverse_complement_on_proteins(self): """Test reverse complement shouldn't work on a protein!""" for s in protein_seqs: with self.assertRaises(ValueError): Seq.reverse_complement(s) with self.assertRaises(ValueError): s.reverse_complement()
def rc_kmers(self, kmers): res={} keys=[] for s in kmers: if Seq.reverse_complement(s) in keys: res[s]=Seq.reverse_complement(s) else: keys.append(s) res[s]=s return keys,res
def test_reverse_complement(self): test_seqs_copy = copy.copy(test_seqs) test_seqs_copy.pop(21) for nucleotide_seq in test_seqs_copy: if not isinstance(nucleotide_seq.alphabet, Alphabet.ProteinAlphabet) and \ isinstance(nucleotide_seq, Seq.Seq): expected = Seq.reverse_complement(nucleotide_seq) self.assertEqual(repr(expected), repr(nucleotide_seq.reverse_complement())) self.assertEqual(repr(expected[::-1]), repr(nucleotide_seq.complement())) self.assertEqual(str(nucleotide_seq.complement()), str(Seq.reverse_complement(nucleotide_seq))[::-1]) self.assertEqual(str(nucleotide_seq.reverse_complement()), str(Seq.reverse_complement(nucleotide_seq)))
def barcode_stats(outpfile, SAMPLE_BARCODE_DICT, delim, filetype, run_cutadapt, logfile, this_dir='./', trimmedbarcodestatus = False): # you do not need to mention global to read values of global variables, only if you want to re/assign value outfile = open(outpfile, 'w') headers_list = ['file' , 'starts_fbarcode', 'ends_fbarcode', 'contains_fbarcode',\ 'starts_FADAPTER', 'ends_FADAPTER','contains_FADAPTER','starts_FWDPRIMER', 'ends_FWDPRIMER', 'contains_FWDPRIMER',\ 'starts_FAD_FP', 'ends_FAD_FP', 'contains_FAD_FP',\ 'starts_REV_COMPLEM_FAD_FP', 'ends_REV_COMPLEM_FAD_FP', 'contains_REV_COMPLEM_FAD_FP',\ 'starts_REVPRIMER', 'ends_REVPRIMER', 'contains_REVPRIMER',\ 'starts_RADAPTER', 'ends_RADAPTER', 'contains_RADAPTER','starts_RAD_RP', 'ends_RAD_RP', 'contains_RAD_RP',\ 'starts_REV_COMPLEM_RAD_RP', 'ends_REV_COMPLEM_RAD_RP', 'contains_REV_COMPLEM_RAD_RP',\ 'starts_rbarcode', 'ends_rbarcode' , 'contains_rbarcode',\ 'starts_REV_COMPLEM_fbarcode', 'ends_REV_COMPLEM_fbarcode' , 'contains_REV_COMPLEM_fbarcode',\ 'starts_REV_COMPLEM_rbarcode', 'ends_REV_COMPLEM_rbarcode' , 'contains_REV_COMPLEM_rbarcode',\ 'starts_REV_COMPLEM_FWDPRIMER', 'ends_REV_COMPLEM_FWDPRIMER', 'contains_REV_COMPLEM_FWDPRIMER',\ 'starts_REV_COMPLEM_REVPRIMER', 'ends_REV_COMPLEM_REVPRIMER', 'contains_REV_COMPLEM_REVPRIMER'] outfile.write ("%s\n" % delim.join(headers_list)) for f in sorted(os.listdir(this_dir)): new_f = this_dir + f if os.path.isfile(new_f) and new_f.endswith(filetype): for k in SAMPLE_BARCODE_DICT: # for each element in dict trimk = k if (trimmedbarcodestatus): trimk = 'trim_' + k # uncomment these next two lines if you directly want to run the barcode status on trimmed reads #else: # continue if new_f.startswith(this_dir+trimk): #if this_dir+trimk in new_f: # if the key is prefix in filename. barcodes = SAMPLE_BARCODE_DICT[k].split("_") #print 'ok', trimk, new_f fbarcode = barcodes[0] rbarcode = barcodes[1] counts_list = [f] rev_complem_fbarcode = BioSeq.reverse_complement(fbarcode) # search at end in read 2 rev_complem_rbarcode = BioSeq.reverse_complement(rbarcode) # search at end in read 1 for pattern in [fbarcode, FADAPTER, FWDPRIMER, FAD_FP, REV_COMPLEM_FAD_FP,\ REVPRIMER, RADAPTER, RAD_RP, REV_COMPLEM_RAD_RP, rbarcode,\ rev_complem_fbarcode, rev_complem_rbarcode, REV_COMPLEM_FWDPRIMER, REV_COMPLEM_REVPRIMER]: counts_list.extend(grep_string(pattern, new_f)) # separately checks starts with pattern, ends with pattern counts_list.append(grep_string_contains(pattern, new_f)) str_counts_list = [str(i.strip()) for i in counts_list] outfile.write ("%s\n" % delim.join(str_counts_list)) outfile.close() if (run_cutadapt): print "Done with initial barcode stats\nNow running cutadapt\n" run_cutadapt_method(outpfile, logfile, this_dir, filetype, SAMPLE_BARCODE_DICT, delim) else: sys.exit('\nRunning cutadapt was not requested.\n')
def extract_input_counts(args): input_info = get_input_info(args) groups = [elem[1] for elem in input_info] n_groups = max(groups) + 1 alphabet = summarize.alphabet kmer_counts = [ defaultdict(lambda: [[0 for j in range(len(alphabet))] for i in range(n_groups)]) for li in range(args.l) ] print(input_info) for li in range(args.l): lag = li + 1 for fi in range(len(input_info)): indiv_file, group, file_type = input_info[fi] for si, elem in enumerate( summarize.load_input(open(indiv_file, 'r'), file_type)): seq = elem[1] full_seq = '[' * lag + seq + ']' for j in range(lag, len(full_seq)): lag_kmer = full_seq[(j - lag):j] next_letter = full_seq[j] kmer_counts[li][lag_kmer][group][ alphabet[next_letter]] += 1 if args.r: seq = Seq.reverse_complement(seq) full_seq = '[' * lag + seq + ']' for j in range(lag, len(full_seq)): lag_kmer = full_seq[(j - lag):j] next_letter = full_seq[j] kmer_counts[li][lag_kmer][group][ alphabet[next_letter]] += 1 return kmer_counts
def parse_library(library_handle, threshold, mismatches=0): """Parse the library file and put the data in a nested dictionary containing per marker the two forward flanking sequences, the two reverse flanking sequences and a regular expression pattern object. :arg stream library_handle: Open readable handle to a library file. :arg float threshold: Number of allowed mismatches per nucleotide. :arg int mismatches: If set, overrides the dynamic threshold calculation. :returns dict: Nested dictionary containing library data. """ library = {} data = map(lambda x: x.strip().split('\t'), library_handle.readlines()) for i in data: pattern = '(?!x)x' # This will never match anything. if len(i) == 4: pat = i[3].split() pattern = '^{}$'.format(''.join(map( lambda x: ('({}){{{},{}}}'.format( pat[x], pat[x + 1], pat[x + 2])), range(0, len(pat), 3)))) library[i[0]] = { 'flanks': [i[1], Seq.reverse_complement(i[2])], 'counts': [0, 0, 0, 0], 'pair_match': [0, 0], 'thresholds': [ mismatches or int(ceil(len(i[1]) * threshold)), mismatches or int(ceil(len(i[2]) * threshold))], 'reg_exp': re_compile(pattern), 'new': defaultdict(lambda: [0, 0]), 'known': defaultdict(lambda: [0, 0])} return library
def __getGenomeRefSequence(self, chrom, start, length, strand): if type(chrom) != str: chrom = str(chrom) chrom = chrom.strip() if not chrom.startswith('chr'): chrom = 'chr' + chrom #if strand == '+': start = start - 1 end = start + length #else: # end = start - 1 # start = end + length db_file = "{0}:{1}:{2}-{3}".format(self.__profiles[self.__profile], chrom, start, end) with tempfile.NamedTemporaryFile() as tmp: #print TWOBITTOFA_BIN, db_file ret = call([TWOBITTOFA_BIN, db_file, tmp.name]) tmp.readline() # read fasta header line seq = '' for line in tmp: seq += line.strip() if strand == '+': return seq else: from Bio import Seq return Seq.reverse_complement(seq)
def convert_from_subtype_to_hxb2(working_dir, position, orientation, subtype): """ Convert a position number in HXB2 to the equivalent in another subtype. Args: working_dir: working folder in which to place temporary files position: hxb2 coordinate position to convert subtype: subtype position to convert to """ sequences = [subtype_sequence(subtype), HXB2()] if orientation == "reverse": sequences = [SeqRecord.SeqRecord(Seq.reverse_complement(s.seq), id = s.id, name = s.name) for s in sequences] alignment = wrappers.mafft(working_dir, sequences) hxb2_pos = 0 subtype_pos = 0 for i in range(len(alignment[0])): if subtype_pos == position: return hxb2_pos if alignment[0][i] != "-": subtype_pos += 1 if alignment[1][i] != "-": hxb2_pos += 1
def main(): """ Returns all position and length of every reverse palindrome in the string having length between 4 and 12. A string sequence is a reverse palindrome if the reverse compliment is equivalent to itself. (position begins at 1) :return: nothing. Position Length of reverse palindromes are printed """ # import SeqIO for parsing FASTA file # import Seq to determine reverse complimentary strands from Bio import SeqIO as SeqIO from Bio import Seq # open FASTA file and obtain string - representing DNA sequence. fasta_sequence = SeqIO.parse(open(f"{input('file name here: ')}.txt"), 'fasta') for fasta in fasta_sequence: sequence = fasta.seq # determines length of search inside the total DNA sequence for length in range(12, 3, -2): # determines where within the sequence to view for index in range(len(sequence)): # checks to prevent searching index beyond the length of sequence if (index + length) < len(sequence) + 1: # generates sequence with given length, and create reverse complimentary possible_palindrome = sequence[index:index + length] reverse_comoplimentary = Seq.reverse_complement( possible_palindrome) # definition of reverse palindrome: sequence == reverse complimentary if possible_palindrome == reverse_comoplimentary: print(str(index + 1) + f" {length}")
def create_primer_output(config: SSMConfig, mutagenic_primer, mutation, degenerate_codon: str, new_sequence_start, parameters_in_range: bool): # The actual degenerate outputs for the export are created on the frontend. This is simply for the resulting # table, which can only show one primer at a time. first_degenerate_codon = degenerate_codon.split(",")[0] mutated = mutagenic_primer.primer.get_mutated_sequence( mutation.position, first_degenerate_codon) direction = mutagenic_primer.primer.direction primer = mutagenic_primer.primer if mutagenic_primer.primer.direction == Primer.FORWARD: sequence = mutated elif mutagenic_primer.primer.direction == Primer.REVERSE: sequence = Seq.reverse_complement(mutated) else: raise NotImplemented() return PrimerOutput( direction=direction, sequence=sequence, normal_order_sequence=mutated, normal_order_start=primer.get_normal_start() - new_sequence_start, start=primer.start - new_sequence_start, length=primer.length, three_end_temperature=mutagenic_primer.three_end_temperature, gc_content=mutagenic_primer.primer.get_gc_content(), parameters_in_range=parameters_in_range)
def get_complement_elems(self): """Gives the complement strand of sequence. :return: Complement Seq iterator. """ for base in Seq.reverse_complement(self.seq): yield base
def test_reverse_complement(self): test_seqs_copy = copy.copy(test_seqs) test_seqs_copy.pop(21) for nucleotide_seq in test_seqs_copy: if not isinstance(nucleotide_seq.alphabet, Alphabet.ProteinAlphabet) and \ isinstance(nucleotide_seq, Seq.Seq): expected = Seq.reverse_complement(nucleotide_seq) self.assertEqual(repr(expected), repr(nucleotide_seq.reverse_complement())) self.assertEqual(repr(expected[::-1]), repr(nucleotide_seq.complement())) self.assertEqual( str(nucleotide_seq.complement()), str(Seq.reverse_complement(nucleotide_seq))[::-1]) self.assertEqual(str(nucleotide_seq.reverse_complement()), str(Seq.reverse_complement(nucleotide_seq)))
def reverse_complement(sequence): """ Reverse complement of a sequence represented as unicode string. Unfortunately, BioPython's reverse_complement doesn't work on unicode strings. We work almost exclusively with unicode strings, so this is a convenience wrapper. """ return unicode(Seq.reverse_complement(str(sequence)))
def reverse_complement(seq): """ Given: A DNA string s of length at most 1000 bp. Return: The reverse complement sc of s. due to the complement_map, the symbol such as \n and something else is illegal the input need to be pure sequence """ return Seq.reverse_complement(seq)
def rev_complement(cls, string): """ Quick method to perform the reverse complement of a given string, using the class translation table. :param string: the sequence to be rev-complented :type string: str """ return Seq.reverse_complement(string)
def translate(seq): r = {} r['First Frame'] = Seq.translate(seq) r['Second Frame'] = Seq.translate(seq[1:]) r['Third Frame'] = Seq.translate(seq[2:]) seq = Seq.reverse_complement(seq) r['Complement First Frame'] = Seq.translate(seq) r['Complement Second Frame'] = Seq.translate(seq[1:]) r['Complement Third Frame'] = Seq.translate(seq[2:]) return r
def oligos_1(): overhangs = ['CATG', 'ACAA'] bc = 'GATGATTGA' kozak = 'gccacc' start = 'atg' fwd =overhangs[0] + loxp + bc + kozak + start + lox71 rev = seq.reverse_complement(loxp+bc+kozak+start+lox71+overhangs[1]) print fwd.upper() print rev.upper()
def make_fasta(gene_dict, args, gene_fa_out): with open(gene_fa_out, "w") as out_handle: for rec in SeqIO.parse(args.fasta, "fasta"): for gene, start, end, strand in gene_dict[rec.id]: seq = str(rec.seq)[start-(args.flank+1):end+args.flank] if strand == "-": seq = Seq.reverse_complement(seq) out_handle.write(f">{gene} | location={rec.id}:{start}-{end} | strand={strand}\n{seq}\n")
def test_reverse_complement(self): test_seqs_copy = copy.copy(test_seqs) test_seqs_copy.pop(13) for nucleotide_seq in test_seqs_copy: if isinstance(nucleotide_seq, Seq.Seq): expected = Seq.reverse_complement(nucleotide_seq) self.assertEqual(repr(expected), repr(nucleotide_seq.reverse_complement())) self.assertEqual(repr(expected[::-1]), repr(nucleotide_seq.complement())) self.assertEqual( str(nucleotide_seq.complement()), str(Seq.reverse_complement(nucleotide_seq))[::-1], ) self.assertEqual( str(nucleotide_seq.reverse_complement()), str(Seq.reverse_complement(nucleotide_seq)), )
def delGene(geneName, cutsite): # This function asks user for a chromosomal locus, a region to be deleted, a suitable CRIPSR cutsite # and outputs oligos for cloning of a pL308 Cas9-gRNA vector, and ones for generating a donor DNA # to delete the unwanted chromosomal region. Primers Lup+Rdown produce a 1kb band if deletion was # successful. # part of yCRISPRv3 by [email protected] #GeneName=input("Name, using quotes: ") #cutsite=input("20-mer cut sequence, using quotes: ").upper() locus = genomicData[geneName][0] deletion = genomicData[geneName][1] deletion = Seq(deletion) if deletion.find(cutsite)==-1: if deletion.reverse_complement().find(cutsite)==-1: print ("WARNING: Guide 20-mer sequence not found in deletion region.") locus=Seq(locus) index=locus.find(deletion) # index gives the start position within locus of the string deletion. # now we delete the deletion region to redefine a newlocus: newlocus=locus[0:index]+locus[index+len(deletion):] # note that since index starts at 0, a value of n points to, in the newlocus, # the first nt after the deletion. So we define the newlocus as above. Note too # that a string of len=40 ends at an index of 39--so we pick up at index+len-1. Lup=newlocus[index-500:index-470] Rdown=newlocus[index+469:index+499].reverse_complement() Rtemp1 = newlocus[:index].reverse_complement() Rtemp2 = newlocus[index:].reverse_complement() rPrimer, rLength = getPrimer(Rtemp1) lPrimer, lLength = getPrimer(newlocus[index:]) Rup = getOverhang(Rtemp2, rLength) + rPrimer Ldown = getOverhang(newlocus[:index], lLength) + lPrimer cutSequence=Seq("cgggtggcgaatgggacttt")+cutsite+Seq("gttttagagctagaaatagc") seqprimer=Seq("gacttt")+cutsite print("cut" + GeneName + " " + cutSequence) print("seq" + GeneName + " " + seqprimer) print("Lup" + GeneName + "del" + " " + Lup) print("Rup" + GeneName + "del" + " " + Rup) print("Ldown" + GeneName + "del" + " " + Ldown) print("Rdown" + GeneName + "del" + " " + Rdown) return Ldown, Rup
def get_sgrna(self): # return DataFrame contains possible sgRNAs. if not hasattr(self, 'sgrna'): ngg = re.compile( '([atgcATGC]{20})([atgcATGC](GG|gg|Gg|gG))' ) ccn = re.compile( '((CC|cc|Cc|cC)[atgcATGC])([atgcATGC]{20})' ) columns = ['seqname', 'start', 'cut', 'end', 'sgrna', 'pam'] sgrna = list() for chromosome in self.genome: sglist = [ { 'seqname': chromosome.id, 'start': x.start(), 'cut': x.end() - 6, 'end': x.end() - 3, 'sgrna': x.group(1), 'pam': x.group(2) } for x in ngg.finditer(str(chromosome.seq)) ] sglist.extend( { 'seqname': chromosome.id, 'start': x.start() + 3, 'cut': x.start() + 6, 'end': x.end(), 'sgrna': Seq.reverse_complement(x.group(3)), 'pam': Seq.reverse_complement(x.group(1)) } for x in ccn.finditer(str(chromosome.seq)) ) sgrna.append( pd.DataFrame( sglist, columns = columns ) ) self.sgrna = pd.concat(sgrna, axis = 0, ignore_index = True) return self.sgrna
def oligos_1(): overhangs = ['CATG', 'ACAA'] bc = 'GATGATTGA' kozak = 'gccacc' start = 'atg' fwd = overhangs[0] + loxp + bc + kozak + start + lox71 rev = seq.reverse_complement(loxp + bc + kozak + start + lox71 + overhangs[1]) print fwd.upper() print rev.upper()
def find_orf_all(seq): """Given a string representing a coding sequence, find all ORFs for all frames on the + and - strand""" all_orfs = [] all_orfs.extend(find_orf(seq)) all_orfs.extend(find_orf(seq[1:])) all_orfs.extend(find_orf(seq[2:])) revc = Seq.reverse_complement(seq) all_orfs.extend(find_orf(revc)) all_orfs.extend(find_orf(revc[1:])) all_orfs.extend(find_orf(revc[2:])) return list(set(all_orfs))
def make_fasta(gene_dict, args): with open(args.out_file, "w") as out_handle: for rec in SeqIO.parse(args.fasta, "fasta"): for gene, start, end, strand in gene_dict[rec.id]: seq = str(rec.seq)[start - args.f - 1:end + args.f] if strand == "-": seq = Seq.reverse_complement(seq) out_handle.write( f">{gene} | location={rec.id}:{start-args.f-1}-{end+args.f} including {args.f}n flank | strand={strand}\n{seq}\n" )
def counter(kmers): final_shape = np.r_[np.shape(kmers), [alphabet_size+1]] counts = np.zeros([np.size(kmers), alphabet_size+1]) for i, k in enumerate(kmers.flatten()): for j, b in enumerate(alphabet): # Get kp1mer count counts[i, j] = get_kmc_count(k + b, file, kmer_token, c) # Get reverse count (assemblies only look at one strand). if reverse: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), file, kmer_token, c) return counts.reshape(final_shape)
def orf_reader(infile): orfs = {} handle = open(infile,"r") lines = handle.readlines() for line in lines: if line[0] != "#": line_array = line.split("\t") if int(line_array[1]) < 0: orfs[line_array[0]] = [Seq.reverse_complement(line_array[4]),line_array[4]] else: orfs[line_array[0]] = [line_array[4],line_array[4]] return orfs
def counter(kmers): final_shape = np.r_[np.shape(kmers), [alphabet_size+1]] counts = np.zeros([np.size(kmers), alphabet_size+1]) for i, k in enumerate(kmers.flatten()): k = k.replace('[', '') for j, b in enumerate(alphabet): # Get kp1mer count counts[i, j] = get_kmc_count(k + b, files[len(k)], kmer_token, c) # Get reverse count (assemblies only look at one strand). if reverse: if len(k) == lag: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), files[len(k)], kmer_token, c) if len(k) < lag: counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b), files_suf[len(k)], kmer_token, c) if len(k) == lag: counts[i, -1] = get_kmc_count(k, files_suf[len(k)-1], kmer_token, c) if reverse: counts[i, -1] += get_kmc_count(Seq.reverse_complement(k), files[len(k)-1], kmer_token, c) return counts.reshape(final_shape)
def main() -> None: """ Make a jazz noise here """ args = get_args() if seqs := [str(rec.seq) for rec in SeqIO.parse(args.file, 'fasta')]: rna = seqs[0].replace('T', 'U') orfs = set() for seq in [rna, Seq.reverse_complement(rna)]: for i in range(3): if prot := Seq.translate(truncate(seq[i:], 3), to_stop=False): for orf in find_orfs(prot): orfs.add(orf)
def translateSeq(cds): senseOrAnti = 'sense' finalCDS = cds try: translated = Seq.translate(cds,cds=True) # finalCDS = cds except TranslationError,e: try: reverseCDS = Seq.reverse_complement(cds) translated = Seq.translate(reverseCDS,cds=True) finalCDS = reverseCDS senseOrAnti = 'anti' except TranslationError,e: print 'Translation failed in %s'%cds
def calculate(sequence, cut_length, conditions, cg_clamp, self_comp) -> pd.DataFrame: sequence = remove_unnecessary(sequence).upper() frag_list, position_list = get_fragments(seq=sequence, cut_length=cut_length) used_frag_list = [] rev_comp_list = [] tm_list_breslauer = [] tm_list_santalucia = [] cg_list = [] marked_list = [] for frag, pos in zip(frag_list, position_list): if cg_clamp: if check_gc_clamp(frag, last=5): continue if self_comp: if check_selfcomp(frag, threshold=4): continue used_frag_list.append(frag) rev_comp_list.append(str(Seq.reverse_complement(frag))) nn = NearestNeighbor(frag) melting_temp_breslauer = nn.breslauer() melting_temp_santalucia = nn.santalucia() cg_content = round(GC(frag), 1) marked_result = view_position(full_seq=sequence, pos_list=pos) tm_list_breslauer.append(melting_temp_breslauer) tm_list_santalucia.append(melting_temp_santalucia) cg_list.append(cg_content) marked_list.append(marked_result) df = pd.DataFrame({ 'fragment': used_frag_list, 'rev_comp': rev_comp_list, 'breslauer': tm_list_breslauer, 'santalucia': tm_list_santalucia, 'cg_content': cg_list, 'position': marked_list }) # narrow down except homology filtered_df, homology_condition = narrow_down(df, conditions=conditions) homology_list = get_homology_count(filtered_df['fragment']) filtered_df['homology'] = homology_list # narrowed down by homology if homology_condition: filtered_df, _ = narrow_down(filtered_df, homology_condition, pending=False) # sorted by breslauer filtered_df_s = filtered_df.sort_values('breslauer', ascending=False) return filtered_df_s
def __call__(self): # Initialize output files. file_out = { 'suf': [ open(self.file_out_names['suf'][li], 'w') for li in range(self.lag) ] } if self.pr: file_out['pre'] = [ open(self.file_out_names['pre'][li], 'w') for li in range(self.lag) ] else: file_out['pre'] = open(self.file_out_names['pre'], 'w') if self.file_type != 'fq' or self.reverse: file_out['full'] = open(self.file_out_names['full'], 'w') # Open file. in_file = open(self.file, 'r') # Iterate through sequences in files. for j, elem in enumerate(load_input(in_file, self.file_type)): not_init = j > 0 name, seq = elem[:2] self.__write_out(file_out, not_init, name, seq) if self.reverse: not_init = True seq = Seq.reverse_complement(seq) name = name + '_rev' self.__write_out(file_out, not_init, name, seq) # Close files. if self.pr: for li in range(self.lag): file_out['pre'][li].close() else: file_out['pre'].close() if self.file_type != 'fq' or self.reverse: file_out['full'].close() for li in range(self.lag): file_out['suf'][li].close()
def dna_aa(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) form = FORM( TABLE( TR( 'Sequence (raw format): ', TEXTAREA(_type='text', _name='sequence', requires=IS_NOT_EMPTY())), #TR("Sequence Type: ", # SELECT("Raw Format", "FASTA", # _name="seq_type")), TR( 'Action: ', SELECT('Complementation', 'Transcribe', 'Translate', 'Back Transcribe', 'Back Translate', _name='action'), INPUT(_type='submit', _value='SUBMIT')))) if form.accepts(request.vars, session): #if form.vars.seq_type == "FASTA": # session['sequence'] = \ # seqClean(fasta_to_raw(form.vars.sequence.upper())) #else: session['sequence'] = seqClean(form.vars.sequence.upper()) if form.vars.action == "Complementation": session['action'] = "Complementation" session['Complement'] = Seq.reverse_complement(session['sequence']) if form.vars.action == "Transcribe": session['action'] = 'Transcribe' session['Transcribed RNA'] = Seq.transcribe(session['sequence']) if form.vars.action == "Back Transcribe": session['action'] = 'Back Transcribe' session['DNA'] = Seq.back_transcribe(session['sequence']) if form.vars.action == "Translate": session['action'] = 'Translate' session.update(translate(session['sequence'])) if form.vars.action == "Back Translate": session['action'] = 'Back Translate' session.update(back_translate(session['sequence'])) redirect(URL(r=request, f='dna_aa_output')) return dict(form=form)
def get_three_end(self, gene_of_interest, reverse_primer): reverse_complement = Seq.reverse_complement(reverse_primer) self.check_sequence_occurrences(reverse_complement, "Reverse primer") if self.gene_end_in_plasmid is None: self.check_sequence_occurrences(gene_of_interest, "Gene of interest") gene_end = self.plasmid_sequence.find(gene_of_interest) + len( gene_of_interest) else: gene_end = self.gene_end_in_plasmid reverse_position = self.plasmid_sequence.find(reverse_complement) reverse_primer_end = reverse_position + len(reverse_primer) if reverse_primer_end > gene_end: return self.plasmid_sequence[gene_end:reverse_primer_end] else: return self.plasmid_sequence[gene_end:] + \ self.plasmid_sequence[:reverse_primer_end]
def bam_to_fastq(bam_file, is_paired): """Convert a BAM file to fastq files. """ out_files, out_handles = _get_fastq_handles(bam_file, is_paired) if len(out_handles) > 0: in_bam = pysam.Samfile(bam_file, mode='rb') for read in in_bam: num = 1 if (not read.is_paired or read.is_read1) else 2 # reverse the sequence and quality if mapped to opposite strand if read.is_reverse: seq = str(Seq.reverse_complement(Seq.Seq(read.seq))) qual = "".join(reversed(read.qual)) else: seq = read.seq qual = read.qual out_handles[num].write("@%s\n%s\n+\n%s\n" % (read.qname, seq, qual)) [h.close() for h in out_handles.values()] return out_files
def dna_aa(): if session.username == None: redirect(URL(r=request, c='account', f='log_in')) form = FORM(TABLE(TR('Sequence (raw format): ', TEXTAREA(_type='text', _name='sequence', requires=IS_NOT_EMPTY())), #TR("Sequence Type: ", # SELECT("Raw Format", "FASTA", # _name="seq_type")), TR('Action: ', SELECT('Complementation', 'Transcribe', 'Translate', 'Back Transcribe', 'Back Translate', _name='action'), INPUT(_type='submit', _value='SUBMIT')))) if form.accepts(request.vars,session): #if form.vars.seq_type == "FASTA": # session['sequence'] = \ # seqClean(fasta_to_raw(form.vars.sequence.upper())) #else: session['sequence'] = seqClean(form.vars.sequence.upper()) if form.vars.action == "Complementation": session['action'] = "Complementation" session['Complement'] = Seq.reverse_complement(session['sequence']) if form.vars.action == "Transcribe": session['action'] = 'Transcribe' session['Transcribed RNA'] = Seq.transcribe(session['sequence']) if form.vars.action == "Back Transcribe": session['action'] = 'Back Transcribe' session['DNA'] = Seq.back_transcribe(session['sequence']) if form.vars.action == "Translate": session['action'] = 'Translate' session.update(translate(session['sequence'])) if form.vars.action == "Back Translate": session['action'] = 'Back Translate' session.update(back_translate(session['sequence'])) redirect(URL(r=request, f='dna_aa_output')) return dict(form=form)
compl_values = complement(values).replace("T", "U") # need to help as no alphabet print "%s={%s} --> {%s}=%s" % (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char]) assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]]) print print "Reverse complements:" for sequence in [ Seq.Seq("".join(sorted(ambiguous_rna_values))), Seq.Seq("".join(sorted(ambiguous_dna_values))), Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna), Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna), Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG"), ]: # Note no U or T print "%s -> %s" % (repr(sequence), repr(Seq.reverse_complement(sequence))) assert str(sequence) == str( Seq.reverse_complement(Seq.reverse_complement(sequence)) ), "Dobule reverse complement didn't preserve the sequence!" print ########################################################################### test_seqs = [ s, t, u, Seq.Seq("ATGAAACTG"), "ATGAAACtg", # TODO - Fix ambiguous translation # Seq.Seq("ATGAARCTG"),
def test_reverse_complement_of_rna(self): seq = "AUGAAACUG" self.assertEqual("CAGUUUCAU", Seq.reverse_complement(seq))
def complement(sequence) : #TODO - Add a complement function to Bio/Seq.py? #There is already a complement method on the Seq and MutableSeq objects. return Seq.reverse_complement(sequence)[::-1]
compl_values = complement(values).replace("T","U") #need to help as no alphabet print "%s={%s} --> {%s}=%s" % \ (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char]) assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]]) print print "Reverse complements:" for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))), Seq.Seq("".join(sorted(ambiguous_dna_values))), Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna), Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna), Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X",""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X",""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG")]: # Note no U or T print "%s -> %s" \ % (repr(sequence), repr(Seq.reverse_complement(sequence))) assert sequence.tostring() \ == Seq.reverse_complement(Seq.reverse_complement(sequence)).tostring(), \ "Dobule reverse complement didn't preserve the sequence!" print ########################################################################### test_seqs = [s,t,u, Seq.Seq("ATGAAACTG"), "ATGAAACtg", #TODO - Fix ambiguous translation #Seq.Seq("ATGAARCTG"), #Seq.Seq("AWGAARCKG"), # Note no U or T #Seq.Seq("".join(ambiguous_rna_values)), #Seq.Seq("".join(ambiguous_dna_values)),
def twin(km): ''' Retorna la secuencia invertida ''' return Seq.reverse_complement(km)
compl_values = complement(values).replace("T", "U") # need to help as no alphabet print("%s={%s} --> {%s}=%s" % (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char])) assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]]) print("") print("Reverse complements:") for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))), Seq.Seq("".join(sorted(ambiguous_dna_values))), Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna), Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna), Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""), IUPAC.IUPACAmbiguousRNA()), Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""), IUPAC.IUPACAmbiguousDNA()), Seq.Seq("AWGAARCKG")]: # Note no U or T print("%s -> %s" % (repr(sequence), repr(Seq.reverse_complement(sequence)))) assert str(sequence) \ == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \ "Dobule reverse complement didn't preserve the sequence!" print("") ########################################################################### test_seqs = [s, t, u, Seq.Seq("ATGAAACTG"), "ATGAAACtg", # TODO - Fix ambiguous translation # Seq.Seq("ATGAARCTG"), # Seq.Seq("AWGAARCKG"), # Note no U or T # Seq.Seq("".join(ambiguous_rna_values)), # Seq.Seq("".join(ambiguous_dna_values)),
def twin(km): return Seq.reverse_complement(km)
#ORF: AUG #CLOSE : UAA, UAG, UGA #>Rosalind_99 #MLLGSFRLIPKETLIQVAGSSPCNLS #M #MGMTPRLGLESLLE #MTPRLGLESLLE with open("rosalind_orf.txt", 'r') as Z: DNA1 = Z.read() DNA = DNA1.replace("\n", '') ReverseDNA = Seq(DNA, generic_dna) FinalReverse = str(ReverseDNA.reverse_complement()) FinalizedProt = [] def ReadingFrameFinder(DNASTRING): CleanDNA = DNASTRING.rstrip("\n") OpenLocations = [] CloseLocations = [] stringlen = len(CleanDNA) TtoU = CleanDNA.replace("T", 'U') readingframeRange = xrange(0, stringlen) PossibleGenes = [] for item in readingframeRange: if TtoU[item:item+3] == "AUG": Newthing = xrange(item, stringlen, 3) storage = item
coordinates_dict[header].append("\t".join(column_list)) #----------------------------------------------------- # Step 3 # Itterate through the fasta accessions, extracting # sequence data #----------------------------------------------------- seq_records = list(SeqIO.parse(conf.fasta_file, "fasta")) # print(len(seq_records)) for accession in seq_records: header = accession.id # print header for coordinates in coordinates_dict[header]: # print ("hello") # print coordinates coordinate_list = coordinates.split("\t") # print coordinate_list start = int(coordinate_list[1]) stop = int(coordinate_list[2]) extracted_seq = accession.seq[start:stop] if '-' in coordinate_list[3]: # print ("reversed") extracted_seq = Seq.reverse_complement(extracted_seq) print (">" + coordinate_list[4]) print extracted_seq # print accession.id # print accession.seq[5:10]
def _main(): parser = argparse.ArgumentParser(description="Search Unique Sequence") parser.add_argument('--gene2refseq', type=commonlib.FileType('r'), help="default: %(default)s", nargs='?', default=os.path.join(BASEDIR, '../gene2refseq_test.txt.bz2')) parser.add_argument('--refseq_rna', type=commonlib.FileType('r'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../refseq.rna.bz2')) parser.add_argument('--unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram.txt.bz2')) parser.add_argument('--not-unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram-not.txt.bz2')) parser.add_argument('--fragment-specific-unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram-fragment-specific.txt.bz2')) parser.add_argument('--ngram', type=int, default=10) parser.add_argument('--coverage-output', type=commonlib.FileType('w'), help="default: %(default)s", default=os.path.join(BASEDIR, '../coverage.txt')) options = parser.parse_args() # load gene2refseq gene2refseq = collections.defaultdict(set) refseq2gene = dict() for row in csv.reader(options.gene2refseq, delimiter='\t', quotechar=None): if row[3] == '-': continue gene2refseq[row[1]].add(row[3]) refseq2gene[row[3]] = row[1] print gene2refseq refseq2sequence = dict() ngram2refseq = collections.defaultdict(set) available_gene2refseq = collections.defaultdict(list) # load sequences and calculate uniqueness for record in SeqIO.parse(options.refseq_rna, 'fasta'): refseq_id = record.id.split('|')[3] refseq2sequence[refseq_id] = {'id': refseq_id, 'seq': str(record.seq), 'coverage': list(['.']*len(record.seq)), 'gene': 0, 'fragment': 0} available_gene2refseq[refseq2gene[refseq_id]].append(refseq_id) for i in xrange(len(record.seq)-options.ngram+1): fragment = record.seq[i:i+options.ngram] ngram2refseq[str(fragment)].add((refseq_id, refseq2gene[refseq_id])) ngram2refseq[str(Seq.reverse_complement(fragment))].add((refseq_id, refseq2gene[refseq_id])) # write out gene unique and non-unique, isoform specific n-grams unique_ngrams = [] unique_no_common_ngrams = [] output = csv.writer(options.unique_ngram, delimiter='\t', quotechar=None) output_notunique = csv.writer(options.not_unique_ngram, delimiter='\t', quotechar=None) output_specific = csv.writer(options.fragment_specific_unique_ngram, delimiter='\t', quotechar=None) for k, v in ngram2refseq.iteritems(): geneset = set([x[1] for x in v]) refseqset = set([x[0] for x in v]) if len(geneset) == 1: geneid = geneset.pop() print refseqset, available_gene2refseq[geneid] if refseqset == set(available_gene2refseq[geneid]): unique_ngrams.append((k, geneid, ', '.join(refseqset))) output.writerow([k, geneid, ', '.join(refseqset)]) else: unique_no_common_ngrams.append((k, geneid, ', '.join(refseqset))) output_specific.writerow([k, geneid, ', '.join(refseqset)]) else: output_notunique.writerow([k, ', '.join(geneset), ', '.join(refseqset)]) # isoform specific reads for oneunique in unique_no_common_ngrams: #print oneunique ngram = len(oneunique[0]) for refseq in oneunique[2].split(', '): record = refseq2sequence[refseq] refseq2sequence[refseq]['gene'] = oneunique[1] pos = 0 while True: pos = record['seq'].find(oneunique[0], pos) if pos < 0: break record['coverage'][pos] = '_' for i in xrange(ngram-1): if record['coverage'][pos+1+i] == '.': record['coverage'][pos+1+i] = ',' pos += 1 # calculate coverage for oneunique in unique_ngrams: #print oneunique ngram = len(oneunique[0]) for refseq in oneunique[2].split(', '): record = refseq2sequence[refseq] refseq2sequence[refseq]['gene'] = oneunique[1] refseq2sequence[refseq]['fragment'] += 1 pos = 0 while True: pos = record['seq'].find(oneunique[0], pos) if pos < 0: break record['coverage'][pos] = 'X' for i in xrange(ngram-1): if record['coverage'][pos+1+i] != 'X': record['coverage'][pos+1+i] = '*' pos += 1 # write out coverage coverage_result = csv.writer(options.coverage_output, delimiter='\t', quotechar=None) coverage_result.writerow(['GeneID', 'RefSeqID', 'Length', '# of covered base', '# of unique fragment', '# of found fragmnet', 'coverage', 'cover']) for k, v in refseq2sequence.iteritems(): coverage = v['coverage'] covered = len([x for x in v['coverage'] if x == 'X' or x == '*']) starts = len([x for x in v['coverage'] if x == 'X']) assert len(coverage) == len(v['seq']) percent = starts/float(len(coverage)-ngram+1) coverage_result.writerow([v['gene'], k, len(coverage), covered, v['fragment'], starts, str(percent), ''.join(coverage)])
def test_reverse_complement_of_dna(self): seq = "ATGAAACTG" self.assertEqual("CAGTTTCAT", Seq.reverse_complement(seq))