class TestEventMapping(): def __init__(self): self.template_model = EventModel("models/template_model_5.model") self.complement_model = EventModel("models/complement_model_5.model") self.event_mapper = EventMapper() self.my_seq = Seq("CAAAACGTGT") self.forward_template_events = Seq2Events(self.my_seq, self.template_model).events self.reverse_template_events = Seq2Events(self.my_seq.reverse_complement(), self.template_model).events self.reverse_complement_events = Seq2Events(self.my_seq.reverse_complement(), self.complement_model).events self.forward_complement_events = Seq2Events(self.my_seq, self.complement_model).events self.complement_seq = self.my_seq.reverse_complement() def test_make_simple_reference(self): forward_template_mapping_results = self.event_mapper.map(self.forward_template_events, self.forward_template_events) reverse_template_mapping_results = self.event_mapper.map(self.reverse_template_events, self.forward_template_events) forward_complement_mapping_results = self.event_mapper.map(self.forward_complement_events, self.forward_template_events) reverse_complement_mapping_results = self.event_mapper.map(self.reverse_complement_events, self.forward_template_events) assert len(forward_template_mapping_results.path[0]) == 6 assert forward_template_mapping_results.path[0].tolist() == [0, 1, 2 ,3, 4, 5] assert forward_template_mapping_results.path[1].tolist() == [0, 1, 2 , 3 ,4, 5] assert min([sum(sum(forward_template_mapping_results.cost)), sum(sum(reverse_template_mapping_results.cost)), sum(sum(forward_complement_mapping_results.cost)), sum(sum(reverse_complement_mapping_results.cost))]) == sum(sum(forward_template_mapping_results.cost)) assert min([forward_template_mapping_results.dist, reverse_template_mapping_results.dist, forward_complement_mapping_results.dist, reverse_complement_mapping_results.dist]) == forward_template_mapping_results.dist def test_complement_mapping(self): forward_template_mapping_results = self.event_mapper.map(self.forward_template_events, self.forward_complement_events) reverse_template_mapping_results = self.event_mapper.map(self.reverse_template_events, self.forward_complement_events) forward_complement_mapping_results = self.event_mapper.map(self.forward_complement_events, self.forward_complement_events) reverse_complement_mapping_results = self.event_mapper.map(self.reverse_complement_events, self.forward_complement_events) assert len(forward_complement_mapping_results.path[0]) == 6 assert forward_complement_mapping_results.path[0].tolist() == [0, 1, 2 ,3, 4, 5] assert forward_complement_mapping_results.path[1].tolist() == [0, 1, 2 , 3 ,4, 5] assert min([sum(sum(forward_template_mapping_results.cost)), sum(sum(reverse_template_mapping_results.cost)), sum(sum(forward_complement_mapping_results.cost)), sum(sum(reverse_complement_mapping_results.cost))]) == sum(sum(forward_complement_mapping_results.cost)) assert min([forward_template_mapping_results.dist, reverse_template_mapping_results.dist, forward_complement_mapping_results.dist, reverse_complement_mapping_results.dist]) == forward_complement_mapping_results.dist
def intron_sequence_single(juncid,f): """ Returns the intron sequence and flanks for a join """ j1 = Junctionid(juncid) if j1.strand == '+': fiveprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna) threeprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna) donormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna) acceptormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna) acceptormotif = acceptormotif.upper() donormotif = donormotif.upper() dastring = donormotif + '..' + acceptormotif else: fiveprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna) threeprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna) fiveprimeflank = fiveprimeflank.reverse_complement() threeprimeflank = threeprimeflank.reverse_complement() acceptormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna) donormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna) acceptormotif = acceptormotif.upper() donormotif = donormotif.upper() dastring = donormotif.reverse_complement() + '..' + acceptormotif.reverse_complement() #INTSEQ[juncid]['dinucleotide'] = dastring #INTSEQ[juncid]['flank5'] = fiveprimeflank #INTSEQ[juncid]['flank3'] = threeprimeflank return dastring
def searchDnaParts(request, sequence_text,displayIdDnaComponent): coo = sequence_text rs = sequence_text.split('__') sequence_text = rs[0] sequence_vector = rs[1] f = len(sequence_vector) r = len(sequence_text) seq_exceptVector = sequence_text.replace(sequence_vector, '') s = len(seq_exceptVector) message = {"list_dnas": "", "extra_values": "","parttypes_values": "","optimizedfor_values": "", \ "reverse_list_dnas": "","reverse_extra_values": ""} if request.is_ajax(): # calculate the reverse complement sequence and duplicate sequence for better Vector matching sequence_textDuplicate = sequence_text + sequence_text my_seqDuplicate = Seq(sequence_textDuplicate, IUPAC.unambiguous_dna) revseqDuplicate = my_seqDuplicate.reverse_complement() my_seq = Seq(sequence_text, IUPAC.unambiguous_dna) revseq = my_seq.reverse_complement() my_seq_exceptVect = Seq(seq_exceptVector, IUPAC.unambiguous_dna) revseq_exceptVect = my_seq_exceptVect.reverse_complement() #part for retriving potential Inserts message['extra_values'] = getInsertDBAnnotationBySequence(seq_exceptVector,'+',displayIdDnaComponent) message['reverse_extra_values'] = getInsertDBAnnotationBySequence(str(revseq_exceptVect),'-',displayIdDnaComponent) #paret retrieving all partTypes dnapartstypesAll = M.DnaComponentType.objects.all() json_dnaparttype = '' for dnaparttype in dnapartstypesAll : id = dnaparttype.id name = dnaparttype.name if json_dnaparttype == '': json_dnaparttype = '{ "id":"'+str(id)+'","name":"'+name+'"}' else: json_dnaparttype = json_dnaparttype+',{ "id":"'+str(id)+'","name":"'+name+'"}' json_dnaparttype = '['+json_dnaparttype+']' message['parttypes_values'] = json_dnaparttype #paret retrieving all optimized for chassisOptimizedAll = M.Chassis.objects.all() json_chassisOptimizedAll = '' json_chassisOptimizedAll = '{ "id":"","name":""}' for chas in chassisOptimizedAll : id = chas.id name = chas.name displayId = chas.displayId if json_chassisOptimizedAll == '': json_chassisOptimizedAll = '{ "id":"'+str(id)+'","name":"'+name+'","displayId":"'+displayId+'"}' else: json_chassisOptimizedAll = json_chassisOptimizedAll+',{ "id":"'+str(id)+'","name":"'+name+ \ '","displayId":"'+displayId+'"}' json_chassisOptimizedAll = '['+json_chassisOptimizedAll+']' message['optimizedfor_values'] = json_chassisOptimizedAll else: message = "None" json = simplejson.dumps(message) return HttpResponse(json, mimetype='application/json')
def intron_sequence(myjuncs,f): """ Returns the intron sequence and flanks for each join """ INTSEQ = collections.defaultdict(lambda : collections.defaultdict(dict)) for juncid in myjuncs: j1 = Junctionid(juncid) if j1.strand == '+': fiveprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna) threeprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna) donormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna) acceptormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna) acceptormotif = acceptormotif.upper() donormotif = donormotif.upper() dastring = donormotif + '..' + acceptormotif else: fiveprimeflank = Seq(f[j1.chr][j1.end:j1.end+10], IUPAC.unambiguous_dna) threeprimeflank = Seq(f[j1.chr][j1.start-10:j1.start], IUPAC.unambiguous_dna) fiveprimeflank = fiveprimeflank.reverse_complement() threeprimeflank = threeprimeflank.reverse_complement() acceptormotif = Seq(f[j1.chr][j1.start:j1.start+2], IUPAC.unambiguous_dna) donormotif = Seq(f[j1.chr][j1.end-2:j1.end], IUPAC.unambiguous_dna) acceptormotif = acceptormotif.upper() donormotif = donormotif.upper() dastring = donormotif.reverse_complement() + '..' + acceptormotif.reverse_complement() INTSEQ[juncid]['dinucleotide'] = dastring INTSEQ[juncid]['flank5'] = fiveprimeflank INTSEQ[juncid]['flank3'] = threeprimeflank return INTSEQ
def get_reads_seqs(bamfile, rnames): """ Return the sequences of all the reads from the bam file Arguments: - `bamfile`: The pysam file - `rnames`: reads names """ r1_seqs = {} r2_seqs = {} rqns = set() reads = defaultdict(list) for read in bamfile.fetch(until_eof=True): rqns.add(read.qname) reads[read.qname].append(read) for rn in set(rnames) & rqns: for read in reads[rn]: if read.is_read1: outseq = Seq(read.seq) if not read.is_reverse: outseq = outseq.reverse_complement() r1_seqs[read.qname] = str(outseq) else: outseq = Seq(read.seq) if read.is_reverse: outseq = outseq.reverse_complement() r2_seqs[read.qname] = str(outseq) # r1_seqs is the 3' end of the second fused RNA, r2_seqs is the 5' of the # first fused RNA return r1_seqs, r2_seqs
class TwoPrimers(object): """A container for the two primers of a sample""" def __len__(self): return 2 def __init__(self, fwd_str, rev_str): # Strings # self.fwd_str = fwd_str self.rev_str = rev_str # Lengths # self.fwd_len = len(self.fwd_str) self.rev_len = len(self.rev_str) # Sequences # self.fwd_seq = Seq(self.fwd_str, IUPAC.ambiguous_dna) self.rev_seq = Seq(self.rev_str, IUPAC.ambiguous_dna) # Search patterns # self.fwd_pattern = iupac_pattern(self.fwd_seq) self.rev_pattern = iupac_pattern(self.rev_seq) # Don't add reverse complement here, use option instead # Search patterns reverse complemented # self.fwd_pattern_revcompl = iupac_pattern(self.fwd_seq.reverse_complement()) self.rev_pattern_revcompl = iupac_pattern(self.rev_seq.reverse_complement()) # Search expression without mismatches # self.fwd_regex = re.compile(self.fwd_pattern) self.rev_regex = re.compile(self.rev_pattern) # Uracil instead of thymine # self.fwd_regex_uracil = re.compile(self.fwd_pattern.replace('T', 'U')) self.rev_regex_uracil = re.compile(self.rev_pattern.replace('T', 'U'))
def extractRegion(bamfile): pysam.index(bamfile) # must create a .bai index for any bam file to be read or fetch won't work bam = pysam.Samfile(bamfile,'rb') # and must be done before bamfile is opened ref = bam.references[0] # Get name of reference reads aligned to in bam outFASTQfile = open(bamfile+".extracted.fastq",'w') # Need to keep this dictionary up-to-date with references you expect to see gene_pos = {'1b_Con1_full_reference_seq':{'ns5b':{'nterm':7599,'cterm':9371}}, '1a_H77_full_reference_seq':{'ns5b':{'nterm':7602,'cterm':9374}}, 'H77_genome':{'ns5b':{'nterm':7602,'cterm':9374}}, 'JFH-1_genome':{'ns5b':{'nterm':7666,'cterm':9443}}} # Get the reads in region of interest read_pool = bam.fetch(bam.references[0], gene_pos[ref]['ns5b']['nterm'],gene_pos[ref]['ns5b']['cterm']) # Process reads for read in read_pool: seqlen = len(read.seq) # If start and end of read is completely within region of interest, just write it out if read.pos >= gene_pos[ref]['ns5b']['nterm'] and read.aend <= gene_pos[ref]['ns5b']['cterm']: if read.is_reverse == True: # all reverse reads in a bam file have been reverse seq = Seq(read.query) # complemented already so they need to be reverse rc = seq.reverse_complement().tostring() # complemented again, along with the quality scores rq = reverseString(read.qqual) # to write correctly to the fastq outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n") else: outFASTQfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n") # If read is longer than region on N-term elif read.pos < gene_pos[ref]['ns5b']['nterm']: q = gene_pos[ref]['ns5b']['nterm'] - read.pos - 1 if read.is_reverse == True: seq = Seq(read.query[q:]) rc = seq.reverse_complement().tostring() rq = reverseString(read.qqual[q:]) outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n") else: outFASTQfile.write("@"+read.qname+"\n"+read.query[q:]+"\n+\n"+read.qqual[q:]+"\n") # If read is longer than region on C-term elif ((read.pos-read.qstart) + len(read.seq)) > gene_pos[ref]['ns5b']['cterm']: s = gene_pos[ref]['ns5b']['cterm'] if read.pos <= s: q = s - read.pos if read.is_reverse == True: seq = Seq(read.query[:q]) rc = seq.reverse_complement().tostring() rq = reverseString(read.qqual[:q]) outFASTQfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n") else: outFASTQfile.write("@"+read.qname+"\n"+read.query[:q]+"\n+\n"+read.qqual[:q]+"\n") outFASTQfile.close() return
def readFamilySequences(file, temparr, family): fileptr2 = open(file); correctFamily2 = 0; #this part is to find the family in .align2 and reduce dashes arrforall = [] #the whole arr arrelement = [] #each element of arrforall contains (origseq,organismseq,coordinates in MIRb) coor_arr = [] #the coordinates in MIRb line for line in fileptr2: if line[0] == '#': # KARRO: Allow us to comment out lines int the file (for testing) continue line = line.rstrip() arr = re.split("\s+", line) if len(arr) > 1: if arr[1] == family: coor_arr.append(int(arr[2])) # KARRO: Changed these to ints for consistancy. coor_arr.append(int(arr[3])) correctFamily2 = 1 line = "\t".join(arr) else: correctFamily2 = 0; else: if correctFamily2 == 1: #organism sequece line correctFamily2 = 2 organism_sequence = line elif correctFamily2 == 2: #original sequnce line origi_sequence = line line = line.lower() arr2 = re.split("-+", line) #reduce dashes in "line" line = "".join(arr2) r0 = re.search(line, temparr) #search modified origi in MIRb if r0 is None: line = Seq(line) #to use biopython line = line.reverse_complement() line = str(line) organism_sequence = Seq(organism_sequence) organism_sequence = organism_sequence.reverse_complement() organism_sequence = str(organism_sequence) origi_sequence = Seq(origi_sequence) origi_sequence = origi_sequence.reverse_complement() origi_sequence = str(origi_sequence) temparr2 = temparr.replace(line, origi_sequence) #change MIRb to original #print temparr2 #MIRb with original changed #print line #lower without dashes original piece pat = origi_sequence r1 = re.search(pat, temparr2) start = r1.start() end = r1.end() arrelement = [origi_sequence, organism_sequence, start, start, coor_arr[0], coor_arr[1]] arrforall.append(arrelement) coor_arr = [] #print arrforall #print "\n" return arrforall
def find_priming_sites(oligo, seq): """For supplied priming sequence, find positions of all matches in a given sequence returns list of sites. """ array = [] for m in re.finditer(oligo, str(seq)): array.append(m.end()) rc_oligo = Seq(oligo) rc_oligo.reverse_complement() for m in re.finditer(str(rc_oligo), str(seq)): array.append(m.end()) return array
def get_prom(f,gene): seqid = str(gene["seqid"]) if gene["strand"] == "+": prom_start = max(0,int(gene["start"]) - 3000) promf = f.sequence({'chr':seqid, 'start': prom_start, 'stop': int(gene["start"])}) s = Seq(promf, generic_dna) promr = s.reverse_complement() elif gene["strand"] == "-": prom_start = int(gene["end"]) + 3000 promr = f.sequence({'chr':seqid, 'start':int(gene["end"]), 'stop': prom_start, 'strand': '+'}) s = Seq(promr, generic_dna) promf = s.reverse_complement() return str(promf),str(promr)
def translate(sequence, min_protein_length=1, orient=None, frame=None, full=False, all=False): """Translates cdna sequence into protein""" seq = Seq(sequence) orfs = [] seq_len = len(seq) if orient == "+": strand_and_base = [(+1, seq)] elif orient == "-": strand_and_base = [(-1, seq.reverse_complement())] else: strand_and_base = [(+1, seq), (-1, seq.reverse_complement())] for strand, nuc in strand_and_base: for fm in range(3): if frame != None and fm != frame: continue trans = str(nuc[fm:].translate()) trans_len = len(trans) aa_start = 0 aa_end = 0 while aa_start < trans_len: aa_end = trans.find("*", aa_start) if aa_end == -1: aa_end = trans_len - 1 if aa_end - aa_start >= min_protein_length: if strand == 1: start = fm + aa_start * 3 end = min(seq_len - 1, fm + aa_end * 3 + 3 - 1) else: end = seq_len - 1 - fm - aa_start * 3 start = end + 1 - (aa_end - aa_start) * 3 - 3 orfs.append((start, end, strand, trans[aa_start:aa_end])) aa_start = aa_end + 1 if len(orfs) > 0: if not all: orfs.sort(lambda x, y: len(y[3]) - len(x[3])) if not full: return orfs[0][-1] else: return orfs[0] else: return orfs else: return None
class TwoPrimers(object): """A container for the two primers of a pool""" def __repr__(self): return '<%s object for pool %s>' % (self.__class__.__name__, self.parent.id_name) def __len__(self): return 2 def __init__(self, parent): self.parent, self.pool = parent, parent self.info = parent.info['primers'] # Basic # self.name = self.info.get('name') # Names # self.fwd_name = self.info['forward']['name'] self.rev_name = self.info['reverse']['name'] # Strings # self.fwd_str = self.info['forward']['sequence'] self.rev_str = self.info['reverse']['sequence'] # Lengths # self.fwd_len = len(self.fwd_str) self.rev_len = len(self.rev_str) # Sequences # self.fwd_seq = Seq(self.fwd_str, IUPAC.ambiguous_dna) self.rev_seq = Seq(self.rev_str, IUPAC.ambiguous_dna) # Search patterns # self.fwd_pattern = ''.join(['[' + iupac[char] + ']' for char in self.fwd_seq]) self.rev_pattern = ''.join(['[' + iupac[char] + ']' for char in self.rev_seq.reverse_complement()]) # Search expression # self.fwd_regex = re.compile(self.fwd_pattern) self.rev_regex = re.compile(self.rev_pattern) # Uracil instead of thymine # self.fwd_regex_uracil = re.compile(self.fwd_pattern.replace('T', 'U')) self.rev_regex_uracil = re.compile(self.rev_pattern.replace('T', 'U'))
def stitch(fragments): #this function takes seq records and prints primers #let's make an empty sequence file Nfrags=len(fragments) donor=Seq("") index=[] print("") for i in range (0, Nfrags): donor=donor+fragments[i] # Dummy assignment setup to allow for compilation Lup = "" Rup = "" Ldown = "" Rdown = "" L = "" R = "" for i in range (0, Nfrags): if i==0: Lup = "Lup"+ fragments[i].id + " " + getPrimer(donor) Rup = "Rup"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()) elif i==Nfrags-1: Ldown = "Ldown"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1]) Rdown = "Rdown"+ fragments[i].id + " " + getPrimer(donor.reverse_complement()) else: L = "L"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1]) R = "R"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()) sequenceLength = len(donor.seq) donorSequence = donor.seq return str(Lup), str(Rup), str(Ldown), str(Rdown), str(L), str(R), "Sequence Length: " + str(sequenceLength), "Sequence: " + str(donorSequence)
def insert_element(pos, ref, ofile): chrn = pos[0] chrseq = ref[chrn] half1 = chrseq[:pos[2]] half2 = chrseq[pos[2]:] repid =pos[4] repseq=pos[5] reptsd=pos[6] repname=pos[7] #print 'insert: %s, %s' %(reptsd, repseq) #Chr1 not.give transposable_element_attribute 1132975 1132977 - . . ID=Chr1.1132977.spanners;avg_flankers=17;spanners=0;type=homozygous;TE=mping;TSD=TAA gff_newline = '%s\tPseudoGenome\tTransposable_element\t%s\t%s\t%s\t.\t.\tID=%s_%s_%s;Original_ID=%s;TE=%s;TSD=%s;' %(chrn, pos[1], pos[2], pos[3], chrn, pos[1], pos[2], repid, repname, reptsd) print >> ofile, gff_newline ##we choose sequence at target site as tsd, not use tsd provided tsdstart = pos[2] - len(reptsd) tsdseq = chrseq[tsdstart:pos[2]] newseq = '' if pos[3] == '+': newseq = half1 + repseq + tsdseq + half2 #print tsdseq, repseq else: repseq_seq = Seq(repseq) repseq_rec = repseq_seq.reverse_complement() #print tsdseq, str(repseq_rec) newseq = half1 + str(repseq_rec) + tsdseq + half2 ref[chrn] = newseq
def main(inputtable, referencefile, window, k_min, k_max, outputdir, outputtable): outputdir += "/" if not os.path.exists(outputdir): os.makedirs(outputdir) megatable = pd.read_table(inputtable) reference = pysam.Fastafile(referencefile) table = open(outputdir + outputtable, 'w') table.write('KMER' + '\t' + 'AMOUNT' + '\n') kmer = [] for index, row in log_progress(megatable.iterrows(), name = inputtable, every = 250, size = len(megatable)): if (str(row['Alu_hg38']) == 'Unknown') and (str(row['Alu_dbRIP_hg38']) == 'Unknown'): pos = int(row['POS']) if row['STRAND'] == '+': start = pos end = pos + window seq = Seq(reference.fetch(row['CHR'], start, end)) seq = seq.reverse_complement() seq = str(seq).upper() else: start = pos - window - 1 end = pos - 1 seq = reference.fetch(row['CHR'], start, end) seq = seq.upper() for i in range(k_min, k_max + 1): for j in range(len(seq) - i): kmer.append(seq[j : j + i]) kmer_count = dict(Counter(kmer)) for key, value in kmer_count.items(): table.write(str(key) + '\t' + str(value) + '\n') table.close()
def _process_single_end(self, input_fh, output_fh): for header, seq, qualities in self._parse_sequences(input_fh): raw_seq_len = len(seq) self._stats["total_no_of_reads"] += 1 if self._fastq and not self._min_phred_score is None: seq = self._trim_by_quality(seq, qualities) if self._reverse_complement: seq = Seq(seq) seq = str(seq.reverse_complement()) if not self._adapter is None: seq = self._clip_adapter(seq) if self._poly_a_clipping: seq = self._poly_a_clipper.clip_poly_a_strech(seq) seq = self._poly_a_clipper.remove_3_prime_a(seq) clipped_seq_len = len(seq) if clipped_seq_len == raw_seq_len - 1: self._stats["single_a_removed"] += 1 elif clipped_seq_len < raw_seq_len - 1: self._stats["polya_removed"] += 1 else: self._stats["unmodified"] += 1 if clipped_seq_len < self._min_read_length: self._stats["too_short"] += 1 continue self._stats["long_enough"] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_len] += 1 self._stats["read_length_after_processing_and_freq"][ clipped_seq_len] += 1 # Encoding to bytes is necessary due to saving via gzip output_fh.write(str.encode(">%s\n%s\n" % (header, seq)))
def findFragendSites(fasta, resite): ''' Function creates FragendDict object. The object contains the location of all fragends for eachh strand of all chromosomes within a FASTA file. ''' # Process restriction enzyme size and create output dictionary resite = resite.upper() frags = {'resite': resite} # Create sequence object for resite and reverse complent standard = Seq(resite) revcomp = standard.reverse_complement() # Open and parse fasta file fastaHandle = open(fasta) fastaData = SeqIO.parse(fastaHandle,'fasta') # Loop through fasta file and extract fragend information for each chromosome for fasta in fastaData: # Extract name and sequence fName, fSequence = str(fasta.id), str(fasta.seq).upper() # Add re sites to dictionary using 1 based index forward = nt_search(fSequence, standard)[1:] if forward: frags[(fName,'+')] = [x + len(resite) for x in forward] else: frags[(fName,'+')] = [] reverse = nt_search(fSequence, revcomp)[1:] if reverse: frags[(fName,'-')] = [x + 1 for x in reverse] else: frags[(fName,'-')] = [] # Close input file and return data fastaHandle.close() return(frags)
def scanSequences(title,sequence,quality): tmp = str(random.random())[2:] seq = Seq(sequence) tempfile = open("hmm.seq"+tmp,"w") tempfile.write(">forward\n"+sequence+"\n>reverse\n"+seq.reverse_complement().tostring()) # writing full length seqto file for hmmscan tempfile.close() local_path = os.getcwd()+"/" hmmscan_bin = "/usr/local/bin/hmmscan" hmmresult_filename = doHMMScan("hmm.seq"+tmp,local_path,hmmscan_bin) target_cregions = processHMMresult(hmmresult_filename,local_path) s = q ='' if target_cregions['ns5b_5prime'] =='' and target_cregions['ns5b_3prime'] =='': s = sequence q = quality elif target_cregions['ns5b_5prime'] != '' and target_cregions=='forward': x = target_cregions['ns5b_5prime'][0] s = sequence[x:] q = quality[x:] elif target_cregions['ns5b_3prime'] != '' and target_cregions=='forward': x = target_cregions['ns5b_3prime'][0] s = sequence[:x] q = quality[:x] elif target_cregions['ns5b_5prime'] and target_cregions=='reverse': x = len(sequence)-target_cregions['ns5b_5prime'][0] s = sequence[:x] q = quality[:x] elif target_cregions['ns5b_3prime'] and target_cregions=='reverse': x = len(sequence)-target_cregions['ns5b_3prime'][0]+1 # This is a little tricky to compensate for 0-based index of string s = sequence[x:] q = quality[x:] # print "title: ",title," q: ",q," s: ",s return (title,s,q)
def stitch(fragments): #this function takes seq records and prints primers #let's make an empty sequence file Nfrags=len(fragments) donor=Seq("") index=[] print("") for i in range (0, Nfrags): donor=donor+fragments[i] for i in range (0, Nfrags): if i==0: print("Lup"+ fragments[i].name + " " + getPrimer(donor)) print("Rup"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())) elif i==Nfrags-1: print("Ldown"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1])) print("Rdown"+ fragments[i].name + " " + getPrimer(donor.reverse_complement())) else: print("L"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1])) print("R"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())) print("") print("Your donor DNA cassette, has the following bp length and sequence:") print("") print(len(donor.seq)) print("") print(donor.seq) print("") print("You might want to copy this entire prompt and save it for your records.")
def extractRegion(bamfile,start,stop,output): pysam.index(bamfile) # must create a .bai index for any bam file to be read or fetch won't work bam = pysam.Samfile(bamfile,'rb') # and must be done before bamfile is opened ref = bam.references[0] # Get name of reference reads aligned to in bam outfile = open(bamfile+".extracted."+output,'w') # Get the reads in region of interest read_pool = bam.fetch(bam.references[0], start,stop) # Process reads for read in read_pool: if read.is_reverse == True: # all reverse reads in a bam file have been reverse seq = Seq(read.query) # complemented already so they need to be reverse rc = seq.reverse_complement().tostring() # complemented again, along with the quality scores rq = reverseString(read.qqual) # to write correctly to the fastq if output == 'fastq': outfile.write("@"+read.qname+"\n"+rc+"\n+\n"+rq+"\n") elif output == 'fasta': outfile.write('>'+read.qname+'\n'+rc+'\n') else: if output == 'fastq': outfile.write("@"+read.qname+"\n"+read.query+"\n+\n"+read.qqual+"\n") elif output == 'fasta': outfile.write('>'+read.qname+'\n'+read.query+'\n') outfile.close() return
def prepend_barcode(seqfile, bcfile, rc, text=''): tmph = open(seqfile+'.tmp', 'w') itr1 = FastqGeneralIterator(open(seqfile)) itr2 = FastqGeneralIterator(open(bcfile)) (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() while 1: h1 = h1.split()[0] h2 = h2.split()[0] while h1 != h2: try: (h2, s2, q2) = itr2.next() h2 = h2.split()[0] except (StopIteration, IOError): break if rc: rcs = Seq(s2, generic_dna) s2 = rcs.reverse_complement() q2 = q2[::-1] if text: h1 = h1+'.'+text tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1)) try: (h1, s1, q1) = itr1.next() (h2, s2, q2) = itr2.next() except (StopIteration, IOError): break tmph.close() os.rename(seqfile+'.tmp', seqfile)
def generateSeqHandles(anIndexCfg): """ The YAML config file to parse is like: handles: prefix: "TTAGTCTCCGACGGCAGGCTTCAAT" postfix: "ACGCACCCACCGGGACTCAG" indexes: [ "ACAGTC", "TGATGC", "TCTCAG" ] There is a handle at one end of each sequence which is as follows: TTAGTCTCCGACGGCAGGCTTCAAT-ACAGTC-ACGCACCCACCGGGACTCAG prefix -index - postfix """ forwardIdx= [] # the result array to collect handle sequence strings handlePrefix = anIndexCfg["handles"]["prefix"] handlePostfix = anIndexCfg["handles"]["postfix"] for index in anIndexCfg["indexes"]: forwardIdx.append(handlePrefix + index + handlePostfix) reverseIdx = [] # to collect reverse complements for handle in forwardIdx: seq = Seq(handle) rc = str(seq.reverse_complement()) reverseIdx.append(rc) return (forwardIdx,reverseIdx)
def make_consensus( rev_string, for_string, seqfile): "function that accepts 2 sequence and returns the consensus sequence" # make fasta file for each paired sequence rev_sequence = Seq(rev_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna) rev_sequence= rev_sequence.reverse_complement() for_sequence = Seq(for_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna) paired_sequences = [SeqRecord(rev_sequence, id="rev"), SeqRecord(for_sequence, id="for")] if not os.path.exists("results/"): os.makedirs("results/") fasta_file = "results/" + seqfile + ".fasta" SeqIO.write(paired_sequences, fasta_file, "fasta") # align the paired sequences aln_file = "results/" + seqfile + ".aln" # clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen="100", gapopen="100") clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen=100, gapopen=100) clustalw_cline() # hack so that dumb_consensus will accept 1 base call against N f = open(aln_file, 'r+') contents = f.read() f.close() f = open(aln_file, 'w') f.write( contents.replace('N','.') ) f.close() # read in alignment file and generate consensus alignment = AlignIO.read(aln_file, "clustal") summary_align = AlignInfo.SummaryInfo(alignment) return summary_align.dumb_consensus(ambiguous = "N", threshold=0.0, require_multiple=0)
def main(args): usage = "usage: %prog [options] -i <input index file> -o <output barcode file>"+__doc__ parser = OptionParser(usage) parser.add_option("-i", "--input", dest="input", default=None, help="Input index fastq file.") parser.add_option("-o", "--output", dest="output", default=None, help="Output barcode file.") parser.add_option("-p", "--prefix", dest="prefix", default=None, help="Optional string to prepend to names.") parser.add_option("-r", "--revcomp", dest="revcomp", action="store_true", default=False, help="Print reverse complement of index sequences for barcodes [default is same].") (opts, args) = parser.parse_args() if not (opts.input and os.path.isfile(opts.input) and opts.output): parser.error("Missing input and/or output") # parse index file - build map barcodes = {} input_hdl = open(opts.input, 'rU') for rec in FastqGeneralIterator(input_hdl): seq = rec[1].upper() barcodes[seq] = 1 input_hdl.close() # print to output output_hdl = open(opts.output, 'w') for i, bc in enumerate(barcodes.keys()): if opts.revcomp: bcseq = Seq(bc, generic_dna) bc = bcseq.reverse_complement() if opts.prefix: output_hdl.write("%s.%d\t%s\n"%(opts.prefix, i+1, bc)) else: output_hdl.write(bc+"\n") output_hdl.close() return 0
def calc_repeat_rev_comp(seq, window): rec_one = Seq(seq) rec_one_rev_comp = rec_one.reverse_complement() rec_one_len = len(rec_one) dict_one = {} dict_two = {} for (seq, section_dict) in [(str(rec_one).upper(), dict_one), (str(rec_one_rev_comp).upper(), dict_two)]: for i in range(len(seq) - window + 1): section = seq[i : i + window] try: section_dict[section].append(i) except KeyError: section_dict[section] = [i] # Now find any sub-sequences found in both sequences matches = set(dict_one).intersection(dict_two) repeats_list = [] repeats_binary_list = [1] * len(rec_one) for section in matches: repeat_positions = set() for i in dict_one[section]: for j in dict_two[section]: # repeat_positions.add(i + window - 1) # repeats_binary_list[i + window - 1] = 0 repeat_positions.add(rec_one_len - j - 1) repeats_binary_list[rec_one_len - j - 1] = 0 if repeat_positions != set(): repeat = {section: sorted(list(repeat_positions))} repeats_list.append(repeat) return repeats_list, repeats_binary_list
def get_MSA(coords, method, species_set, query_species, version, force_strand = True): ''' Get the genome alignments that overlap a particular sequence region. ''' reverse = False if coords[6] == "-" and force_strand: reverse = True MSA = run_process(["perl", "MSA.pl", method, species_set, version, coords[0], coords[2], coords[3], query_species]) MSA = MSA.split("|||") MSA = [i.split(">") for i in MSA if i] MSA = [[j.split("\n") for j in i if j] for i in MSA] MSA_dict = {} for gab in MSA: for species in gab: name = species[0] temp_name = name.split("/") true_name = temp_name[0] coords = "-".join(temp_name[1:]) if true_name not in MSA_dict: MSA_dict[true_name] = {} current_seq = "".join(species[1:]).upper() if reverse: current_seq = Seq(current_seq, IUPAC.unambiguous_dna) current_seq = current_seq.reverse_complement() current_seq = str(current_seq) MSA_dict[true_name][coords] = current_seq return(MSA_dict)
def is_site_confirmed(self, mt_id2sites_ls, line, max_mis_match_perc, min_no_of_mismatches, max_esc_length): ### 1st parse (copied from transfacdb.py ls = line[:-1].split('|') mt_id = ls[0].strip() #remove spaces bs_disp_start_strand = ls[1].strip() #bs_disp_start = int(bs_disp_start_strand[:-3]) strand = bs_disp_start_strand[-2] #core_similarity_score = float(ls[2]) #matrix_similarity_score = float(ls[3]) sequence = ls[4].strip() if strand=='-': #take the reverse_compliment() seq = Seq(sequence) sequence = seq.reverse_complement().tostring() """ if self.debug: sys.stderr.write("Strand is -, need reverse_compliment() from %s to %s.\n"%(seq.data, sequence)) """ #transform it into upper case sequence = sequence.upper() no_of_mismatches_allowed = self.get_no_of_mismatches_allowed(sequence, max_mis_match_perc, \ min_no_of_mismatches, max_esc_length) #check the no_of_mismatches sites_ls = mt_id2sites_ls[mt_id] if sites_ls[0] == 0: #it's consensus return self.get_no_of_mismatches_for_consensus(sequence, sites_ls[1], no_of_mismatches_allowed,\ max_esc_length) elif sites_ls[0] == 1: #it's the sequence where the consensus is derived return self.get_no_of_mismatches_for_site(sequence, sites_ls, no_of_mismatches_allowed,\ max_esc_length) else: sys.stderr.write("Wrong type of sites_ls of mt_id2sites_ls: %s.\n"%sites_ls[0]) return None
def translate(seq, trans_table=CodonTable.unambiguous_dna_by_name["Standard"], min_prot_len=128): '''Translates supplied nucleotide sequence in all 6 reading frames.''' result = [] seq = Seq(seq) for strand, nuc in [('+', seq), ('-', seq.reverse_complement())]: for frame in range(3): trans = \ str(nuc[frame:-(len(nuc[frame:]) % 3)].translate(trans_table)) trans_len = len(trans) aa_start = 0 aa_end = 0 while aa_start < trans_len: aa_end = trans.find("*", aa_start) if aa_end == -1: aa_end = trans_len if aa_end - aa_start >= min_prot_len: start = frame + aa_start * 3 end = frame + aa_end * 3 result.append((start, end, strand, frame, len(trans[aa_start:aa_end]), trans[aa_start:aa_end])) aa_start = aa_end + 1 return result
def find_and_score(sequence): #Initialize arrays scores = [] indices = [] complements = [] PAMs = [] #Load scoring file model_file = open('crispr_app/V3_model_nopos.pickle', 'rb') model = pickle.load(model_file) #Score in 5-->3 direction for i in range(len(sequence) - 30): toScore = sequence[i:i+30] if len(toScore) == 30 and toScore[25:27] == 'GG': complements.append(toScore[4:24]) PAMs.append(toScore[24:27]) scores.append(calculateScore(toScore, model)) indices.append(i+21) #Score in 3-->5 (Reverse complement) direction mySeq = Seq(sequence) reverseComp = str(mySeq.reverse_complement()) for i in range(len(reverseComp) - 30): toScore = reverseComp[i:i+30] if len(toScore) == 30 and toScore[25:27] == 'GG': complements.append(toScore[4:24]) PAMs.append(toScore[24:27]) scores.append(calculateScore(toScore, model)) indices.append(len(sequence)-(i+21)) return scores, indices, complements, PAMs
def detect_orfs(seq): orf_list = [] seq = Seq(seq) seq_len = len(seq) aa_len = int(math.floor(seq_len/3.0)) for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]: for frame in range(3): trans = str(nuc[frame:].translate(trans_table)) trans_len = len(trans) aa_start = 0 aa_end = 0 # go through the translation and find end codons that follow a # start codon. while aa_start < trans_len and aa_start < aa_len: aa_end = trans.find("*", aa_start) has_stop = 1 if aa_end == -1: # no more stop codon, just abort... break # we start looking for a M at the earliest at aa_end-aa_len+1, # since we don't want an ORF that's actually bigger than the # original sequence if aa_start < aa_end-aa_len+1: aa_start = aa_end-aa_len+1 start_codon = trans.find('M', aa_start, aa_end) # is there a start codon? and is it before end of sequence # (remember we doubled up the sequence earlier to detect orfs # crossing boundaries) if start_codon == -1 or start_codon >= aa_len: assert(aa_end != -1) aa_start = aa_end+1 continue if aa_end-start_codon >= min_protein_len: # the following start and end need to start with # 1, not 0. if strand == 1: start = frame+start_codon*3+1 end = frame+aa_end*3+has_stop*3 size = end-start+1 if end > seq_len: end = end % seq_len else: start = seq_len-frame-aa_end*3-has_stop*3+1 end = seq_len-frame-start_codon*3 size = end-start+1 if start < 0: start = seq_len+start f = dict(name='ORF frame '+str(frame+1), start=start, end=end, strand=strand) orf_list.append(f) aa_start = aa_end+1 return orf_list
]) total = len(extract) current = 0 genomeFile = open('../db/blumeria/latest/Bgt_genome_v2_1.fa', 'r') genomeLines = genomeFile.readlines() genomeFile.close() for start, stop, contig, name, strand in extract: print(str(round(((current / total) * 100), 4)) + '% Done', end='\r') place = 1 thisContig = False output = [genomeSlicer(start, stop, x, contig) for x in genomeLines] selection = [x for x in output if x is not None] bluSeq = ''.join(selection) if strand == '-': bluSeqC = Seq(bluSeq) bluSeqC = bluSeqC.reverse_complement() geneGenie.append([name, str(bluSeqC)]) else: geneGenie.append([name, str(bluSeq)]) current += 1 # print (upstreamGirl) outfile = open('bluGenes.fa', 'w') for name, seq in geneGenie: outfile.write('>' + name + '\n') outfile.write(seq + '\n') outfile.close() toc = time.clock() print('And it only took ' + str(toc - tic) + ' seconds')
def rev_comp(seq): dna = Seq(seq, generic_dna) return str(dna.reverse_complement())
def reverse_complement(seq): my_dna = Seq(seq, generic_dna) rc = my_dna.reverse_complement() return str(rc)
#Sem BioPython dna = "AACTCCGTATCGGCTTAGCGCCTGACTTAACCACAGACCCGCCTTATGAGCTTCAACGGAAAGTATGTATCGGCCCCTTCCTATTTGATGTTAATCGCTACTTGGTATTGGCTGATTGCTTCCCTATTTATTTGAAGAGAAGTACCTGTTTCCTTGAAAGTTTGTATTCTTCCCATAAGTACCTTTTCCAAACTTACAGGACACCTCGTTGAATGGCCTAAGCCTAGCTGGCATACTAAGGCTAGTGTGTTAGGTTACAAGTTGGCTCCCTCCCGACGTAGCAGGCGGGTTGGTCTGAGCAGGTCAACCTCGTTCAGTGGCGATTTGAGAGCGAGGTTCTCTGACAAGCGCCCTTTGCCGTATCGTGGCCGCAAGGAGTTCCCGATATCGCGTTGAGTGTCGTAGGAGAGACTCGGAGCTAATGATGCACTTCCTGGGCACCATGGGGCAGCCCCCTTGGGTAACGCCGGAGCATAATAATATCCCCAAAGTAGCAGTGTATACGAATGGCTACGGTCGACATAGCATTATCAATTAAGTGATTTTATGTAAAAAGCGACCTTTTTTTGCCCTTGTACCCGGGCTGAGTCCTGTCGCGGCGGTGGGAGCCCCACTGTAGTCGGGGTTATGTGCTAGTACACCTAAAGTTAGATGGATGTCTAGTCCCTCCAACAATACCCCTAGCGCTGAGGTTCTTTGACTTCCTTTGATTTTTCAACCGAGCTTAATCACACAAACGGTCAGGATAAGTTATCAAACATTCCCTCGTGTAATTCCTCAACGCACTCGTCATACACGGATGGGCAGTACACGCAGCCCTGCGTCCGGCCACCTTGCAAGCCATGGGCGCATTCCCATGTGGAATTCCAGTGTAAGCACCACAGGCAGTGGTTTATATCCTATACCACTCTTGTTAGTGCGAACCTAGGTACGCGACAGCCTCGACCGAGGTCCCTATCACAACCGGAAATTTGCCGATGA" dna_reverse_complement = str() dna_reverse = dna[::-1] for i in dna_reverse: if i == "A": dna_reverse_complement += "T" elif i == "T": dna_reverse_complement += "A" elif i == "C": dna_reverse_complement += "G" elif i == "G": dna_reverse_complement += "C" else: print("error") print(dna_reverse_complement) #ComBiopython from Bio.Seq import Seq dna = "ACATCAGCATGCATGCATGCATCGATCGATGCATCGATGCATCGATGCATGCATCGATCGATCGATCT" dna = Seq(dna) print(dna.reverse_complement())
def _reverse_complement(self, sequence): dna = Seq(sequence) rev_complement = dna.reverse_complement() return rev_complement
def reverse(seq): my_seq = Seq(seq, IUPAC.unambiguous_dna) re_seq = str(my_seq.reverse_complement()) return re_seq
def bam_5prime_stranded(file, minlength=0, maxlength=1000, unique=False): # empty dicts for read sequences and read counts sense_seqs = [] sense_counts = [] antisense_seqs = [] antisense_counts = [] # read in bamfile bamfile = pysam.AlignmentFile(file, "rb") for read in bamfile: # convert line into string to make it splittable line = str(read) # split line on tabs linesplit = line.split("\t") # this filters out reads with flag 4 (i.e. unmapped reads) cigar = linesplit[5] if cigar.endswith('M'): seq = linesplit[9] if unique == True: count = 1 else: count = int(linesplit[0].split('-')[1]) # this is where the length filtering happens if minlength <= len(seq) <= maxlength: # this splits up sense and antisense reads if linesplit[1] == '16': # convert seq string to Biopython Seq object tempseq = Seq(seq) # set reverse-complement of Seq object as string tempseqrevcomp = tempseq.reverse_complement() antisense_seqs.append(tempseqrevcomp) antisense_counts.append(count) else: sense_seqs.append(seq) sense_counts.append(count) # go through all of the sequences, creating a list of all read lengths lengths = [] for i in sense_seqs: if len(i) not in lengths: lengths.append(len(i)) for i in antisense_seqs: if len(i) not in lengths: lengths.append(len(i)) # sort the lengths list in ascending order lengths.sort() # for each base, the total number of the shortest reads starting with that base will be appended, followed by the next length, and the next length... senseA = [] senseC = [] senseG = [] senseT = [] senseN = [] antisenseA = [] antisenseC = [] antisenseG = [] antisenseT = [] antisenseN = [] # go through the lengths, looking at the first base of each sense read of that length, and totalling them up for readlength in lengths: Acount = 0 Ccount = 0 Gcount = 0 Tcount = 0 Ncount = 0 for i in range(len(sense_seqs)): read = sense_seqs[i] if len(read) == readlength: firstbase = read[0] if unique == True: count = 1 elif unique == False: count = sense_counts[i] if firstbase == 'A': Acount = Acount + count elif firstbase == 'C': Ccount = Ccount + count elif firstbase == 'G': Gcount = Gcount + count elif firstbase == 'T': Tcount = Tcount + count elif firstbase == 'N': Ncount = Ncount + count senseA.append(Acount) senseC.append(Ccount) senseG.append(Gcount) senseT.append(Tcount) senseN.append(Ncount) # go through the lengths, looking at the first base of each antisense read of that length, and totalling them up for readlength in lengths: Acount = 0 Ccount = 0 Gcount = 0 Tcount = 0 Ncount = 0 for i in range(len(antisense_seqs)): read = antisense_seqs[i] if len(read) == readlength: firstbase = read[0] # NB the counts here are subtracted rather than added to make the plotting on the bottom of the plot work if unique == True: count = 1 elif unique == False: count = antisense_counts[i] if firstbase == 'A': Acount = Acount - count elif firstbase == 'C': Ccount = Ccount - count elif firstbase == 'G': Gcount = Gcount - count elif firstbase == 'T': Tcount = Tcount - count elif firstbase == 'N': Ncount = Ncount - count antisenseA.append(Acount) antisenseC.append(Ccount) antisenseG.append(Gcount) antisenseT.append(Tcount) antisenseN.append(Ncount) print('Bases counted') # format the dataframe formatted = {} formatted['Length'] = lengths formatted['senseA'] = senseA formatted['senseC'] = senseC formatted['senseG'] = senseG formatted['senseT'] = senseT formatted['senseN'] = senseN formatted['antisenseA'] = antisenseA formatted['antisenseC'] = antisenseC formatted['antisenseG'] = antisenseG formatted['antisenseT'] = antisenseT formatted['antisenseN'] = antisenseN basecounts = pd.DataFrame(formatted, columns=[ 'Length', 'senseA', 'senseC', 'senseG', 'senseT', 'senseN', 'antisenseA', 'antisenseC', 'antisenseG', 'antisenseT', 'antisenseN' ]) return (basecounts)
'C') - 16.4) / (DNA_Sequence.count('A') + DNA_Sequence.count('T') + DNA_Sequence.count('G') + DNA_Sequence.count('C')) except: Tm_more = 0 st.write("-The Melting Temperature(Tm) of the given DNA sequence:", Tm_more, "°C") st.write(""" *** """) ###Reverse Complement of the given DNA Sequence### DNA_Sequence = Seq(DNA_Sequence) st.subheader("[7] Reverse Complement of the given DNA Sequence") st.write(DNA_Sequence.reverse_complement()) st.write(""" *** """) ###Transcribed Sequence of the given data### st.subheader("[8] Transcribed Sequence (DNA -> RNA)") st.write(DNA_Sequence.transcribe()) st.write(""" *** """) ###Translated Sequence of the given data### st.subheader("[9] Translated Sequence (RNA -> Protein)")
barcode_type = int(sys.argv[4]) #dictonary to point to the output files pointer_dict={ 'F':out_f, 'R':out_r} fcount = 0 rcount = 0 #setup to search for the reverse complement #sequences of the barcodes if barcode_type == 2: print('two barcodes:' 'Remember: input both barcode seqences in forward orientation') #store barcode Forward (reverse complement) fr_barcode = f_barcode.reverse_complement() out_fr = out_base+'_FR.bam' out_fr = pysam.Samfile(out_fr, "wb", template = bamfile) #store barcode Reverse (reverse complement) rr_barcode = r_barcode.reverse_complement() out_rr = out_base+'_RR.bam' out_rr = pysam.Samfile(out_rr, "wb", template = bamfile) #update the pointer to the add two more output files pointer_dict['FR']=out_fr pointer_dict['RR']=out_fr frcount = 0 rrcount = 0 elif barcode_type == 1: print('one barcode:' 'Remember: input the forward and reverse complement sequences') else: print('something odd, i shuld not be here with barcode type')
print "\nmismatches_in_hsp: " + str(mismatches_in_hsp) print "\nQuery: " + query.upper() print "Subject: " + str(lookup_mature[mature_title]) print rec.query count = count + 1 found = True counter = 1 if (int( strand.split(',')[1].split(' ')[1].split(')') [0]) == -1): mature_fasta.write('>>' + str(rec.query) + '-rc' + '\n') mature_fasta.write( (str(query_seq.reverse_complement()) + '\n')) else: mature_fasta.write('>>' + str(rec.query) + '\n') mature_fasta.write( str(record_index[rec.query].seq) + '\n') mature_fasta.write(str(hit) + '\n') mature_fasta.write( str(lookup_mature[mature_title]) + '\n') mature_summary.write( str(count) + ',>>' + str(rec.query) + ',' + hit + ',' + str(hsps[10]) + ',' + str(hsps[12]) + ',' + str(hsps[-3]) + ',' + str(hsps[-1]) + ',' + strand_csv + ',' + out_score + ',' + str(hsps[5]).split(',')[0] + ',' + str(len(query)) + ',' + str(len(x.match)) + ',' +
# Pri načrtovanju oligonukleotidov je priporočljivo upoštevati nekaj osnovnih pravil, ki pripomorejo k večji uspešnosti reakcije in manjši količini nespecifičnih produktov. Z iskanjem po spletu lahko najdemo kar nekaj priporočil, na primer [navodila podjetja Addgene](https://www.addgene.org/protocols/primer-design/), nekaj osnovnih pa je tukaj: # * temperatura tališča ($T_m$) smernega in protismernega oligonukleotida se naj ne bi razlikovala za več kot 5 °C (kar nam omogoča njuno optimalno vezavo na matrico pri enaki temperaturi prileganja ($T_a$)), hkrati pa naj bo njuna temperatura tališča nekje med 50 in 60 °C, # * delež parov GC naj bo med 40 in 60 %. # # --- # ## Primeri kode # # Spodaj je predstavljen zgled, kako definiramo nukleotidno zaporedje in dobimo komplementarno ter obratno-komplementarno zaporedje. Prikazano je tudi, kako preštejemo določen nukleotid v zaporedju. # In[1]: from Bio.Seq import Seq my_seq = Seq('AGTACACTGGT') print(my_seq) print(my_seq.complement()) print(my_seq.reverse_complement()) print(my_seq.count('A')) # preštejemo nek nukleotid print(len(my_seq)) # dolžina zaporedja # Drug način za štetje nukleotidov je, da si pripravimo slovar, na primer: # In[2]: freq = {} for x in my_seq: freq[x] = my_seq.count(x) print(freq) # izpiše slovar print('A:', freq['A']) # izpiše, koliko A je v zaporedju # Pogosto želimo, so vsi nukleotidi napisani bodisi z velikimi ali malimi črkami. Za ta namen lahko uporabimo `upper` ali `lower`:
from Bio.Seq import Seq from Bio.Alphabet import IUPAC from Bio.Seq import MutableSeq # Seq doesn't inherit from String print(Seq.__bases__) # Example of f string formatting using Seq my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna) print(f"The value of my_seq is {my_seq}.") # Example of using the format method on a Seq object print("my_seq can be expressed as a string using the format method: {}".format( my_seq)) # Can Seq objects be overwritten by reverse complements or are they immutable? my_seq = my_seq.reverse_complement() print(f"The value of my_seq is {my_seq}.") # What methods are available from MutableSeq print(f"MutableSeq attributes and methods are {dir(MutableSeq)}.")
def simulate_read_with_errors(self, right_read, left_read, common_id, ins_rate1, ins_rate2, del_rate1, del_rate2, pid): # put all together # unique identifiers for right and left reads dir = os.getcwd() os.chdir("temp_files_%s" % pid) right_read_id = "2:N:0:CGCTGTG" right_id = common_id + "space" + right_read_id left_read_id = "1:N:0:CGCTGTG" left_id = common_id + "space" + left_read_id # attemp to use art to simulate the quality scores and the error rate #create a one read genome left_fasta = open("left_read_%s.fa" % (self.process), "w") left_fasta.write(">" + left_id + "\n" + str(left_read) + "\n") # sim the read with art left_fasta.close() sp.call( "art_illumina -q -na -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i left_read_%s.fa -l %s -f 1 -o left%s" % (ins_rate1, ins_rate2, del_rate1, del_rate2, self.process, self.read_length, self.process), shell=True, stdout=sp.DEVNULL, stderr=sp.STDOUT) with open("left%s.fq" % (self.process), 'r') as left: left_read = left.read().replace('space', ' ').replace( '1:N:0:CGCTGTG-1', '1:N:0:CGCTGTG') # get the reverse complement of the right read right_read = Seq(right_read, generic_dna) right_read = right_read.reverse_complement() right_fasta = open("right_read_%s.fa" % (self.process), "w") right_fasta.write(">" + right_id + "\n" + str(right_read) + "\n") right_fasta.close() # sim the read with art sp.call( "art_illumina -na -q -ss HS25 -ir %s -ir2 %s -dr %s -dr2 %s -nf 0 -i right_read_%s.fa -l %s -f 1 -o right%s" % (ins_rate1, ins_rate2, del_rate1, del_rate2, self.process, self.read_length, self.process), shell=True, stdout=sp.DEVNULL, stderr=sp.STDOUT) with open("right%s.fq" % (self.process), 'r') as right: right_read = right.read().replace('space', ' ').replace( '1:N:0:CGCTGTG-1', '2:N:0:CGCTGTG') #sometimes the reading fails. I introduce this to capture it try: right_record = SeqIO.read(StringIO(right_read), "fastq") left_record = SeqIO.read(StringIO(left_read), "fastq") os.chdir(dir) return (left_record, right_record) except ValueError as v: warnings.warn('Catched ValueError in a sampling round. Skipping') os.chdir(dir) return (None)
def reverse_complement(seq): my_dna = Seq(seq, IUPAC.ambiguous_dna) seq_RC = str(my_dna.reverse_complement()) return seq_RC
print(mySequence[0]) stringSequence = str(mySecondSequence) format_fast_string = ">Genname\n%s\n" % mySecondSequence print(format_fast_string) # Zusammenfügen bzw. Konkatenieren von Sequenzen # ACHTUNG BEIM ARBEITEN von unterschiedlichen Seqs Typ check dna_seq = Seq("ACGTA") protein_seq = Seq("EVRNAK") print ("Sum: ", protein_seq + dna_seq) print(dna_seq) print(dna_seq.complement()) print(dna_seq.reverse_complement()) # Transcription and Tanslation coding_dna = Seq("ATGGCCATTGTAATG") template_dna = coding_dna.reverse_complement() messenger_rna = transcribe(coding_dna) print(messenger_rna) print(back_transcribe(messenger_rna)) print(translate(messenger_rna)) myThirdSequence = Seq("GATCGATGGGGGCTATCC") print(GC(myThirdSequence)) # MutableSeq objects print(dna_seq)
from Bio.Seq import Seq import Bio.Alphabet # open txt file with sequence to transcribe f = open('rosalind_revc.txt', 'r') # read all lines by removing newline character data = f.read().replace('\n', '') # close file f.close() # assign sequence as a DNA sequence t = Seq(data, Bio.Alphabet.IUPAC.unambiguous_dna) # use transcribe function o = open('_REVC.txt', 'w') o.write(str(t.reverse_complement())) o.close()
def rev_seq(seq): #function that reverse and complement a sequence my_dna = Seq(seq, generic_dna) return str(my_dna.reverse_complement())
sequence_lines = SEQUENCE_FILE.read().splitlines() seq = "".join(sequence_lines).replace('\n', '') GENES_FILE = open(args['GENES_FILE'], 'r') gene_lines = GENES_FILE.read().splitlines() for line in gene_lines: data = line.split('\t') start, end, strand = int(data[0]), int(data[1]), data[2] if strand == "+": print "gene: " + str(start) + "-" + str( end) + " DIRECT: ..." + seq[end - NUM:end] elif strand == "-": #make it a Seq object and reverse complement subseq = Seq(seq[start - 1:start + NUM - 1], unambiguous_dna) subseq = str(Seq.reverse_complement(subseq)) print "gene: " + str(start) + "-" + str( end) + " COMPLEMENTARY: ..." + subseq ############END######## sys.exit(0) gene_lines, gene_filetype = geneTools.readORFLines(args['GENES_FILE']) NUM_GENES = 100 gene_lines_rand = [] attempts = 0 while len(gene_lines_rand) < NUM_GENES: index = random.randint(0, len(gene_lines) - 1) gene = geneTools.getLineData(gene_lines[index], gene_filetype) if gene not in gene_lines_rand: gene_lines_rand.append(gene)
def get_reverse_complement(nuc_seq): '''Get reverse complement''' my_dna = Seq(nuc_seq, generic_dna) rev_comp_dna = str(my_dna.reverse_complement()) return rev_comp_dna
def str_reverse_comp(str_seq): #gets reverse-compliment of a string and returns it as a string seq_str = Seq(str_seq) return str(seq_str.reverse_complement())
def main(): # initialization # parse command-line arguments if len(sys.argv) == 5: # have comma separated lists for each field, split on comma gff3DB = sys.argv[1] refGenomeFile = sys.argv[2] alleleTableFilename = sys.argv[3] chromToGenotype = sys.argv[4] else: print("Didn't specify right number of input arguments, exiting...") sys.exit() db = gffutils.FeatureDB( gff3DB) # this database was created in previous step # Assumption in analyses below is that exons are at least 1 a.a. residue long, 3 bp. confirm this areExonsAtLeast3bp(db) # create data structs to convert position -> functional effect Pos_2_CDS, CDSinfo = shFn.constructCdsDicts( db, chromToGenotype) # Pos_2_CDS[pos] = [ sorted CDS ids ] # Pos_2_CDS[pos] = [ sorted CDS ids ] # CDSinfo[firstCDS] = (0:geneName, 1:strand, 2:frame, 3:startPos, 4:endPos, 5:RefSeqScaffold, 6:UniqueGeneName) refGenomeSeqDict = SeqIO.to_dict( SeqIO.parse(refGenomeFile, "fasta") ) # refGenomeSeqDict[ scaffoldName ].seq = ATG...GGC, sequence of that scaffold translateCodonDict = shFn.constructTranslateCodonDict() mkDict = {} # mkDict[gene] = [SynPoly, NonsynPoly, SynFixed, NonsynFixed] for gene in db.features_of_type('gene'): if gene.seqid == chromToGenotype: mkDict[gene['Name'][0]] = [0, 0, 0, 0] if not os.path.isfile(alleleTableFilename): print( "Alleletable -> MKtable script cannot locate the allele table! exiting..." ) sys.exit() f = open(alleleTableFilename, 'r') # File structure: # Col 1: position # Col 2: Refbase # Col 3: Species1 allele # Col 4: Species1 allele frequency # ... # Col n-1: SpeciesX allele # Col n: SpeciesX allele frequency total = 0 biallelic = 0 biCoding = 0 numOutgroupPolymorphic = 0 numOutgroupFixed = 0 for line in f: if not re.match(r'^Position', line): mutInfo = line.split() posInScaff = int(mutInfo[0]) refBase = str(mutInfo[1]) ######### # NOTE: only analysing ingroup and a single outgroup, for now ######### ingroupBase = str(mutInfo[2]) ingroupAF = float(mutInfo[3]) outgroupBase = str(mutInfo[4]) outgroupAF = float(mutInfo[5]) allelesAllDict = { } # used to check how many alleles at this site, across all species allelesAllDict[ refBase] = 1 # also use ref base to see if site biallelic allelesAllDict[ingroupBase] = 1 allelesAllDict[outgroupBase] = 1 #allelesOutgroupDict = {} # used to check how many alleles at this site, ONLY in outgroup species #i = 2 # ALT alleles start at index 2 (Col 3), and proceed until end, skipping a col for frequency each time #while i < len(mutInfo)-1: # allelesAllDict[mutInfo[i]] = 1 # allelesOutgroupDict[mutInfo[i]] = 1 # i+=2 #if len(allelesAllDict.keys()) == 2: # biallelic+=1 #if len(allelesOutgroupDict.keys()) == 1: # IMPORTANT FILTERING STEP: OUTGROUP SPECIES MUST AGREE ON ANCESTRAL BASE, CHANGE ME #outgroupBase = next(iter(allelesOutgroupDict.keys())) if posInScaff in Pos_2_CDS.keys(): # is position in coding region biCoding += 1 inPolymorphic = None inFixed = None outPolymorphic = None outFixed = None # is ingroup polymorphic or fixed? if ingroupAF > 0.0 and ingroupAF < 1.0: inPolymorphic = 1 inFixed = 0 else: inPolymorphic = 0 inFixed = 1 # is outgorup polymorphic or fixed? if outgroupAF > 0.0 and outgroupAF < 1.0: outPolymorphic = 1 outFixed = 0 numOutgroupPolymorphic += 1 else: outPolymorphic = 0 outFixed = 1 numOutgroupFixed += 1 firstCDS = Pos_2_CDS[posInScaff][ 0] # first of sorted mRNAs position corresponds to # Reminder: CDSinfo[firstCDS] = (0:geneName, 1:strand, 2:frame, 3:startPos, 4:endPos, 5:RefSeqScaffold, 6:UniqueGeneName) geneName = CDSinfo[firstCDS][6] strand = CDSinfo[firstCDS][1] refseqScaffold = CDSinfo[firstCDS][5] if strand == "+": p1, p2, p3 = shFn.getCodonPositionsInReference_forwardStrand( posInScaff, firstCDS, CDSinfo) elif strand == "-": p1, p2, p3 = shFn.getCodonPositionsInReference_reverseStrand( posInScaff, firstCDS, CDSinfo) # ignore sites in which flanking bases could not be found, i.e. flanking exon not located from partial gene model if p1 == None: continue posInCodonToChange = shFn.getPosInCodon(p1, p2, p3, posInScaff) # scaffolds in gff3 RefSeq IDs, scaffolds in reference genome Genbank IDs b1 = str( refGenomeSeqDict[refseqScaffold].seq[(p1 - 1):(p1)]).upper() b2 = str( refGenomeSeqDict[refseqScaffold].seq[(p2 - 1):(p2)]).upper() b3 = str( refGenomeSeqDict[refseqScaffold].seq[(p3 - 1):(p3)]).upper() codonRef = [b1, b2, b3] if "N" not in codonRef: codonIngroup = codonRef.copy( ) # need to use .copy method, otherwise it creates new pointer to same object codonOutgroup = codonRef.copy() codonIngroup[posInCodonToChange] = ingroupBase.upper() codonOutgroup[posInCodonToChange] = outgroupBase.upper() codonRefStr = ''.join(codonRef) codonIngroupStr = ''.join(codonIngroup) codonOutgroupStr = ''.join(codonOutgroup) if strand == "-": ref = Seq(codonRefStr, generic_dna) ref = ref.reverse_complement() inG = Seq(codonIngroupStr, generic_dna) inG = inG.reverse_complement() outG = Seq(codonOutgroupStr, generic_dna) outG = outG.reverse_complement() codonRefStr = str(ref) codonIngroupStr = str(inG) codonOutgroupStr = str(outG) if inFixed: if outFixed and codonIngroupStr != codonOutgroupStr: #ingroup and outGroup fixed, can only be 2 bases if translateCodonDict[ codonIngroupStr] == translateCodonDict[ codonOutgroupStr]: mkDict[geneName][2] += 1 #synonymous else: mkDict[geneName][3] += 1 #nonsynonymous elif outPolymorphic: continue # WE ARE IGNORING THIS FOR NOW; # this could represent a fixed diff in ingroup that's still polymorphic in outgroup # OR, it could be a a polymorphic in the outgroup where NOTHING happened in ingroup (unmutated, monomorphic) else: continue #inPolymorphic #outPolymorphic #codonIngroupStr == codonOutgroupStr, b/c the second outgroup has a different base if inPolymorphic: #is site biallelic? if len(allelesAllDict.keys()) == 2: if translateCodonDict[ codonIngroupStr] == translateCodonDict[ codonOutgroupStr]: mkDict[geneName][0] += 1 #synonymous else: mkDict[geneName][1] += 1 #nonsynonymous # if multiallelic, then Ingroup alternate allele != outgroup allele # these sites count as 1 polymorphism, 1 divergence elif len(allelesAllDict.keys()) > 2: # add 1 to polymorphism if translateCodonDict[ codonIngroupStr] == translateCodonDict[ codonRefStr]: mkDict[geneName][0] += 1 #synonymous else: mkDict[geneName][1] += 1 #nonsynonymous # add 1 to divergence if translateCodonDict[ codonIngroupStr] == translateCodonDict[ codonOutgroupStr]: mkDict[geneName][2] += 0.5 #synonymous else: mkDict[geneName][3] += 0.5 #nonsynonymous if translateCodonDict[ codonRefStr] == translateCodonDict[ codonOutgroupStr]: mkDict[geneName][2] += 0.5 #synonymous else: mkDict[geneName][3] += 0.5 #nonsynonymous mkFile = open('MKtable_%s.txt' % chromToGenotype, 'w') print("GeneName\tSynPolymorphic\tNonsynPolymorphic\tSynFixed\tNonsynFixed", file=mkFile) for gene in mkDict.keys(): print(gene, "\t", end="", file=mkFile) for i in (0, 1, 2, 3): print(mkDict[gene][i], end="\t", file=mkFile) print(file=mkFile) outFile = open('out_%s.txt' % chromToGenotype, 'w') print("Coding biallelic polymorphic sites: :", biCoding, file=outFile) print("Number of sites with polymorphic outgroup: ", numOutgroupPolymorphic, file=outFile) print("Number of sites with fixed outgroup: ", numOutgroupFixed, file=outFile) sys.exit()
class NextOrf: def __init__(self, file, options): self.options = options self.file = file self.genetic_code = int(self.options['table']) self.table = makeTableX(CodonTable.ambiguous_dna_by_id[self.genetic_code]) self.counter = 0 self.ReadFile() def ReadFile(self): handle = open(self.file) for record in SeqIO.parse(handle, "fasta"): self.header = record.id frame_coordinates = '' dir = self.options['strand'] plus = dir in ['both', 'plus'] minus = dir in ['both', 'minus'] start, stop = int(self.options['start']), int(self.options['stop']) s = str(record.seq).upper() if stop > 0: s = s[start:stop] else: s = s[start:] self.seq = Seq(s,IUPAC.ambiguous_dna) self.length = len(self.seq) self.rseq = None CDS = [] if plus: CDS.extend(self.GetCDS(self.seq)) if minus: self.rseq = self.seq.reverse_complement() CDS.extend(self.GetCDS(self.rseq, strand = -1)) self.Output(CDS) def ToFasta(self, header, seq): seq = re.sub('(............................................................)','\\1\n',seq) return '>%s\n%s' % (header, seq) def Gc(self, seq): d = {} for nt in 'ATGC': d[nt] = seq.count(nt) gc = d['G'] + d['C'] if gc == 0: return 0 return round(gc*100.0/(d['A'] +d['T'] + gc),1) def Gc2(self,seq): l = len(seq) d= {} for nt in ['A','T','G','C']: d[nt] = [0,0,0] for i in range(0,l,3): codon = seq[i:i+3] if len(codon) < 3: codon = codon + ' ' for pos in range(0,3): for nt in ['A','T','G','C']: if codon[pos] == nt: d[nt][pos] = d[nt][pos] +1 gc = {} gcall = 0 nall = 0 for i in range(0,3): try: n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] gc[i] = (d['G'][i] + d['C'][i])*100.0/n except: gc[i] = 0 gcall = gcall + d['G'][i] + d['C'][i] nall = nall + n gcall = 100.0*gcall/nall res = '%.1f%%, %.1f%%, %.1f%%, %.1f%%' % (gcall, gc[0], gc[1], gc[2]) return res def GetOrfCoordinates(self, seq): s = seq.data letters = [] table = self.table get = self.table.forward_table.get n = len(seq) start_codons = self.table.start_codons stop_codons = self.table.stop_codons # print 'Start codons', start_codons # print 'Stop codons', stop_codons frame_coordinates = [] for frame in range(0,3): coordinates = [] for i in range(0+frame, n-n%3, 3): codon = s[i:i+3] if codon in start_codons: coordinates.append((i+1,1,codon)) elif codon in stop_codons: coordinates.append((i+1,0,codon)) frame_coordinates.append(coordinates) return frame_coordinates def GetCDS(self, seq, strand = 1): frame_coordinates = self.GetOrfCoordinates(seq) START, STOP = 1,0 so = self.options nostart = so['nostart'] minlength, maxlength = int(so['minlength']), int(so['maxlength']) CDS = [] f = 0 for frame in frame_coordinates: f+=1 start_site = 0 if nostart == '1': start_site = 1 frame.append((self.length, 0, 'XXX')) for pos, codon_type, codon in frame: if codon_type == START: if start_site == 0: start_site = pos elif codon_type == STOP: if start_site == 0: continue # if codon == 'XXX': print 'do something' stop = pos + 2 # print stop length = stop - start_site +1 if length >= minlength and length <= maxlength: if nostart == '1' and start_site == 1: start_site = start_site + f - 1 if codon == 'XXX': stop = start_site + 3*((int((stop-1)-start_site)/3)) s = seq[start_site -1 : stop] CDS.append((start_site, stop, length, s, strand*f)) start_site = 0 if nostart == '1': start_site = stop + 1 elif length < minlength or length > maxlength: start_site = 0 if nostart == '1': start_site = stop + 1 del stop return CDS def Output(self, CDS): out = self.options['output'] seqs = (self.seq, self.rseq) n = len(self.seq) for start, stop, length, subs, strand in CDS: self.counter += 1 if strand > 0: head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, start,stop) if strand < 0: head = 'orf_%s:%s:%d:%d:%d' % (self.counter, self.header, strand, n-stop+1,n-start+1) if self.options['gc']: head = '%s:%s' % (head, self.Gc2(subs.data)) if out == 'aa': orf = subs.translate(table=self.genetic_code) print self.ToFasta(head, orf.data) elif out == 'nt': print self.ToFasta(head, subs.data) elif out == 'pos': print head
from Bio.Seq import Seq my_dna = Seq( "AACATGCGTCGAATTCCGGTCCAAAACCAAGAAGCTATGGAGAAGCTTGGTGCAAAAGGAGAATCTCGTAATCGTTGGTATACAAAACCATGTTCTTGGATCGAAATGAGTTGGACTTTTAACACTGAGCTGCTAACTGATGTCTCTTACTAGCGATTCGACGTCCATGGTCGTGCAGCGGCATTAGCCTGACCGCATGATGCACTCTTTCTAGTGCGTCTGTCGGTGACTACTTAACTTGGTTGGTTCACATGATCCACTAAGGGCGTTTCTGCGGACCTGAGAACTCCGGCAATGTTAGTTACGCTGAGCTATTATGGTGAGTCCACCGTCGGGACAGCCACGCAGACGCTGGTTTGGAACCCTTGAAATATCCTGCACGCGATAGGATGTCAATATTGAATTATTAATCAACACCGTCCTTCCAGTTTTGCGCTCGCACTGCCAGTATGTACGAACAATACCTTTGTGATGCAAATACGTAAAAGTTGTGATCTGATCTCAACACCTGGCGCTTTCCTGCCGGAAAGATTCTCTTTTGAATGCCGCGGCGGACCCTAGAGTAGGACTAGTTCCTACTTGCGCGGCAAGTTTCAAATCTACAAGAATTAACGCATTCACCTCACACGAACGAGCCTGGTCGACTCACTATTACTCCCATCCGGAGCCTCCTACCCATTCTAGTGATATATTCCGGCAGTAGAGACGGATGGCTTGCCCAAGGTTGACGGCAGCGATTAAATCGTTGAGGGTGTTTAGGACCTGAAATACGGACTGATTCACGCGTTTTTGGCTGTTTCGTTTGAGACACCCTTCTCGCGCTCTGGCATTTATGAACCTAGTTTCACTGAGGCAACTACCGCAGGAACTTCTGATTCGCCTTCCACACAATATCTGGACATGTAGCCATCTTAATTTGCAGTGGCACAAGACAAATTACCCACGGTGATGCCCCAGTTATTCAGATCGCCCAACCCTAGTCACCGTAAACTGTCACCGTACGCTTAATTGGTTCGATACTTTCGCCTAACTTAAACTACCGGGGACTCGGTCTGGTACGGGAATTGCGAACGTAGATCCTATGAGCTTCGCAGATATGGCCCAACCACCAAAGACCTTACAGAGATACGGCTGATGGCCATGAGTGATCGATCCTACCAACGCGGGCAATCGATCTTTAGTAGTGCTCTCGGGAAGAGCATACAGCCGGCGAGCAGAATCTGGGTCGGAACTCAACAAGAGTGGTCACTGAGCAATAACAGTCGAACTCACAGATGAATTTATCAAACGGGGTATCCGCTGTGGCGGCCATCCAGACGCGGGTAGTAAGAGGTGCTCTACGCAGCCCTCTCGACGATTATTGTATCGATTTTCGACTCCAGTTATCAGGTTTCTATATCAAGGCTATATATTTTGACCTGGCCCCTCAGTACTCATATAGTCTCATCGAAAGGTGGTTGTCTGAGCTGTCAAAAAGCACCCGATCTGCCCCGCTCAACCCATGCCTATCGTCTTGGTTGCGTGGCGCGTTTCTGTAGTGGCTGGCAAGTTGCGATCGTAGCCTCCCGGTCTTGCCGGGACGCGGCTTTACTCCGGAGGCAAGGAATGTGTCTCTGGCTGTGGCGGAAGGATTTGACGTTCAAGGTTAACCATAATCTCCATGCGTGAGTGTTCAGCGCATGTAAGATGAGAAGATTTCCGACCTAATGGATCGTCGTCCAGCAGCGAGCGCCGCGATCAGACTAGGCATATACAAAGTTCCATGCTATTGAATCGCCCGACGTAAGACTGCCAACCAGCCTTTTCGTGCGTATCTAACGCGTTACTATGTTGCAACACCCATGGTTAGGTATAGTATATCTACACATTGAGGGCACTATAAGAGTGACGGGCGAGGCTAAAAGCAACACTTATTGTGCTGGCGTCATCGAGGACGTAACACAATACCTCAGCTACCCGATTAGATGGGTATCTTGGGAGTAGTCGTAAGCTAGACATGAAATCTAGGCCACTCTCGCTCTCTTTCGTCGTTAGAAATACTTATGACGCATTTTATTAACAAGACAGCGGCCATCTGCGAGCGCTGCGAGATTCACCCAGGTTTCATCTCACTCGGGCTTGGCTGGAGACGTACAAGGAGCAGGGGGAGGCTACAGATAGTTGCGCAAATGGGCTTGAGTAGCAAGTCCTTGGCACGATACTAATTACCCAATAGTTAATCTAAAGCATCTCTCGGATGTAAAGCTTAACTTAGAAATCTCTACGATCTATAGAACAAAGAGATTTACCTAGCGCTAAGTTTTTTCATAGGAGAAAGTACACCCCGGATAGGAGATTGGCACTACTTAGAGATACTGCGAACTTCCCTCACTCCTTGTGTTCTCGTGGAGTATACTCTACTTTCGAGAACAAATTGACACGGGGCGTCAACTCCGTATCTAACTGTAATATACGTCTCATCGAGCGAACGACGCGTATCAAACATGAGATTCGACATTGTCGCGCTGAAGGATTGGTGTTGGGATCCTGAACAAAAGTTCCCTGAGCGCGCTAAGCGTGATGTATAGTCGAGTTTTGGGACCACTAGACTAACTGGTCCTGTGCGGGAGGCACTAATTTGAGCGACACCGCGAACCCCGCGCCCCATTTACTTGGGTCCAAATTACCCACCCAGAACAGGGCGGACCAGTATGGACTTTAATCACAACGGGTGCCCCTTCAACGCTCATGGTGGGGCCCCAACCCCACACGACAATTTGGGTAAGCGCCGAGCGTGCTCGTTGGTCCGAAGCTTTCGTTTAGGATCAATTTGCTGGAAGAATTCTGTACGACCATCAAAATCCCCCATAGCTATCCAGTTCAGTACGACAGCCAGGGGACGCGGGAGGTCTCGTCGCGCTAGGAATTAGCCAGAATTTTATGGTACAACAATGCTAGTCTACGTTCCCAGATCAATCACATGGCGCCCGACCCCACGCAGTATGAACCTAGCGCCTCACGCAACGATTCAGCTATGGCGCTCAAAACTTTGCAAGGAACGGCTTGCCAGGTCTTCAGTACGAATCATAAAATACTGTGCCGTACTGCCTCTAGTGAACCTTCGGTGGCGACCGGTTCCGTGGTACTTCATCTAGTTAGCCTGGGCTCAGCTAAAATGTAATCCGATATGTCGTTCGCGCTCCCGTTAGGGCATACTTACCTCAGAGCGGGGAAGGGATAAAATTTGAAAGCACCCGGGCCCAGGACTCTCTTTGTCTGAACGAATTGCTGCGAGTGCTGTGGCTGAGCTGGCCGTCACCACCCTAGCTGCCATGTAAATGAACTATTGGCATTTATTATAAGTTCCCCCCCTGAGTACCCGATGTTGGTTCTCGCGGACACTAAGCGGTGCGGAGACGCGTTTTCGCTGGAATGAATGGGCCACAAAAGCAAGCGCCGATCATACCGTTCTCATGCATCGGTCGTGCGGGATACCAGGTCGAAAAGCGCACCGGAGTTAATTCTCGGCATGCTTAGAGTGGGCCTGGTTACGTCGAGACGCTATCTGCCCCTACCGGCTGATGTATTTCGATAATCCGAGTCTCGAGGCCTTGGATACACCCCAGTGTACAATTGTAGGACGTAGAACGTGATGTCTGACGCGTTGAGTGCTTTATCATGCGGACACTCCCATGATTTCTATGATGGGACGTCTGAGGTGTCCGCTGGCGAGTATATGATCAACCGTCGGGTTATTTTGAGTGGTGGGTTGTGCGCGAAGTAGTTATTGTGCTTGGAAATTTAGGTAGATGGTTTTCTGCCGAGACATAGAGCGCTTCTTAGTATTTTTGGGGCCGGGTCAAACCTTCCGACCCCGCGTAACTTCAAAGTGCAAGGACTACCTAGCAACCGATTAGCTTCAAGTGCGGGCGTCAGAGTTTAGATAAAGGGGCCGTTAATGCGCGTATCACCACATGCTATATACACTCGGCCCTGTATACTCCTCCTAATATCGCTTGATGAACGCGTTCTACGAGCGCCTCGTACATAACCGAGGAGCCCCCCTGCCCCTGGCTATCCCCGCCACCAAGCTGTCCAAAACCCTACCCCAGGCCGGAACACCTTCTGGCATTAACCAGCAGCGCCAGTGGTAGCAATCTCCTGGGATCCTATGAGACGACGTATCGCTGTTTTTAAGCTTGCGACTGTCGGCCCGACTTCCCGGCAAGAAAAGTTAGGGTATGTGATCCGCCACACGAATCTGGTAGTTCATGCCTTTGCGCGACCGTGAGATACGCAGACTTGAAACCTCTTAGTAATCCATAACGACAATCCCTGCACGGCCACCAGCGAAAACTCTGTATAGTCTAACCGATATTGAACCATGGACACATATTTGCATGGCCGCTGGTTTTTCTCCGTATAATACTCCTTTGCGCTCCCCGATCATGATAAGGCGGGCCTCATAGTGAAACGCTGCCCGGACGCGTTCTGGATCTCGAGTCTCAACCTTTAGGTGCGCTCATGGGACGCCCCACCTTTCGTAGCAGGGGTTAGCGTTTGAACGGGACGCCCAGTGCGCCTATCTCCAGCACGGTAACCTCAACGAATCTTGCGGTGTTGTGAGATTATATAGATGTTCGGTACTTGTGTAAGATGGCAATAGGGACATAGATCTCAACTCAGTCTGCGGACGCCTCCCGGTGCGTGGCTTCAGCCGGGCGGGAGGTCGGGCAGCGTTAGGCCCTGATCACAAGTCATAGAAAGGGGGAGTGTCTGGTCTTCGAGGTAACACTTTGGTTTGACAGAACAATACCCAATAAATGTGTACTAACCACCGCAACATCGAAAGTCAACCAGCGCAGCTGAAATGATCATAGTGGGGTAGTGCGCGACTACTATAAGCACTTACCCGTTACGTTGTTATGTAGACGGTAATTCTTCCTTGGGCACCGCCGCATAGTATCTCCGATTGCGTCTCTGACAACGGTCTCGAGTCTAGCTAGTACCGGTGTCTAAGTGCATGCCTACTCTAGTGTGGACGTCTCTCAGTGTTTAGTCGAATGGTCACCCACCATGTTAGATGGGCCGTAAGTTTTAGTGGTGACGTGTGCTCGTTGTAAACCCCGAACAACCGGTTACGCCATATCTAAACCCGTTCCACGGATTCGAGCCCGAGCATGATGGTGCCCTAACCGCCAATGACGTGCCGAAACCGTATTAGATCCGCTATTACCATAAACTCCCCGGGTTTCTTACAACACATGGTCTCTACCAATATATTGTACCTAGTCGCGAATTGGACACGTTTGCCTGCTTTTTTTAGTTCCACGAATGTGCTTAGCAGCTTTACCAAAAGCGACCTCCGTAATATCGAGAAGTTTAGACTGCCTCCGTGCCTCGCAACTTGGTAAATCTGTCCGGTACTACTTAGAGTGATTGATATCGGCCTATCCCGTACACACAGTACTACAGAAATCGTTGTATCCGTACGAATACAGACCACTCGATAATGATGGGTAAACAGTCAGATTACACACCTCCTAACCCAGTCCAGTGGGGGTCTAGAACGACGTTTCTATGCAATAATGAACGAAGAACTGAGCGATAGAGACTTTAAGAGTGCCACACACGCCGCATGGGTGTTTAGAACGCTTACCGGTAGAACAGCTGTCGAGAACTGTAAAAGAAAACACAAACGTTAGCGTGCACTAAGCAATAGCTCAGATGCTATTACCATCTCTAGGATAGCGCTACAACAGGACCCTCTGGACCACCGCGCAACGTATGCGTACTTCGATCGGGGGTAGGACTCCGTTAGCTGAGGCTGCGGCATGCGGAGCACTGTAGTTTCCTGGCTGCATGTTACTCTATGTGCATGATTGTATGACGTGACATTCTCTTGAGGTAAACCGAATAAAAAGTAAATTCTACTTTAACATCACCTGCTCGACTGTTCCGCAACGCCCCCTTGGCGTCGGAGACTGCGATTAGCTCGACTAAATCCTATGTGCGATTATGATTGGGCTACAGCGACCCGTGCTAGACCTCGTGATCTGGAAAGGGCCTGCACAGGGAGAACATGGTGAGCCGTTTGCTGGTAATGTACCGAGACGCACGTGTGCACTTATACGCAATATGGAAGTAGGCTCGCACTAGTGCACGCTGGACCAATCGGTGTTTCCCCTAACCCCAGAAGCAGTGCATCTCTCATCGATTCATCGGACGTACATACACGGCCCTTGTTCACCATATCCGTGGATCCATGTCGCCTTACCCTCAAGGGCAGCTCCCGGGACAGTCTATAGGAAAAGGGACAGCCGGTCCCAGGTTCTATCCATAGTAGAGATAAGACCTAAAGCATTAAACTACTGAGGGTGAGCCTGAGCTAATCCCTGCATATAAGACCATAAAAGCTGAGCAAGGAGCTTAGATTTAGCTAAGCCTCGGAAACGGATCTATTTAGTCTCAGGTGAACTGCCTCATGGGGTTCACAAGCAAGGCGTCCCTAAGGCGTTTATGCACGTTCTATTAAGCCTTCGTGCTTATAGGTCTACAGCGCATGGCTTATGAGAGCGAGCGGCGGAACGTAATCCCAGCGCAAGGTACGTCTTAGCCTCCTTCGCTCGCCACGAAGATCTTATCGATTCCGTATTCTTGGAGCATACGGAGTTCTTGCATCAGTAGAATATTGCTGAGCAAGATCTGACTTTACGTTCCAGGACGCCGCAAGACGACTATAGCGTAACGTGGCCAAAGATTCCGTCCCTTCCTCGTAAGTTTTCATGGCAAGCTAGATTTTTCGACAACATTTACGACTGAGCAGCTAGTCCAGAGGGCTACCCGGATGTATCGACGGAGAAGCAGATTATTCTTCTGGCTCTCCTGAGAGAGGCTCACCCGGCTACCTCATAGCTGTGCAAAGCTCCCAGGTAGTTAAGAGCTGGATGTATATCTATCTATACGGGTAGAGGGGAGTTGCCATCGGCATTAACCGTAGACTGTAGGGCAAGACTCGCGTTTGGAGAACTTGGCGGATGCGCTTGTTCGGTAGGCAGGTGTCCACATTTAGATCACTGGTTTGGTTTCGGTTGGGGGGATTTTATGGCTGTGAAAGACTAAGTGTCCCGTTTCGCGTGTTCGTGTTGGGGTCTGCCGCCTAGGTGCACGCAACTTTCATCGTTTCGCCTCCTGTGAAGAACCCGCTATGCCAGCTAGAACAACATCCAAAGGACTCTTGTCATTAAGTCGTAGCGAGGTCCTTCGCTTACGCTTCAATGTCGAGTGTCAGGCTCACATTCGGGCCAAAGGTACCCGTCCCATTAAGTGCATTCGAACTCATACTGGCCCTGTCGTGTAGCAAAGACACACACTCTTCGTTTCCTCTTCCTAGTACGACCCTGGAATGAGTATGTGATCATTCACAGGCTGACTTGACTGCAAGGCGGCCCGCGTTAATTTAAGTGAATTACGAGTAATAACGCCCTCCTTGGGTCCTTGTGGGGAGGTATGATAATCAGCAATCTACCGATACACAAGGTCCGAGGTCGCGTCACGAAACGTCGGCTGCCTAGGGGACCGGCATGATACGTAATACGTCTACCTGCCGGCATCGCTATGCCGGTGGTTAGTAGGGGGTCGATATTTTTGTTTCTCCTTGCGTCTCAGTCAGCGGGTTCTACCTGTTGGATATCCTATTCAGATTGTCAGAGCAGTCCTTCTATTCCAGGATCATGCTTTATTTCCATCGCTCCAGTCTTGTCCGGGCCGACGGCCCGACTATCGTGGGCTGAAGGGTCAGCTGAATTTGTGTACTTGAATTACCATGAACTGTGAAAATCTATGACTACGAGTATAACGTTTAAAAGATGAATGCTTTTCGCACACTGTACACTGCTCATAAACTAAATGCAGGCTCGTTATCAGTTTCTGTTACATCGTTTACACTTGGTACATAGTTAAAACGGTTCCCTTAGGGGGAACATATCTACTACCTATTTCACGACAGACGCAAACTGAGTAAACATTGGATTGAGCCTTCCTTGAGTTTCCAATAGCGAGGTACTTTATTAGAGGACGTAGGAATGCTCTTTCACGAACCACTGACGGTCGTGCAGATAGTGCTTAGATTTTTGTGCTCTGGGCCTCACGATTGTAGCGTCTAACCAGGCGCCCATTTAACTCGCAGGCCCTTTCAATAATCTACCTTTTAAGACCCGGCTAGCCAGCTAAAATTAGATACTTCGTCACTTTGTGCTCGGTAGGCGTTTGGGCATCGCGAAATGACTATACTAACTTTTTTCACTACGACTGACACGCGGATAGAGACATGCATGTAAAAAGGTTATCAAAATGATGGTCTTCGGG" ) #could use any DNA sequence #method reversecomplement returns the reverse complement, recall that this is a reversal of the string and then swapping C for G, G for C, A for T and T for A. A = my_dna.reverse_complement() print(A)
#!/usr/bin/env python3 from Bio.Seq import Seq from Bio.Alphabet import IUPAC # In ORF (Stronghold) we did enumerate ORFs in a straightforward O(n^2) way # (as there are anyway O(n) of them of total size O(n^2) in the worst case). # Here we enumerate *maximal* ORFs (that cannot be extended to the left # from an earlier start) in O(n) (there are O(n) of them of total size O(n) # in the worst case). start = 'M' stop = '*' DNA = Seq(input(), IUPAC.unambiguous_dna) DNArc = DNA.reverse_complement() N = len(DNA) # on traduit integralement les 3 decalages (reading frames) de la chaine # et de son complementaire inverse P = [] for i0 in range(3): P.append(DNA[i0:N - ((N - i0) % 3)].translate()) P.append(DNArc[i0:N - ((N - i0) % 3)].translate()) # on recherche les ORF maximaux (non prolongeables sur la gauche) ir, ml, mr = None, 0, 0 for rf in range(6): l = None for i in range(len(P[rf])): if P[rf][i] == start and l == None: l = i elif P[rf][i] == stop and l != None:
from Bio.Seq import Seq minha_sequencia = Seq("AGTACACTGGT") print(minha_sequencia) # Sequência complementar print(minha_sequencia.complement()) # Sequência reverso complementar print(minha_sequencia.reverse_complement()) # Transcrição dna = Seq("ATGGCCATTCGCAAGGGTGCCCGATAG") print("DNA:" + dna) rna = dna.transcribe() print("RNA:" + rna) dna2 = rna.back_transcribe() print("DNA:" + dna2) # Tradução print(rna.translate()) print(dna.translate())
# Because python numbering starts at 0: start_pos_2 = start_pos - 1 end_pos_2 = end_pos - 1 # Need to extract the invertible region and reverse complement it: with open("out_file.fa", "w") as f: for seq_record in SeqIO.parse(in_file, "fasta"): f.write(str(seq_record.seq[start_pos_2:end_pos_2]) ) # prints sequence from start to end pos with open("reverse_file.fa", "w'") as f: q = open("out_file.fa") r = Seq(q.read(), generic_dna) inv_region = r.reverse_complement( ) # Reverse complements the invertible region of interest f.write(str(inv_region)) # Now we should have two files - out_file.fa containing the invertible DNA region, and # reverse_file.fa containing the reverse complement of the former # Extract flanking regions as well: left_flank = start_pos - 1000 right_flank = end_pos + 1000 with open("left_flank.fa", "w") as f: for seq_record in SeqIO.parse(in_file, "fasta"): f.write(str(seq_record.seq[left_flank:start_pos_2])) with open("right_flank.fa", "w") as f:
def get_custom_fasta(ref_fasta, subsectionlist, args, model_kmer_means, kmer_len): if (args.verbose is True): print "Generating a custom fasta" sequencedict = dict() for sequence in subsectionlist: if (args.verbose is True): print sequence for record in SeqIO.parse(ref_fasta, 'fasta'): if (record.id == sequence): if (sequence not in sequencedict): sequencedict[sequence] = list() for sections in subsectionlist[sequence]: start = sections[0] end = sections[1] if (len(sequencedict[sequence]) > 0): sequencedict[sequence] = str( sequencedict[sequence]) + str( record.seq[sections[0] - 1:sections[1] - 1]) else: sequencedict[sequence] = str( record.seq[sections[0] - 1:sections[1] - 1]) if (args.verbose is True): print "processing the custom fasta" kmer_means = dict() for sequence in sequencedict: kmer_means[record.id] = dict() tmp = dict() tmp2 = dict() tmp["F"] = list() tmp["R"] = list() tmp["Fprime"] = list() tmp["Rprime"] = list() print "ID", record.id print "length", len(record.seq) print "FORWARD STRAND" seq = Seq(sequencedict[sequence], generic_dna) for x in range(len(seq) + 1 - kmer_len): kmer = str(seq[x:x + kmer_len]) tmp["F"].append(float(model_kmer_means[kmer])) print "REVERSE STRAND" seq = revcomp = seq.reverse_complement() for x in range(len(seq) + 1 - kmer_len): kmer = str(seq[x:x + kmer_len]) tmp["R"].append(float(model_kmer_means[kmer])) tmp2["Fprime"] = sklearn.preprocessing.scale(tmp["F"], axis=0, with_mean=True, with_std=True, copy=True) tmp2["Rprime"] = sklearn.preprocessing.scale(tmp["R"], axis=0, with_mean=True, with_std=True, copy=True) kmer_means[record.id] = tmp2 '''From this dictionary we will return a pair consisting of a list of keys(lookup for sequence name) and a 3D array each slice of which relates to the seqid,forward and reverse and then the values. This will then be used as a numpy shared memory multiprocessing array. We hope. Caution - the dictionary returns in the wrong order. ''' items = kmer_means.items() '''for k,v in kmer_means.items(): for x,y in kmer_means[k].items(): print "idiot check",k,x ''' items_ = map(processItems, items) seqids, arrays = zip(*items_) z = len(seqids) print arrays r, c = list(arrays)[0].shape threedarray = multiprocessing.Array(ctypes.c_double, z * r * c) threedarrayshared_array = np.ctypeslib.as_array(threedarray.get_obj()) a = np.array(arrays, dtype=np.float32) threedarrayshared_array = a return seqids, threedarrayshared_array
from Bio.Seq import Seq from Bio.Alphabet import IUPAC from snakemake import shell if snakemake.params['amplicon_type'] == "ITS": print("ITS Trimming") forward_primer_compl = Seq.reverse_complement( Seq(snakemake.params['forward_primer'], IUPAC.ambiguous_dna)) shell("""cutadapt \ --cores {snakemake.threads} \ --error-rate 0.1 \ --times 2 \ --overlap 3 \ -o {snakemake.output[R1_trimmed_reads]} \ -g '{snakemake.params[forward_primer]}' \ -a '{reverse_primer_compl}' \ --match-read-wildcards \ --discard-untrimmed \ {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") elif snakemake.params['amplicon_type'] == "16S": print("16S Trimming") reverse_primer_compl = Seq.reverse_complement( Seq(snakemake.params['reverse_primer'], IUPAC.ambiguous_dna)) shell("""cutadapt \ --cores {snakemake.threads} \ --error-rate 0.1 \ --times 1 \ --overlap 3 \ -o {snakemake.output[R1_trimmed_reads]} \ -g '{snakemake.params[forward_primer]}' \
def bam_seqlogo(file, outfilename, minlength=0, maxlength=1000, unique=False): # empty dicts for read names, sequences and read counts posnames = [] posseqs = [] poscounts = [] negnames = [] negseqs = [] negcounts = [] postotal = 0 negtotal = 0 mappedtotal = 0 # read in bamfile bamfile = pysam.AlignmentFile(file, "rb") for read in bamfile: # convert line into string to make it splittable line = str(read) # split line on tabs linesplit = line.split("\t") # this filters out reads with flag 4 (i.e. unmapped reads) cigar = linesplit[5] if cigar.endswith('M'): seq = linesplit[9].replace('T', 'U') count = int(linesplit[0].split('-')[1]) name = linesplit[0].split('-')[0] strand = linesplit[1] # this is where the length filtering happens if minlength <= len(seq) <= maxlength: mappedtotal += 1 # this is where the strand filtering happens # negative sense if strand == '16': negtotal += 1 negnames.append(name) # convert seq string to Biopython Seq object tempseq = Seq(seq) # set reverse-complement of Seq object as string tempseqrevcomp = tempseq.reverse_complement() negseqs.append(str(tempseqrevcomp)) negcounts.append(count) # positive sense else: postotal += 1 posnames.append(name) posseqs.append(seq) poscounts.append(count) print('Total mapped reads = ' + str(mappedtotal) + '\nSense reads = ' + str(postotal) + '\nAntisense reads = ' + str(negtotal) + '\nMissing mapped reads = ' + str(mappedtotal - (postotal + negtotal))) # because seqlogo needs all reads to be the same length, we now need to pad the shorter reads with Ns (which will be ignored) # positive sense # sort the sequence list by length, and set the max length as the length of the last (i.e. longest) seq posmaxlen = len(sorted(posseqs, key=len)[len(posseqs) - 1]) # go through each seq, add as many Ns onto end as needed to bring length up to maxlength for i in range(len(posseqs)): lengthdiff = posmaxlen - len(posseqs[i]) # if the seq is shorter than maxlength if lengthdiff > 0: # set the seq as the initial newseq newseq = posseqs[i] # add as many Ns as necessary to bring newseq up to maxlength for j in range(lengthdiff): newseq = newseq + 'N' # replace the initial seq (which was shorter than maxlength) with the new seq (which is now padded to maxlength) posseqs[i] = newseq # negative sense # sort the sequence list by length, and set the max length as the length of the last (i.e. longest) seq negmaxlen = len(sorted(negseqs, key=len)[len(negseqs) - 1]) # go through each seq, add as many Ns onto end as needed to bring length up to maxlength for i in range(len(negseqs)): lengthdiff = negmaxlen - len(negseqs[i]) # if the seq is shorter than maxlength if lengthdiff > 0: # set the seq as the initial newseq newseq = negseqs[i] # add as many Ns as necessary to bring newseq up to maxlength for j in range(lengthdiff): newseq = newseq + 'N' # replace the initial seq (which was shorter than maxlength) with the new seq (which is now padded to maxlength) negseqs[i] = newseq # output the positive sense reads and call Seqlogo posout = '' for i in range(len(posseqs)): if unique == True: posout = posout + '>' + posnames[i] + '\n' + posseqs[i] + '\n' elif unique == False: for j in range(poscounts[i]): posout = posout + '>' + posnames[i] + '_' + str( j) + '\n' + posseqs[i] + '\n' posoutfile = open('pos.fas', 'wt') posoutfile.write(posout) posoutfile.close() command = 'weblogo -f pos.fas -D fasta -o ' + outfilename + '_sense.pdf -F pdf -A RNA -a \'ACGU\' -c classic --yaxis 1 --errorbars NO' subprocess.run(command, shell=True) subprocess.run('rm pos.fas', shell=True) # output the negative sense reads and call Seqlogo negout = '' for i in range(len(negseqs)): if unique == True: negout = negout + '>' + negnames[i] + '\n' + negseqs[i] + '\n' elif unique == False: for j in range(negcounts[i]): negout = negout + '>' + negnames[i] + '_' + str( j) + '\n' + negseqs[i] + '\n' negoutfile = open('neg.fas', 'wt') negoutfile.write(negout) negoutfile.close() command = 'weblogo -f neg.fas -D fasta -o ' + outfilename + '_antisense.pdf -F pdf -A RNA -a \'ACGU\' -c classic --yaxis 1 --errorbars NO' subprocess.run(command, shell=True) subprocess.run('rm neg.fas', shell=True) return ('SeqLogos constructed')
print ">forward", name , #frwd_raw_res #prints the fasta tag for forwward primer with name of sequance and the cuting site print frwd #prints the forward primer (eventully with the primer) frwd_G = frwd.count("G") #counts G's #print "G: ", frwd_G #prints number of G's frwd_C = frwd.count("C") #counts the number of C's #print "C: ", frwd_C #prints number of C's frwd_A = frwd.count("A") #counts number of A's frwd_T = frwd.count("T") #counts the number of T's frwdG_C = 100 * float(frwd_G + frwd_C) / len(frwd) # GC content (G/C)/len*100 print "GC content", frwdG_C, "%" #prints the gc contents in a precent frwd_MT = 64.9 + 41 *float( (frwd_G + frwd_C - 16.4) / (frwd_A + frwd_T + frwd_G + frwd_C)) #http://www.biophp.org/minitools/melting_temperature/demo.php?formula=basic #the melting temp equation print "melting temp", frwd_MT #prints the melting temp revs = Seq(sequance_sites[-20:]) #sequace for the reverse primer revs_revs_comp = revs.reverse_complement() #reverse complement of the last 20 characters #documentation same as forward print ">reverse", name print revs_revs_comp revs_G = revs_revs_comp.count("G") revs_C = revs_revs_comp.count("C") revs_A = revs_revs_comp.count("A") revs_T = revs_revs_comp.count("T") revsG_C = 100 * float (revs_G + revs_C ) / len(revs_revs_comp) print "GC content: ",revsG_C, "%" revs_MT = 64.9 + 41 *float( (revs_G + revs_C - 16.4) / (revs_A + revs_T + revs_G + revs_C)) #http://www.biophp.org/minitools/melting_temperature/demo.php?formula=basic print "melting temp", revs_MT