def _as_seq_object(dna, alphabet=IUPAC.ambiguous_dna): if not isinstance(dna, Seq): dna = Seq(dna, alphabet) return dna
def test_non_iupac_letters(self): """Test if non-IUPAC letters raise a TypeError.""" with self.assertRaises(TypeError): seq = FormattedSeq(Seq("GATCZ"))
#!/usr/bin/env # encoding: utf-8 """ Created by John DiBaggio on 2016-11-30 """ __author__ = 'johndibaggio' import sys import os from Bio.Seq import Seq from Bio.Alphabet import IUPAC argv = list(sys.argv) input_file = open(argv[1]) output_file = open(argv[2], 'w+') dna = input_file.read() dna_sequence = Seq(dna, IUPAC.unambiguous_rna) a_count = dna_sequence.count("A") c_count = dna_sequence.count("C") g_count = dna_sequence.count("G") t_count = dna_sequence.count("T") output_file.write("DNA: " + dna + "\nA: " + str(a_count) + "\nC: " + str(c_count) + "\nG: " + str(g_count) + "\nT: " + str(t_count)) output_file.close() input_file.close()
feature_type = seq_record.features[feat].type if feature_type == 'gene': try: feature_start_zero_based_numbering = seq_record.features[ feat].location.nofuzzy_start feature_end_zero_based_numbering = seq_record.features[ feat].location.nofuzzy_end feature_strand = seq_record.features[feat].strand gene_id = str(seq_record.features[feat].qualifiers['gene'][0]) sequence_slice = gb_entire_sequence_joined[ feature_start_zero_based_numbering: feature_end_zero_based_numbering] if feature_strand == 1: gene_array.at[accession, gene_id] = sequence_slice if feature_strand == -1: sequence_slice_BP = Seq(sequence_slice) sequence_slice_rvscomp = str( sequence_slice_BP.reverse_complement()) gene_array.at[accession, gene_id] = sequence_slice_rvscomp except KeyError: pass # Count and record instances of each gene in the dataframe gene_instances_list = [] #print('Counts of each gene extracted from GenBank file:') for gene in gene_list_rem_dups: gene_to_append = gene, len(gene_array) - gene_array[gene].isnull().sum() gene_instances_list.append(gene_to_append) #print(gene, len(gene_array)-gene_array[gene].isnull().sum()) gene_instances_list_DF = pd.DataFrame(gene_instances_list,
def test_overlapping_cut_sites(self): """Check if overlapping recognition sites are properly handled.""" seq = Seq("CATGCACGCATGCATGCACGC") self.assertEqual(SphI.search(seq), [13, 17])
def revcomp(seq): #create a sequence object my_seq = Seq(seq) return str(my_seq.reverse_complement())
def cluster(file_names, candidates, min_copy_number, FSL, workers): from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq from Bio import SeqIO from Bio import pairwise2 from subprocess import Popen, PIPE from collections import OrderedDict import os, shutil import math makelog("Clustering") cmd_list = [ './vsearch-2.7.1/bin/vsearch', '--cluster_fast',file_names['file_candidates_fasta'], #'--consout',file_names['file_representative'], '--threads',str(workers), '--strand','both', '--clusters',file_names['file_temp_cluster'], '--iddef','1', '--id', '0.8'] makelog(' '.join(cmd_list)) p = Popen(cmd_list, stdout=PIPE, stderr=PIPE) out,err = p.communicate() #for stdout_line in iter(popen.stdout.readline, ""): # yield stdout_line #popen.stdout.close() #return_code = popen.wait() ##if return_code: # raise subprocess.CalledProcessError(return_code, cmd) #for c in iter(lambda: p.stdout.read(), ''): #makelog(c) makelog("Clustering done") makelog("Filtering clusters") #count for minimum file length clusters_dic = {} list_dir = os.listdir(file_names['file_temp_cluster_dir']) makelog("Initial clusters: %i" % (len(list_dir),)) for fn in list_dir: if os.path.isfile(file_names['file_temp_cluster_dir'] + fn): fh = open(file_names['file_temp_cluster_dir'] + fn) n = 0 for line in fh: if line.startswith(">"): n += 1 id_seq = line[1:line.find(" ")] if fn in clusters_dic: clusters_dic[fn].append(id_seq) else: clusters_dic[fn] = [id_seq] fh.close() #shutil.rmtree(file_names['file_temp_cluster_dir']) # os.unlink(file_names['file_temp_cluster_dir'] + fn) # if n < args.min_copy_number: # df.loc[df['candidate_id'] == 'id_seq', 'status'] = 'low_cn' # os.remove(fn) # continue #clusters_dic = loadcluster(cluster_candidates_file + ".clstr") filtered_clusters = filtercluster(clusters_dic, min_copy_number, candidates) unique_clusters = set(filtered_clusters.keys()) num_clusters = len(unique_clusters) #loop through clusters for current_cluster in unique_clusters: #search candidates for that cluster #all possible 2-combinations of candidates candidates_in_cluster = filtered_clusters[current_cluster] #porc_of_clusters = int(math.ceil(len(candidates_in_cluster) * 0.4)) #new_min_copy_number = max(min_copy_number,porc_of_clusters) new_min_copy_number = min_copy_number sum_diff_fs_cluster = 0 for x in candidates_in_cluster: totally_different_fs = True cand_x = candidates[x] fs_right_1 = cand_x['fs_right'] fs_left_1 = cand_x['fs_left'] if fs_left_1 == '' or fs_right_1 == '' or not isinstance(fs_left_1,str) or not isinstance(fs_right_1,str): totally_different_fs = False continue if not complex_enough(fs_right_1) or not complex_enough(fs_left_1): totally_different_fs = False continue fs_right_1 = fs_right_1.upper() fs_left_1 = fs_left_1.upper() fs_right_1_plus_mite = cand_x['seq'][-FSL:].upper() + fs_right_1 fs_left_1_plus_mite = fs_left_1 + cand_x['seq'][0:FSL].upper() at_least_one = False for y in candidates_in_cluster: cand_y = candidates[y] if cand_x['candidate_id'] == cand_y['candidate_id']: continue # R1 x R2 # L1 x L2 # L1RC x R2 # R1RC x L2 #some MITE could be at the end or begining of the sequence and this not having flanking seqs fs_right_2 = cand_y['fs_right'] fs_left_2 = cand_y['fs_left'] if fs_right_2 == '' or fs_left_2 == '': continue #empty strings in some versions of pandas are returned as nan, so we make sure the flanking seqs are strings if fs_left_2 == '' or fs_right_2 == '' or not isinstance(fs_right_2,str) or not isinstance(fs_left_2,str): continue if not complex_enough(fs_right_2) or not complex_enough(fs_left_2): continue fs_right_2 = fs_right_2.upper() fs_left_2 = fs_left_2.upper() fs_right_2_plus_mite = cand_y['seq'][-FSL:].upper() + fs_right_2 fs_left_2_plus_mite = fs_left_2 + cand_y['seq'][0:FSL].upper() fs_left_1_rc = Seq(fs_left_1).reverse_complement() fs_right_1_rc = Seq(fs_right_1).reverse_complement() fs_left_1_plus_mite_rc = Seq(fs_left_1_plus_mite).reverse_complement() fs_right_1_plus_mite_rc = Seq(fs_right_1_plus_mite).reverse_complement() #calculate scores score_r1_r2 = pairwise2.align.localms(fs_right_1, fs_right_2, 1, -1, -1, -1,score_only=True) score_l1_l2 = pairwise2.align.localms(fs_left_1, fs_left_2, 1, -1, -1, -1,score_only=True) score_l1rc_r2 = pairwise2.align.localms(fs_left_1_rc, fs_right_2, 1, -1, -1, -1,score_only=True) score_r1rc_l2 = pairwise2.align.localms(fs_right_1_rc, fs_left_2, 1, -1, -1, -1,score_only=True) #since a MITE might be longer, we also look a few nt inside score_r1_r2_plus_mite = pairwise2.align.localms(fs_right_1, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True) score_l1_l2_plus_mite = pairwise2.align.localms(fs_left_1, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True) score_l1rc_r2_plus_mite = pairwise2.align.localms(fs_left_1_rc, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True) score_r1rc_l2_plus_mite = pairwise2.align.localms(fs_right_1_rc, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True) #TODO remove #since a MITEs might be longer, we also look for the FS inside #score_r1_m2 = pairwise2.align.localms(fs_right_1, seq_2, 1, -1, -1, -1,score_only=True) #score_l1_m2 = pairwise2.align.localms(fs_left_1, seq_2, 1, -1, -1, -1,score_only=True) #score_r1rc_m2 = pairwise2.align.localms(fs_right_1_rc, seq_2, 1, -1, -1, -1,score_only=True) #score_l1rc_m2 = pairwise2.align.localms(fs_left_1_rc, seq_2, 1, -1, -1, -1,score_only=True) #max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_m2,score_r1rc_m2,score_l1rc_m2) #get max score max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_r2_plus_mite,score_l1_l2_plus_mite,score_l1rc_r2_plus_mite,score_r1rc_l2_plus_mite) if max_score == []: max_score = 0 max_score /= FSL at_least_one = True if max_score > 0.5: totally_different_fs = False break if totally_different_fs and at_least_one: sum_diff_fs_cluster += 1 if sum_diff_fs_cluster >= new_min_copy_number: break if sum_diff_fs_cluster < new_min_copy_number: #makelog(' '.join(filtered_clusters[current_cluster]) + " filtered by flanking sequence") del filtered_clusters[current_cluster] #else: # makelog(' '.join(filtered_clusters[current_cluster]) + " not filtered by flanking sequence") #again to remove < MIN_COPY_NUMBER elements #filtered_clusters = filtercluster(filtered_clusters, args.min_copy_number, positions, df, 'low_copy_number_2') ordered_cluster = OrderedDict(sorted(filtered_clusters.items(), key=lambda t: t[0])) makelog("Clusters: " + str(len(filtered_clusters))) buffer_rec = [] #import ipdb; ipdb.set_trace() #for candidate in candidates.values(): count = 1 family_number = 1#ordered_cluster.keys().index(clus) buffer_nr = [] output_gff = open(file_names['file_gff'],"w") output_gff.write("##gff-version 3\n") for clus, seqs in ordered_cluster.items(): one_per_family = False tsd_family = [] for seq in seqs: candidate = candidates[seq] candidate['id'] = "MITE_T_%s|%s|%s|%s|%s|%s|F%s" % (str(count),candidate['record'],candidate['start'],candidate['end'],candidate['tsd'],candidate['tir_len'],family_number) candidate['description'] = "%s CANDIDATE_ID:%s" % (candidate['description'], candidate['candidate_id'].split('|')[0]) record = SeqRecord(Seq(candidate['seq']), id=candidate['id'], description=candidate['description']) buffer_rec.append(record) write_row = '\t'.join([candidate['record'], 'MITE_Tracker','MITE',str(candidate['start']), str(candidate['end']),'.','+','.','ID='+candidate['id'] ]) output_gff.write(write_row + '\n') tsd_family.append(candidate['tsd']) if not one_per_family: one_per_family = True record_family = record buffer_nr.append(record_family) count += 1 from statistics import mode,StatisticsError try: tsd_consensus = mode(tsd_family) record_family.description = '%s COMMON_TSD:%s' % (record_family.description, tsd_consensus) except StatisticsError: pass family_number += 1 SeqIO.write(buffer_nr, file_names['file_representative'] , "fasta") SeqIO.write(buffer_rec, file_names['all_file'] , "fasta") cluster2seq(ordered_cluster, candidates, file_names['families_file'])
def setUp(self): self.sequences = [ SeqRecord(Seq("AAA"), id="s1"), SeqRecord(Seq("A-G"), id="s2"), SeqRecord(Seq("-A-"), id="s3"), ]
def extract_pairwise(align_json=None, outfile=None, outfmt=None, refreg=None, debug=False, ): outh = sys.stdout if outfile is None else open(outfile, 'w') if outfmt == 'nuc_fa' or outfmt == 'prot_fa': jaln = load_slot_json(align_json, 'padded_alignments') if refreg is None: for newname, alignment in list(jaln.items()): nucstr = ''.join(t[2] for t in alignment if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s' % newname, file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr),3))*3]) print(sequtils.wrap(str(s.translate())), file=outh) else: refmap = {sequtils.parse_seq_id(k)['ref']:k for k in list(jaln.keys())} chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg) ref_s = ref_s - 1 alignment = jaln[refmap[chrom]] # Get alignment start for aln_s in range(len(alignment)): if alignment[aln_s][0] == ref_s: break while alignment[aln_s][3] == -1: aln_s += 1 # Get alignment end for aln_e in range(len(alignment)-1, -1, -1): if alignment[aln_e][0] == ref_e: break while alignment[aln_e][3] == -1: aln_e += -1 nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e] if t[3] != -1) nucstr = nucstr.replace('*', 'N') print('>%s (%s)' % (refmap[chrom], refreg), file=outh) if outfmt == 'nuc_fa': print(sequtils.wrap(nucstr), file=outh) else: s = Seq(nucstr[:(old_div(len(nucstr),3))*3]) print(sequtils.wrap(str(s.translate())), file=outh) elif outfmt == 'aln_fa': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): aid = sequtils.parse_seq_id(newname) rstr = ''.join(t[1] for t in alignment).replace('*', 'N') qstr = ''.join(t[2] for t in alignment).replace('*', 'N') print('>ref|%s|' % aid['ref'], file=outh) print(sequtils.wrap(rstr), file=outh) print('>sid|%s|' % aid['sid'], file=outh) print(sequtils.wrap(qstr), file=outh) elif outfmt == 'amp_gtf': jgtf = load_slot_json(align_json, 'padded_gtf') print('\n'.join(_ for _ in jgtf), file=outh) elif outfmt == 'tsv': jaln = load_slot_json(align_json, 'padded_alignments') for newname, alignment in list(jaln.items()): print('# %s' % newname, file=outh) for l in alignment: print('\t'.join(str(_) for _ in l), file=outh)
def _alignment_record(sequence): return SeqRecord( Seq(sequence, alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
def seqrecord(sequence_id, sequence_text, alphabet=Alphabet.generic_dna): """ Quick shortcut to make a SeqRecord """ return SeqRecord(Seq(sequence_text, alphabet), id=sequence_id)
def proRC(self): return str(Seq(self.seq, IUPAC.unambiguous_dna).reverse_complement())
from Bio.Seq import Seq Dna=input("enter DNA sequence ") Dna =Seq(Dna) Mrna=Dna.translate() protein=Mrna.translate() print(protein)
from Bio.Alphabet import generic_dna from Bio.SeqUtils import molecular_weight, MeltingTemp from Bio.Restriction import AllEnzymes from pathlib import Path from exmemo import Workspace from functools import lru_cache from more_itertools import one work = Workspace.from_path(__file__) work.sequence_dir = work.root_dir / 'sequences' work.plasmid_dir = work.sequence_dir / 'plasmids' work.plasmid_db = work.sequence_dir / 'plasmids.xlsx' work.fragment_db = work.sequence_dir / 'fragments.xlsx' work.oligo_db = work.sequence_dir / 'oligos.xlsx' DnaSeq = lambda x: Seq(x.upper(), generic_dna) param_pattern = r'(?P<key>\w+)=((?P<value>[^"]\S*)|(?P<value_quoted>".*?"))(\s|$)' def get_seq(tag): return dispatch_to_tag( tag, p=get_plasmid_seq, f=get_fragment_seq, o=get_oligo_seq, ) def get_mw(tag): return molecular_weight( seq=get_seq(tag),
def simulate_read_with_error_model(cls, genome, ErrorModel, i, always_forward=True): """Form a read from one genome (or sequence) according to an ErrorModel returns a string Args: genome (string): sequence or genome of reference ErrorModel (ErrorModel): an ErrorModel class i (int): a number identifying the read Returns: string: a string representing a single read """ # ErrorModel.read_length = ErrorModel.read_length - 1 np_random = np.random.RandomState(seed=i) read_length = ErrorModel.read_length if len(genome) <= read_length: genome = "".join([genome, "N" * (read_length - len(genome) + 1)]) record = SeqRecord(Seq(genome, IUPAC.unambiguous_dna), id=f'genome_{i}', description='') sequence = record.seq header = record.id # generate the forward read forward_start = np_random.randint( low=0, high=max(len(record.seq) - read_length + 1, 1)) forward_end = forward_start + read_length generate_forward = np_random.randint(low=0, high=2) if generate_forward or always_forward: bounds = (forward_start, forward_end) # create a perfect read forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]), IUPAC.unambiguous_dna), id='%s_%s' % (header, i), description='') # add the indels, the qual scores and modify the record accordingly forward.seq = ErrorModel.introduce_indels(forward, 'forward', sequence, bounds) forward = ErrorModel.introduce_error_scores(forward, 'forward') forward.seq = ErrorModel.mut_sequence(forward, 'forward') return str(forward.seq) else: insert_size = ErrorModel.random_insert_size() try: reverse_start = forward_end + insert_size reverse_end = reverse_start + read_length assert reverse_end < len(record.seq) except AssertionError: reverse_end = np_random.randint(low=read_length, high=len(record.seq)) reverse_start = reverse_end - read_length bounds = (reverse_start, reverse_end) reverse = SeqRecord(Seq( rev_comp(str(sequence[reverse_start:reverse_end])), IUPAC.unambiguous_dna), id='%s_%s' % (header, i), description='') reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse', sequence, bounds) reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') return str(reverse.seq)
style = '' end = i - 1 features.append({ 'style': '-->', 'sel': [begin, end], 'position': position }) return features #prof=cons_prof(alignment) #pylab.plot(prof) if __name__ == '__main__': human_h2a_z_core = Seq( 'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG' ) xenopus_h2a_core = Seq( 'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP' ) # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG') msa = MultipleSeqAlignment( [SeqRecord(xenopus_h2a_core, id='H2A', name='H2A')]) features = get_hist_ss_in_aln_for_shade(msa, below=True) # features=[{'style':'fill:$\uparrow$','sel':[5,10],'text':'test'}] print features shade_aln2png(msa, filename='default', shading_modes=['charge_functional'], legend=False,
def write_novel_report(novelmiRNALListFile, featureFile, clusterFile, rnafoldCmdTmp): dir_tmp = os.path.split(os.path.abspath(clusterFile))[0] samppleName_tmp = '_'.join(novelmiRNALListFile.split('_')[:-3]) novelmiRNALOriginalList = [] #clusterNameContentDic = {} clusterNameProbabilityDic = {} with open(novelmiRNALListFile, 'r') as inf: line = inf.readline() probabilityIndex = line.strip().split(',').index('probability') clusterNameIndex = line.strip().split(',').index('clusterName') line = inf.readline() while line != '': content = line.strip().split(',') probability = content[probabilityIndex] clusterName = content[clusterNameIndex] if clusterName not in novelmiRNALOriginalList: novelmiRNALOriginalList.append(clusterName) clusterNameProbabilityDic.update({clusterName: probability}) line = inf.readline() # The majority of the paired miRNAs' pruned precursor RNA sequences should be identical. However, the pruned precursor RNA sequences might be slightly different # at tail and head part. This situation happens at a quite low probility due to pruning error. # Therefore, the paired miRNA's precursor miRNA of the identified novel miRNA should be corrected in order to plot. with open(featureFile, 'r') as inf: totalContent = inf.readlines() with open(featureFile, 'r') as inf: line = inf.readline() content = line.strip().split('\t') clusterNameIndex = content.index('clusterName') pruned_precusor_seqIndex = content.index('pruned_precusor_seq') pruned_precusor_strIndex = content.index('pruned_precusor_str') upstreamDistanceIndex = content.index('upstreamDistance') downstreamDistanceIndex = content.index('downstreamDistance') armTypeIndex = content.index('armType') pair_stateIndex = content.index('pair_state') i = 1 line = inf.readline() while line != '': content = line.strip().split('\t') if content[clusterNameIndex] in novelmiRNALOriginalList: armTypeTmp = content[armTypeIndex] pair_state = content[pair_stateIndex] pruned_precusor_seq = content[pruned_precusor_seqIndex] pruned_precusor_str = content[pruned_precusor_strIndex] upstreamDistance = content[upstreamDistanceIndex] downstreamDistance = content[downstreamDistanceIndex] if pair_state == 'Yes': if upstreamDistance != 'None': if int(upstreamDistance) <= 44: correctLineContent = totalContent[ i - 1].strip().split('\t') if (correctLineContent[pair_stateIndex] ) == 'Yes' and ( correctLineContent[armTypeIndex] in [ 'arm5', 'arm3' ]) and (correctLineContent[armTypeIndex] != armTypeTmp): correctLineContent[ pruned_precusor_seqIndex] = pruned_precusor_seq correctLineContent[ pruned_precusor_strIndex] = pruned_precusor_str totalContent[ i - 1] = '\t'.join(correctLineContent) + '\n' #print content[clusterNameIndex] elif int(downstreamDistance) <= 44: correctLineContent = totalContent[ i + 1].strip().split('\t') if (correctLineContent[pair_stateIndex] ) == 'Yes' and ( correctLineContent[armTypeIndex] in [ 'arm5', 'arm3' ]) and (correctLineContent[armTypeIndex] != armTypeTmp): correctLineContent[ pruned_precusor_seqIndex] = pruned_precusor_seq correctLineContent[ pruned_precusor_strIndex] = pruned_precusor_str totalContent[ i + 1] = '\t'.join(correctLineContent) + '\n' #print content[clusterNameIndex] else: pass line = inf.readline() i = i + 1 with open(featureFile[:-4] + '_corrected.tsv', 'w') as outf: for t, totalContentTmp in enumerate(totalContent): # Correct the armType if it is wrongly alocated because the armType is based on the stableClusterSeq. This sequnce may be too short. # So use the clusterSeq to recorrect the armType. if t == 0: content = totalContentTmp.strip().split('\t') clusterNameIndex = content.index('clusterName') clusterSeqIndex = content.index('clusterSeq') stableClusterSeqIndex = content.index('stableClusterSeq') pruned_precusor_seqIndex = content.index('pruned_precusor_seq') pruned_precusor_strIndex = content.index('pruned_precusor_str') armTypeIndex = content.index('armType') outf.write(totalContentTmp) else: content = totalContentTmp.strip().split('\t') clusterSeq = content[clusterSeqIndex] clusterSeqNew = '' for nucl in clusterSeq: if nucl == 'T': clusterSeqNew = clusterSeqNew + 'U' else: clusterSeqNew = clusterSeqNew + nucl stableClusterSeq = content[stableClusterSeqIndex] pruned_precusor_seq = content[pruned_precusor_seqIndex] pruned_precusor_str = content[pruned_precusor_strIndex] clusterSeqStr = getMiRNAStructure(clusterSeqNew, pruned_precusor_seq, pruned_precusor_str) pattern = re.compile('\(.*\)') if pattern.search(clusterSeqStr) is not None: if content[armTypeIndex] == 'loop': outf.write(totalContentTmp) else: content[armTypeIndex] = 'loop' outf.write('\t'.join(content) + '\n') #print '%s is modified'%(content[clusterNameIndex]) else: outf.write(totalContentTmp) clusterNameFeatureDic = {} precursorSeqclusterNameDic = {} with open(featureFile[:-4] + '_corrected.tsv', 'r') as inf: line = inf.readline() content = line.strip().split('\t') clusterNameIndex = content.index('clusterName') seqCountIndex = content.index('seqCount') readCountSumIndex = content.index('readCountSum') stableClusterSeqIndex = content.index('stableClusterSeq') alignedClusterSeqLabel = content.index('alignedClusterSeq') headUnstableLengthLabel = content.index('headUnstableLength') tailUnstableLengthLabel = content.index('tailUnstableLength') precusorSeqIndex = content.index('pruned_precusor_seq') precusorStrIndex = content.index('pruned_precusor_str') armTypeIndex = content.index('armType') line = inf.readline() while line != '': content = line.strip().split('\t') clusterName = content[clusterNameIndex] seqCount = content[seqCountIndex] readCountSum = content[readCountSumIndex] stableClusterSeq = str( Seq(content[stableClusterSeqIndex], generic_dna).transcribe()) alignedClusterSeq = content[alignedClusterSeqLabel] headUnstableLength = int(content[headUnstableLengthLabel]) tailUnstableLength = int(content[tailUnstableLengthLabel]) precusorSeq = content[precusorSeqIndex] precusorStr = content[precusorStrIndex] armType = content[armTypeIndex] strand = clusterName[-1] chr = clusterName.split(':')[2] startPos = int( clusterName.split(':')[3][:-1].split('_')[0].strip()) endPos = int(clusterName.split(':')[3][:-1].split('_')[1].strip()) headDashCountTmp = headDashCount2(alignedClusterSeq) tailDashCountTmp = tailDashCount2(alignedClusterSeq) if strand == '+': startPos = startPos - headDashCountTmp + headUnstableLength endPos = endPos + tailDashCountTmp - tailUnstableLength else: startPos = startPos - tailDashCountTmp + tailUnstableLength endPos = endPos + headDashCountTmp - headUnstableLength #startPos = startPos - headDashCountTmp + tailUnstableLength #endPos = endPos + tailDashCountTmp - headUnstableLength if precusorSeq != 'None': if clusterName not in clusterNameFeatureDic.keys(): clusterNameFeatureDic.update({ clusterName: [ chr, startPos, endPos, strand, stableClusterSeq, seqCount, readCountSum, armType, precusorSeq, precusorStr ] }) if (precusorSeq, chr, strand) not in precursorSeqclusterNameDic.keys(): precursorSeqclusterNameDic.update({ (precusorSeq, chr, strand): [clusterName] }) else: precursorSeqclusterNameDic[(precusorSeq, chr, strand)].append(clusterName) line = inf.readline() clusterNameClusterSeqDic = {} # Parse the clusterFile to get the detailed reads information for each cluster. with open(clusterFile, 'r') as inf: contentTmp = inf.readlines() labelList = [] for index, item in enumerate(contentTmp): if 'Cluster Name:' in item: labelList.append(index) for k in range(len(labelList)): if k != len(labelList) - 1: subContentTmp = contentTmp[labelList[k]:labelList[k + 1]] else: subContentTmp = contentTmp[labelList[k]:] clusterNameTmp = subContentTmp[0].strip().split(' ')[2] if clusterNameTmp in clusterNameFeatureDic.keys(): #print clusterNameTmp startTmp = int(clusterNameTmp[:-1].split(':')[-1].split('_')[0]) endTmp = int(clusterNameTmp[:-1].split(':')[-1].split('_')[1]) if clusterNameTmp[-1] == '+': dashedClusterSeqStart = startTmp - headDashCount2( subContentTmp[4].strip()) dashedClusterSeqEnd = endTmp + tailDashCount2( subContentTmp[4].strip()) else: dashedClusterSeqStart = startTmp - tailDashCount2( subContentTmp[4].strip()) dashedClusterSeqEnd = endTmp + headDashCount2( subContentTmp[4].strip()) readContentlist = [] for subitem in subContentTmp[5:]: if clusterNameTmp[-1] == '+': ReadSeqStart = dashedClusterSeqStart + headDashCount2( subitem.split('\t')[0]) ReadSeqEnd = dashedClusterSeqEnd - tailDashCount2( subitem.split('\t')[0]) else: #ReadSeqStart = dashedClusterSeqStart + headDashCount2(subitem.split('\t')[0])-2 #ReadSeqEnd = dashedClusterSeqEnd - tailDashCount2(subitem.split('\t')[0])-2 ReadSeqStart = dashedClusterSeqStart + tailDashCount2( subitem.split('\t')[0]) ReadSeqEnd = dashedClusterSeqEnd - headDashCount2( subitem.split('\t')[0]) readCount = int(subitem.strip().split('\t')[1]) #readSeq = removeDash(subitem.split('\t')[0]) readSeq = str( Seq(removeDash(subitem.split('\t')[0]), generic_dna).transcribe()) readContentlist.append( (readSeq, ReadSeqStart, ReadSeqEnd, readCount)) clusterNameClusterSeqDic.update({clusterNameTmp: readContentlist}) #print len(clusterNameClusterSeqDic) #print clusterNameClusterSeqDic['unmapped_mirna_HUVEC_JH-04:miRCluster_154_18:chr1:38212279_38212296-'] # Output the novel miRNA report csv file #print precursorSeqclusterNameDic[('CUGACUGCCGAGGGGGCCCUGGCCUGGAUCCAUGCUGGGCAGAAGCAGCUGGACACUGACCAGGACCCCCCAGGGCCGGAGGAACC', 'chr9', '+')] outf1 = open( os.path.join( os.path.join(dir_tmp, samppleName_tmp + '_novel_miRNAs_report.csv')), 'w') outf1.write( 'Novel miRNA name,Probability,Chr,Start Pos,End Pos,Strand,Mature miRNA sequence,Arm type,Passenger miRNA sequence,Mature miRNA read Count,Passenger miRNA read Count,Precursor miRNA sequence,Precursor miRNA structure\n' ) i = 1 while len(novelmiRNALOriginalList) >= 1: novelmiRNA = novelmiRNALOriginalList[0] sampleName = '_'.join(novelmiRNA.split(':')[0].split('_')[2:]) precursorSeq = clusterNameFeatureDic[novelmiRNA][-2] clusterNameList = precursorSeqclusterNameDic[( precursorSeq, clusterNameFeatureDic[novelmiRNA][0], clusterNameFeatureDic[novelmiRNA][3])] #print clusterNameList for clusterNameListTmp in chunkInto2(clusterNameList): #print clusterNameListTmp if len(clusterNameListTmp) == 1: matureMiRNAName = clusterNameListTmp[0] passengerMiRNAName = 'None' elif len(clusterNameListTmp) == 2: if clusterNameListTmp[0] in clusterNameProbabilityDic.keys( ) and clusterNameListTmp[1] in clusterNameProbabilityDic.keys( ): if int( clusterNameFeatureDic[clusterNameListTmp[0]][6] ) >= int(clusterNameFeatureDic[clusterNameListTmp[1]][6]): matureMiRNAName = clusterNameListTmp[0] passengerMiRNAName = clusterNameListTmp[1] else: matureMiRNAName = clusterNameListTmp[1] passengerMiRNAName = clusterNameListTmp[0] elif clusterNameListTmp[0] in clusterNameProbabilityDic.keys( ) and clusterNameListTmp[ 1] not in clusterNameProbabilityDic.keys(): matureMiRNAName = clusterNameListTmp[0] passengerMiRNAName = clusterNameListTmp[1] else: matureMiRNAName = clusterNameListTmp[1] passengerMiRNAName = clusterNameListTmp[0] if clusterNameFeatureDic[passengerMiRNAName][7] == 'loop': passengerMiRNAName = 'None' #print 'Error happens at: %s'%(passengerMiRNAName) chr = clusterNameFeatureDic[matureMiRNAName][0] startPos = clusterNameFeatureDic[matureMiRNAName][1] endPos = clusterNameFeatureDic[matureMiRNAName][2] strand = clusterNameFeatureDic[matureMiRNAName][3] matureMiRNASeq = clusterNameFeatureDic[matureMiRNAName][4] readCountSumMatureMiRNA = clusterNameFeatureDic[matureMiRNAName][6] armType = clusterNameFeatureDic[matureMiRNAName][7] matureMiRNAPrecusorSeq = clusterNameFeatureDic[matureMiRNAName][8] matureMiRNAPrecusorStr = clusterNameFeatureDic[matureMiRNAName][9] matureReadContentlistRaw = clusterNameClusterSeqDic[ matureMiRNAName] if passengerMiRNAName == 'None': passengerMiRNASeq = 'None' passengerReadContentlistRaw = 'None' passengerReadContentlist = 'None' readCountSumPassengerMiRNA = 'None' passengerStrand = 'None' else: passengerMiRNASeq = clusterNameFeatureDic[passengerMiRNAName][ 4] readCountSumPassengerMiRNA = clusterNameFeatureDic[ passengerMiRNAName][6] passengerReadContentlistRaw = clusterNameClusterSeqDic[ passengerMiRNAName] passengerStartPos = clusterNameFeatureDic[passengerMiRNAName][ 1] passengerEndPos = clusterNameFeatureDic[passengerMiRNAName][2] passengerStrand = clusterNameFeatureDic[passengerMiRNAName][3] # Pad the head and tail part with '.' according to the allignment to the precursor miRNA. matureReadContentlist = padClusteredList(matureReadContentlistRaw, matureMiRNASeq, matureMiRNAPrecusorSeq, startPos, endPos, strand) if passengerReadContentlistRaw != 'None': passengerReadContentlist = padClusteredList( passengerReadContentlistRaw, passengerMiRNASeq, matureMiRNAPrecusorSeq, passengerStartPos, passengerEndPos, passengerStrand) totalReadCountSum = int(readCountSumMatureMiRNA) + int( readCountSumPassengerMiRNA) else: passengerReadContentlist = 'None' totalReadCountSum = int(readCountSumMatureMiRNA) #if readCountSumPassengerMiRNA != 'None': # totalReadCountSum = int(readCountSumMatureMiRNA) + int(readCountSumPassengerMiRNA) #else: # totalReadCountSum = int(readCountSumMatureMiRNA) if armType != 'loop': novelmiRNANameNew = 'novel_miRNA_' + str(i) outf1.write(','.join([ novelmiRNANameNew, clusterNameProbabilityDic[matureMiRNAName], chr, str(startPos), str(endPos), strand, matureMiRNASeq, armType, passengerMiRNASeq, str(readCountSumMatureMiRNA), str(readCountSumPassengerMiRNA), matureMiRNAPrecusorSeq, matureMiRNAPrecusorStr ])) outf1.write('\n') # Prepare to plot the precurosr sturcuture, cluster seuqences into a pdf file. with open(os.path.join(dir_tmp, 'precusorTmp.fa'), 'w') as outf: fa_tmp = '\n'.join([ '>' + novelmiRNANameNew, matureMiRNAPrecusorSeq, matureMiRNAPrecusorStr ]) outf.write(fa_tmp + '\n') f1 = os.path.join(dir_tmp, 'precusorTmp.fa') f2 = os.path.join(dir_tmp, 'precusorTmp.str') #print '%s -d 0 < %s > %s'%(rnafoldCmdTmp, f1, f2) #os.system('cd %s'%(dir_tmp)) #os.system('%s -d 0 < %s > %s'%(rnafoldCmdTmp, f1, f2)) os.system('cd %s && %s -d 0 < %s > %s' % (dir_tmp, rnafoldCmdTmp, f1, f2)) #f3 = os.path.join(dir_tmp, 'precusorTmp_ss.ps') f3 = os.path.join(dir_tmp, 'novel_miRNA_' + str(i) + '_ss.ps') #print f3 #print os.path.isfile(f3) f4 = os.path.join(dir_tmp, 'novel_miRNA_' + str(i) + '.pdf') # Extract the coordinate information of the precusor sequence #with open(f3, 'r') as infTmp: #clusterNameProbabilityDic[matureMiRNAName] #print '*******************' #if novelmiRNANameNew == 'novel_miRNA_1': # print len(matureMiRNAPrecusorSeq) # print matureMiRNAPrecusorSeq # print matureReadContentlist # print len(matureReadContentlist[0][0]) # print len(matureReadContentlist[1][0]) # print passengerReadContentlist # if passengerReadContentlist != 'None': # print len(passengerReadContentlist[0][0]) # print len(passengerReadContentlist[1][0]) #print '*******************' creatPDF(sampleName, novelmiRNANameNew, clusterNameProbabilityDic[matureMiRNAName], chr, startPos, endPos, strand, armType, readCountSumMatureMiRNA, totalReadCountSum, matureMiRNAPrecusorSeq, matureMiRNAPrecusorStr, matureMiRNASeq, passengerMiRNASeq, f3, f4, matureReadContentlist, passengerReadContentlist) i = i + 1 # Delete the clusterNames in clusterNameListTmp for clusterName in clusterNameListTmp: #print '%s is remvoed'%(clusterName) if clusterName in novelmiRNALOriginalList: novelmiRNALOriginalList.remove(clusterName) #if novelmiRNA == 'unmapped_mirna_HUVEC_JH-04:miRCluster_921_28:chr7:94176805_94176832-': # print len(matureMiRNAPrecusorSeq) # print matureMiRNAPrecusorSeq # print matureReadContentlist # print clusterNameClusterSeqDic[novelmiRNA][0] # print clusterNameFeatureDic[matureMiRNAName][1] # print clusterNameFeatureDic[matureMiRNAName][2] # print len(matureReadContentlist[0][0]) # print len(matureReadContentlist[1][0]) # print passengerReadContentlist # if passengerReadContentlist != 'None': # print len(passengerReadContentlist[0][0]) # print len(passengerReadContentlist[1][0]) outf1.close()
def main(alignment_file, edlevel_step, confint_p, spec_ids_str, permut_n, orf_crd_table, nucl_list): global aminoacids nucl_list = list(nucl_list) aminoacids = list(aminoacids) spec_ids_list = spec_ids_str.split(',') edlevel_dict = edlevel_dict_init(edlevel_step) edlevels = sorted(edlevel_dict.keys()) l = len(edlevels) orf_crd_dict = make_orf_crd_dict(orf_crd_table) for align_obj in align_parse(alignment_file): if (spec_ids_list[0] not in align_obj.species_list) or (spec_ids_list[1] not in align_obj.species_list): continue align_length = len(align_obj.align_dict[spec_ids_list[0]]) seq_id = align_obj.seqinfo_dict[spec_ids_list[0]].keys()[0] orf_crds = orf_crd_dict[seq_id] for i in range(orf_crds[0], orf_crds[1]): if align_obj.align_dict[spec_ids_list[0]][i] != 'A': if not align_obj.edinfo_dict[spec_ids_list[0]].get(i+1): continue if align_obj.align_dict[spec_ids_list[1]][i] == "-": continue if align_obj.edinfo_dict[spec_ids_list[1]].get(i+1): let2 = 'A' else: let2 = align_obj.align_dict[spec_ids_list[1]][i] let1 = 'A' codon, aacid, shift = get_codon(align_obj, spec_ids_list[0], orf_crds[0], i) syn = True for nucl in nucl_list: codon_new = codon[:] codon_new[shift] = nucl if aacid != str(Seq(''.join(codon_new)).translate()): syn = False if syn and aacid not in aminoacids: continue if align_obj.edinfo_dict[spec_ids_list[0]].get(i+1): edlevel = align_obj.edinfo_dict[spec_ids_list[0]][i+1].edlevel if syn: for j in range(1, l): if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]): edlevel_dict[edlevels[j - 1]].total_S_E += 1 else: for j in range(1, l): if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]): edlevel_dict[edlevels[j - 1]].total_N_E += 1 else: if syn: for j in range(1, l): edlevel_dict[edlevels[j - 1]].total_S_A += 1 else: for j in range(1, l): edlevel_dict[edlevels[j - 1]].total_N_A += 1 if let1 == let2: continue if let2 in nucl_list: codon_new = codon[:] codon_new[shift] = let2 aacid_new = str(Seq(''.join(codon_new)).translate()) if aacid == aacid_new: syn = True else: syn = False if align_obj.edinfo_dict[spec_ids_list[0]].get(i+1): edlevel = align_obj.edinfo_dict[spec_ids_list[0]][i+1].edlevel if syn: for j in range(1, l): if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]): edlevel_dict[edlevels[j - 1]].dS_e += 1 else: for j in range(1, l): if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]): edlevel_dict[edlevels[j - 1]].dN_e += 1 else: if syn: for j in range(1, l): edlevel_dict[edlevels[j - 1]].dS += 1 else: for j in range(1, l): edlevel_dict[edlevels[j - 1]].dN += 1 s = edlevel_dict[sorted(edlevel_dict.keys())[0]] dnds_a = (float(s.dN)/s.total_N_A)/(float(s.dS)/s.total_S_A) for i in sorted(edlevel_dict.keys()): edlevel_dict[i].p_and_confint_print(confint_p, spec_ids_str, permut_n, dnds_a)
print refseq sys.exit('Read %s is empty' % f_align.get_all_seqs()[1].id) if options.threshold < idencount / basecount and basecount > min_length: reads[ID] = tmp ref[ID] = refseq if len(refseq) > maxlen[0]: maxlen[0] = len(refseq) maxlen[1] = ID ID += 1 countreads_afterreverse += 1 else: ts = r_align.get_seq_by_num(1).tostring() tid = f_align.get_all_seqs()[1].id disc_seq.append( SeqRecord(Seq(ts, generic_nucleotide), id=tid, description='')) tseq = f_align.get_all_seqs()[1] disc_h.write('>%s\n' % tseq.id) disc_h.write(tseq.seq.tostring().replace('-', '') + '\n') disc_h.close() print >> sys.stderr, '%d reads were above the threshold (discarded), %d reads left' % ( countreads - countreads_afterreverse, countreads_afterreverse) try: print >> sys.stderr, 'dropped %d reads with wron length' % sff_droppedreads_length except: pass print >> sys.stderr, 'forward: %d, reverse: %d' % (count_forward, count_reverse)
pHash[pid] = 1 #print "############################ pid = " + str(pid) if pid != int(patternId) and int(patternId) != 0: #i += 1 # result query number may not equal to num !!! continue box = random.randrange(r, r + klength) for p in range(r, r + klength): if boxsize >= 2 and p == box: query += "(" for t in range(0, boxsize): query += alphabet[t] query += ")" else: query += seqStr[p] # id is very important for multiple K but name and desc is optional # and only for user to check query info query_seq = SeqIO.SeqRecord(Seq(query, generic_dna), id=str(i), description="dim=" + str(klength)) query_list.append(query_seq) i += 1 SeqIO.write(query_list, query_file, "fasta") query_file.close() for key in pHash: patternDistributionFile.write(str(key) + " " + str(pHash[key]) + "\n") patternDistributionFile.close() #print aln_ref+'\n'+aln_sample
rnak562 = pd.DataFrame() exon2 = 'AGAACTCCACAAACCCATC' exon1 = 'CTTGGAAGGCCGTCTCGTGG' cryptic = pd.Series() downstream1 = 'CTCTCTAAAAAAAATCCTTC' for var in rnareads.index: rr = pd.Series(rnareads.loc[var, 'K562'].split(' ')) rr.drop(0, inplace=True) for i in rr.index: if (exon2 not in rr.loc[i]): rr.drop(i, inplace=True) rrup = rr.apply(lambda x: x[114:134]) casexon = str(Seq( lib300.varseq[var][142:162]).reverse_complement()).upper() if (casexon in rrup.values): rnak562.loc[var, 'incl'] = rrup.value_counts()[casexon] else: rnak562.loc[var, 'incl'] = 0 if (exon1 in rrup.values): rnak562.loc[var, 'excl'] = rrup.value_counts()[exon1] else: rnak562.loc[var, 'excl'] = 0 if (downstream1 in rrup.values): rnak562.loc[var, 'cryptic'] = rrup.value_counts()[downstream1] else: rnak562.loc[var, 'cryptic'] = 0 rnak562.loc[var, 'rnareads'] = len(rr) rnak562.loc[var, 'rawreads'] = ' '.join(rr)
trained_mm = trainer.train([known_training_seq]) if VERBOSE: print trained_mm.transition_prob print trained_mm.emission_prob test_rolls, test_states = generate_rolls(300) predicted_states, prob = trained_mm.viterbi(test_rolls, DiceTypeAlphabet()) if VERBOSE: print "Prediction probability:", prob Utilities.pretty_print_prediction(test_rolls, test_states, predicted_states) # -- Baum-Welch training without known state sequences print "Training with Baum-Welch..." training_seq = Trainer.TrainingSequence(rolls, Seq("", DiceTypeAlphabet())) trainer = Trainer.BaumWelchTrainer(baum_welch_mm) trained_mm = trainer.train([training_seq], stop_training) if VERBOSE: print trained_mm.transition_prob print trained_mm.emission_prob test_rolls, test_states = generate_rolls(300) predicted_states, prob = trained_mm.viterbi(test_rolls, DiceTypeAlphabet()) if VERBOSE: print "Prediction probability:", prob Utilities.pretty_print_prediction(test_rolls, test_states, predicted_states)
""" dirFrameCheck = src.check_create_dir(dirTmp + os.sep + "framecheck") strTmpMarkers = dirFrameCheck + os.sep + "FirstMarkers.faa" fOut = open(strTmpMarkers, 'w') iMLength = args.iMLength iTotLength = args.iTotLength for gene in SeqIO.parse(open(dirTmp + os.sep + 'premarkers.txt'), "fasta"): iCount = 1 iRemSeq = iTotLength mtch = re.search('\+', str(gene.seq)) if not mtch: strMarker = str(gene.seq) geneMarker = SeqRecord(Seq(strMarker[0:min(iRemSeq, len(strMarker))]), id=gene.id + "_#" + str(iCount).zfill(2) + '\n', description="") SeqIO.write(geneMarker, fOut, "fasta") iRemSeq = iRemSeq - len(geneMarker.seq) else: for strMarker in (re.split('\++', str(gene.seq))): if (iRemSeq >= iMLength and len(strMarker) >= iMLength): geneMarker = SeqRecord( Seq(strMarker[0:min(iRemSeq, len(strMarker))]), id=gene.id + "_#" + str(iCount).zfill(2) + '\n', description="") SeqIO.write(geneMarker, fOut, "fasta") iCount += 1 iRemSeq = iRemSeq - len(geneMarker)
def mapgenome(genome_fn, reference_fn, delta_prefix): NO_ALIGN_STR = "ERROR: Could not find any alignments for" # run nucmer cmd = [nucmer_path, '--prefix', delta_prefix, reference_fn, genome_fn] devnull = open(os.devnull, 'w') proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE) _, out_stderr = proc.communicate() devnull.close() if proc.returncode: raise Exception(out_stderr) delta_fn = delta_prefix + '.delta' # get the first reference sequence only ref_record = SeqIO.parse(reference_fn, 'fasta').next() ref_seqid = ref_record.id ref_len = len(ref_record) genome_seq_mapped = np.asarray(['.'] * ref_len) genome_count_mapped = np.zeros(ref_len, dtype=np.int) for record in SeqIO.parse(genome_fn, 'fasta'): genome_seqid = record.id # run show-aligns align_fn = delta_prefix + '.align' with open(align_fn, 'wb') as align_handler: cmd = [show_aligns_path, delta_fn, ref_seqid, genome_seqid] proc = subprocess.Popen(cmd, stdout=align_handler, stderr=subprocess.PIPE) _, out_stderr = proc.communicate() if proc.returncode: if out_stderr.startswith(NO_ALIGN_STR): os.remove(align_fn) continue else: raise Exception(out_stderr) # load alignments with open(align_fn, 'rb') as align_handler: mar = strainest.mummer.MummerAlignmentReader(align_handler) for al in mar: ref_seq = np.asarray(list(al.seq1)) genome_seq = np.asarray(list(al.seq2)) genome_seq = genome_seq[np.where(ref_seq != '.')] genome_seq_mapped[al.start1 - 1:al.end1] = genome_seq genome_count_mapped[al.start1 - 1:al.end1] += 1 os.remove(align_fn) os.remove(delta_fn) # remove repeats genome_seq_mapped[genome_count_mapped > 1] = '.' # sub '.' with '-' genome_seq_mapped[genome_seq_mapped == '.'] = '-' # write mapped genome out_seq = Seq(''.join(genome_seq_mapped)).upper() out_id = re.sub('\s+', '_', os.path.basename(genome_fn)) out_record = SeqRecord(out_seq, id=out_id, description='') return out_record
def createAnalysis(self, seq_str, batch_ary): """Restriction.Analysis creation helper method.""" rb = Restriction.RestrictionBatch(batch_ary) seq = Seq(seq_str) return Restriction.Analysis(rb, seq)
import Bio from Bio.Seq import Seq from Bio.Alphabet import generic_dna, generic_protein, generic_rna my_gene = Seq("ACTAGCAGCGGA", generic_dna) print(type(my_gene)) attributes = [a for a in dir(my_gene) if not a.startswith("_")] print(attributes) my_transcript = my_gene.transcribe() print(my_transcript) print(my_transcript.alphabet) my_protein = my_gene.translate() print(my_protein) print(my_protein.alphabet) coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", generic_dna) myprot = coding_dna.translate(to_stop=True) print(myprot) seq1 = Seq("AAACGGA", generic_dna) seq2 = Seq("GGAGAT", generic_dna) mut_seq = seq1.tomutable() mut_seq mut_seq[0] = "G" print(mut_seq) myseq = Seq("CCAGAAACCCGGAA", generic_dna) #find the first occurence of the pattern print(myseq.find("GAA"))
def setUp(self): """Set up some sequences for later use.""" base_seq = Seq("AAAA") self.ecosite_seq = base_seq + Seq(EcoRI.site) + base_seq self.smasite_seq = base_seq + Seq(SmaI.site) + base_seq self.kpnsite_seq = base_seq + Seq(KpnI.site) + base_seq
def get_custom_fasta(ref_fasta,subsectionlist,args,model_kmer_means,kmer_len): if (args.verbose is True): print ("Generating a custom fasta") sequencedict=dict() for sequence in subsectionlist: if (args.verbose is True): print (sequence) for record in SeqIO.parse(ref_fasta, 'fasta'): if (record.id == sequence): if (sequence not in sequencedict): sequencedict[sequence]=list() for sections in subsectionlist[sequence]: start = sections[0] end = sections[1] if (len(sequencedict[sequence])>0): sequencedict[sequence]=str(sequencedict[sequence])+str(record.seq[sections[0]-1:sections[1]-1]) else: sequencedict[sequence]=str(record.seq[sections[0]-1:sections[1]-1]) if (args.verbose is True): print ("processing the custom fasta") kmer_means=dict() for sequence in sequencedict: kmer_means[record.id]=dict() tmp=dict() tmp2=dict() tmp["F"]=list() tmp["R"]=list() tmp["Fprime"]=list() tmp["Rprime"]=list() print ("ID", record.id) print ("length", len(record.seq)) print ("FORWARD STRAND") # seq = Seq(sequencedict[sequence], generic_dna) seq = Seq(sequencedict[sequence]) for x in range(len(seq)+1-kmer_len): kmer = str(seq[x:x+kmer_len]) tmp["F"].append(float(model_kmer_means[kmer])) print ("REVERSE STRAND") seq = revcomp = seq.reverse_complement() for x in range(len(seq)+1-kmer_len): kmer = str(seq[x:x+kmer_len]) tmp["R"].append(float(model_kmer_means[kmer])) tmp2["Fprime"]=sklearn.preprocessing.scale(tmp["F"], axis=0, with_mean=True, with_std=True, copy=True) tmp2["Rprime"]=sklearn.preprocessing.scale(tmp["R"], axis=0, with_mean=True, with_std=True, copy=True) kmer_means[record.id]=tmp2 '''From this dictionary we will return a pair consisting of a list of keys(lookup for sequence name) and a 3D array each slice of which relates to the seqid,forward and reverse and then the values. This will then be used as a numpy shared memory multiprocessing array. We hope. Caution - the dictionary returns in the wrong order. ''' items=kmer_means.items() '''for k,v in kmer_means.items(): for x,y in kmer_means[k].items(): print "idiot check",k,x ''' items_=map(processItems,items) seqids,arrays=zip(*items_) z=len(seqids) print (arrays) r,c=list(arrays)[0].shape threedarray=multiprocessing.Array(ctypes.c_double,z*r*c) threedarrayshared_array = np.ctypeslib.as_array(threedarray.get_obj()) a = np.array(arrays,dtype=np.float32) threedarrayshared_array = a return seqids,threedarrayshared_array
def test_recognition_site_on_both_strands(self): """Check if recognition sites on both strands are properly handled.""" seq = Seq("CTCTTCGAAGAG") self.assertEqual(EarI.search(seq), [3, 8])
def extract_kmers(name, fasta, length, pams, pampos, filename, chroms=[], minchrlen=10000, processes=1): """Extract candidate k-mer guideRNAs with their coordinates from FASTA. Convention: coordinate reported is for start position of the whole probe in the genome in 0-based coordinates; probe includes guideRNA and PAM (PAM can be before or after the guide); then for plus strand probe continues to the right, for minus strand probe continues to the left Args: name: project name, a folder with this name containing intermediate and final files in it fasta: iterator over Bio.SeqRecord.SeqRecord objects containing chromosomes as returned by load_fasta() length: length of guideRNAs (not including PAM sequence) pams: list of primary and alternative PAM sequences pampos: position of PAM ('start' or 'end') filename: all k-mers will be written in this file in the format '<k-mer followed by PAM> <coordinates>'' chroms: if not empty, inlcude in analysis only chromosomes with names from this list minchrlen: include in analysis only chromosomes not shorter than this processes: how many processes to use; do not specify more than you have on your system Return: genome info in the format [(<chromosome name>, <chromosome length>)] for all processed chromosomes in the order of processing """ if not pampos in ['start', 'end']: raise util.iGuideError("'pampos' argument should be 'start' or 'end'") pams_extend = [(pam, pam_seq) for pam in pams for pam_seq in util.expand_dna_n(pam)] pams_extend_rev = [(pam, str(Seq(pam_seq).reverse_complement())) for pam in pams for pam_seq in util.expand_dna_n(pam)] genome = [] fasta_temp = [] for chrom in fasta: if len(chrom) < minchrlen: continue if chroms and chrom.id not in chroms: continue genome.append((chrom.id, len(chrom))) fasta_temp.append(chrom) #Parallelize extracting kmers from the reference genome sequences by user defined #processes or the number of reference sequences parts = len(fasta_temp) if processes > parts: processes = parts kmersfiles_temp = [ tempfile.NamedTemporaryFile(dir=name, suffix='.temp%s' % i) for i in range(parts) ] pool = Pool(processes) util.print_log('poolSize %s...' % processes) for i in range(parts): pool.apply_async(extract_kmers_pool, (fasta_temp[i], length, pampos, pams_extend, pams_extend_rev, kmersfiles_temp[i].name)) util.print_log('Waiting for all subprocesses done...') pool.close() pool.join() util.print_log('all chromosomes processed') util.print_log('done, merge all kmers...') total_count = 0 util.warn_file_exists(filename) f = gzip.open(filename, 'w') for i in range(parts): for line in kmersfiles_temp[i]: f.write(line) total_count += 1 for file in kmersfiles_temp: file.close() f.close() util.print_log('total k-mers written: %s' % total_count) return genome