def write_fasta_file(cs_file, target, args, configs): from cs import TalosCSFile tab=TalosCSFile() tab.read_file( cs_file ) sequence=tab.sequence fasta_file=target+'.fasta' if args.fasta: print 'read fasta sequence from %s'%args.fasta target_fasta=fasta.read_fasta(args.fasta) print target_fasta if not fasta.compare_fasta( target_fasta, sequence, strict_length=True ): sys.exit("\n".join([ 'fasta-sequence does not match sequence in chemical shift file!!! ', 'FASTA: '+target_fasta, 'CS: '+sequence])) sequence=fasta.fill_gaps(target_fasta,sequence) if path.exists( fasta_file ): target_fasta=fasta.read_fasta(fasta_file) if not '-' in sequence and target_fasta!=sequence: print "inconsistent fasta sequence: between chemical shifts %(cs_file)s and fasta file %(fasta_file)s"%locals() print "will overwrite fasta file, and create backup of original fasta file .bak" shutil.copy(fasta_file,fasta_file+".bak") fasta.write_fasta(fasta_file, sequence, configs.target_name ) fasta_file=target+'.fasta' if '-' in sequence: exit('require full sequence information, provide fasta file with -fasta') fasta.write_fasta(fasta_file, sequence, configs.target_name ) return fasta_file
def testGcContentPercentageFromFile(self): with open('data/gcContent.fasta') as fp: gcContents = {name: gcContentPercentage(seq) for name, seq in read_fasta(fp)} maxName, maxGc = max(gcContents.iteritems(), key=operator.itemgetter(1)) self.assertEqual('Rosalind_6020', maxName) self.assertAlmostEqual(51.881994, maxGc, places=5)
def main(m5, fa, method, trim_suffix=False): read_names = None if method in ["sprai"]: if fa is None: raise Exception("A matching uncorrected fasta file is required to convert sprai indexed names back to their original") import fasta _, read_names = fasta.read_fasta(fa) for line in open(m5): fields = line.strip().split() n = fields[0] if '/' in n and trim_suffix: n = n[:n.rindex('/')] if method is not None: if method in ["ectools"]: n = n[:n.rindex("_corrected")] elif method == "sprai": n = read_names[int(n[:n.index('/')]) - 1] # !! reads are 1-indexed as of sprai v0.9.9.23 elif method in ["nanocorr"]: n = n[:n.rindex("_consensus")] elif method in ["lorma"]: if '_' in n: n = n[:n.rindex("_")] # lorma adds a "_<index>" to each read, which increments from 1 for each subread it was split into fields[0] = n print ' '.join(fields)
def testGcContentPercentageFromFile(self): with open('data/gcContent.fasta') as fp: gcContents = { name: gcContentPercentage(seq) for name, seq in read_fasta(fp) } maxName, maxGc = max(gcContents.iteritems(), key=operator.itemgetter(1)) self.assertEqual('Rosalind_6020', maxName) self.assertAlmostEqual(51.881994, maxGc, places=5)
def setUpClass(cls): #setup code that really should only run once print 'initialize ScoreFunctionTestCase...' cls.sequence=fasta.read_fasta( data_path+'gmr137.fasta' ) cls.molecule=AtomTree.from_sequence( cls.sequence ) cls.peak_collection=PeakCollection() peaks=['aroC.peaks','n.peaks'] for p in peaks: file=data_path+'assigned/'+p name=path.splitext(path.basename(file))[0] cls.peak_collection.add_experiment( PeakList.read_from_stream( name, open(file,'r'), False ) ) cls.assignments=AssignmentCollection.from_hard_assignments( cls.peak_collection, cls.molecule ) cls.some_to_remove=list([ x for i,x in enumerate( cls.assignments ) if i<100])
def GenerateLastNts(fasta_file, length=150): output_file = fasta_file output_file += ".-%d" % length fa = fasta.read_fasta(fasta_file) lastXnt_dic = {} fout = open(output_file, "w") for id in fa: lastXnt = fa[id][-length:].upper() print >> fout, ">%s\n%s" % (id, lastXnt) #, lastXnt in lastXnt_dic lastXnt_dic[lastXnt] = id fout.close() return output_file
def RNAMapping(fastq_file, fasta_file, output_file): ''' Input format: fastq.gz, fa file Output format: Blast matrix format python library_checker.py RNA_mapping ../2017-10-19_MiSeq/S1_W_R1.fastq.gz ../data/KRP_set.fa > ./output/2017-10-19_MiSeq/Blastn/S1_W_R1.exact python library_checker.py RNA_mapping ../2017-10-19_MiSeq/S2_WH_R1.fastq.gz ../data/KRP_set.fa > ./output/2017-10-19_MiSeq/Blastn/S2_WH_R1.exact ''' #=============================== # Read Fasta file (RNA) and make motifs for exact search #=============================== # GCAGGCATGCAAGCTGCC #ggcagcttgcatgcctg #gctagaactagtggatccc # GCAGGCATGCAAGCTGCC TCCTCGTTCATGGGGAATAATTGCAATCCCCGATCCCCAT GGGATCCACTAGTTCTAGCCGG " prefix = "GCAGGCATGCAAGCTGCCCGGG" motifs = {} fa = fasta.read_fasta(fasta_file) # RNA fasta file for id in fa: seq = fa[id].upper() rc_seq = ReverseComplement(seq) motif = prefix + rc_seq[: 50] # using only 50 nt for all cases (This can be chaged) motifs[motif] = id #=============================== # Read Fastq file (RNA, Read1) and Searching id with exact matching the pattern (motif) #=============================== fo = open(output_file, "w") cnt_dic = {} read1 = ReadSequenceFile(fastq_file) for line in read1.stdout.xreadlines(): id = line.split()[0] seq = read1.stdout.next()[:-1] read1.stdout.next() read1.stdout.next() for motif in motifs: if seq[:len(motif)] == motif: print >> fo, "%s\t%s\t100.00\t%d\t0\t0\t1\t50\t50\t1\t1e-10\t100.0" % ( id, motifs[motif], len(motif)) # @M03766:67:000000000-BGPDM:1:1101:14936:1730 1NYB_A:B 100.00 24 0 0 5 28 24 1 9e-10 48.1 # @M03766:67:000000000-BGPDM:1:1101:16561:1732 2PJP_A:B 100.00 23 0 0 5 27 23 1 3e-09 46.1 cnt_dic[id] = cnt_dic.get(id, 0) + 1 break fo.close() read1.stdout.close() read1.kill()
res_to_add.append(noesy.Resonance(atom=Atom(name,res.resid()),freq=res.freq(),error=res.error())) for res in res_to_add: #print res.atom().resid() resonances.add_resonance(res) def initial_assign(peak,molecule,fm,known_dist): for match in random_items( peak.matches( molecule, frequency_matcher=fm, distance_matcher=known_dist ), 1 ): if match: return match return None #ref_resonances = noesy.ResonanceList.read_from_stream( open(args.ref_prot,'r') ) resonances = noesy.ResonanceList.read_from_stream( open(args.prot,'r') ) unpack_unmethyl_atom_pool(resonances) #resonances = filter_resonances( ref_resonances ) sequence=fasta.read_fasta(args.fasta) resonances.set_sequence(sequence) peaks = PeakCollection.from_peak_files( args.peaks, ignore=True ) molecule=AtomTree.from_sequence( resonances.sequence() ) state=AssignmentCollection( peaks, molecule ) scorefxn=ScoreFunction(bmrb=1,consistency=1,symmetry=1) import random from assignment import ConstantFreqMatcher peak_order = [ peak for peak in peaks ] #random.shuffle( peak_order ) fm=ConstantFreqMatcher( resonances ) known_dist=ScoreDistanceMatcher( ConformationDistanceScore(), abs(math.log(0.3)), 0 ) #known_dist.max_sequence_separation=9 count=0 for peak in peak_order:
def main(unc, cor, fa, method, verbose=False): if verbose: print "Reading pacbio fasta" pacbio_reads, names = fasta.read_fasta(fa) cor_aligned = 0 unc_aligned = 0 tp = 0 fp = 0 fn = 0 #ne = 0 tn = 0 ''' From ec_toolkit compute-stats.py: errorStats['TP'] += len(errPreCorrect.difference(errPostCorrect)) errorStats['FP'] += len(errPostCorrect.difference(errPreCorrect)) errorStats['FN'] += len(errPreCorrect.intersection(errPostCorrect)) errorStats['NE'] += getNumWrongBase(errPreCorrect,errPostCorrect) # apparently, NE is the number of bases changed, but still incorrect ''' ''' From Error Correction Toolkit paper: We use the following measures for each program: number of erroneous bases identified and successfully corrected (true positives, TP), correct bases wrongly identified as errors and changed (false positives, FP), and erroneous bases that were either uncorrected or falsely corrected (false negatives, FN). We report sensitivity and specificity for each program. Then, we combine these into the gain metric [21], defined by gain = (TP - FP) / (TP + FN), which is the percentage of errors removed from the data set by the error-correction program. A negative gain value indicates that more errors have been introduced due to false corrections, which is not captured by measures such as sensitivity and specificity. ''' cor_iter = aln_formats.iter_m5(cor, 0, 0, 1000000000, by_query=True) uncor_iter = aln_formats.iter_m5(unc, 0, 0, 1000000000, by_query=True) cor_query = None uncor_query = None correct_uncorrected = 0 incorrect_uncorrected = 0 correct_corrected = 0 incorrect_corrected = 0 # ------ !! keep track of loci on the target sequence since coordinates on the query sequence will change dramatically ------ # ------ !! although if sequences align to different places, everything will go to crap ------ while True: if cor_query is not None and cor_query == uncor_query and cor_best_aln.target.name == uncor_best_aln.target.name: if verbose: print print "{} aligned for both uncorrected and corrected".format( cor_query) print "{} errors in uncorrected read".format( len(incorrect_loci_in_uncorrected)) print "{} errors in corrected read".format( len(incorrect_loci_in_corrected)) read_tp = len(prev_incorrect & now_correct) read_fp = len(prev_correct & now_incorrect) read_fn = len(prev_incorrect & now_incorrect) read_tn = len(prev_correct & now_correct) # not obviously trivial to compute this using set operations #read_ne = 0 tp += read_tp fp += read_fp fn += read_fn tn += read_tn if (uncor_query is None or cor_iter is None or uncor_query <= cor_query) and uncor_iter is not None: try: uncor_query, uncor_aln = uncor_iter.next() except: uncor_iter = None if cor_iter is None: break unc_aligned += 1 uncor_best_aln = sorted( uncor_aln, key=lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[ -1] # keep only "best" alignment, by total correct bp incorrect_loci_in_uncorrected = [] correct_loci_in_uncorrected = [] # get all incorrect loci in uncorrected alignment tpos = uncor_best_aln.target.start for i in xrange(len(uncor_best_aln.alignment)): if uncor_best_aln.alignment[i] == '|': correct_loci_in_uncorrected.append(tpos) else: if uncor_best_aln.target.alignment[i] == '-': incorrect_loci_in_uncorrected.append( "{}i{}".format( uncor_best_aln.query.alignment[i], tpos)) # <nucleotide> inserted before tpos else: incorrect_loci_in_uncorrected.append( "x{}".format(tpos) ) # <nucleotide> mismatch or deleted at tpos if uncor_best_aln.target.alignment[i] != '-': tpos += 1 incorrect_loci_in_uncorrected.extend( range(uncor_best_aln.target.start - uncor_best_aln.query.start, uncor_best_aln.target.start) + range( uncor_best_aln.target.end, uncor_best_aln.target.end + uncor_best_aln.query.length - uncor_best_aln.query.end) ) # finagle an estimate of the target regions that are supposed to be covered by the read prev_incorrect = set(incorrect_loci_in_uncorrected) prev_correct = set(correct_loci_in_uncorrected) incorrect_uncorrected += len(prev_incorrect) correct_uncorrected += len(prev_correct) else: try: cor_query, cor_aln = cor_iter.next() except: cor_iter = None if uncor_iter is None: break cor_aligned += 1 cor_best_aln = sorted( cor_aln, key=lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[ -1] # keep only "best" alignment, by total correct bp incorrect_loci_in_corrected = [] correct_loci_in_corrected = [] # get all incorrect loci in corrected alignment tpos = cor_best_aln.target.start for i in xrange(len(cor_best_aln.alignment)): if cor_best_aln.alignment[i] == '|': correct_loci_in_corrected.append(tpos) else: if cor_best_aln.target.alignment[i] == '-': incorrect_loci_in_corrected.append( "{}i{}".format( cor_best_aln.query.alignment[i], tpos)) # <nucleotide> inserted before tpos else: incorrect_loci_in_corrected.append("{}".format( tpos)) # <nucleotide> mismatch or deleted at tpos if cor_best_aln.target.alignment[i] != '-': tpos += 1 incorrect_loci_in_corrected.extend( range(cor_best_aln.target.start - cor_best_aln.query.start, cor_best_aln.target.start) + range( cor_best_aln.target.end, cor_best_aln.target.end + cor_best_aln.query.length - cor_best_aln.query.end) ) # finagle an estimate of the target regions that are supposed to be covered by the read now_incorrect = set(incorrect_loci_in_corrected) now_correct = set( correct_loci_in_corrected ) # these should already be unique, but we need them to be sets to do set operations incorrect_corrected += len(now_incorrect) correct_corrected += len(now_correct) if tp + fn == 0: raise Exception("No read names matched (uncor: {}, cor: {})".format( uncor_query, cor_query)) gain = float(tp - fp) / (tp + fn) print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tUncorrected wrong bp\tUncorrected right bp\tCorrected wrong bp\tCorrected right bp\tTP\tFP\tTN\tFN\tSensitivity\tSpecificity\tGain" print "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % ( fa, cor, unc_aligned, cor_aligned, (cor_aligned - unc_aligned), incorrect_uncorrected, correct_uncorrected, incorrect_corrected, correct_corrected, tp, fp, tn, fn, (float(tp) / (tp + fn)), (float(tn) / (tn + fp)), gain)
r += 1 return r complement = {"A":"T","T":"A","G":"C","C":"G"} def revcomp(s): r = "" for i in range(len(s)): r = complement[s[i]] + r return r infile = "rosalind_corr.txt" with open(infile,"r") as f: dnas,_ = fasta.read_fasta(f) count = {} for s in dnas.values(): if count.get(revcomp(s),0) != 0: count[revcomp(s)] += 1 else: count.setdefault(s,0) count[s] += 1 adjs = {} for s in count.keys(): adjs[s] = set() for s2 in count.keys(): if s == s2: continue
import fasta infile = "rosalind_mult.txt" with open(infile,"r") as f: dnas,keys = fasta.read_fasta(f) ss = [] for k in keys: ss.append(dnas[k]) d = {} p = {} INF = 10000000 print(ss) def score(c): r = 0 for i in range(len(c)): for j in range(i+1,len(c)): if c[i] != c[j]: r -= 1 return r for i0 in range(-1,len(ss[0])): for i1 in range(-1,len(ss[1])): for i2 in range(-1,len(ss[2])): for i3 in range(-1,len(ss[3])):
def testReadFasta(self): with open('data/gcContent.fasta') as fp: seqs = list(read_fasta(fp)) self.assertIsNotNone(seqs) self.assertEqual(7, len(seqs))
#!/usr/bin/env python import fasta import rna_transcription import protein_translation def rna_splicing(dnas): s = dnas.popitem(False)[1] for sub in dnas.itervalues(): s = s.replace(sub, '') return protein_translation.encode_strand( rna_transcription.transcribe_rna(s)) if __name__ == '__main__': import sys f = sys.stdin if len(sys.argv) == 1 else open(sys.argv[1]) print rna_splicing(fasta.read_fasta(f))
def motif_locations(fasta_file): data = fasta.read_fasta(fasta_file).popitem()[1] return [m.start(0) + 1 for m in n_glycosylation_motif.finditer(data, overlapped=True)]
#!/usr/bin/env python from __future__ import division from fasta import read_fasta def gc_percentage(strand): return sum(c in ['C', 'G'] for c in strand) / len(strand) * 100 \ if strand else 0 if __name__ == '__main__': import sys strands = read_fasta(open(sys.argv[1])) gc_content = dict((name, gc_percentage(strands[name])) for name in strands) max_strand = max(gc_content, key=gc_content.get) print max_strand print "%2.6f%%" % gc_content[max_strand]
#!/usr/bin/env python import sys import fasta file = sys.argv[1] temp = file.split('.') filename_base = temp[0] tag = temp[1] sequences = fasta.read_fasta(open(file, 'r').readlines()) count = 1 for i in sequences: f = filename_base + '_' + str(count) + '.' + tag output = open(f, 'w') output.write(i.name + '\n') output.write(i.sequence) count += 1
def main(sam, tef, rename, fa): read_names = None if rename in ["sprai"]: if fa is None: raise Exception("A matching uncorrected fasta file is required to convert sprai indexed names back to their original") import fasta _, read_names = fasta.read_fasta(fa) fout = open(tef, 'w') l = 0 for query_name, alignments in aln_formats.iter_maf(maf, 0, 0, 1000000000, by_query=True): al = sorted(alignments, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp #fout.write('\n' + str(al)) ''' TEF (format) readid n-errors [pos tb wb ind]+ In the above format, the fields are described as below : Fields Description readid ID of the read corrected n-errors Integer. Number of errors corrected in the read. pos Position for fix (0 < = pos < length of the read) tb true value of the base at pos. wb wrong value of the base at pos. wb should be current base at read tb,wb is one of {0,1,2,3,4,5} 0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T', 5 = '-' ind indicates the type of error. one of {0,1,2} 0 substitution (bad char in the read at pos) or 1 deletion (missing char in the read after pos) or 2 insertion (extra char in the read at pos) ''' n_errs = 0 err_strings = [] q = al.query.start - 1 t = al.target.start - 1 # whole bunch of ambiguity codes will all map to N (they are present in the a_thaliana reference...) almap = {'A':0, 'C':1, 'G':2, 'T':3, 'N':4, '-':5, 'R':4, 'Y':4, 'S':4, 'W':4, 'K':4, 'M':4, 'B':4, 'V':4, 'D':4, 'H':4} for i in xrange(len(al.query.alignment)): qlocus = al.query.alignment[i] tlocus = al.target.alignment[i] if qlocus == tlocus: continue n_errs += 1 if tlocus == '-': ind = 2 q += 1 elif qlocus == '-': ind = 1 t += 1 else: ind = 0 q += 1 t += 1 err_strings.append("%i %i %i %i" % (q, almap[tlocus.upper()], almap[qlocus.upper()], ind)) if rename is not None: if rename in ["ectools"]: query_name = query_name[:query_name.rindex("_corrected")] elif rename == "sprai": query_name = read_names[int(query_name[:query_name.index('/')])] elif rename in ["nanocorr"]: query_name = query_name[:query_name.rindex("_consensus")] tef_line = "%s %i %s" % (query_name, n_errs, ' '.join(err_strings)) fout.write(('\n' if l > 0 else '') + tef_line) l += 1 fout.close()
#!/usr/bin/env python ''' simple to translate dna into proteins ''' #importing the dnatranslate module import dnatranslate import sys import fasta #opening and reading the file in one take dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines()) #iterate over the sequences and translate them for item in dna: protein = dnatranslate.translate_dna(item.sequence) print item.name print protein
#import fasta import read_fasta import fasta import codon data = fasta.read_fasta('DNA.fasta') nama = data[0][0] sekuens = data[0][1] #transkripsi mRNA = sekuens.replace('T','U') print mRNA #cari posisi start codon start = mRNA.find('AUG') print "Start codon ada di posisi %d" %(start) #pecahin mRNA per tiga huruf, mulai dari start kodon prot = "" while start <= len(mRNA): kodon = mRNA[start:start+3] print kodon
import fasta infile = "rosalind_pmch.txt" #infile = "rosalind_cat.txt" with open(infile,"r") as f: rnas,key = fasta.read_fasta(f) rna = rnas[key[0]] n = len(rna) nA = len(filter(lambda x:x == "A",rna)) nG = len(filter(lambda x:x == "G",rna)) def fac(n): if n == 0: return 1 return n*fac(n-1) print(nA,nG) print(fac(nA)*fac(nG))
#!/usr/bin/env python2.7 ##-*- mode:python;tab-width:2;indent-tabs-mode:t;show-trailing-whitespace:t;rm-trailing-spaces:t;python-indent:2 -*-' from sys import argv import fasta from PDB.Polypeptide import one_to_three assert( len(argv)>3) rdc_file = argv[1] fasta_file = argv[2] orientation = argv[3] rdc_line=open(rdc_file,'r').readlines() seq=fasta.read_fasta(fasta_file) if int(orientation)==1: error=3.2 elif int(orientation)==2: error=4.5 print "# First atom Second atom RDC Error Weight Orientation" for line in rdc_line: tags=line.split() resid1=int(tags[0]) atom1=tags[1] resid2=int(tags[2]) atom2=tags[3] rdc_value=float(tags[4]) aa1=one_to_three(seq[resid1-1]) aa2=one_to_three(seq[resid2-1]) print'%5d %4s %3s %5d %4s %3s %8.3f %5.3f 1.000 %5d'%(resid1,aa1,atom1,resid2,aa2,atom2,rdc_value,error,int(orientation))
#!/usr/bin/python/ import fasta import sys fastafile = open("test.fas", "r").readlines() my_sequences = fasta.read_fasta(fastafile) #test_list = ['ATATAG', 'TATA', 'GGGTGA'] #to make a dictionary of all possible hexamers all_6mer = {} base = ['A','C','G','T'] for C1 in base: for C2 in base: for C3 in base: for C4 in base: for C5 in base: for C6 in base: all_6mer[''.join([C1,C2,C3,C4,C5,C6])] = 0 length = len(all_6mer) keys = all_6mer.keys() for i in my_sequences: ind_sequence = i.sequence print i.name for keys in all_6mer: match = {} if keys in ind_sequence:
def load_fasta_db_into_proteins( proteins, fasta_db, clean_seqid=None, iso_leu_isomerism=False): seqids, fastas = fasta.read_fasta(fasta_db) load_fastas_into_proteins(proteins, fastas, clean_seqid, iso_leu_isomerism)
number_C_H={'A':1, 'R':3, 'N':1, 'D':1, 'C':1, 'Q':2, 'E':2, 'G':0, 'H':4, 'I':4, 'L':4, 'K':4, 'M':3, 'F':6, 'P':3, 'S':1, 'T':2, 'W':6, 'Y':5, 'V':3} assert( len(argv)>1) fasta_file = argv[1] sequence=fasta.read_fasta(fasta_file) all_num=0 for aa in sequence: all_num+=number_C_H[aa] print all_num
def main(maf, tef, rename, fa, untef=None): # ------ stats ------ if untef is not None: import fasta reads, names = fasta.read_fasta(fa) tp = 0 fp = 0 fn = 0 tn = 0 uncor = {} for line in open(untef): data = line.strip().split(' ') fields = [int(a) for a in data[1:]] assert fields[0] == (len(fields) - 1) / 4, "Number of errors does not match list" uncor[data[0]] = [ fields[i:i + 4] for i in xrange(1, len(fields), 4) ] cor_aligned = 0 # ------------------- read_names = None if rename in ["sprai"]: if fa is None: raise Exception( "A matching uncorrected fasta file is required to convert sprai indexed names back to their original" ) import fasta _, read_names = fasta.read_fasta(fa) fout = open(tef, 'w') l = 0 for query_name, alignments in aln_formats.iter_maf(maf, 0, 0, 1000000000, by_query=True): #al = sorted(alignments, key = lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] # keep only "best" alignment, by total correct bp n_errs = 0 err_strings = [] # ------ stats ------ if untef is not None: cor_aligned += 1 cpos = set() # ------------------- for al in alignments: #fout.write('\n' + str(al)) ''' TEF (format) readid n-errors [pos tb wb ind]+ In the above format, the fields are described as below : Fields Description readid ID of the read corrected n-errors Integer. Number of errors corrected in the read. pos Position for fix (0 < = pos < length of the read) tb true value of the base at pos. wb wrong value of the base at pos. wb should be current base at read tb,wb is one of {0,1,2,3,4,5} 0 = 'A', 1 = 'C', 2 = 'G', 3 = 'T', 5 = '-' ind indicates the type of error. one of {0,1,2} 0 substitution (bad char in the read at pos) or 1 deletion (missing char in the read after pos) or 2 insertion (extra char in the read at pos) ''' q = al.query.start - 1 t = al.target.start - 1 # whole bunch of ambiguity codes will all map to N (they are present in the a_thaliana reference...) almap = { 'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4, '-': 5, 'R': 4, 'Y': 4, 'S': 4, 'W': 4, 'K': 4, 'M': 4, 'B': 4, 'V': 4, 'D': 4, 'H': 4 } for i in xrange(len(al.query.alignment)): qlocus = al.query.alignment[i] tlocus = al.target.alignment[i] if qlocus == tlocus: continue n_errs += 1 if tlocus == '-': ind = 2 q += 1 elif qlocus == '-': ind = 1 t += 1 else: ind = 0 q += 1 t += 1 err_strings.append( "%i %i %i %i" % (q, almap[tlocus.upper()], almap[qlocus.upper()], ind)) # ------ stats ------ if untef is not None: cpos.add(q) # ------------------- if rename is not None: if rename in ["ectools"]: query_name = query_name[:query_name.rindex("_corrected")] elif rename == "sprai": query_name = read_names[int( query_name[:query_name.index('/')])] elif rename in ["nanocorr"]: query_name = query_name[:query_name.rindex("_consensus")] tef_line = "%s %i %s" % (query_name, n_errs, ' '.join(err_strings)) fout.write(('\n' if l > 0 else '') + tef_line) l += 1 # ------ stats ------ if untef is not None: if uncor.has_key(query_name): un = uncor[query_name] upos = set([u[0] for u in un]) read_tp = len(upos - cpos) read_fp = len(cpos - upos) read_fn = len(cpos & upos) read_tn = len(reads[query_name]) - read_tp - read_fp - read_fn tp += read_tp fp += read_fp fn += read_fn tn += read_tn # ------------------- fout.close() # ------ stats ------ if untef is not None: print tp, fp, tn, fn gain = float(tp - fp) / (tp + fn) print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tTP\tFP\tTN\tFN\tGain\tSensitivity\tSpecificity" print "%s\t%s\tN/A\t%i\tN/A\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % ( fa, tef, cor_aligned, tp, fp, tn, fn, (float(tp) / (tp + fn)), (float(tn) / (tn + fp)), gain)
def __call__(self, infile, outfile, fasta=None ): target_fasta=0 from cs import ProtCSFile tab=ProtCSFile() tab.read_stream( infile ) sequence=tab.sequence if not sequence and fasta: sequence=fasta tab.set_sequence(sequence) if not sequence and self._args.fasta: sequence=fasta.read_fasta(self._args.fasta) tab.set_sequence(sequence) #combine atoms into QX if possible/necessary res_in=noesy.ResonanceList.read_from_prot( tab ) self.clean_up_names( res_in ) res_out=noesy.ResonanceList() res_out.set_sequence( sequence ) cyana_ss_constraints = [] for resid,resonances in res_in.iter_residues(): # print 'reso: ',"\n".join(["%s"%r for r in resonances]) aa=res_in.sequence()[resid-1] #copy heavy atoms if self._args.v >= 2: print 'residue %d %s round 1...'%(resid,aa) pools = self.get_pools( resonances, aa ) combined_pools, cya_ss = self.combine_pools( pools, aa ) new_resonances = self.generate_combined_resonances( pools, combined_pools ) if self._args.v >=2: print 'residue %d %s round 2...'%(resid,aa) pools = self.get_pools( new_resonances, aa ) combined_pools, cya_ss = self.combine_pools( pools, aa ) new_resonances = self.generate_combined_resonances( pools, combined_pools ) cyana_ss_constraints += cya_ss for r in new_resonances: res_out.add_resonance( r ) prot_data=res_out.generate_dict() floats=[] ambiguity=[] for r in res_out.itervalues(): ambiguity.append( r.ambiguity ) try: floats.append( r.float_partners_str() ) except AttributeError as exc: # print exc floats.append( None ) if self._args.stereo: prot_data['STEREO']=floats if self._args.ambiguity: prot_data['AMBIGUITY']=ambiguity # print floats nih_table = cs.NIH_table().from_dict( prot_data ) # print nih_table.vars # print nih_table.table # print 'convert to ProtCS-File' prot_file = cs.ProtCSFile().from_table( nih_table ) prot_file.write( outfile, header=self._args.header ) if self._args.cyana_ssa: fd = open( self._args.cyana_ssa, 'w') for line in cyana_ss_constraints: fd.write('%s\n'%line)
def cluster(read_fa, ref_fa, aln_file, out_prefix, verbosity=0, st=None, en=None): # load read and reference sequences reads, read_names = fasta.read_fasta(read_fa, split_at_space=True) ref, ref_names = fasta.read_fasta(ref_fa) ref_name = ref_names[0] ref_seq = ref[ ref_name] # we assume the ref has only one sequence, or the first is the primary ref_name = ref_name.split()[0] if st is None: st = 0 if en is None: en = len(ref_seq) - 1 if verbosity > 0: print("Assessing {}-{} of {} bp in {}".format(st, en, len(ref_seq), ref_names[0])) # load or build distance matrix dist_matrix_file = "{}.pairwise_distance.npy".format(out_prefix) read_name_file = "{}.aligned_reads.txt".format(out_prefix) feature_file = "{}.features.npy".format(out_prefix) features = None try: print("Trying to load features and distance matrix...") dist = np.load(dist_matrix_file).astype('i4') np.save(dist_matrix_file, dist) aligned_read_names = open(read_name_file).read().strip().split('\n') features = np.load(feature_file) except Exception as e: print("Missing.") print("Computing features...") features, aligned_read_names = compute_features(aln_file, read_names, ref_seq, ref_name, feature_file, binary=True, st=st, en=en) open(read_name_file, 'w').write('\n'.join(aligned_read_names) + '\n') print("Computing distances...") dist = distance(features) np.save(dist_matrix_file, dist) # only distances between aligned reads (filter out any rows/cols with any -2) aln_indices = [d for d in range(dist.shape[0]) if -2 not in dist[d, :]] aln_dist = dist[aln_indices, :][:, aln_indices] if aln_dist.shape[0] == 0: print("No reads aligned to {} ({} bp) from {} - {}".format( ref_name, len(ref_seq), st, en)) return print("Plotting distance distribution...") plot_distance_distr(aln_dist, out_prefix) compressed_dist_matrix = spadist.squareform(aln_dist) print("Agglomerative clustering (linkage)...") # does pretty simple agglomerative hierarchical clustering (think neighbor-joining) linkage = sch.linkage( compressed_dist_matrix, method="ward", metric="euclidean") # same as ward(compressed_dist_matrix) np.save("{}.linkage.npy".format(out_prefix), linkage) # convert hierarchical clustering (from linkage) to flat clustering: n_clusters, cutoff = get_cutoff(linkage, aln_dist, out_prefix, threshold=1000) print("Cutoff: {}".format(cutoff)) cluster_indices = sch.fcluster( linkage, cutoff - 1, 'distance') # this is the default behavior of dendrogram #print(list(cluster_indices)) ai = np.array(aln_indices) np.save("{}.aligned_indices.npy".format(out_prefix), ai) np.save("{}.cluster_indices.npy".format(out_prefix), cluster_indices) print("Drawing heatmap...") draw_heatmap(aln_dist, linkage, out_prefix, cutoff) n_indices = len(set(cluster_indices)) print("{} clusters (indices) found".format(n_indices)) # ------ PCoA and plot colored by cluster_indices ------ d = DistanceMatrix(aln_dist) pcoa_result = pcoa(d) if verbosity > 1: print("Proportion explained:", pcoa_result.proportion_explained) print("Eigenvalues:", pcoa_result.eigvals) print("Samples:", pcoa_result.samples) print("Features:", pcoa_result.features) x = pcoa_result.samples["PC1"] for pc in [2, 3, 4]: y = pcoa_result.samples["PC{}".format(pc)] plt.clf() f, ax = plt.subplots(figsize=(8, 8)) sn.despine(f) sn.scatterplot(x, y, hue=cluster_indices, palette=sn.color_palette("husl", n_indices)) lgd = plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.xlabel("PC1: {:.2f}%".format( pcoa_result.proportion_explained["PC1"] * 100)) plt.ylabel("PC{}: {:.2f}%".format( pc, pcoa_result.proportion_explained["PC{}".format(pc)] * 100)) plt.savefig("{}.pcoa_1_{}.png".format(out_prefix, pc), bbox_extra_artists=(lgd, ), bbox_inches='tight') # ------ Generate cluster consensus seqs ------ generate_cluster_seqs(ref_seq, ref_name, aligned_read_names, aln_file, cluster_indices, out_prefix, st, en, verbosity)
from collections import OrderedDict from itertools import chain from fasta import read_fasta def directed_edges(node, nodes): edges = [] name, strand = node for n, s in nodes: if strand[-3:] == s[:3]: edges.append(n) return edges def all_adjacencies(nodes): adjacencies = OrderedDict() for name, strand in nodes.iteritems(): adjacencies[name] = directed_edges((name, strand), ((n, s) for n, s in nodes.iteritems() if n != name)) return adjacencies if __name__ == '__main__': import sys nodes = read_fasta(open(sys.argv[1])) adjacencies = all_adjacencies(nodes) for tail in (name for name in adjacencies if adjacencies[name]): print '\n'.join("%s %s" % (tail, head) for head in adjacencies[tail])
#input: lines = open( args.infile,'r').readlines(); #output: verbose=1 if args.outfile=="stdout": outfile=sys.stdout verbose=0 else: outfile=open(args.outfile,'w'); library.hello( __file__ ) fasta=None if args.fasta: fasta=fasta_lib.read_fasta(args.fasta) sequence=""; end=args.end if end<=0: end=1000000 start=args.start format="%5d %5s %5s %5d %5s %5s %8.3f %s\n" try: if fasta: upl_fasta=fasta_lib.upl2fasta( lines ) offset=fasta_lib.find_fasta_offset( upl_fasta, fasta, verbose ) start=offset+1 end=start+len(fasta)-1 for line in lines:
def BLASTN_RNA(fasta1, fasta2, exact_file1, blastn_file2, output_file): ''' parse blastn output and make ppi ''' ppi_cnt_dic = {} read1_dic = {} read2_dic = {} ''' ['M03766:33:000000000-AT3T3:1:1101:21081:6509', '113', 'NMI', '880', '36', '54S41M', 'IGF2', '452', '0', 'ATTTTGATCATATGACTGCTCTGTTTCATTTTTTTCAATAAACCCTTTACAATTAAGTGTTCTCTAGGTCAACCTCACATAGCATACTTTGAAGA', 'HHFFHHHHFDHHHGHHHHHHHHEHHHHHGGHFHGBHGHHGHHEG4GHHHHHHHHHHHHHFFFG3GEBGBFHHHHGHHHHHGHHFHFHGHHGHHHH', 'AS:i:82', 'XN:i:0', 'XM:i:0', 'XO:i:0', 'XG:i:0', 'NM:i:0', 'MD:Z:41', 'YS:i:174', 'YT:Z:DP'] ['M03766:33:000000000-AT3T3:1:1101:21081:6509', '177', 'IGF2', '452', '36', '5S87M', 'NMI', '880', '0', 'TCTCTAGGCCAAACGTCACCGTCCCCTGATTGCTCTACCCACCCAAGACCCCGCCCACGGGGGCGCCCCCCCAGAGATGGCCAGCAATCGGA', '/BBB/BBBFFFEFFFEEFAFB?FFFFBFFFFFFFEB;@-DFFFFFFD@FFFFEFFFFAFFFFDAFGCGGHGGHHHHHHHFFHHHGFEGFHHH', 'AS:i:174', 'XN:i:0', 'XM:i:0', 'XO:i:0', 'XG:i:0', 'NM:i:0', 'MD:Z:87', 'YS:i:82', 'YT:Z:DP']^C ''' #if len( sys.argv ) < 2: # print "python SAM.py ../data/roth2016_control_set_plus_control.fa output/2016-12-22_MiSeq/Friedrich/17543_S1.sam" # sys.exit(0) total_cnt = 0 RNA_fa = fasta.read_fasta(fasta1) fa = fasta.read_fasta(fasta2) filepath1 = exact_file1 # sys.argv[3] # read1 = bait filepath2 = blastn_file2 #sys.argv[4] # read2 = prey # Read1 = bait = RNA part PREV_QNAME = "" f = open(filepath1) read_cnt = 0 for line in f.xreadlines(): #if read_cnt % 10000 == 0: print read_cnt read_cnt += 1 ## READ 1 # @M03766:53:000000000-B63MG:1:1101:13982:1738 cask_p142 98.969 97 1 0 1 97 99 3 3.06e-50 184 [ QNAME, TARGET, PERCENT, LENGTH, MISMATCH, GAPOPEN, QSTART, QEND, SSTART, SEND, EVALUE, BITSCORE ] = line[:-1].split("\t") if QNAME == PREV_QNAME: continue if int(SEND) > int(SSTART): continue # don't allow both direction for RNA read1_dic[QNAME] = TARGET PREV_QNAME = QNAME f.close() # Read2 = prey = Protein part PREV_QNAME = "" read_cnt = 0 f = open(filepath2) for line in f.xreadlines(): #if read_cnt % 10000 == 0: print read_cnt read_cnt += 1 ## READ 2 # @M03766:53:000000000-B63MG:1:1101:13982:1738 cask_p142 98.969 97 1 0 1 97 99 3 3.06e-50 184 [ QNAME, TARGET, PERCENT, LENGTH, MISMATCH, GAPOPEN, QSTART, QEND, SSTART, SEND, EVALUE, BITSCORE ] = line[:-1].split("\t") if QNAME == PREV_QNAME: continue if int(SEND) > int(SSTART): continue read2_dic[QNAME] = TARGET PREV_QNAME = QNAME f.close() for QNAME in read1_dic: TARGET2 = read2_dic.get(QNAME, "") if TARGET2 == "": continue TARGET1 = read1_dic[QNAME] ppi_cnt_dic[(TARGET1, TARGET2)] = ppi_cnt_dic.get( (TARGET1, TARGET2), 0) + 1 total_cnt += 1 f.close() RNA_id_list = RNA_fa.keys() RNA_id_list.sort() id_list = fa.keys() id_list.sort() fo = open(output_file, "w") print >> fo, "# This file is generated by BLASTN_RNA" print >> fo, "DB(Read 1) \ AD(Read 2)\t" + "\t".join(id_list) for RNA_id1 in RNA_id_list: output = RNA_id1 for id2 in id_list: cnt = ppi_cnt_dic.get((RNA_id1, id2), 0) output += "\t%d" % cnt print >> fo, output fo.close()
from itertools import chain from fasta import read_fasta def directed_edges(node, nodes): edges = [] name, strand = node for n, s in nodes: if strand[-3:] == s[:3]: edges.append(n) return edges def all_adjacencies(nodes): adjacencies = OrderedDict() for name, strand in nodes.iteritems(): adjacencies[name] = directed_edges( (name, strand), ((n, s) for n, s in nodes.iteritems() if n != name)) return adjacencies if __name__ == '__main__': import sys nodes = read_fasta(open(sys.argv[1])) adjacencies = all_adjacencies(nodes) for tail in (name for name in adjacencies if adjacencies[name]): print '\n'.join("%s %s" % (tail, head) for head in adjacencies[tail])
# from fasta import read_fasta import fasta import codon #sekuens = fasta.read_fasta('flu_A.fasta') #print(sekuens) data = fasta.read_fasta('flu_A.fasta') nama = data[0][0] sekuens = data[0][1] #Menghitung persentasi A, T, G, C sum_basa_adenin = sekuens.count("A") sum_basa_timin = sekuens.count("T") sum_basa_guanin = sekuens.count("G") sum_basa_citocin = sekuens.count("C") total_basa = sum_basa_adenin + sum_basa_timin + sum_basa_guanin + sum_basa_citocin print ("\n================Persentase A, T, G, C===============") print ("Persentase A = %.2f%%" %((sum_basa_adenin / total_basa) * 100)) print ("Persentase T = %.2f%%" %((sum_basa_timin / total_basa) * 100)) print ("Persentase G = %.2f%%" %((sum_basa_guanin / total_basa) * 100)) print ("Persentase C = %.2f%%" %((sum_basa_citocin / total_basa) * 100)) #transkripsi
def main(unc, cor, fa, sorted=False, verbose=False): if verbose: print "Reading pacbio fasta" reads, names = fasta.read_fasta(fa) cor_aligned = 0 unc_aligned = 0 tp = 0 fp = 0 fn = 0 #ne = 0 tn = 0 ''' From ec_toolkit compute-stats.py: errorStats['TP'] += len(errPreCorrect.difference(errPostCorrect)) errorStats['FP'] += len(errPostCorrect.difference(errPreCorrect)) errorStats['FN'] += len(errPreCorrect.intersection(errPostCorrect)) errorStats['NE'] += getNumWrongBase(errPreCorrect,errPostCorrect) # apparently, NE is the number of bases changed, but still incorrect ''' ''' From Error Correction Toolkit paper: We use the following measures for each program: number of erroneous bases identified and successfully corrected (true positives, TP), correct bases wrongly identified as errors and changed (false positives, FP), and erroneous bases that were either uncorrected or falsely corrected (false negatives, FN). We report sensitivity and specificity for each program. Then, we combine these into the gain metric [21], defined by gain = (TP - FP) / (TP + FN), which is the percentage of errors removed from the data set by the error-correction program. A negative gain value indicates that more errors have been introduced due to false corrections, which is not captured by measures such as sensitivity and specificity. ''' if not sorted: uncor = {} corr = {} if verbose: print "Reading uncorrected TEF" for line in open(unc): data = line.strip().split(' ') fields = [int(a) for a in data[1:]] assert fields[0] == (len(fields) - 1) / 4, "Number of errors does not match list" uncor[data[0]] = [ fields[i:i + 4] for i in xrange(1, len(fields), 4) ] if verbose: print "Reading corrected TEF" for line in open(cor): data = line.strip().split(' ') fields = [int(a) for a in data[1:]] assert fields[0] == (len(fields) - 1) / 4, "Number of errors does not match list" corr[data[0]] = [ fields[i:i + 4] for i in xrange(1, len(fields), 4) ] if verbose: print "Some uncorrected reads:" print uncor.keys()[:10] print print "Some corrected reads:" print corr.keys()[:10] for n in names: if not uncor.has_key(n) and not corr.has_key(n): continue if not uncor.has_key(n) and corr.has_key(n): cor_aligned += 1 continue if uncor.has_key(n) and not corr.has_key(n): unc_aligned += 1 continue cor_aligned += 1 unc_aligned += 1 un = uncor[n] co = corr[n] if verbose: print print n print "%i errors in uncorrected read" % len(un) print "%i errors in corrected read" % len(co) cpos = set([c[0] for c in co]) upos = set([u[0] for u in un]) read_tp = len(upos - cpos) read_fp = len(cpos - upos) read_fn = len(cpos & upos) read_tn = len(reads[n]) - read_tp - read_fp - read_fn # not obviously trivial to compute this using set operations #read_ne = 0 tp += read_tp fp += read_fp fn += read_fn tn += read_tn else: # sorted TEF cor_in = open(cor) uncor_in = open(unc) cor_aligned += 1 cor_line = cor_in.readline() cor_data = cor_line.strip().split(' ') #cor_fields = [int(a) for a in cor_data[1:]] unc_aligned += 1 uncor_line = uncor_in.readline() uncor_data = uncor_line.strip().split(' ') #uncor_fields = [int(a) for a in uncor_data[1:]] while len(cor_line) > 0 and len(uncor_line) > 0: if cor_data[0] == uncor_data[0]: n = cor_data[0] #co = [cor_fields[i:i+4] for i in xrange(1, len(cor_fields), 4)] #un = [uncor_fields[i:i+4] for i in xrange(1, len(uncor_fields), 4)] co = [cor_data[i] for i in xrange(1, len(cor_data), 4)] un = [uncor_data[i] for i in xrange(1, len(uncor_data), 4)] if verbose: print print n print "%i errors in uncorrected read" % len(un) print "%i errors in corrected read" % len(co) #cpos = set([c[0] for c in co]) #upos = set([u[0] for u in un]) cpos = set(co) upos = set(un) read_tp = len(upos - cpos) read_fp = len(cpos - upos) read_fn = len(cpos & upos) read_tn = len(reads[n]) - read_tp - read_fp - read_fn # not obviously trivial to compute this using set operations #read_ne = 0 tp += read_tp fp += read_fp fn += read_fn tn += read_tn if uncor_data[0] <= cor_data[0]: unc_aligned += 1 uncor_line = uncor_in.readline() if len(uncor_line) > 0: uncor_data = uncor_line.strip().split(' ') #uncor_fields = [int(a) for a in uncor_data[1:]] else: #if cor_data[0] < uncor_data[0]: cor_aligned += 1 cor_line = cor_in.readline() if len(cor_line) > 0: cor_data = cor_line.strip().split(' ') #cor_fields = [int(a) for a in cor_data[1:]] if cor_line is None: while uncor_line is not None: unc_aligned += 1 uncor_line = uncor_in.readline() if uncor_line is None: while cor_line is not None: cor_aligned += 1 cor_line = cor_in.readline() cor_in.close() uncor_in.close() if tp + fn == 0: raise Exception("No read names matched (uncor: {}, cor: {})".format( uncor_data[0], cor_data[0])) gain = float(tp - fp) / (tp + fn) print "Sample\tMethod\tUncorrected reads\tCorrected reads\tRead gain/loss\tTP\tFP\tTN\tFN\tSensitivity\tSpecificity\tGain" print "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%.4f\t%.4f\t%.4f" % ( fa, cor, unc_aligned, cor_aligned, (cor_aligned - unc_aligned), tp, fp, tn, fn, (float(tp) / (tp + fn)), (float(tn) / (tn + fp)), gain) '''
def load_fasta_db_into_proteins(proteins, fasta_db, clean_seqid=None, iso_leu_isomerism=False): seqids, fastas = fasta.read_fasta(fasta_db) load_fastas_into_proteins(proteins, fastas, clean_seqid, iso_leu_isomerism)
if verbose: library.hello( __file__ ) try: target_fasta=0 sequence=""; start=args.start end=args.end from cs import ProtCSFile tab=ProtCSFile() tab.read_file( args.infile ) sequence=tab.sequence if not sequence and args.correct_fasta: sequence=fasta.read_fasta(args.correct_fasta) tab.set_sequence(sequence) elif sequence and args.correct_fasta: sys.stderr('WARNING: overwriting sequence in .prot file with input from -correct_fasta is this really intended?\n') if args.fasta: if start or end: exit('cannot choose -fasta together with -start and -end for trimming') if sequence: target_fasta=fasta.read_fasta(args.fasta) start=-fasta.find_fasta_offset(target_fasta,sequence,verbose)+1 end=start+len(target_fasta)-1; else: exit('WARNING: cannot use fasta to trim since there is no sequence information in the .prot file') if args.rigid: if start or end: exit('cannot choose -fasta together with -start and -end for trimming')
#!/usr/bin/env python #import two modules import dnatranslate import fasta import sys #read the fasta file in one line: open the file, read the contents #and send it to the fasta reading function dna = fasta.read_fasta(open(sys.argv[1], 'r').readlines()) for item in dna: #translate the DNA protein = dnatranslate.translate_dna(item.sequence) print item.name #format and print the protein print fasta.format_output(protein, 60)
#search all entries of the pattern sites = searchpattern.findall(sequence) temppos = searchpattern.finditer(sequence) for i in temppos: begin, end = i.span() positions.append(begin) return sites, positions #read the enzyme name enzyme = sys.argv[1] #read the list enzymeset = read_enzymes(open('bionet.709', 'r')) isname = check_enzyme(enzyme, enzymeset) if isname: print 'Name found' #if we found the enzyme name we read the sequence file sequences = fasta.read_fasta(open(sys.argv[2], 'r').readlines()) for item in sequences: #let's search sites, positions = find_sites(enzyme, enzymeset, item.sequence) #print the sequence name print item.name[:20]+'...' #and use the zip function to combine the lists and print for i in zip(sites,positions): print i[0], '->', i[1] #if the name is not found, we bail out else: print 'Enzyme name not found, please try again'
elif star[i] == '-': # gap di star, maka seluruh curr disisipkan gap aligns[k][0] = insert_gap(aligns[k][0], i) aligns[k][1] = insert_gap(aligns[k][1], i) curr = aligns[k][0] elif curr[i] == star[i]: continue merged.append(aligns[k][1]) return merged if __name__ == "__main__": #seq = ["ATTGCCATT", "ATGGCCATT", "ATCCAATTTT", "ATCTTCTT", "ATTGCCGATT"] import fasta fa = fasta.read_fasta('h5n1.fasta') seq = [fa[0][1][:100],fa[1][1][:100],fa[2][1][:100]] sim_matrix = { ('A','A'): +2, ('G','A'): -1, ('G','G'): +2, ('C','A'): -1, ('C','G'): -1, ('C','C'): +2, ('T','A'): -1, ('T','G'): -1, ('T','C'): -1, ('T','T'): +2 } gap_penalty = -1 star, aligns = all_pairs(seq, sim_matrix, gap_penalty) merged = merge_alignments(star, aligns) for m in merged:
#output: verbose=1 if args.outfile=="stdout": outfile=sys.stdout verbose=0 else: outfile=open(args.outfile,'w'); ####### program start if verbose: library.hello( __file__ ) try: sequence=None if args.fasta: sequence=read_fasta( args.fasta ) if args.seq: sequence=read_aa3_sequence( args.seq) prot = ProtCSFile() prot.read_file( args.infile, sequence ) if not sequence: sequence = prot.sequence talos = TalosCSFile() talos.from_table( prot, sequence=sequence ) talos.write( outfile ) except library.LibException as inst:
def main(unc, cor, fa): print "Reading pacbio fasta" reads, names = fasta.read_fasta(fa) uncor = {} corr = {} print "Reading uncorrected alignments" for query_name, alignments in aln_formats.iter_maf(unc, 0, 0, 1000000000, by_query=True): al = sorted(alignments, key=lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[ -1] # keep only "best" alignment, by total correct bp uncor[al.query.name] = al print "Reading corrected alignments" for query_name, alignments in aln_formats.iter_maf(cor, 0, 0, 1000000000, by_query=True): al = sorted(alignments, key=lambda al: (al.accuracy() * abs(al.query.end - al.query.start)))[-1] corr[al.query.name] = al new_aligned = 0 new_unaligned = 0 for n in names: if not uncor.has_key(n) and not corr.has_key(n): continue if not uncor.has_key(n) and corr.has_key(n): new_aligned += 1 continue if uncor.has_key(n) and not corr.has_key(n): new_unaligned += 1 continue un = uncor[n] co = corr[n] print print n print un print co for i in xrange(len(un.query.alignment)): qlocus = un.query.alignment[i] tlocus = un.target.alignment[i] if qlocus == tlocus: continue n_errs += 1 if tlocus == '-': q += 1 elif qlocus == '-': t += 1 else: q += 1 t += 1
from utility import GaussianDistribution from assignment.noesy import Resonance, Atom parser = ExampleArgumentParser(prog=basename(__file__), description="make autoNOE-Rosetta readable chemical shift list from any column based format", examples=['%(prog)s input.prot -fasta input.fasta | awk \'NF>1{print}\'| sort -n -k 5 > proper.prot']) parser.add_argument("input", help="A shift file"); parser.add_argument("-check", help="check the CS of protons matching bmrb statistics or not, if not, delete.", action='store_true', default=False); parser.add_argument("-threshold", help="threshold",type=float,default=0.1); mutex=parser.add_mutually_exclusive_group() mutex.add_argument("-fasta",help="figure out the sequence"); library.add_standard_args( parser ) args = parser.parse_args() target_seq=fasta.read_fasta(args.fasta) resonance_list=noesy.ResonanceList.read_from_stream( open(args.input,'r') ) resonance_list.set_sequence(target_seq) if args.check: delete_res=[] data_library=AATypeShiftDistributionLibrary(target_seq) if 'csrosettaDir' not in environ: print 'Please setup csrosettaDir to your environment' exit() list=open(environ['csrosettaDir']+"/database/cs_distribution.txt",'r').readlines() for line in list: tags=line.split() if tags[0]=='Res': continue data_library.add_distribution(tags[1],three_to_one(tags[0]),GaussianDistribution(float(tags[6]),float(tags[7]))) for resonance in resonance_list.itervalues():
verbose=0 else: outfile=open(args.outfile,'w'); ####### program start if verbose: library.hello( __file__ ) try: target=0 start=args.start end=args.end pdb_fasta=fasta.pdb2fasta(args.infile) if args.fasta: target=fasta.read_fasta(args.fasta) start=-fasta.find_fasta_offset(target,pdb_fasta)+1 end=start+len(target)-1; print pdb_fasta print '-'*(start-1)+target if verbose: print "worked out trimming from fasta-sequences: start: %d end: %d"%(start,end) if args.rigid: start,end=library.read_rigid_file( args.rigid ) if verbose>0: print 'Will trim from %d to %d'%(start,end) if pdb_fasta: pdb_fasta, end=fasta.cut_sequence(pdb_fasta,start,end,verbose) #input: