def test_stockholm_from_alignment(self): """should return correct stockholm string.""" self.assertEqual(stockholm_from_alignment({}),'') self.assertEqual(stockholm_from_alignment(self.alignment_dict),\ self.stockholm_with_label) self.assertEqual(stockholm_from_alignment(self.alignment_dict, interleave_len=2),self.stockholm_with_label_lw2)
def test_stockholm_from_alignment_reordered(self): """should return correct stockholm string.""" self.assertEqual(stockholm_from_alignment(self.alignment_object),\ self.stockholm_with_label_reordered) self.assertEqual( stockholm_from_alignment(self.alignment_object, interleave_len=2), self.stockholm_with_label_lw2_reordered)
def test_stockholm_from_alignment(self): """should return correct stockholm string.""" self.assertEqual(stockholm_from_alignment({}), '') self.assertEqual(stockholm_from_alignment(self.alignment_dict),\ self.stockholm_with_label) self.assertEqual( stockholm_from_alignment(self.alignment_dict, interleave_len=2), self.stockholm_with_label_lw2)
def test_stockholm_from_alignment_struct(self): """should return correct stockholm string.""" self.assertEqual(stockholm_from_alignment({},\ GC_annotation=self.gc_annotation),'') self.assertEqual(stockholm_from_alignment(self.alignment_dict,\ GC_annotation=self.gc_annotation),\ self.stockholm_with_label_struct) self.assertEqual(stockholm_from_alignment(self.alignment_dict,\ GC_annotation=self.gc_annotation,\ interleave_len=2),self.stockholm_with_label_struct_lw2)
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1): '''Function for multithreading creates the final BayesFold alignment and writes to files, then r2r struct''' try: #run locana-p on the superclusters to get the alignment and consensus structure #skip if already run and program just crashsed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] count = 0 out = "group " + str(currgroup) + ": " for header, seq in MinimalFastaParser(open(groupfasta, 'rU')): seqs.append((header.split()[0] + "_" + header.split("_")[1], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" if count < minseqs: print currgroup + " has less than " + str(minseqs) + " sequences, skipping" return "" #make sure group has enough sequences before continuing #run BayesFold on the at most 50 most abundant sequences in the group aln, struct = bayesfold(seqs) #create output folder for group mkdir(currotufolder) if(aln.getNumSeqs() < 50): out += str(aln.getNumSeqs()) + " unique sequences\n" fout = open(currotufolder + "/unique.fasta", 'w') fout.write(aln.toFasta()) fout.close() else: s, h = remove_duplicates(seqs) out += str(len(s)) + " unique sequences\n" write_fasta_list(s, currotufolder + "/unique.fasta") out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/bayesfold-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup)) except Exception, e: print str(e) stdout.flush()
def run_locarnap_for_infernal(currgroup, clusters, otus, basefolder): '''Function for multithreading creates the final locarna-p alignment and writes to files, then r2r struct''' #run locana-p on the superclusters to get the alignment and consensus structure #skip if already run and program just crashsed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] out = "group " + str(currgroup) + ": " for cluster in clusters: out += cluster + " " count = 0 for header, seq in MinimalFastaParser(open(otus[cluster], 'rU')): seqs.append((header.split()[0], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" #make sure group has enough sequences before continuing #run locarna-p on the at most 50 most abundant sequences in the group aln, struct = run_locarnap(seqs, 50, cpus=2, foldless=True) #create output folder for group mkdir(currotufolder) if(aln.getNumSeqs() < 50): out += str(aln.getNumSeqs()) + " unique sequences\n" fout = open(currotufolder + "/unique.fasta", 'w') fout.write(aln.toFasta()) fout.close() else: s, h = remove_duplicates(seqs) out += str(len(s)) + " unique sequences\n" write_fasta_list(s, currotufolder + "/unique.fasta") out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/locarnap-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/locarnap-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/locarnap-aln.sto", currotufolder, "group_" + str(currgroup))
def run_fold_for_infernal(currgroup, groupfasta, basefolder, minseqs=1): '''Function for multithreading. Creates the final BayesFold alignment and writes to files, then r2r struct''' try: #run locana-p on the superclusters to get alignment and structure #skip if already run and program just crashed or whatever currotufolder = basefolder + "group_" + str(currgroup) if exists(currotufolder): return "" seqs = [] count = 0 out = "group " + str(currgroup) + ": " for header, seq in MinimalFastaParser(open(groupfasta, 'rU')): seqs.append((header.split()[0] + "_" + header.split("_")[1], seq)) count += int(header.split("_")[1]) out += "\n" + str(count) + " sequences\n" if count < minseqs: return "" stdout.flush() #hard limit of 500 sequences to align and fold for memory reasons if len(seqs) > 500: seqs = seqs[:500] #run BayesFold on sequences in the group #maxiters set to 5 because should have huge amount of sequences for some groups aln, struct = bayesfold(seqs, params={"-diags": True}) #create output folder for group mkdir(currotufolder) out += str(aln.getNumSeqs()) + " unique sequences\n" out += "Structure: " + struct + "\n" #write out alignment and structure in fasta and stockholm formats #write that shit logout = open(currotufolder + "/log.txt", 'w') logout.write(out) logout.close() alnout = open(currotufolder + "/bayesfold-aln.fasta", 'w') alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(currotufolder + "/bayesfold-aln.sto", 'w') struct_dict = {'SS_cons': struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() #make R2R secondary structure for alignment make_r2r(currotufolder + "/bayesfold-aln.sto", currotufolder, "group_" + str(currgroup)) except Exception, e: print str(e) stdout.flush()
def test_stockholm_from_alignment_reordered(self): """should return correct stockholm string.""" self.assertEqual(stockholm_from_alignment(self.alignment_object),\ self.stockholm_with_label_reordered) self.assertEqual(stockholm_from_alignment(self.alignment_object, interleave_len=2),self.stockholm_with_label_lw2_reordered)
def create_final_output(groupfasta, basefolder, minseqs=1, cpus=1): '''Function for multithreading. Creates the final BayesFold alignment and writes to files, then r2r struct and infernal CM file''' # skip if already run and program just crashed or whatever currgroup = groupfasta.split("/")[-1].split(".")[0] currotufolder = basefolder + currgroup if exists(currotufolder): return # load seqs and make sure we have enough aln = LoadSeqs(groupfasta, moltype=RNA, aligned=True) count = count_seqs(aln.Names) if count < minseqs: return # get weights for each sequence. weight==count weights = [] maxweight = 0 for header in aln.Names: weight = count_seqs(header) if weight > maxweight: maxweight = weight weights.append(header.split()[0]) weights.append(str(weight)) # fold alignment with bayesfold aln, struct = bayesfold(aln, align=False) # write log information mkdir(currotufolder) with open(currotufolder + "/log.txt", 'w') as logout: logout.write(' '.join([ currgroup, ":\n", str(count), "sequences\n", str(aln.getNumSeqs()), "unique sequences\nStructure: ", struct, "\n" ])) # write out alignment and structure in fasta format with open(currotufolder + "/bayesfold-aln.fasta", 'w') as alnout: alnout.write(">SS_cons\n%s\n%s" % (struct, aln.toFasta())) # shave off info in header for stockholm aln = LoadSeqs(data=aln, moltype=RNA, label_to_name=lambda x: x.split()[0]) # create stockholm formatted alignment sto = stockholm_from_alignment(aln, GC_annotation={'SS_cons': struct}) del aln # create standard weights for infernal infweights = "" for pos in range(0, len(weights), 2): infweights = ''.join([ infweights, '# =GS %s WT %s\n' % (weights[pos], str(float(weights[pos + 1]) / maxweight)) ]) # create weights for r2r r2r_weights = "# =GF USE_THIS_WEIGHT_MAP " + ' '.join(weights) # create sto file with r2r and std weights sto = sto.split("\n") sto[-1] = infweights.strip() sto.append(r2r_weights) sto.append("//\n") stofile = currotufolder + "/bayesfold-aln.sto" with open(stofile, 'w') as alnout: alnout.write('\n'.join(sto)) # make R2R secondary structure for alignment make_r2r(stofile, currotufolder, currgroup) # create CM file for infernal from group cmbuild_from_file(stofile, currotufolder + "/cmfile.cm", params={'--wgiven': True}) calibrate_cmfile(currotufolder + "/cmfile.cm", cpus=cpus)
def setUp(self): """Infernal general setUp method for all tests""" self.seqs1_unaligned = {'1':'ACUGCUAGCUAGUAGCGUACGUA',\ '2':'GCUACGUAGCUAC',\ '3':'GCGGCUAUUAGAUCGUA'} self.struct1_unaligned_string = '....(((...)))....' self.seqs1_unaligned_gaps = {'1':'ACUGCUAGCUAGU-AGCGUAC--GUA',\ '2':'--GCUACGUAGCUAC',\ '3':'GCGGCUAUUAGAUCGUA--'} self.seqs2_aligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC---------',\ 'c': '------------UGACUACGCAU---------',\ 'b': '----UAUCGCUUCGACGAUUCUCUGAUAGAGA'} self.seqs2_unaligned = {'a': 'UAGGCUCUGAUAUAAUAGCUCUC',\ 'c': 'UGACUACGCAU',\ 'b': 'UAUCGCUUCGACGAUUCUCUGAUAGAGA'} self.struct2_aligned_string = '............((.(...)))..........' self.struct2_aligned_dict = {'SS_cons':self.struct2_aligned_string} self.lines2 = stockholm_from_alignment(aln=self.seqs2_aligned,\ GC_annotation=self.struct2_aligned_dict) #self.seqs1 aligned to self.seqs2 with self.seqs2 included. self.seqs1_and_seqs2_aligned = \ {'a': 'UAGGCUCUGAUAUAAUAGC-UCUC---------',\ 'b': '----UAUCGCUUCGACGAU-UCUCUGAUAGAGA',\ 'c': '------------UGACUAC-GCAU---------',\ '1': '-ACUGCUAGCUAGUAGCGUACGUA---------',\ '2': '----------GCUACGUAG-CUAC---------',\ '3': '-----GCGGCUAUUAG-AU-CGUA---------',\ } self.seqs1_and_seqs2_aligned_struct_string = \ '............((.(....)))..........' #self.seqs1 aligned to self.seqs2 without self.seqs2 included. self.seqs1_aligned = \ {'1': 'ACUGCUAGCUAGUAGCGUACGUA',\ '2': '---------GCUACGUAG-CUAC',\ '3': '----GCGGCUAUUAG-AU-CGUA',\ } self.seqs1_aligned_struct_string = \ '...........((.(....))).' self.temp_dir = tempfile.mkdtemp() self.temp_dir_spaces = '/tmp/test for infernal/' try: mkdir(self.temp_dir_spaces) except OSError: pass try: #create sequence files f = open(path.join(self.temp_dir, 'seqs1.sto'),'w') f.write(self.lines2) f.close() #create cm file. self.cmfile = path.join(self.temp_dir, 'aln2.cm') cm = open(self.cmfile,'w') cm.write(ALN1_CM) cm.close() #create alignment file used to create cm file. self.aln2_file = path.join(self.temp_dir, 'aln2.sto') af = open(self.aln2_file,'w') af.write(self.lines2) af.close() except OSError: pass
args = {"--cpus": "24"} aln, struct = create_locarnap_alignment(seqs, RNA, struct=True, params=args) # create output folder for OTU otufolder = "/Users/Ely/Desktop/Ely_selection/R7/lead_clusters/" if not exists(otufolder): mkdir(otufolder) otufolder += otu if not exists(otufolder): mkdir(otufolder) # print out alignment and structure in fasta and stockholm formats alnout = open(otufolder + "/locarnap-aln.fasta", "w") alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(otufolder + "/locarnap-aln.sto", "w") struct_dict = {"SS_cons": struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close() print struct # CLUSTER THE SECONDA print "Creating CM and running Infernal over all rounds" # create the cm file. Could call cmsearch_from_alignment but dont want to build # cm file multiple times since is time consuming and processor intensive cmfile = cmbuild_from_alignment(aln, struct, calibrate=True) for i in range(7, 0, -1): # run cmsearch over every round of SELEX # Only search unique sequences to save time seqs = LoadSeqs( "/Users/Ely/Desktop/Ely_selection/R" + str(i) + "/R" + str(i) + "-Unique.fasta", moltype=RNA, aligned=False,