def run_locarnap(seqsin, numkept, cpus=1, foldless=False): """Runs locarna-p on a set of sequences in MinimalFastaParser format [(header, seq), (header, seq)] and retgurns alignemtn and structure""" seqs, headers = remove_duplicates(seqsin) # blank headers to save memory headers = 0 # make sure group has enough sequences before continuing if len(seqs) < numkept and not foldless: return "", "" # headers come out in format Header_# so split to get # and sort by abundance seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1])) # cut to numkept most abundant sequences if len(seqs) > numkept: seqs = seqs[:numkept] return create_locarnap_alignment(seqs, RNA, struct=True, params={"--cpus": cpus})
def run_locarnap(seqsin, numkept, cpus=1,foldless=False): '''Runs locarna-p on a set of sequences in MinimalFastaParser format [(header, seq), (header, seq)] and returns alignment and structure''' #make sure group has enough sequences before continuing if len(seqsin) < numkept and not foldless: return "", "" if len(seqsin) == 1: #raise ValueError("Need at least two sequences for locarna-p") return LoadSeqs(data=seqsin, moltype=RNA), get_secondary_structure(seqsin[0][1])[1] #headers come out in format Header_# so split to get # and sort by abundance seqsin.sort(reverse=True, key=lambda count: int(count[0].split('_')[1])) #cut to numkept most abundant sequences if len(seqsin) > numkept: seqsin = seqsin[:numkept] aln, struct = create_locarnap_alignment(seqsin, RNA, struct=True, params={'--cpus': cpus}) struct = struct.replace('-', ".") return aln, struct
otu = currotu[0] print "==" + otu + "==" print "Reading in 30 most abundant sequences" # assuming that the fasta has more than 30 sequences in it. Safe assumption # if this is a significant cluster seqs = [(header, seq) for header, seq in MinimalFastaParser(open(currotu[1], "rU"))] seqs, headers = remove_duplicates(seqs) # blank headers to save memory headers = 0 # headers come out in format Header_# so split to get # and sort by abundance seqs.sort(reverse=True, key=lambda count: int(count[0].split("_")[1])) # cut to 30 most abundant sequences seqs = seqs[:30] print "Running locarna-p on sequences" args = {"--cpus": "24"} aln, struct = create_locarnap_alignment(seqs, RNA, struct=True, params=args) # create output folder for OTU otufolder = "/Users/Ely/Desktop/Ely_selection/R7/lead_clusters/" if not exists(otufolder): mkdir(otufolder) otufolder += otu if not exists(otufolder): mkdir(otufolder) # print out alignment and structure in fasta and stockholm formats alnout = open(otufolder + "/locarnap-aln.fasta", "w") alnout.write(aln.toFasta() + "\n>SS_struct\n" + struct + "\n") alnout.close() alnout = open(otufolder + "/locarnap-aln.sto", "w") struct_dict = {"SS_cons": struct} alnout.write(stockholm_from_alignment(aln, GC_annotation=struct_dict)) alnout.close()