def get_longest(contig_file, outfile, min_length): #returns a dictionary of header sequence pairs of the longest contig in #each locus locus_dict = defaultdict(list) contig_dict = {} for h, s in FastaReader(contig_file): if len(h.split("_")) > 1: locus_dict[h.split("_")[1]].append((h, s)) else: contig_dict[h] = s longest_contigs = [] for locus in locus_dict: max_len = 0 for h, s in locus_dict[locus]: if len(s) > max_len: max_len = len(s) curr = (h, s) longest_contigs.append(curr) with open(outfile, 'w') as outfas: for contig in longest_contigs: if len(contig[1]) >= min_length: outfas.write(">" + contig[0] + "\n") outfas.write(contig[1] + "\n") for contig in contig_dict: if len(contig_dict[contig]) >= min_length: outfas.write(">" + contig + "\n") outfas.write(contig_dict[contig] + "\n") return longest_contigs
def split_genome(fastafile, outputprefix, n_splits, n_reps): seqs = [] total_length = 0 for h, s in FastaReader(fastafile): seqs.append((h, s)) if len(seqs) > 1: raise ValueError( 'Multiple contigs in fasta file which isnt currently supported!') h = "" s = seqs[0][1] print "Genome length:", len(s) for r in range(n_reps): outputfile = outputprefix + "_rep_" + str(r) + ".fasta" # generate random split locations cuts = np.random.choice(len(s), n_splits) cuts = sorted(np.append(cuts, len(s))) print "cuts:", cuts prev = 0 with open(outputfile, 'w') as outfile: for i, cut in enumerate(cuts): outfile.write(">split_" + str(i) + "\n") outfile.write(s[prev:cut] + "\n") prev = cut print "Final contig end position:", len(s) return
def remove_shorter_sequences(fasta_file, per_within_max, len_cutoff , outputfile): locus_dict = {} #Load in fasta file for h,s in FastaReader(fasta_file): locus_name = h.split("_")[1] if locus_name not in locus_dict: locus_dict[locus_name]=Locus(h) locus_dict[locus_name].transcripts.append(Transcript(h, s)) if locus_dict[locus_name].max_trans_length < len(s): locus_dict[locus_name].max_trans_length = len(s) #now output with cleaning with open(outputfile,'w') as outfile: for locus_name in locus_dict: for t in locus_dict[locus_name].transcripts: if ((len(t.sequence)/float(locus_dict[locus_name].max_trans_length)) < (1-per_within_max)): #too far away from the longest continue if len(t.sequence) < len_cutoff: continue outfile.write(">"+t.name+"\n") outfile.write(t.sequence+"\n")
def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose): #run blast against the rask VAR genes blast_out = run_blast(raskFasta, contig_file, outdir, verbose) rask_hits = set() with open(blast_out, 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) rask = outdir + fileName + "_rask.fa" non_rask = outdir + fileName + "_nonrask.fa" count_rask = 0 count_non_rask = 0 with open(rask, 'w') as raskout: with open(non_rask, 'w') as nonraskout: for h, s in FastaReader(contig_file): if h in rask_hits: raskout.write(">" + h + "\n") raskout.write(s + "\n") count_rask += 1 else: nonraskout.write(">" + h + "\n") nonraskout.write(s + "\n") count_non_rask += 1 if verbose: print count_rask, " annotated to rask DB..." print count_non_rask, " remaining" return rask, non_rask
def merge_files(easy, hmmMatch, outputdir): out_file = (outputdir + os.path.splitext(os.path.basename(easy))[0] + "_finalTranslated.fa") with open(out_file, 'w') as outfile: for h, s in FastaReader(easy): outfile.write(">" + h + "\n") outfile.write(s + "\n") for h, s in FastaReader(hmmMatch): outfile.write(">" + h + "\n") outfile.write(s + "\n") print "Success!" return
def translate_6_frame(in_fasta, out_fasta): with open(out_fasta, 'w') as outfile: for h,s in FastaReader(in_fasta): for frame, seq in sixFrameTranslation(s).items(): outfile.write(">"+h+" _frame_"+str(frame)+"\n") outfile.write(seq+"\n") return out_fasta
def filter_Locus_1(outputdir, folder_prefix, verbose=False): final_transcripts = [] locus_dict = defaultdict(list) for transcript_file in glob.glob(outputdir + folder_prefix + '*/transcripts.fa'): #now need to extract all transcripts that are the only member of #their Locus name = transcript_file.strip('/transcripts.fa')[-2:] print name print transcript_file for h, s in FastaReader(transcript_file): locus = h.split("_")[1] + "_" + name locus_dict[locus].append((h + "_K" + name, s)) #Now we want to write out all Locus' of length 1 to the outfile out_fileA = outputdir + "filter_locus1_transcripts_keep61.fa" with open(out_fileA, 'w') as outfile: for locus in locus_dict: if (locus.split("_")[-1] == '61') or (len(locus_dict[locus]) == 1): for t in locus_dict[locus]: outfile.write(">" + t[0] + "\n") outfile.write(t[1] + "\n") # out_fileB = outputdir + "filter_locus1_transcripts.fa" # with open(out_fileB, 'w') as outfile: # for locus in locus_dict: # if len(locus_dict[locus]) == 1: # for t in locus_dict[locus]: # outfile.write(">" + t[0] + "\n") # outfile.write(t[1] + "\n") return out_fileA
def filter_with_HMMER(orfFile, hmmfile, hmmThresh, outputdir): outname = os.path.splitext(os.path.basename(orfFile))[0] search_file = searchhmmer(orfFile, hmmfile, 9.97, outputdir, outname, True) output_file = (outputdir + os.path.splitext(os.path.basename(orfFile))[0] + "_matchedHMMER.fa") # target name accession tlen query name accession qlen E-value score bias # of c-Evalue i-Evalue score bias from to from to from to acc description of target head_hits = defaultdict(set) with open(search_file, 'r') as searchfile: for line in searchfile: if line[0] == '#': continue line = line.strip().split() sequence = line[0] hmmHit = line[3] head_hits[sequence].add(hmmHit) num_orfs = 0 num_keep = 0 with open(output_file, 'w') as outfile: for h, s in FastaReader(orfFile): num_orfs += 1 if len(head_hits[h]) >= hmmThresh: outfile.write(">" + h + "\n") outfile.write(s + "\n") num_keep += 1 print str(num_keep) + " orfs kept out of " + str(num_orfs) return output_file
def filterWithHMMER(inputfile, prefix, cpu, verbose): hmmer_cmd = (HMMERSEARCH + " -o /dev/null" + " --domT 80" + " --domtblout " + "hmmerDBLalphaSearch.txt" + " --cpu " + str(cpu) + " " + DBLA_HMM + " " + inputfile) if verbose: print "running... ", hmmer_cmd check_call(hmmer_cmd, shell=True) #Now run through and get DBLalpha seqs keep = set() with open("hmmerDBLalphaSearch.txt", 'rU') as infile: for line in infile: if line[0] == "#": continue keep.add(line.split()[0]) with open(prefix + "_DBLa_cleaned.fasta", 'w') as dblfile: with open(prefix + "_NOT_dblalpha.fasta", 'w') as contamfile: for h, s in FastaReader(inputfile): if h in keep: dblfile.write(">" + h + "\n" + s + "\n") else: contamfile.write(">" + h + "\n" + s + "\n")
def trim_contigs(length, contig_file, outdir): out_file = outdir + "trim_transcripts.fa" with open(out_file, 'w') as outfile: for h, s in FastaReader(contig_file): if len(s) < length: continue outfile.write(">" + h + "\n") outfile.write(s + "\n") return out_file
def reNameContigs(contig_file, fileName, outputdir): renamed = outputdir + fileName + "renamed.fa" with open(renamed, 'w') as outfile: for h, s in FastaReader(contig_file): h = h.split()[0] if len(h.split("_")) > 4: #oases transcript h = h.split("_") h = "_".join([h[0], h[1], h[3]]) outfile.write(">" + h + "\n") outfile.write(s + "\n") return renamed
def convert(gfffile, fastafile, outputfile): with open(outputfile, 'w') as outfile: outfile.write("##gff-version 3\n") for h, s in FastaReader(fastafile): h = h.split()[0] outfile.write(" ".join(["##sequence-region", h, "1", str(len(s))]) + "\n") with open(gfffile, 'rU') as infile: for line in infile: if line[0] != "#": outfile.write(line) outfile.write("##FASTA\n") for h, s in FastaReader(fastafile): h = h.split()[0] outfile.write(">" + h + "\n" + s + "\n") return
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap, outdir, verbose): #first get list of contigs contigs = {} for h, s in FastaReader(contig_file): contigs[h] = s if verbose: print("Number of contigs before contaminant filtering: ", len(contigs.keys())) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir, verbose)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3]) / float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) #now write out a fasta file of contaminant sequences contaminant_file = outdir + fileName + "contaminants.fa" with open(contaminant_file, 'w') as outfile: for contig in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") #now write contigs without contaminants to a file non_contaminant_file = outdir + fileName + "Non_contaminants.fa" with open(non_contaminant_file, 'w') as outfile: for contig in contigs: if contig not in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") if verbose: print("Number of contigs after filtering: ", len(contigs.keys()) - len(bad_contigs)) return non_contaminant_file, contaminant_file
def combineReadFiles(outputfile, verbose): if verbose: print "combining sample files..." #combine read files appending sample name to read headers with open(outputfile, 'w') as outfile: for f in glob.glob("*_lowSupportFiltered.fasta"): for h, s in FastaReader(f): outfile.write(">" + h + "sample=" + f.split("_demultiplex")[0] + "\n") outfile.write(s + "\n") return
def main_new(fastafile,bkp): distance_name=["ab","ac","bc"]; seq_name=[] for h,s in FastaReader(fastafile): seq_name.append(h) aln = AlignIO.read(open(fastafile), 'fasta') calculator = DistanceCalculator('blosum62') segment_1 = calculator.get_distance(aln[:, :bkp]) segment_2 = calculator.get_distance(aln[:, bkp:]) distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]]; #distance=[segment_1[seq_name[1]][0],segment_1[seq_name[2]][0],segment_1[seq_name[2]][1],segment_2[seq_name[1]][0],segment_2[seq_name[2]][0],segment_2[seq_name[2]][1]]; compare_distance=[abs(distance[0]-distance[3]),abs(distance[1]-distance[4]),abs(distance[2]-distance[5])]##in order of ab,ac,bc temp2 = distance_name[compare_distance.index(min(compare_distance))] string = "abc";string = string.replace(temp2[0],"");string = string.replace(temp2[1],"") rec=seq_name["abc".index(string)] return rec
def filter_length(contig_file, length_filter, fileName, outdir, verbose): length_file = outdir + fileName + "lenFilt.fa" short_count = 0 with open(length_file, 'w') as outfile: for h, s in FastaReader(contig_file): if len(s) < length_filter: short_count += 1 else: outfile.write(">" + h + "\n") outfile.write(s + "\n") if verbose: print short_count, " contigs removed as too short..." return length_file
def annotate_w_ntDB(contig_file, fileName, outdir, verbose): blastOut = outdir + fileName + "nonRaskBlast.txt" #first run a special blast using the nt database blast_cmd = ( "blastn " + "-evalue 10 " + """-outfmt "6 qseqid sseqid stitle length pident qstart qend sstart send evalue" """ # + "-num_alignments " + str(num_hits) + " " + "-num_threads 10 -max_target_seqs 3 " + "-db " + BLAST_NT_DB + " " + "-query " + contig_file + " " + "-out " + blastOut) if verbose: print blast_cmd check_call(blast_cmd, shell=True) #now retrieve annotation information contigs = defaultdict(str) contigs_perID = defaultdict(float) with open(blastOut, 'r') as blastfile: for line in blastfile: line = line.strip().split("\t") contigs[line[0]] = (contigs[line[0]] + " [" + line[2] + "_alignLen_" + line[3] + "_perID_" + line[4] + "] ") contigs_perID[line[0]] = max(contigs_perID[line[0]], float(line[4])) #now re-write the fasta file with the annotations in the headers annotated = outdir + fileName + "nonRask_annotated.fa" unknown_blastOut = outdir + fileName + "ForManualInspection.fa" with open(annotated, 'w') as outfileKnown: with open(unknown_blastOut, 'w') as outfileUnknoen: for h, s in FastaReader(contig_file): if h in contigs: if contigs_perID[h] > 97: outfileKnown.write(">" + h + " " + contigs[h] + "\n") outfileKnown.write(s + "\n") continue if h in contigs: outfileUnknoen.write(">" + h + " " + contigs[h] + "\n") else: outfileUnknoen.write(">" + h + " none\n") outfileUnknoen.write(s + "\n") return annotated
def pull_out_long_ORFs(bad_file, len_cutoff, outputdir): out_file = (outputdir + os.path.splitext(os.path.basename(bad_file))[0] + "_TranslongORFS.fa") num_seqs = 0 num_orfs = 0 with open(out_file, 'w') as outfile: for h, s in FastaReader(bad_file): num_seqs += 1 translation = sixFrameTranslation(s) orfs = get_long_ORFS(translation, len_cutoff) for o in orfs: num_orfs += 1 outfile.write(">" + h + o[0] + "\n") outfile.write(o[1] + "\n") print str(num_seqs) + " sequences translated into " + str( num_orfs) + " ORFs" return out_file
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap, outfile, outdir): #first get list of contigs contigs = {} for h, s in FastaReader(contig_file): contigs[h] = s print "Number of contigs before filtering: ", len(contigs.keys()) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir, True)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3]) / float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) for name in bad_contigs: del contigs[name] #now write resulting contigs to a file with open(outfile, 'w') as outfas: for contig in contigs: outfas.write(">" + contig + "\n") outfas.write(contigs[contig] + "\n") print "Number of contigs after filtering: ", len(contigs.keys())
def split_easy_from_hard(inputfile, outputdir): seqCount = 0 badSeqs = 0 bad_lengths = [] output_file = (outputdir + os.path.splitext(os.path.basename(inputfile))[0] + "_translatedL2stops.fa") with open(output_file + "_BadSeqs", 'w') as badfile: with open(output_file, 'w') as outfile: for h, s in FastaReader(inputfile): stops = 9999 translation = sixFrameTranslation(s) for frame in translation: st = translation[frame].count('*') if st < stops: best = frame stops = st if stops <= 2: outfile.write(">" + h + " frame_" + str(best) + "\n") outfile.write(translation[best] + "\n") else: badSeqs += 1 bad_lengths.append(len(s)) badfile.write(">" + h + "\n") badfile.write(s + "\n") seqCount += 1 print( str((100.0 * badSeqs) / seqCount) + "percent or " + str(badSeqs) + " out of " + str(seqCount) + " were not translated.") return output_file, output_file + "_BadSeqs"
is_fasta = True else: is_fasta = False if uproc: uprocOut = run_uproc(outdir+prefix+"_uprocList.csv", read1, read2) hmmerIn = process_uproc_results(uprocOut, outdir+prefix+"_UprocReads.fa" , read1, read2) # hmmerIn = outdir+prefix+"_UprocReads.fa" else: hmmerIn = outdir+prefix+"_NoFilterList.fa" with open(hmmerIn, 'w') as outfile: if not is_fasta: if read2: for h,s in FastaReader(read2): outfile.write(">"+h+"\n"+s+"\n") for h,s in FastaReader(read1): outfile.write(">"+h+"\n"+s+"\n") else: if read2: for h,s,q in FastqReader(read2): outfile.write(">"+h+"\n"+s+"\n") for h,s,q in FastqReader(read1): outfile.write(">"+h+"\n"+s+"\n") uproc_reads_6frame = translate_6_frame(hmmerIn , outdir+prefix+"_Uproc_6frame.fa") hmm_out = allocate_w_hmmer(uproc_reads_6frame , outdir+prefix+"_nhmmOut.txt", evalue)
input=dir+"complement_chunks" output=dir+"results/result_all.csv" headers=["chunk","target","db1","db2","rec","sv"] with open(output, 'a+') as outfile: writer = csv.writer(outfile) writer.writerow(headers) for fasta_file in glob.glob(input+"/*.fasta"): bkp=int(fasta_file.split(".fasta")[0].split("_")[-1]) chunk=fasta_file.split("/chunk")[1].split("_")[0] initial= main_new(fasta_file,bkp) #print initial mapping_dict={};seq_name=[] pvalue=0;permutation=100; for h,s in FastaReader(fasta_file): mapping_dict[h]=s;seq_name.append(h);full_alignment_length=len(s) for m in range(permutation): index_1 = np.random.choice(bkp, bkp, replace=True);shuffled_sequences_1=[] for n in [mapping_dict[seq_name[0]][0:bkp],mapping_dict[seq_name[1]][0:bkp],mapping_dict[seq_name[2]][0:bkp]]: temp_list_1=""; for j in index_1: temp_list_1 = temp_list_1+ n[j]; shuffled_sequences_1.append(temp_list_1) s2_length= full_alignment_length-bkp index_2 = np.random.choice(s2_length, s2_length, replace=True);shuffled_sequences_2=[] for n in [mapping_dict[seq_name[0]][bkp:],mapping_dict[seq_name[1]][bkp:],mapping_dict[seq_name[2]][bkp:]]: temp_list_2=""; for j in index_2: temp_list_2 = temp_list_2+ n[j]; shuffled_sequences_2.append(temp_list_2)
import sys, os from third_party_runners import run_blast from mungo.fasta import FastaReader contig_file = sys.argv[1] blastdb_file = sys.argv[2] outdir = sys.argv[3] name = os.path.splitext(os.path.basename(contig_file))[0] blast_out = run_blast(blastdb_file, contig_file, outdir, True) rask_hits = set() with open(blast_out, 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) with open(outdir + name + "_rask.fa", 'w') as raskout: with open(outdir + name + "_nonrask.fa", 'w') as nonraskout: for h, s in FastaReader(contig_file): if h in rask_hits: raskout.write(">" + h + "\n") raskout.write(s + "\n") else: nonraskout.write(">" + h + "\n") nonraskout.write(s + "\n")
else: with open( dir + "temp/chunk_" + str(i) + "/original_chunk_db.txt", "rU") as db_infile: for n, db_line in enumerate(db_infile.readlines()): line1 = db_line.strip().split() h1 = line1[0] s1 = line1[1] if n == k: outfile.write(">" + h + "\n" + s[bkp[k - 1]:bkp[k]] + "\n" + ">" + h1 + "\n" + s1 + "\n") ### Step3: add the part for mafft alignment. use replace not split. mapping_dict_seq_identifier = {} for h, s in FastaReader(dir + "data/simulated_seqs_recombined.fasta"): mapping_dict_seq_identifier[h] = s for i in range(Chunk_count): db_line_index = range(startline[i] + 2, endline[i] + 1) line_index = [startline[i]] + range(startline[i] + 2, endline[i] + 1) for k in range(mosaic_output_dbcount[i] - 1): with open(input, 'rU') as infile: for j, line in enumerate(infile.readlines()): if j == db_line_index[k]: line1 = line.strip().split() h1 = line1[0] s1 = line1[1] if len(s1) >= 10: original_seq = mapping_dict_seq_identifier[h1] s = line1[1].replace("-", "") with open(
def countReads(fastafile): count = 0 for h, s in FastaReader(fastafile): count += 1 return count
from third_party_runners import run_blast from mungo.fasta import FastaReader OVERLAP = 0.7 IDENTITY = 95 sequences = sys.argv[1] outdir = sys.argv[2] #first run all-vs-all blast bfile = run_blast(sequences, sequences, outdir, True) #now get the lengths of all the contigs contig_len = {} for h,s in FastaReader(sequences): contig_len[h]=len(s) #now iterate through search finding redundant contigs redundant_contigs = set() with open(bfile, 'r') as blastsearch: for line in blastsearch: tokens = line.strip().split() if tokens[0]==tokens[1]: #matching itself continue if float(tokens[2]) >= IDENTITY: # print "identity ",tokens[2] if ((float(tokens[3])/contig_len[tokens[0]] >= OVERLAP) or (float(tokens[3])/contig_len[tokens[1]] >= OVERLAP)): # print "overlap", max(float(tokens[3])/contig_len[tokens[0]],float(tokens[3])/contig_len[tokens[1]]) #we want to remove the shorted contig
def assemble_paired_reads_soapDeNovoTrans(fasta_single, fasta_paired, outputdir, ins_length=False, verbose=False): #first move in temp directory so the output is kept nicely curr_dir = os.getcwd() os.chdir(outputdir) scriptPath = getScriptPath() if fasta_paired == None: config_str = ("""#maximal read length max_rd_len=250 [LIB] #maximal read length in this lib rd_len_cutof=250 #average insert size avg_ins=0 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=32 #fasta file for single reads q=""" + fasta_single) elif (fasta_single == None) or (os.stat(fasta_single).st_size == 0): #check if single assembly script = "python " + scriptPath + "/third-party/khmer/scripts/split-paired-reads.py" #first split files so PEAR can use them split_cmd = (script + " " + fasta_paired) if verbose: print split_cmd check_call(split_cmd, shell=True) config_str = ("""#maximal read length max_rd_len=250 [LIB] #maximal read length in this lib rd_len_cutof=250 #average insert size avg_ins=0 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=32 #fasta file for single reads q1=""" + fasta_paired + ".1\n" + "q2=" + fasta_paired + ".2") else: script = "python " + scriptPath + "/third-party/khmer/scripts/split-paired-reads.py" #first split files so PEAR can use them split_cmd = (script + " " + fasta_paired) if verbose: print split_cmd check_call(split_cmd, shell=True) config_str = ("""#maximal read length max_rd_len=250 [LIB] #maximal read length in this lib rd_len_cutof=250 #average insert size avg_ins=0 #if sequence needs to be reversed reverse_seq=0 #in which part(s) the reads are used asm_flags=3 #minimum aligned length to contigs for a reliable read location (at least 32 for short insert size) map_len=32 #fasta file for single reads q1=""" + fasta_paired + ".1\n" + "q2=" + fasta_paired + ".2\n" + "q=" + fasta_single) scriptPath = getScriptPath() script = ( scriptPath + "/third-party/SOAPdenovo-Trans-bin-v1.03/SOAPdenovo-Trans-127mer") for K in ["21", "31", "41", "51", "61"]: config_file = outputdir + "soapConfigK" + K outputGraph = outputdir + "soapGraphK" + K soap_cmd = ( script + " all" + " -s " + config_file + " -o " + outputGraph + " -K " + K + " -L 50" #not actually important as we only use the contigs ) with open(config_file, 'w') as outfile: outfile.write(config_str) if verbose: print soap_cmd check_call(soap_cmd, shell=True) #now to combine the contig files outfile = outputdir + "combined_soapDeNovo.fa" #now rename transcripts to include which Kmer run they came from and combine # into one file with open(outfile, 'w') as output: for transcript_file in glob.glob("*.contig"): fname = os.path.splitext(os.path.basename(transcript_file))[0] for h, s in FastaReader(transcript_file): h = h.strip().split() output.write( (">" + h[0] + "_" + fname + " " + " ".join(h[1:]) + "\n")) output.write(s + "\n") #return to previous directory os.chdir(curr_dir) return outfile
def calculateSummaryStatistics(pairedMIDs, outputfile, total_reads, total_reads_before_contaminant_filtering, total_reads_after_contaminant_filtering, blast_isolate_counts, chimeric_filt, cleaned_read_file, verbose): ##Calculate flexbar summary statisics flexBarStats = readFlexBarLog("flexbarRun1.log") logfiles = glob.glob("*flexbarRun2.log") for f in logfiles: tempLog = readFlexBarLog(f) flexBarStats["processed_reads"] += tempLog["processed_reads"] flexBarStats["skip_uncalled_bases"] += tempLog["skip_uncalled_bases"] flexBarStats["skip_unassigned_reads"] += tempLog[ "skip_unassigned_reads"] flexBarStats["skip_short_reads"] += tempLog["skip_short_reads"] flexBarStats["skip_single_reads"] += tempLog["skip_single_reads"] flexBarStats["discarded_reads"] += tempLog["discarded_reads"] ##Calculate pear merging statistics #Now merge files based on MID combinations pearStats = {} for mid in pairedMIDs: filenames = glob.glob("*B_" + mid[0] + "_*B_" + mid[1] + "_*_pearOut.log") if mid[0] != mid[1]: #add other direction filenames += glob.glob("*B_" + mid[1] + "_*B_" + mid[0] + "_*_pearOut.log") sample = pairedMIDs[mid] if len(filenames) > 0: pearStats[sample] = readPearLog(filenames[0]) else: pearStats[sample] = {} pearStats[sample]["total"] = 0 pearStats[sample]["assembled"] = 0 pearStats[sample]["discarded"] = 0 pearStats[sample]["not_assembled"] = 0 print "Did not find", sample, mid for f in filenames[1:]: tempPear = readPearLog(f) pearStats[sample]["total"] += tempPear["total"] pearStats[sample]["assembled"] += tempPear["assembled"] pearStats[sample]["discarded"] += tempPear["discarded"] pearStats[sample]["not_assembled"] += tempPear["not_assembled"] ##Calculate support filtering statistics #Count number of reads before support filtering initalCounts = {} for f in glob.glob("*_demultiplexTrimMerged.fasta"): sample = f.split("_demultiplexTrimMerged")[0] initalCounts[sample] = countReads(f) #Count number of chimeric reads if chimeric_filt: chimericCounts = Counter() for f in glob.glob("*_ncdenovo.fasta"): sample = f.split("_demultiplexTrimMerged")[0] chimericCounts[sample] = countReads(f) else: for sample in initalCounts: chimericCounts[sample] = 0 #Count number of centroids centroidCounts = Counter() for f in glob.glob("*_demultiplexTrimMerged_centroids.fasta"): sample = f.split("_demultiplexTrimMerged")[0] centroidCounts[sample] = countReads(f) #Count number of centroids with sufficient support supportCentroids = Counter() for f in glob.glob("*_demultiplexTrimMerged_lowSupportFiltered.fasta"): sample = f.split("_demultiplexTrimMerged")[0] supportCentroids[sample] = countReads(f) #Count number of reads remaining after contaminant filtering contamFilteredCounts = Counter() for h, s in FastaReader(cleaned_read_file): sample = h.strip().split("sample=")[1] contamFilteredCounts[sample] += 1 ##write to outputfile with open(outputfile, 'w') as outfile: outfile.write("# Total paired end reads: " + str(total_reads) + "\n") outfile.write("# Total reads prior to contaminant filtering: " + str(total_reads_before_contaminant_filtering) + "\n") outfile.write("# Total reads after contaminant filtering: " + str(total_reads_after_contaminant_filtering) + "\n") outfile.write("####### FlexBar Summary statistics #######\n") outfile.write("# Total reads processed: " + str(flexBarStats["processed_reads"]) + "\n") outfile.write("# Reads skipped due to uncalled bases: " + str(flexBarStats["skip_uncalled_bases"]) + "\n") outfile.write("# Unassigned reads: " + str(flexBarStats["skip_unassigned_reads"]) + "\n") outfile.write("# Short reads skipped: " + str(flexBarStats["skip_short_reads"]) + "\n") outfile.write("# Single reads skipped: " + str(flexBarStats["skip_single_reads"]) + "\n") outfile.write("# Total reads discarded: " + str(flexBarStats["discarded_reads"]) + "\n") p = 100.0 * (1 - flexBarStats["discarded_reads"] / float(flexBarStats["processed_reads"])) outfile.write("# Proportion of reads kept: " + str(p) + "\n") outfile.write("####### Sample specific summary statistics #######\n") #now write out sample specific statistics outfile.write(",".join([ "Sample", "PreMerge", "Merged", "Discarded", "Not Assembled", "Filtered", "Chimeric", "Centroids", "Centroids with support", "After contaminant filtering", "3D7", "DD2", "HB3" ]) + "\n") for sample in pearStats: outfile.write(",".join([ sample, str(pearStats[sample]["total"]), str(pearStats[sample]["assembled"]), str(pearStats[sample]["discarded"]), str(pearStats[sample]["not_assembled"]), str(pearStats[sample]["total"] - initalCounts[sample]), str(chimericCounts[sample]), str(centroidCounts[sample]), str(supportCentroids[sample]), str(contamFilteredCounts[sample]), str(blast_isolate_counts[sample]["3D7"]), str(blast_isolate_counts[sample]["DD2"]), str(blast_isolate_counts[sample]["HB3"]) ]) + "\n") for sample in pearStats: if pearStats[sample]["total"] == 0: outfile.write("WARNING: Sample " + sample + " resulted in 0 reads!\n") return
####################################################################### from mungo.fasta import FastaReader import sys, os import random input_par1 = int(sys.argv[1]) input_par2 = int(sys.argv[2]) input_par3 = int(sys.argv[3]) input_par4 = int(sys.argv[4]) input_fasta = sys.argv[5] seqs = {} count = 0 seq_length = 0 temp_length = [] for h, s in FastaReader(input_fasta): seqs[h] = s count += 1 temp_length.append(len(s)) seq_length = min( temp_length ) ## length should always be the same without considering indel events. if input_par1 == 0: ## eg:set 25 jump once seqs and 25 is nonrecombinants, then input_par4==50 need_seqs_num = 2 * input_par2 + input_par4 - input_par2 seq_index = random.sample(range(need_seqs_num), k=2 * input_par2) with open(input_fasta[:-20] + "simulated_seqs_recombined.fasta", 'w') as outfile: for count in range(input_par2): left_source = seq_index[2 * count]
#!/usr/bin/env python """ calcATContent.py Author: Tony Papenfuss Date: Wed Jul 8 10:18:32 EST 2009 """ import os, sys from mungo.fasta import FastaReader iFilename = sys.argv[1] nAT = 0 total = 0 for h,s in FastaReader(iFilename): nAT += s.count("A") + s.count("T") total += len(s) print "AT content %0.2f" % (100.0*nAT/total)