def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose): #run blast against the rask VAR genes blast_out = run_blast(raskFasta, contig_file, outdir, verbose) rask_hits = set() with open(blast_out, 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) rask = outdir + fileName + "_rask.fa" non_rask = outdir + fileName + "_nonrask.fa" count_rask = 0 count_non_rask = 0 with open(rask, 'w') as raskout: with open(non_rask, 'w') as nonraskout: for h, s in FastaReader(contig_file): if h in rask_hits: raskout.write(">" + h + "\n") raskout.write(s + "\n") count_rask += 1 else: nonraskout.write(">" + h + "\n") nonraskout.write(s + "\n") count_non_rask += 1 if verbose: print count_rask, " annotated to rask DB..." print count_non_rask, " remaining" return rask, non_rask
def analyse_contigs(contig_file, read1, read2, outputdir, fasta_ref_files=[], verbose=False): #first prepare blast files for analysis renamedContigs = reNameContigs(contig_file, outputdir) blast_files = [] for reference in fasta_ref_files: blast_files.append( run_blast(reference, renamedContigs, outputdir, verbose)) #now align reads to contigs samfile = align_w_subread(read1, read2, renamedContigs, outputdir, verbose) #now convert to bam bamfile = convert_to_bam_create_index(renamedContigs, samfile, verbose) # bamfile="notused" #now compute analytics outfile = generate_summary(bamfile, outputdir, renamedContigs, blast_files) return outfile
def get_rask_var(raskFasta, contig_file, fileName, outdir, verbose): #run blast against the rask VAR genes blast_out = run_blast(raskFasta, contig_file, outdir, verbose) rask_hits = set() with open(blast_out , 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) rask = outdir + fileName + "_rask.fa" non_rask = outdir + fileName + "_nonrask.fa" count_rask = 0 count_non_rask = 0 with open(rask, 'w') as raskout: with open (non_rask, 'w') as nonraskout: for h,s in FastaReader(contig_file): if h in rask_hits: raskout.write(">"+h+"\n") raskout.write(s+"\n") count_rask += 1 else: nonraskout.write(">"+h+"\n") nonraskout.write(s+"\n") count_non_rask +=1 if verbose: print count_rask, " annotated to rask DB..." print count_non_rask, " remaining" return rask, non_rask
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap , outdir, verbose): #first get list of contigs contigs = {} for h,s in FastaReader(contig_file): contigs[h] = s if verbose: print ("Number of contigs before contaminant filtering: " , len(contigs.keys())) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir , verbose)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3])/float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) #now write out a fasta file of contaminant sequences contaminant_file = outdir + fileName + "contaminants.fa" with open(contaminant_file, 'w') as outfile: for contig in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") #now write contigs without contaminants to a file non_contaminant_file = outdir + fileName + "Non_contaminants.fa" with open(non_contaminant_file, 'w') as outfile: for contig in contigs: if contig not in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") if verbose: print ("Number of contigs after filtering: " , len(contigs.keys())-len(bad_contigs)) return non_contaminant_file, contaminant_file
def get_contaminants(fasta_ref_files, contig_file, fileName, percent_overlap, outdir, verbose): #first get list of contigs contigs = {} for h, s in FastaReader(contig_file): contigs[h] = s if verbose: print("Number of contigs before contaminant filtering: ", len(contigs.keys())) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir, verbose)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3]) / float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) #now write out a fasta file of contaminant sequences contaminant_file = outdir + fileName + "contaminants.fa" with open(contaminant_file, 'w') as outfile: for contig in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") #now write contigs without contaminants to a file non_contaminant_file = outdir + fileName + "Non_contaminants.fa" with open(non_contaminant_file, 'w') as outfile: for contig in contigs: if contig not in bad_contigs: outfile.write(">" + contig + "\n") outfile.write(contigs[contig] + "\n") if verbose: print("Number of contigs after filtering: ", len(contigs.keys()) - len(bad_contigs)) return non_contaminant_file, contaminant_file
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap , outfile, outdir): #first get list of contigs contigs = {} for h,s in FastaReader(contig_file): contigs[h] = s print "Number of contigs before filtering: ", len(contigs.keys()) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir , True)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3])/float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) for name in bad_contigs: del contigs[name] #now write resulting contigs to a file with open(outfile, 'w') as outfas: for contig in contigs: outfas.write(">" + contig + "\n") outfas.write(contigs[contig] + "\n") print "Number of contigs after filtering: ", len(contigs.keys())
def filter_ref_with_blast(fasta_ref_files, contig_file, percent_overlap, outfile, outdir): #first get list of contigs contigs = {} for h, s in FastaReader(contig_file): contigs[h] = s print "Number of contigs before filtering: ", len(contigs.keys()) #now run blast against the reference files which we want not to be #present in the data i.e. human blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, contig_file, outdir, True)) #now iterate through blast results file removing contigs that have to #high a proportion of hits bad_contigs = set() for blast_file in blast_files: blast_name = os.path.splitext(os.path.basename(blast_file))[0] with open(blast_file, 'r') as bfile: for line in bfile: tokens = line.strip().split() name = tokens[0] overlap = int(tokens[3]) / float(len(contigs[name])) if overlap > percent_overlap: #we don't want this contig print "removing", name, "overlapped", blast_name bad_contigs.add(name) for name in bad_contigs: del contigs[name] #now write resulting contigs to a file with open(outfile, 'w') as outfas: for contig in contigs: outfas.write(">" + contig + "\n") outfas.write(contigs[contig] + "\n") print "Number of contigs after filtering: ", len(contigs.keys())
def analyse_contigs(contig_file, read1, read2, outputdir , fasta_ref_files=[], verbose=False): #first prepare blast files for analysis renamedContigs = reNameContigs(contig_file, outputdir) blast_files = [] for reference in fasta_ref_files: blast_files.append(run_blast(reference, renamedContigs, outputdir , verbose)) #now align reads to contigs samfile = align_w_subread(read1, read2, renamedContigs, outputdir, verbose) #now convert to bam bamfile = convert_to_bam_create_index(renamedContigs, samfile, verbose) # bamfile="notused" #now compute analytics outfile = generate_summary(bamfile, outputdir , renamedContigs, blast_files) return outfile
import sys, os from third_party_runners import run_blast from mungo.fasta import FastaReader OVERLAP = 0.7 IDENTITY = 95 sequences = sys.argv[1] outdir = sys.argv[2] #first run all-vs-all blast bfile = run_blast(sequences, sequences, outdir, True) #now get the lengths of all the contigs contig_len = {} for h,s in FastaReader(sequences): contig_len[h]=len(s) #now iterate through search finding redundant contigs redundant_contigs = set() with open(bfile, 'r') as blastsearch: for line in blastsearch: tokens = line.strip().split() if tokens[0]==tokens[1]: #matching itself continue if float(tokens[2]) >= IDENTITY: # print "identity ",tokens[2] if ((float(tokens[3])/contig_len[tokens[0]] >= OVERLAP) or (float(tokens[3])/contig_len[tokens[1]] >= OVERLAP)): # print "overlap", max(float(tokens[3])/contig_len[tokens[0]],float(tokens[3])/contig_len[tokens[1]])
import sys, os from third_party_runners import run_blast from mungo.fasta import FastaReader contig_file = sys.argv[1] blastdb_file = sys.argv[2] outdir = sys.argv[3] name = os.path.splitext(os.path.basename(contig_file))[0] blast_out = run_blast(blastdb_file, contig_file, outdir, True) rask_hits = set() with open(blast_out , 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) with open(outdir + name + "_rask.fa", 'w') as raskout: with open (outdir + name + "_nonrask.fa", 'w') as nonraskout: for h,s in FastaReader(contig_file): if h in rask_hits: raskout.write(">"+h+"\n") raskout.write(s+"\n") else: nonraskout.write(">"+h+"\n") nonraskout.write(s+"\n")
import sys, os from third_party_runners import run_blast from mungo.fasta import FastaReader contig_file = sys.argv[1] blastdb_file = sys.argv[2] outdir = sys.argv[3] name = os.path.splitext(os.path.basename(contig_file))[0] blast_out = run_blast(blastdb_file, contig_file, outdir, True) rask_hits = set() with open(blast_out, 'r') as blastfile: for line in blastfile: rask_hits.add(line.split()[0]) with open(outdir + name + "_rask.fa", 'w') as raskout: with open(outdir + name + "_nonrask.fa", 'w') as nonraskout: for h, s in FastaReader(contig_file): if h in rask_hits: raskout.write(">" + h + "\n") raskout.write(s + "\n") else: nonraskout.write(">" + h + "\n") nonraskout.write(s + "\n")