def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir+run_id+"/" match_root = run_root+run_dirs['match_out_dir']+ref_n+"/" capture_root = run_root+run_dirs['capture_dir']+ref_n+"/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root+run_dirs['blast_out_dir']+ref_n+"/"+seg_n+"/" capture_dir = capture_root+"/"+seg_n+"/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root+g_name+"/" ensure_dir([matches_dir]) blast_infile = blast_dir+g_name+"_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir']+g_name+"/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir+g_name+"_1.fas", matches_dir+g_name+".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop-q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length>min_match and score>min_score and idp>min_idp: print "+", p_cnt +=1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'('+contig_id+')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir+match.group(1)+".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir+item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir+contig_id+".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start-capture_span c_stop = q_stop+capture_span else: c_start = q_stop-capture_span c_stop = q_start+capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir+g_name+"_"+contig_id+".fas" cxt_rec = SeqRecord(id=contig_id+"_" +str(c_start)+"_" +str(c_stop), seq=contig_rec.seq [c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt +=1 if n_cnt > 0: logstring = "".join(["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def glomp_good_reads(dataset, bin_type): """Use matching reads lists to glomp good reads.""" # Identify dataset nickname = dataset['nickname'] # Identify input directory and filenames root match_dir_root = dirs['match_dir']+nickname+"/" # Signal process start print "-- Glomping matches against all references for", nickname, "--" print datetime.now() filter_files = [] rescue_files = [] for ref_nick in dataset['ref_nicks']: infile = match_dir_root+ref_nick+"/"+nickname+bin_type+"_match.npy" ref_type = [reference['type'] for reference in references if reference['nickname'] is ref_nick] if ref_type[0] is 'filter': filter_files.append({'ref_nick': ref_nick, 'matches': infile}) elif ref_type[0] is 'rescue': rescue_files.append({'ref_nick': ref_nick, 'matches': infile}) # process filter files print "\tprocessing filter references", [filter_file['ref_nick'] for filter_file in filter_files] filter_arrays = [] for file in filter_files: data_array = numpy.load(file['matches']) filter_arrays.append(data_array) array_index = 0 filter_IRA = filter_arrays[array_index] filter_URA = filter_arrays[array_index] while array_index < len(filter_arrays)-1: array_index +=1 filter_IRA = numpy.intersect1d(filter_IRA, filter_arrays[array_index]) filter_URA = numpy.union1d(filter_URA, filter_arrays[array_index]) print "\t\t"+str(len(filter_IRA)), "present in all filter references" print "\t\t"+str(len(filter_URA)), "matching reads all together (union)" # process rescue files print "\tprocessing rescue references", [rescue_file['ref_nick'] for rescue_file in rescue_files] rescue_arrays = [] for file in rescue_files: data_array = numpy.load(file['matches']) rescue_arrays.append(data_array) array_index = 0 rescue_IRA = rescue_arrays[array_index] rescue_URA = rescue_arrays[array_index] while array_index < len(rescue_arrays)-1: array_index +=1 rescue_IRA = numpy.intersect1d(rescue_IRA, rescue_arrays[array_index]) rescue_URA = numpy.union1d(rescue_URA, rescue_arrays[array_index]) print "\t\t"+str(len(rescue_IRA)), "present in all rescue references" print "\t\t"+str(len(rescue_URA)), "matching reads all together (union)" # prepare for masking print "\tpreparing selection masks" q2a_file = dirs['mft_dir']+nickname+"/"+nickname+bin_type+"_track.txt" dtype = numpy.dtype([('title', 'S50'), ('bincode', 'S15')]) pair_array = read_array(q2a_file, dtype, separator='\t') # create masking arrays mask = numpy.zeros(len(pair_array), bool) mask = numpy.invert(mask) # filter out baddies - flip to False for item in filter_URA: mask[item-1] = False # False means reject if item%2==0: mask[item-2] = False # even numbers are /2, flip previous else: mask[item] = False # odd numbers are /1, flip next # rescue goodies - flip to True for item in rescue_URA: mask[item-1] = True # True means accept if item%2==0: mask[item-2] = True # even numbers are /2, flip previous else: mask[item] = True # odd numbers are /1, flip next # save mask to file (where True means keep, False means reject) mask_file = dirs['trim_dir']+nickname+bin_type+"_mask.npy" numpy.save(mask_file, mask) # separate the two sets and write to file bin_accept_titles = pair_array[mask] accept_file = dirs['select_dir']+nickname+bin_type+"_accept.npy" numpy.save(accept_file, bin_accept_titles) inv_mask = numpy.invert(mask) bin_reject_titles = pair_array[inv_mask] reject_file = dirs['select_dir']+nickname+bin_type+"_reject.npy" numpy.save(reject_file, bin_reject_titles) print "\t\t"+str(len(bin_accept_titles)), "reads to accept" print "\t\t"+str(len(bin_reject_titles)), "reads to reject" print "-- Done! --" print datetime.now()
def glompX_blast_out(genomes, run_ref, blast_mode, r_root_dir, run_dirs, run_id, fixed_dirs, blast_dtypes, references, min_nt_match, min_nt_score, min_nt_idp, min_aa_match, min_aa_score, min_aa_idp, capture_span, timestamp): """Collect Blast results and extract match contigs.""" # load inputs ref_n = run_ref.name run_root = r_root_dir + run_id + "/" match_root = run_root + run_dirs['match_out_dir'] + ref_n + "/" capture_root = run_root + run_dirs['capture_dir'] + ref_n + "/" print " ", ref_n # log logstring = "".join(["\n\n# Collect Blast results @", timestamp, "\n\n"]) run_ref.log(logstring) # collect results ref_hits = {} control_scores = [] run_ref.log("Segs/Gs\t") run_ref.log("\t".join([genome['name'] for genome in genomes])) for seg in run_ref.segs: seg_n = seg['name'] print "\t", seg_n, "...", run_ref.log("".join(["\n", seg_n])) blast_dir = run_root + run_dirs[ 'blast_out_dir'] + ref_n + "/" + seg_n + "/" capture_dir = capture_root + "/" + seg_n + "/" ensure_dir([blast_dir, capture_dir]) ref_flag = True for genome in genomes: g_name = genome['name'] print "|", # process if g_name not in ref_hits.keys(): ref_hits[g_name] = {} matches_dir = match_root + g_name + "/" ensure_dir([matches_dir]) blast_infile = blast_dir + g_name + "_out.txt" genome_ctg_dir = fixed_dirs['fas_contigs_dir'] + g_name + "/" rec_array = read_array(blast_infile, blast_dtypes) if len(rec_array) > 0: # take qualified hits p_cnt = 0 n_cnt = 0 if g_name in [ref['name'] for ref in references]: copyfile(genome_ctg_dir + g_name + "_1.fas", matches_dir + g_name + ".fas") if ref_flag: # positive control TODO: better solution control_scores.append(rec_array[0][11]) ref_flag = False for line in rec_array: idp = line[2] q_start, q_stop = line[8], line[9] score = line[11] length = abs(q_stop - q_start) # check the blast mode to use the right thresholds if blast_mode == 'n' or blast_mode == 'tx': min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp elif blast_mode == 'tn': min_match = min_aa_match min_score = min_aa_score min_idp = min_aa_idp else: # default to nucleotide mode min_match = min_nt_match min_score = min_nt_score min_idp = min_nt_idp if length > min_match and score > min_score and idp > min_idp: print "+", p_cnt += 1 contig_id = line[1] if contig_id not in ref_hits[g_name].keys(): ref_hits[g_name][contig_id] = {seg_n: score} else: ref_hits[g_name][contig_id][seg_n] = score pattern = re.compile(r'(' + contig_id + ')\.fas') for item in listdir(genome_ctg_dir): match = re.match(pattern, item) if match: fas_file = matches_dir + match.group( 1) + ".fas" if not path.exists(fas_file): copyfile(genome_ctg_dir + item, fas_file) # context capture capture_flag = False while True: try: if int(seg_n) in run_ref.capture: capture_flag = True else: break except ValueError: if seg_n in run_ref.capture: capture_flag = True else: break else: break if capture_flag: # load the sequence contig_file = matches_dir + contig_id + ".fas" contig_rec = load_fasta(contig_file) # check orientation if q_start < q_stop: c_start = q_start - capture_span c_stop = q_stop + capture_span else: c_start = q_stop - capture_span c_stop = q_start + capture_span print c_start, c_stop # check limits if c_start < 0: c_start = 1 if c_stop > len(contig_rec.seq): c_stop = len(contig_rec.seq) # proceed cxt_file = capture_dir + g_name + "_" + contig_id + ".fas" cxt_rec = SeqRecord( id=contig_id + "_" + str(c_start) + "_" + str(c_stop), seq=contig_rec.seq[c_start:c_stop]) write_fasta(cxt_file, cxt_rec) else: print "-", n_cnt += 1 if n_cnt > 0: logstring = "".join( ["\t", str(p_cnt), " (", str(n_cnt), ")"]) else: logstring = "".join(["\t", str(p_cnt)]) run_ref.log(logstring) else: print "-", run_ref.log("".join(["\t", "0"])) print "" return ref_hits, control_scores
def glomp_blast_out(dataset, ref_nick): """Consolidate Blast output files.""" # Identify the genome nickname = dataset['nickname'] # Determine the input file root root_dir = dirs['blast_out_dir']+nickname+"/"+ref_nick+"/" file_root = root_dir+nickname # Signal process start print "-- Consolidating B_out for", nickname, "against", ref_nick, "--" print datetime.now() # Cycle through bin types series_index = 0 averages = [] # for comparing series later binned_pos = [] for bin_type in bin_types: index = 1 bin_arrays =[] while os.path.isfile(file_root+bin_type+"_"+str(index)+"_blast.out"): infile = file_root+bin_type+"_"+str(index)+"_blast.out" rec_array = read_array(infile, blast_dtypes) if len(rec_array) > 0: bin_arrays.append(rec_array) index +=1 print "\t\t"+str(len(bin_arrays)), "arrays for", \ nickname+bin_type, "series" if len(bin_arrays) > 0: series = numpy.hstack(bin_arrays) else: series = [] print "\t\t"+str(len(series)), "total records in", \ nickname+bin_type, "series" # Save to file cons_outfile = file_root+bin_type+"_cons_out.npy" numpy.save(cons_outfile, series) # Evaluate match positions on reference positions = [] match_read = [] for row in series: # collect match read info while we're at it # use regex to extract query index query_pattern = re.compile(r'\w*_(\d*)') query_match = query_pattern.match(row[0]) query_index = int(query_match.group(1)) match_read.append(query_index) # use regex to extract ref coords ref_pattern = re.compile(r'\w*_\d*_(\d*)') ref_match = ref_pattern.match(row[1]) ref_pos = int(ref_match.group(1)) pos_scaled = ref_pos/cpm['size'] # adjust to db segment length positions.append(pos_scaled) # uniquify the match read array unique_matches = numpy.unique(match_read) print "\t"+str(len(unique_matches)), "unique matches for", bin_type # write to file for future use match_dir_root = dirs['match_dir']+nickname+"/"+ref_nick+"/" ensure_dir(match_dir_root) match_outfile = match_dir_root+nickname+bin_type+"_match.npy" numpy.save(match_outfile, unique_matches) # now count ocurrences per position pos_np = numpy.array(positions) binned = numpy.bincount(pos_np) binned_pos.append(binned) pos_count_average = numpy.average(binned) averages.append((pos_count_average, series_index)) series_index +=1 # compare series averages.sort() averages.reverse() order_indices = [] for pair in averages: order_indices.append(pair[1]) # identify reference ref_name = [reference['full_name'] for reference in references if reference['nickname'] is ref_nick] # prep directory & file fig_root = dirs['reports_dir']+"match_figs/" fig_file = fig_root+nickname+"_"+ref_nick+".png" ensure_dir(fig_root) # generate a figure pylot.autoscale(enable=True, axis='both', tight=True) pylot.xlabel('Position on the chromosome (/'+str(cpm['size'])+')') pylot.ylabel('Number of matches (includes multiples)') pylot.title(nickname+' matches to '+ref_name) pylot.grid(True) for index in order_indices: label_root = nickname+bin_types[index] label_str = label_root+" ("+str(numpy.sum(binned_pos[index]))+")" pylot.plot(binned_pos[index], label=label_str) pylot.legend(loc=1) pylot.savefig(fig_file, dpi=None, facecolor='w', edgecolor='w', orientation='portrait', papertype=None, format=None) pylot.clf() print "\t"+str(series_index), "series consolidated and parsed" print "-- Done, see plot --" print datetime.now() return "OK"