def write_swarms(output_fasta_file, output_counts_file, output_map_file, min_samples, min_count): # set at least one sample where counts not given for swarm_id in dict_swarm_counts: if not swarm_id in dict_swarm_num_samples: dict_swarm_num_samples[swarm_id] = 1 out_handle1 = sys.stdout if output_fasta_file: out_handle1 = happyfile.hopen_write_or_else(output_fasta_file) if verbose and output_fasta_file: print("Writing FASTA file: " + output_fasta_file, file=sys.stderr) for swarm_id in dict_swarm_counts: if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count: print(">" + swarm_id + "\n" + dict_swarm_seq[swarm_id], file=out_handle1) out_handle1.close() if output_counts_file: out_handle2 = happyfile.hopen_write_or_else(output_counts_file) if verbose: print("Writing counts file: " + output_counts_file, file=sys.stderr) column_names = ['id'] for name in sample_list: if name in dict_sample_name: column_names.append(dict_sample_name[name]) else: column_names.append(name) print("\t".join(column_names), file=out_handle2) for swarm_id in dict_swarm_counts: if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count: samplecounts = [] for i in range(len(sample_list)): samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0)) print(swarm_id + "\t" + "\t".join(str(x) for x in samplecounts), file=out_handle2) out_handle2.close() if output_map_file: out_handle3 = happyfile.hopen_write_or_else(output_map_file) if verbose: print("Writing map file: " + output_map_file, file=sys.stderr) for id in sorted(dict_id_swarm, key=dict_id_swarm.get): swarm_id = dict_id_swarm[id] if dict_swarm_num_samples[swarm_id] >= min_samples and dict_swarm_counts[swarm_id] >= min_count: print(swarm_id + "\t" + id, file=out_handle3) out_handle3.close()
def write_swarms(output_counts_file): out_handle = sys.stdout if output_counts_file: out_handle = happyfile.hopen_write_or_else(output_counts_file) if verbose and output_counts_file: print >>sys.stderr, "Writing counts file: " + output_counts_file column_names = ['id', 'besthit', 'taxonomy'] for name in sample_list: if name in dict_sample_name: column_names.append(dict_sample_name[name]) else: column_names.append(name) print >>out_handle, "\t".join(column_names) for swarm_id in dict_swarm_counts: besthit = dict_swarm_best_hit.get(swarm_id, "") tax = "" if besthit: tax = dict_id_taxonomy.get(besthit, "") samplecounts = [swarm_id, besthit, tax] for i in range(len(sample_list)): samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0)) print >>out_handle, "\t".join(str(x) for x in samplecounts) if output_counts_file: out_handle.close()
def write_swarm_content(fasta_file, swarm_content_fasta_file): swarm_content_size = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print("Reading FASTA file: " + fasta_file, file=sys.stderr) out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file) if verbose: print("Writing swarm content FASTA file: " + swarm_content_fasta_file, file=sys.stderr) write_out = False while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id in dict_derep_ids: write_out = True swarm_content_size += 1 else: write_out = False if write_out: print(line, file=out_handle) in_handle.close() out_handle.close() return swarm_content_size
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len, max_seq_len): in_handle = happyfile.hopen_or_else(fastq_file) if verbose: print("Reading FASTQ file: " + fastq_file, file=sys.stderr) out_handle = sys.stdout if output_file: out_handle = happyfile.hopen_write_or_else(output_file) if verbose: print("Writing FASTA file: " + output_file, file=sys.stderr) rnum = 1 id = "" seq = "" qual = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if rnum == 1: id = re.split('\s', line[1:])[0] elif rnum == 2: seq = line elif rnum == 4: qual = line filter_line(out_handle, id, seq, qual, min_quality, min_seq_len, max_seq_len) rnum += 1 if rnum > 4: rnum = 1
def write_swarms(output_counts_file): out_handle = sys.stdout if output_counts_file: out_handle = happyfile.hopen_write_or_else(output_counts_file) if verbose and output_counts_file: print("Writing counts file: " + output_counts_file, file=sys.stderr) column_names = ['id', 'besthit', 'taxonomy'] for name in sample_list: if name in dict_sample_name: column_names.append(dict_sample_name[name]) else: column_names.append(name) print("\t".join(column_names), file=out_handle) for swarm_id in dict_swarm_counts: besthit = dict_swarm_best_hit.get(swarm_id, "") tax = "" if besthit: tax = dict_id_taxonomy.get(besthit, "") samplecounts = [swarm_id, besthit, tax] for i in range(len(sample_list)): samplecounts.append(dict_swarm_sample_counts.get((swarm_id, i), 0)) print("\t".join(str(x) for x in samplecounts), file=out_handle) if output_counts_file: out_handle.close()
def write_group_counts(output_groups_file): out_handle = sys.stdout if output_groups_file: out_handle = happyfile.hopen_write_or_else(output_groups_file) if verbose and output_groups_file: print >> sys.stderr, "Writing group counts file: " + output_groups_file column_names = ['group'] for name in sample_list: column_names.append(name) print >> out_handle, "\t".join(column_names) for group_name in sorted(dict_group_counts, key=lambda x: dict_group_counts.get(x), reverse=True): samplecounts = [group_name] for i in range(len(sample_list)): samplecounts.append( dict_group_sample_counts.get((group_name, i), 0)) print >> out_handle, "\t".join(str(x) for x in samplecounts) if output_groups_file: out_handle.close()
def write_swarm_content(fasta_file, swarm_content_fasta_file): swarm_content_size = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print >>sys.stderr, "Reading FASTA file: " + fasta_file out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file) if verbose: print >>sys.stderr, "Writing swarm content FASTA file: " + swarm_content_fasta_file write_out = False while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id in dict_derep_ids: write_out = True swarm_content_size += 1 else: write_out = False if write_out: print >>out_handle, line in_handle.close() out_handle.close() return swarm_content_size
def write_group_counts(output_groups_file): out_handle = sys.stdout if output_groups_file: out_handle = happyfile.hopen_write_or_else(output_groups_file) if verbose and output_groups_file: xprint("Writing group counts file: " + output_groups_file) column_names = ['group'] for name in sample_list: column_names.append(name) out_handle.write("\t".join(column_names) + "\n") for group_name in sorted(dict_group_counts, key=lambda x: dict_group_counts.get(x), reverse=True): samplecounts = [group_name] for i in range(len(sample_list)): samplecounts.append(dict_group_sample_counts.get((group_name, i), 0)) out_handle.write("\t".join(str(x) for x in samplecounts) + "\n") if output_groups_file: out_handle.close()
def write_swarms(output_fasta_file, output_counts_file, output_map_file, min_samples, min_count): # set at least one sample where counts not given for swarm_id in dict_swarm_counts: if not swarm_id in dict_swarm_num_samples: dict_swarm_num_samples[swarm_id] = 1 out_handle1 = sys.stdout if output_fasta_file: out_handle1 = happyfile.hopen_write_or_else(output_fasta_file) if verbose and output_fasta_file: print("Writing FASTA file: " + output_fasta_file, file=sys.stderr) for swarm_id in dict_swarm_counts: if dict_swarm_num_samples[ swarm_id] >= min_samples and dict_swarm_counts[ swarm_id] >= min_count: print(">" + swarm_id + "\n" + dict_swarm_seq[swarm_id], file=out_handle1) out_handle1.close() if output_counts_file: out_handle2 = happyfile.hopen_write_or_else(output_counts_file) if verbose: print("Writing counts file: " + output_counts_file, file=sys.stderr) column_names = ['id'] for name in sample_list: if name in dict_sample_name: column_names.append(dict_sample_name[name]) else: column_names.append(name) print("\t".join(column_names), file=out_handle2) for swarm_id in dict_swarm_counts: if dict_swarm_num_samples[ swarm_id] >= min_samples and dict_swarm_counts[ swarm_id] >= min_count: samplecounts = [] for i in range(len(sample_list)): samplecounts.append( dict_swarm_sample_counts.get((swarm_id, i), 0)) print(swarm_id + "\t" + "\t".join(str(x) for x in samplecounts), file=out_handle2) out_handle2.close() if output_map_file: out_handle3 = happyfile.hopen_write_or_else(output_map_file) if verbose: print("Writing map file: " + output_map_file, file=sys.stderr) for id in sorted(dict_id_swarm, key=dict_id_swarm.get): swarm_id = dict_id_swarm[id] if dict_swarm_num_samples[ swarm_id] >= min_samples and dict_swarm_counts[ swarm_id] >= min_count: print(swarm_id + "\t" + id, file=out_handle3) out_handle3.close()
def write_purity(output_swarm_content_tax_file, output_swarm_purity_file, output_purity_pdf): if output_swarm_content_tax_file: out_handle1 = happyfile.hopen_write_or_else(output_swarm_content_tax_file) if verbose: print("Writing swarm content taxonomy file: " + output_swarm_content_tax_file, file=sys.stderr) print("\t".join(['id', 'swarm', 'besthit', 'taxonomy']), file=out_handle1) for id in dict_derep_ids: swarm_id = dict_id_swarm.get(id, "") besthit = dict_id_best_hit.get(id, "") tax = "" if besthit: tax = dict_id_taxonomy.get(besthit, "") print("\t".join([id, swarm_id, besthit, tax]), file=out_handle1) out_handle1.close() out_handle2 = happyfile.hopen_write_or_else(output_swarm_purity_file) if verbose: print("Writing swarm purity file: " + output_swarm_purity_file, file=sys.stderr) count_all = {} count_same_tax = {} for id in dict_derep_ids: id_key, id_size = id.split('_')[:2] swarm_id = dict_id_swarm.get(id, "") derep_size = int(id_size) if derep_size < 1: derep_size = 1 if swarm_id: count_all[swarm_id] = count_all.get(swarm_id, 0) + derep_size if id == swarm_id: count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size else: besthit = dict_id_best_hit.get(id, "") besthit_swarm = dict_id_best_hit.get(swarm_id, "") if besthit and besthit_swarm: id_tax = dict_id_taxonomy.get(besthit, "") swarm_tax = dict_id_taxonomy.get(besthit_swarm, "") if id_tax and id_tax == swarm_tax: count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size print("\t".join(['swarm_id', 'taxonomy', 'size', 'same_taxon', 'purity']), file=out_handle2) count_pure_OTUs = 0 for swarm_id in count_all: print("\t".join([swarm_id, dict_id_taxonomy.get(swarm_id, ""), str(count_all[swarm_id]), str(count_same_tax.get(swarm_id, 0)), str(1.0 * count_same_tax.get(swarm_id, 0) / count_all[swarm_id])]), file=out_handle2) if count_same_tax.get(swarm_id, 0) == count_all[swarm_id]: count_pure_OTUs += 1 out_handle2.close() if len(count_all): print("OTUs 100% purity: " + str(count_pure_OTUs) + " / " + str(len(count_all)) + " (" + str(round(100.0 * count_pure_OTUs / len(count_all), 1)) + "%)", file=sys.stderr) cmd = " ".join([R_script, output_swarm_purity_file, output_purity_pdf]) if verbose: print(cmd, file=sys.stderr) rc = os.system(cmd + " >/dev/null") if rc != 0: print("[purity_plot] ERROR: " + R_script, file=sys.stderr) sys.exit(2)
def remove_plastid_seqs(output_base_file): dict_plastid = {} swarm_tax = output_base_file + ".swarm.tax" derep_fa = output_base_file + ".derep.fa" derep_counts = output_base_file + ".derep.counts" swarm_table = output_base_file + ".swarm" swarm_fa = output_base_file + ".swarm.fa" swarm_counts = output_base_file + ".swarm.counts" derep_plastid_fa = output_base_file + ".plastid.derep.fa" derep_plastid_counts = output_base_file + ".plastid.derep.counts" swarm_plastid_table = output_base_file + ".plastid.swarm" swarm_plastid_fa = output_base_file + ".plastid.swarm.fa" swarm_plastid_counts = output_base_file + ".plastid.swarm.counts" tmp_derep_16S_fa = derep_fa + ".tmp" tmp_derep_16S_counts = derep_counts + ".tmp" tmp_swarm_16S_table = swarm_table + ".tmp" tmp_swarm_16S_fa = swarm_fa + ".tmp" tmp_swarm_16S_counts = swarm_counts + ".tmp" tmp_swarm_16S_tax = swarm_tax + ".tmp" if overwrite or not os.path.exists(swarm_plastid_fa): if verbose: print >>sys.stderr, "Filtering chloroplast sequences" # split 16S/Plastid swarm taxonomy table in_handle_tax = happyfile.hopen_or_else(swarm_tax) out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax) firstline = 1 while 1: line = in_handle_tax.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_tax, line else: m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2]) if m: dict_plastid[cols[0]] = 1 else: print >>out_handle_16S_tax, line firstline = 0 in_handle_tax.close() out_handle_16S_tax.close() # split 16S/Plastid swarm file in_handle_table = happyfile.hopen_or_else(swarm_table) out_handle_16S_table = happyfile.hopen_write_or_else(tmp_swarm_16S_table) out_handle_plastid_table = happyfile.hopen_write_or_else(swarm_plastid_table) while 1: line = in_handle_table.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) swarm_id = id_list[0] if swarm_id in dict_plastid: print >>out_handle_plastid_table, line for id in id_list: dict_plastid[id] = 1 else: print >>out_handle_16S_table, line in_handle_table.close() out_handle_16S_table.close() out_handle_plastid_table.close() # split 16S/Plastid derep FASTA in_handle_derep_fa = happyfile.hopen_or_else(derep_fa) out_handle_16S_derep_fa = happyfile.hopen_write_or_else(tmp_derep_16S_fa) out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(derep_plastid_fa) id = "" while 1: line = in_handle_derep_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >>out_handle_plastid_derep_fa, line else: print >>out_handle_16S_derep_fa, line in_handle_derep_fa.close() out_handle_16S_derep_fa.close() out_handle_plastid_derep_fa.close() # split 16S/Plastid derep counts table in_handle_derep_counts = happyfile.hopen_or_else(derep_counts) out_handle_16S_derep_counts = happyfile.hopen_write_or_else(tmp_derep_16S_counts) out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(derep_plastid_counts) firstline = 1 while 1: line = in_handle_derep_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_derep_counts, line print >>out_handle_plastid_derep_counts, line else: if cols[0] in dict_plastid: print >>out_handle_plastid_derep_counts, line else: print >>out_handle_16S_derep_counts, line firstline = 0 in_handle_derep_counts.close() out_handle_16S_derep_counts.close() out_handle_plastid_derep_counts.close() # split 16S/Plastid swarm FASTA in_handle_fa = happyfile.hopen_or_else(swarm_fa) out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa) out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa) id = "" while 1: line = in_handle_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >>out_handle_plastid_fa, line else: print >>out_handle_16S_fa, line in_handle_fa.close() out_handle_16S_fa.close() out_handle_plastid_fa.close() # split 16S/Plastid swarm counts table in_handle_counts = happyfile.hopen_or_else(swarm_counts) out_handle_16S_counts = happyfile.hopen_write_or_else(tmp_swarm_16S_counts) out_handle_plastid_counts = happyfile.hopen_write_or_else(swarm_plastid_counts) firstline = 1 while 1: line = in_handle_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_counts, line print >>out_handle_plastid_counts, line else: if cols[0] in dict_plastid: print >>out_handle_plastid_counts, line else: print >>out_handle_16S_counts, line firstline = 0 in_handle_counts.close() out_handle_16S_counts.close() out_handle_plastid_counts.close() # replace original swarm files with 16S only if os.path.exists(tmp_derep_16S_fa) and os.path.exists(tmp_derep_16S_counts) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(tmp_swarm_16S_tax) and os.path.exists(tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts): replace_file(tmp_derep_16S_fa, derep_fa) replace_file(tmp_derep_16S_counts, derep_counts) replace_file(tmp_swarm_16S_table, swarm_table) replace_file(tmp_swarm_16S_tax, swarm_tax) replace_file(tmp_swarm_16S_fa, swarm_fa) replace_file(tmp_swarm_16S_counts, swarm_counts) else: print >>sys.stderr, "Not all tmp_ files found" sys.exit(2)
def write_dereps(output_fasta_file, output_counts_file, output_map_file, id_format, min_samples, min_count): dict_bestid = {} dict_id_num_samples = {} for key in dict_id_counts: for filenum in range(len(good_fasta_files)): if dict_id_file_counts.get((key, filenum), 0) > 0: dict_id_num_samples[key] = dict_id_num_samples.get(key, 0) + 1 out_handle1 = sys.stdout if output_fasta_file: out_handle1 = happyfile.hopen_write_or_else(output_fasta_file) if verbose and output_fasta_file: xprint("Writing FASTA file: " + output_fasta_file) if id_format == Format.bestid: for id in dict_id_map: key = dict_id_map[id] if (not key in dict_bestid) and dict_id_counts.get(key, 0) > 0: dict_bestid[key] = id for key in dict_id_counts: if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count and key in dict_id_seq: if id_format == Format.swarm: out_handle1.write(">" + key + "_" + str(dict_id_counts[key]) + "\n" + dict_id_seq[key] +"\n") elif id_format == Format.bestid and key in dict_bestid: out_handle1.write(">" + dict_bestid[key] + "\n" + dict_id_seq[key] + "\n") out_handle1.close() if output_counts_file: out_handle2 = happyfile.hopen_write_or_else(output_counts_file) if verbose: xprint("Writing counts file: " + output_counts_file) column_names = ['id'] for file in good_fasta_files: if file in dict_sample_name: column_names.append(dict_sample_name[file]) else: column_names.append(re.sub('\.filtered\.fa$', '', file)) out_handle2.write("\t".join(column_names) + "\n") for key in dict_id_counts: if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count: samplecounts = [] id = key + "_" + str(dict_id_counts[key]) if id_format == Format.bestid: id = re.split('\s', dict_bestid[key])[0] for filenum in range(len(good_fasta_files)): samplecounts.append(dict_id_file_counts.get((key, filenum), 0)) out_handle2.write(id + "\t" + "\t".join(str(x) for x in samplecounts) + "\n") out_handle2.close() if output_map_file: out_handle3 = happyfile.hopen_write_or_else(output_map_file) if verbose: xprint("Writing map file: " + output_map_file) for id in sorted(dict_id_map, key=dict_id_map.get): key = dict_id_map[id] if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[key] >= min_count: if id_format == Format.swarm: out_handle3.write(key + "_" + str(dict_id_counts[key]) + "\t" + id + "\n") elif id_format == Format.bestid: out_handle3.write(re.split('\s', dict_bestid[key])[0] + "\t" + id + "\n") out_handle3.close()
def write_dereps(output_fasta_file, output_counts_file, output_map_file, id_format, min_samples, min_count): dict_bestid = {} dict_id_num_samples = {} for key in dict_id_counts: for filenum in range(len(good_fasta_files)): if dict_id_file_counts.get((key, filenum), 0) > 0: dict_id_num_samples[key] = dict_id_num_samples.get(key, 0) + 1 out_handle1 = sys.stdout if output_fasta_file: out_handle1 = happyfile.hopen_write_or_else(output_fasta_file) if verbose and output_fasta_file: print("Writing FASTA file: " + output_fasta_file, file=sys.stderr) if id_format == Format.bestid: for id in dict_id_map: key = dict_id_map[id] if (not key in dict_bestid) and dict_id_counts.get(key, 0) > 0: dict_bestid[key] = id for key in dict_id_counts: if dict_id_num_samples.get(key, 0) >= min_samples and dict_id_counts[ key] >= min_count and key in dict_id_seq: if id_format == Format.swarm: print(">" + key + "_" + str(dict_id_counts[key]) + "\n" + dict_id_seq[key], file=out_handle1) elif id_format == Format.bestid and key in dict_bestid: print(">" + dict_bestid[key] + "\n" + dict_id_seq[key], file=out_handle1) out_handle1.close() if output_counts_file: out_handle2 = happyfile.hopen_write_or_else(output_counts_file) if verbose: print("Writing counts file: " + output_counts_file, file=sys.stderr) column_names = ['id'] for file in good_fasta_files: if file in dict_sample_name: column_names.append(dict_sample_name[file]) else: column_names.append(re.sub('\.filtered\.fa$', '', file)) print("\t".join(column_names), file=out_handle2) for key in dict_id_counts: if dict_id_num_samples.get( key, 0) >= min_samples and dict_id_counts[key] >= min_count: samplecounts = [] id = key + "_" + str(dict_id_counts[key]) if id_format == Format.bestid: id = re.split('\s', dict_bestid[key])[0] for filenum in range(len(good_fasta_files)): samplecounts.append( dict_id_file_counts.get((key, filenum), 0)) print(id + "\t" + "\t".join(str(x) for x in samplecounts), file=out_handle2) out_handle2.close() if output_map_file: out_handle3 = happyfile.hopen_write_or_else(output_map_file) if verbose: print("Writing map file: " + output_map_file, file=sys.stderr) for id in sorted(dict_id_map, key=dict_id_map.get): key = dict_id_map[id] if dict_id_num_samples.get( key, 0) >= min_samples and dict_id_counts[key] >= min_count: if id_format == Format.swarm: print(key + "_" + str(dict_id_counts[key]) + "\t" + id, file=out_handle3) elif id_format == Format.bestid: print(re.split('\s', dict_bestid[key])[0] + "\t" + id, file=out_handle3) out_handle3.close()
def remove_plastid_seqs(output_base_file): dict_plastid = {} swarm_tax = output_base_file + ".swarm.tax" derep_fa = output_base_file + ".derep.fa" derep_counts = output_base_file + ".derep.counts" swarm_table = output_base_file + ".swarm" swarm_fa = output_base_file + ".swarm.fa" swarm_counts = output_base_file + ".swarm.counts" derep_plastid_fa = output_base_file + ".plastid.derep.fa" derep_plastid_counts = output_base_file + ".plastid.derep.counts" swarm_plastid_table = output_base_file + ".plastid.swarm" swarm_plastid_fa = output_base_file + ".plastid.swarm.fa" swarm_plastid_counts = output_base_file + ".plastid.swarm.counts" tmp_derep_16S_fa = derep_fa + ".tmp" tmp_derep_16S_counts = derep_counts + ".tmp" tmp_swarm_16S_table = swarm_table + ".tmp" tmp_swarm_16S_fa = swarm_fa + ".tmp" tmp_swarm_16S_counts = swarm_counts + ".tmp" tmp_swarm_16S_tax = swarm_tax + ".tmp" if overwrite or not os.path.exists(swarm_plastid_fa): if verbose: print >> sys.stderr, "Filtering chloroplast sequences" # split 16S/Plastid swarm taxonomy table in_handle_tax = happyfile.hopen_or_else(swarm_tax) out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax) firstline = 1 while 1: line = in_handle_tax.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_tax, line else: m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2]) if m: dict_plastid[cols[0]] = 1 else: print >> out_handle_16S_tax, line firstline = 0 in_handle_tax.close() out_handle_16S_tax.close() # split 16S/Plastid swarm file in_handle_table = happyfile.hopen_or_else(swarm_table) out_handle_16S_table = happyfile.hopen_write_or_else( tmp_swarm_16S_table) out_handle_plastid_table = happyfile.hopen_write_or_else( swarm_plastid_table) while 1: line = in_handle_table.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) swarm_id = id_list[0] if swarm_id in dict_plastid: print >> out_handle_plastid_table, line for id in id_list: dict_plastid[id] = 1 else: print >> out_handle_16S_table, line in_handle_table.close() out_handle_16S_table.close() out_handle_plastid_table.close() # split 16S/Plastid derep FASTA in_handle_derep_fa = happyfile.hopen_or_else(derep_fa) out_handle_16S_derep_fa = happyfile.hopen_write_or_else( tmp_derep_16S_fa) out_handle_plastid_derep_fa = happyfile.hopen_write_or_else( derep_plastid_fa) id = "" while 1: line = in_handle_derep_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >> out_handle_plastid_derep_fa, line else: print >> out_handle_16S_derep_fa, line in_handle_derep_fa.close() out_handle_16S_derep_fa.close() out_handle_plastid_derep_fa.close() # split 16S/Plastid derep counts table in_handle_derep_counts = happyfile.hopen_or_else(derep_counts) out_handle_16S_derep_counts = happyfile.hopen_write_or_else( tmp_derep_16S_counts) out_handle_plastid_derep_counts = happyfile.hopen_write_or_else( derep_plastid_counts) firstline = 1 while 1: line = in_handle_derep_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_derep_counts, line print >> out_handle_plastid_derep_counts, line else: if cols[0] in dict_plastid: print >> out_handle_plastid_derep_counts, line else: print >> out_handle_16S_derep_counts, line firstline = 0 in_handle_derep_counts.close() out_handle_16S_derep_counts.close() out_handle_plastid_derep_counts.close() # split 16S/Plastid swarm FASTA in_handle_fa = happyfile.hopen_or_else(swarm_fa) out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa) out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa) id = "" while 1: line = in_handle_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >> out_handle_plastid_fa, line else: print >> out_handle_16S_fa, line in_handle_fa.close() out_handle_16S_fa.close() out_handle_plastid_fa.close() # split 16S/Plastid swarm counts table in_handle_counts = happyfile.hopen_or_else(swarm_counts) out_handle_16S_counts = happyfile.hopen_write_or_else( tmp_swarm_16S_counts) out_handle_plastid_counts = happyfile.hopen_write_or_else( swarm_plastid_counts) firstline = 1 while 1: line = in_handle_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_counts, line print >> out_handle_plastid_counts, line else: if cols[0] in dict_plastid: print >> out_handle_plastid_counts, line else: print >> out_handle_16S_counts, line firstline = 0 in_handle_counts.close() out_handle_16S_counts.close() out_handle_plastid_counts.close() # replace original swarm files with 16S only if os.path.exists(tmp_derep_16S_fa) and os.path.exists( tmp_derep_16S_counts ) and os.path.exists(tmp_swarm_16S_table) and os.path.exists( tmp_swarm_16S_tax) and os.path.exists( tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts): replace_file(tmp_derep_16S_fa, derep_fa) replace_file(tmp_derep_16S_counts, derep_counts) replace_file(tmp_swarm_16S_table, swarm_table) replace_file(tmp_swarm_16S_tax, swarm_tax) replace_file(tmp_swarm_16S_fa, swarm_fa) replace_file(tmp_swarm_16S_counts, swarm_counts) else: print >> sys.stderr, "Not all tmp_ files found" sys.exit(2)
def write_purity(output_swarm_content_tax_file, output_swarm_purity_file, output_purity_pdf): if output_swarm_content_tax_file: out_handle1 = happyfile.hopen_write_or_else(output_swarm_content_tax_file) if verbose: print >>sys.stderr, "Writing swarm content taxonomy file: " + output_swarm_content_tax_file print >>out_handle1, "\t".join(['id', 'swarm', 'besthit', 'taxonomy']) for id in dict_derep_ids: swarm_id = dict_id_swarm.get(id, "") besthit = dict_id_best_hit.get(id, "") tax = "" if besthit: tax = dict_id_taxonomy.get(besthit, "") print >>out_handle1, "\t".join([id, swarm_id, besthit, tax]) out_handle1.close() out_handle2 = happyfile.hopen_write_or_else(output_swarm_purity_file) if verbose: print >>sys.stderr, "Writing swarm purity file: " + output_swarm_purity_file count_all = {} count_same_tax = {} for id in dict_derep_ids: id_key, id_size = id.split('_')[:2] swarm_id = dict_id_swarm.get(id, "") derep_size = int(id_size) if derep_size < 1: derep_size = 1 if swarm_id: count_all[swarm_id] = count_all.get(swarm_id, 0) + derep_size if id == swarm_id: count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size else: besthit = dict_id_best_hit.get(id, "") besthit_swarm = dict_id_best_hit.get(swarm_id, "") if besthit and besthit_swarm: id_tax = dict_id_taxonomy.get(besthit, "") swarm_tax = dict_id_taxonomy.get(besthit_swarm, "") if id_tax and id_tax == swarm_tax: count_same_tax[swarm_id] = count_same_tax.get(swarm_id, 0) + derep_size print >>out_handle2, "\t".join(['swarm_id', 'taxonomy', 'size', 'same_taxon', 'purity']) count_pure_OTUs = 0 for swarm_id in count_all: print >>out_handle2, "\t".join([swarm_id, dict_id_taxonomy.get(swarm_id, ""), str(count_all[swarm_id]), str(count_same_tax.get(swarm_id, 0)), str(1.0 * count_same_tax.get(swarm_id, 0) / count_all[swarm_id])]) if count_same_tax.get(swarm_id, 0) == count_all[swarm_id]: count_pure_OTUs += 1 out_handle2.close() if len(count_all): print >>sys.stderr, "OTUs 100% purity: " + str(count_pure_OTUs) + " / " + str(len(count_all)) + " (" + str(round(100.0 * count_pure_OTUs / len(count_all), 1)) + "%)" cmd = " ".join([R_script, output_swarm_purity_file, output_purity_pdf]) if verbose: print >>sys.stderr, cmd rc = os.system(cmd + " >/dev/null") if rc != 0: print >>sys.stderr, "[purity_plot] ERROR: " + R_script sys.exit(2)