def get_taxonomy(fasta_file, ggsearch_file, database_file, cpus): global dict_swarm_best_hit global dict_swarm_best_bs global dict_id_taxonomy if fasta_file: if os.path.exists(ggsearch_file): if verbose: print >>sys.stderr, "Ignoring FASTA file: " + fasta_file else: print >>sys.stderr, "[swarm_classify_taxonomy] running ggsearch" if cpus < 1: cpus = 1 cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), fasta_file, database_file, ">", ggsearch_file]) if verbose: print >>sys.stderr, cmd rc = os.system(cmd) if rc != 0: print >>sys.stderr, "[swarm_classify_taxonomy] ERROR: ggsearch" sys.exit(2) in_handle1 = happyfile.hopen_or_else(ggsearch_file) if verbose: print >>sys.stderr, "Reading ggsearch file: " + ggsearch_file while 1: line = in_handle1.readline() if not line: break line = line.rstrip() qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12] if (not qid in dict_swarm_best_bs) or bs > dict_swarm_best_bs[qid]: dict_swarm_best_hit[qid] = sid dict_swarm_best_bs[qid] = bs in_handle1.close() in_handle2 = happyfile.hopen_or_else(database_file) if verbose: print >>sys.stderr, "Reading database file: " + database_file while 1: line = in_handle2.readline() if not line: break line = line.rstrip() if line.startswith(">"): m = re.match('>(\S+)\s+(.+)$', line) if m: id = m.group(1) taxstr = m.group(2) taxstr = re.sub('\|', ';', taxstr) taxstr = re.sub('\+', ' ', taxstr) dict_id_taxonomy[id] = taxstr in_handle2.close()
def get_swarms(fasta_file, swarm_file, cpus): global dict_id_swarm if fasta_file and not os.path.exists(swarm_file): print("[swarm_map] running swarm", file=sys.stderr) if cpus < 1: cpus = 1 cmd = " ".join( ["swarm -f -t", str(cpus), "-o", swarm_file, fasta_file]) if verbose: print(cmd, file=sys.stderr) else: cmd += " &>/dev/null" rc = os.system(cmd) if rc != 0: print("[swarm_map] ERROR: swarm", file=sys.stderr) sys.exit(2) in_handle1 = happyfile.hopen_or_else(fasta_file) while 1: line = in_handle1.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = line[1:] # set any IDs not returned by swarm, to their own cluster dict_id_swarm[id] = id in_handle1.close() in_handle2 = happyfile.hopen_or_else(swarm_file) if verbose: print("Reading swarm file: " + swarm_file, file=sys.stderr) while 1: line = in_handle2.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) for id in id_list: dict_id_swarm[id] = id_list[0] in_handle2.close()
def get_swarms(fasta_file, swarm_file, cpus): global dict_id_swarm if fasta_file and not os.path.exists(swarm_file): print("[swarm_map] running swarm", file=sys.stderr) if cpus < 1: cpus = 1 cmd = " ".join(["swarm -f -t", str(cpus), "-o", swarm_file, fasta_file]) if verbose: print(cmd, file=sys.stderr) else: cmd += " &>/dev/null" rc = os.system(cmd) if rc != 0: print("[swarm_map] ERROR: swarm", file=sys.stderr) sys.exit(2) in_handle1 = happyfile.hopen_or_else(fasta_file) while 1: line = in_handle1.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = line[1:] # set any IDs not returned by swarm, to their own cluster dict_id_swarm[id] = id in_handle1.close() in_handle2 = happyfile.hopen_or_else(swarm_file) if verbose: print("Reading swarm file: " + swarm_file, file=sys.stderr) while 1: line = in_handle2.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) for id in id_list: dict_id_swarm[id] = id_list[0] in_handle2.close()
def read_swarm_fasta(fasta_file): in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print("Reading FASTA file: " + fasta_file, file=sys.stderr) id = "" seq = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): if seq: dict_swarm_seq[id] = seq id = line[1:] seq = "" else: seq += re.sub('\s', '', line) if seq: dict_swarm_seq[id] = seq in_handle.close()
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len, max_seq_len): in_handle = happyfile.hopen_or_else(fastq_file) if verbose: print("Reading FASTQ file: " + fastq_file, file=sys.stderr) out_handle = sys.stdout if output_file: out_handle = happyfile.hopen_write_or_else(output_file) if verbose: print("Writing FASTA file: " + output_file, file=sys.stderr) rnum = 1 id = "" seq = "" qual = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if rnum == 1: id = re.split('\s', line[1:])[0] elif rnum == 2: seq = line elif rnum == 4: qual = line filter_line(out_handle, id, seq, qual, min_quality, min_seq_len, max_seq_len) rnum += 1 if rnum > 4: rnum = 1
def read_sample_names(sample_names_file): global dict_sample_name if sample_names_file: in_handle = happyfile.hopen_or_else(sample_names_file) if verbose: xprint("Reading sample names file: " + sample_names_file) while 1: line = in_handle.readline() if not line: break line = line.rstrip() name, file = line.split("\t") if name in dict_all_sample_names: xprint("Duplicate sample name found: " + name) sys.exit(2) dict_sample_name[file] = name dict_all_sample_names[name] = 1 m = re.search('^(.+)\.filtered\.fa$', file) if m: dict_sample_name[m.group(1)] = name else: dict_sample_name[file + ".filtered.fa"] = name in_handle.close()
def read_sample_names(sample_names_file): global dict_sample_name if sample_names_file: in_handle = happyfile.hopen_or_else(sample_names_file) if verbose: print >> sys.stderr, "Reading sample names file: " + sample_names_file while 1: line = in_handle.readline() if not line: break line = line.rstrip() name, file = line.split("\t") if name in dict_all_sample_names: print >> sys.stderr, "Duplicate sample name found: " + name sys.exit(2) dict_sample_name[file] = name dict_all_sample_names[name] = 1 m = re.search('^(.+)\.filtered\.fa$', file) if m: dict_sample_name[m.group(1)] = name else: dict_sample_name[file + ".filtered.fa"] = name in_handle.close()
def filter_fastq(fastq_file, output_file, min_quality, min_seq_len, max_seq_len): in_handle = happyfile.hopen_or_else(fastq_file) if verbose: print("Reading FASTQ file: " + fastq_file, file=sys.stderr) out_handle = sys.stdout if output_file: out_handle = happyfile.hopen_write_or_else(output_file) if verbose: print("Writing FASTA file: " + output_file, file=sys.stderr) rnum = 1 id = "" seq = "" qual = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if rnum == 1: id = re.split('\s', line[1:])[0] elif rnum == 2: seq = line elif rnum == 4: qual = line filter_line(out_handle, id, seq, qual, min_quality, min_seq_len, max_seq_len) rnum += 1 if rnum > 4: rnum = 1
def write_swarm_content(fasta_file, swarm_content_fasta_file): swarm_content_size = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print("Reading FASTA file: " + fasta_file, file=sys.stderr) out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file) if verbose: print("Writing swarm content FASTA file: " + swarm_content_fasta_file, file=sys.stderr) write_out = False while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id in dict_derep_ids: write_out = True swarm_content_size += 1 else: write_out = False if write_out: print(line, file=out_handle) in_handle.close() out_handle.close() return swarm_content_size
def read_sample_names(sample_names_file): global dict_sample_name if sample_names_file: in_handle = happyfile.hopen_or_else(sample_names_file) if verbose: print >>sys.stderr, "Reading sample names file: " + sample_names_file while 1: line = in_handle.readline() if not line: break line = line.rstrip() name, file = line.split("\t") dict_sample_name[file] = name m = re.search('^(.+)\.filtered\.fa$', file) if m: dict_sample_name[m.group(1)] = name else: dict_sample_name[file + ".filtered.fa"] = name in_handle.close()
def read_counts(counts_file): global dict_swarm_counts global dict_swarm_sample_counts global sample_list if counts_file: in_handle = happyfile.hopen_or_else(counts_file) if verbose: print >>sys.stderr, "Reading counts file: " + counts_file firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if firstline: sample_list = line.split("\t")[1:] else: cols = line.split("\t") for i in range(1, len(cols)): dict_swarm_sample_counts[cols[0], i-1] = int(cols[i]) dict_swarm_counts[cols[0]] = dict_swarm_counts.get(cols[0], 0) + int(cols[i]) firstline = 0 in_handle.close()
def write_swarm_content(fasta_file, swarm_content_fasta_file): swarm_content_size = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print >>sys.stderr, "Reading FASTA file: " + fasta_file out_handle = happyfile.hopen_write_or_else(swarm_content_fasta_file) if verbose: print >>sys.stderr, "Writing swarm content FASTA file: " + swarm_content_fasta_file write_out = False while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id in dict_derep_ids: write_out = True swarm_content_size += 1 else: write_out = False if write_out: print >>out_handle, line in_handle.close() out_handle.close() return swarm_content_size
def read_swarm_fasta(fasta_file): in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print("Reading FASTA file: " + fasta_file, file=sys.stderr) id = "" seq = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): if seq: dict_swarm_seq[id] = seq id = line[1:] seq = "" else: seq += re.sub('\s', '', line) if seq: dict_swarm_seq[id] = seq in_handle.close()
def read_sample_names(sample_names_file): global dict_sample_name if sample_names_file: in_handle = happyfile.hopen_or_else(sample_names_file) if verbose: print("Reading sample names file: " + sample_names_file, file=sys.stderr) while 1: line = in_handle.readline() if not line: break line = line.rstrip() name, file = line.split("\t") dict_sample_name[file] = name m = re.search('^(.+)\.filtered\.fa$', file) if m: dict_sample_name[m.group(1)] = name else: dict_sample_name[file + ".filtered.fa"] = name in_handle.close()
def read_counts(counts_file): global dict_id_counts global dict_id_sample_counts global sample_list if counts_file: in_handle = happyfile.hopen_or_else(counts_file) if verbose: print("Reading counts file: " + counts_file, file=sys.stderr) firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if firstline: sample_list = line.split("\t")[1:] else: cols = line.split("\t") for i in range(1, len(cols)): dict_id_sample_counts[cols[0], i - 1] = int(cols[i]) dict_id_counts[cols[0]] = dict_id_counts.get( cols[0], 0) + int(cols[i]) firstline = 0 in_handle.close() calc_swarm_counts()
def read_taxa_counts(swarm_tax_file): global dict_taxa_counts global dict_taxa_sample_counts global dict_group_counts global dict_group_sample_counts global sample_list in_handle = happyfile.hopen_or_else(swarm_tax_file) if verbose: print >> sys.stderr, "Reading taxa counts file: " + swarm_tax_file firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if firstline: sample_list = line.split("\t")[3:] else: cols = line.split("\t") taxstr = cols[2] for i in range(3, len(cols)): dict_taxa_sample_counts[taxstr, i - 3] = dict_taxa_sample_counts.get( (taxstr, i - 3), 0) + int(cols[i]) dict_taxa_counts[taxstr] = dict_taxa_counts.get( taxstr, 0) + int(cols[i]) firstline = 0 in_handle.close() for id_tax in dict_taxa_counts: best_grp_tax = "" for grp_tax in dict_taxonomy_group: sub_tax = id_tax[:len(grp_tax)] if sub_tax == grp_tax: if (not best_grp_tax) or len(grp_tax) > len(best_grp_tax): best_grp_tax = grp_tax best_grp_name = dict_taxonomy_group.get(best_grp_tax, "Unclassified") for i in range(len(sample_list)): dict_group_sample_counts[best_grp_name, i] = dict_group_sample_counts.get( (best_grp_name, i), 0) + dict_taxa_sample_counts.get( (id_tax, i), 0) dict_group_counts[best_grp_name] = dict_group_counts.get( best_grp_name, 0) + dict_taxa_counts.get(id_tax, 0)
def read_index_file(index_file): global dict_is_index in_handle = happyfile.hopen_or_else(index_file) xprint_err("reading file: " + index_file) while 1: line = in_handle.readline() if not line: break line = line.rstrip() cols = line.split('\t') dict_is_index[cols[0]] = True xprint_err("found: " + cols[0])
def read_swarms(swarm_file): in_handle = happyfile.hopen_or_else(swarm_file) if verbose: print("Reading swarm file: " + swarm_file, file=sys.stderr) while 1: line = in_handle.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) for id in id_list: dict_id_swarm[id] = id_list[0] in_handle.close()
def read_swarms(swarm_file): in_handle = happyfile.hopen_or_else(swarm_file) if verbose: print >>sys.stderr, "Reading swarm file: " + swarm_file while 1: line = in_handle.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) for id in id_list: dict_id_swarm[id] = id_list[0] in_handle.close()
def read_swarm_counts(swarm_counts_file, min_swarm_count, top_swarms): dict_swarm_counts = {} global dict_derep_ids in_handle = happyfile.hopen_or_else(swarm_counts_file) if verbose: print("Reading swarm counts file: " + swarm_counts_file, file=sys.stderr) firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if not firstline: cols = line.split("\t") for i in range(1, len(cols)): dict_swarm_counts[cols[0]] = dict_swarm_counts.get( cols[0], 0) + int(cols[i]) firstline = 0 in_handle.close() num_ids = 0 dict_top_swarms = {} for swarm_id in sorted(dict_swarm_counts, key=dict_swarm_counts.get, reverse=True): if num_ids < top_swarms: dict_top_swarms[swarm_id] = 1 num_ids += 1 for id in dict_id_swarm: swarm_id = dict_id_swarm[id] if swarm_id in dict_top_swarms and dict_swarm_counts.get( swarm_id, 0) >= min_swarm_count: dict_derep_ids[id] = 1 if verbose: print("Top purity content, swarms: " + str(len(dict_top_swarms)) + " derep ids: " + str(len(dict_derep_ids)), file=sys.stderr)
def read_chimeras(chimera_file): global dict_chimera_ids in_handle = happyfile.hopen_or_else(chimera_file) if verbose: print("Reading chimera file: " + chimera_file, file=sys.stderr) while 1: line = in_handle.readline() if not line: break line = line.rstrip() cols = line.split('\t') if cols[16] == 'Y': dict_chimera_ids[cols[1]] = 1
def read_chimeras(chimera_file): global dict_chimera_ids in_handle = happyfile.hopen_or_else(chimera_file) if verbose: print("Reading chimera file: " + chimera_file, file=sys.stderr) while 1: line = in_handle.readline() if not line: break line = line.rstrip() cols = line.split('\t') if cols[16] == 'Y': dict_chimera_ids[cols[1]] = 1
def read_taxa_counts(swarm_tax_file): global dict_taxa_counts global dict_taxa_sample_counts global dict_group_counts global dict_group_sample_counts global sample_list in_handle = happyfile.hopen_or_else(swarm_tax_file) if verbose: xprint("Reading taxa counts file: " + swarm_tax_file) firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if firstline: sample_list = line.split("\t")[3:] else: cols = line.split("\t") taxstr = cols[2] for i in range(3, len(cols)): dict_taxa_sample_counts[taxstr, i-3] = dict_taxa_sample_counts.get((taxstr, i-3), 0) + int(cols[i]) dict_taxa_counts[taxstr] = dict_taxa_counts.get(taxstr, 0) + int(cols[i]) firstline = 0 in_handle.close() for id_tax in dict_taxa_counts: best_grp_tax = "" for grp_tax in dict_taxonomy_group: sub_tax = id_tax[:len(grp_tax)] if sub_tax == grp_tax: if (not best_grp_tax) or len(grp_tax) > len(best_grp_tax): best_grp_tax = grp_tax best_grp_name = dict_taxonomy_group.get(best_grp_tax, "Unclassified") for i in range(len(sample_list)): dict_group_sample_counts[best_grp_name, i] = dict_group_sample_counts.get((best_grp_name, i), 0) + dict_taxa_sample_counts.get((id_tax, i), 0) dict_group_counts[best_grp_name] = dict_group_counts.get(best_grp_name, 0) + dict_taxa_counts.get(id_tax, 0)
def read_groups(taxa_group_file): global dict_taxonomy_group in_handle = happyfile.hopen_or_else(taxa_group_file) if verbose: xprint("Reading taxa groups file: " + taxa_group_file) while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line: group_name, taxstr = line.split("\t")[:2] if taxstr and group_name: dict_taxonomy_group[taxstr] = group_name in_handle.close()
def read_groups(taxa_group_file): global dict_taxonomy_group in_handle = happyfile.hopen_or_else(taxa_group_file) if verbose: print >> sys.stderr, "Reading taxa groups file: " + taxa_group_file while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line: group_name, taxstr = line.split("\t")[:2] if taxstr and group_name: dict_taxonomy_group[taxstr] = group_name in_handle.close()
def derep_fasta(fasta_files, min_fasta): global good_fasta_files filenum = 0 for fasta_file in fasta_files: total_seqs = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: print("Reading FASTA file: " + fasta_file, file=sys.stderr) id = "" seq = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): total_seqs += 1 derep_line(id, seq, filenum) id = line[1:] seq = "" else: seq += re.sub('\s', '', line) derep_line(id, seq, filenum) in_handle.close() # Remove counts for this file if below minimum if total_seqs < min_fasta: print("[fasta_dereplicate] Excluding: " + fasta_file, file=sys.stderr) for key in dict_id_counts: dict_id_counts[key] -= dict_id_file_counts.get((key, filenum), 0) dict_id_file_counts[key, filenum] = 0 else: good_fasta_files.append(fasta_file) filenum += 1
def read_swarm_counts(swarm_counts_file, min_swarm_count, top_swarms): dict_swarm_counts = {} global dict_derep_ids in_handle = happyfile.hopen_or_else(swarm_counts_file) if verbose: print("Reading swarm counts file: " + swarm_counts_file, file=sys.stderr) firstline = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if not firstline: cols = line.split("\t") for i in range(1, len(cols)): dict_swarm_counts[cols[0]] = dict_swarm_counts.get(cols[0], 0) + int(cols[i]) firstline = 0 in_handle.close() num_ids = 0 dict_top_swarms = {} for swarm_id in sorted(dict_swarm_counts, key=dict_swarm_counts.get, reverse=True): if num_ids < top_swarms: dict_top_swarms[swarm_id] = 1 num_ids += 1 for id in dict_id_swarm: swarm_id = dict_id_swarm[id] if swarm_id in dict_top_swarms and dict_swarm_counts.get(swarm_id, 0) >= min_swarm_count: dict_derep_ids[id] = 1 if verbose: print("Top purity content, swarms: " + str(len(dict_top_swarms)) + " derep ids: " + str(len(dict_derep_ids)), file=sys.stderr)
def derep_fasta(fasta_files, min_fasta): global good_fasta_files filenum = 0 for fasta_file in fasta_files: total_seqs = 0 in_handle = happyfile.hopen_or_else(fasta_file) if verbose: xprint("Reading FASTA file: " + fasta_file) id = "" seq = "" while 1: line = in_handle.readline() if not line: break line = line.rstrip() if line.startswith(">"): total_seqs += 1 derep_line(id, seq, filenum) id = line[1:] seq = "" else: seq += re.sub('\s', '', line) derep_line(id, seq, filenum) in_handle.close() # Remove counts for this file if below minimum if total_seqs < min_fasta: xprint("[fasta_dereplicate] Excluding: " + fasta_file) for key in dict_id_counts: dict_id_counts[key] -= dict_id_file_counts.get((key, filenum), 0) dict_id_file_counts[key, filenum] = 0 else: good_fasta_files.append(fasta_file) filenum += 1
def read_fastq(fastq_file): global dict_index_count global count_total global count_index in_handle = happyfile.hopen_or_else(fastq_file) xprint_err("reading file: " + fastq_file) rnum = 1 while 1: line = in_handle.readline() if not line: break line = line.rstrip() if rnum == 2: idx = line[4:12] dict_index_count[idx] = dict_index_count.get(idx, 0) + 1 count_total += 1 if dict_is_index.get(idx, False): count_index += 1 rnum += 1 if rnum > 4: rnum = 1
def get_taxonomy(swarm_content_fasta_file, swarm_content_ggsearch_file, database_file, cpus): global dict_id_best_hit global dict_id_best_bs global dict_id_taxonomy if swarm_content_fasta_file: if os.path.exists(swarm_content_ggsearch_file): if verbose: print >>sys.stderr, "Ignoring content FASTA file: " + swarm_content_fasta_file else: print >>sys.stderr, "[purity_plot] running ggsearch" if cpus < 1: cpus = 1 cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), swarm_content_fasta_file, database_file, ">", swarm_content_ggsearch_file]) if verbose: print >>sys.stderr, cmd else: cmd += " 2>/dev/null" rc = os.system(cmd) if rc != 0: print >>sys.stderr, "[purity_plot] ERROR: ggsearch" sys.exit(2) in_handle1 = happyfile.hopen_or_else(swarm_content_ggsearch_file) if verbose: print >>sys.stderr, "Reading ggsearch file: " + swarm_content_ggsearch_file while 1: line = in_handle1.readline() if not line: break line = line.rstrip() qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12] if (not qid in dict_id_best_bs) or bs > dict_id_best_bs[qid]: dict_id_best_hit[qid] = sid dict_id_best_bs[qid] = bs in_handle1.close() in_handle2 = happyfile.hopen_or_else(database_file) if verbose: print >>sys.stderr, "Reading database file: " + database_file while 1: line = in_handle2.readline() if not line: break line = line.rstrip() if line.startswith(">"): m = re.match('>(\S+)\s+(.+)$', line) if m: id = m.group(1) taxstr = m.group(2) taxstr = re.sub('\|', ';', taxstr) taxstr = re.sub('\+', ' ', taxstr) dict_id_taxonomy[id] = taxstr in_handle2.close()
def remove_plastid_seqs(output_base_file): dict_plastid = {} swarm_tax = output_base_file + ".swarm.tax" derep_fa = output_base_file + ".derep.fa" derep_counts = output_base_file + ".derep.counts" swarm_table = output_base_file + ".swarm" swarm_fa = output_base_file + ".swarm.fa" swarm_counts = output_base_file + ".swarm.counts" derep_plastid_fa = output_base_file + ".plastid.derep.fa" derep_plastid_counts = output_base_file + ".plastid.derep.counts" swarm_plastid_table = output_base_file + ".plastid.swarm" swarm_plastid_fa = output_base_file + ".plastid.swarm.fa" swarm_plastid_counts = output_base_file + ".plastid.swarm.counts" tmp_derep_16S_fa = derep_fa + ".tmp" tmp_derep_16S_counts = derep_counts + ".tmp" tmp_swarm_16S_table = swarm_table + ".tmp" tmp_swarm_16S_fa = swarm_fa + ".tmp" tmp_swarm_16S_counts = swarm_counts + ".tmp" tmp_swarm_16S_tax = swarm_tax + ".tmp" if overwrite or not os.path.exists(swarm_plastid_fa): if verbose: print >>sys.stderr, "Filtering chloroplast sequences" # split 16S/Plastid swarm taxonomy table in_handle_tax = happyfile.hopen_or_else(swarm_tax) out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax) firstline = 1 while 1: line = in_handle_tax.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_tax, line else: m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2]) if m: dict_plastid[cols[0]] = 1 else: print >>out_handle_16S_tax, line firstline = 0 in_handle_tax.close() out_handle_16S_tax.close() # split 16S/Plastid swarm file in_handle_table = happyfile.hopen_or_else(swarm_table) out_handle_16S_table = happyfile.hopen_write_or_else(tmp_swarm_16S_table) out_handle_plastid_table = happyfile.hopen_write_or_else(swarm_plastid_table) while 1: line = in_handle_table.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) swarm_id = id_list[0] if swarm_id in dict_plastid: print >>out_handle_plastid_table, line for id in id_list: dict_plastid[id] = 1 else: print >>out_handle_16S_table, line in_handle_table.close() out_handle_16S_table.close() out_handle_plastid_table.close() # split 16S/Plastid derep FASTA in_handle_derep_fa = happyfile.hopen_or_else(derep_fa) out_handle_16S_derep_fa = happyfile.hopen_write_or_else(tmp_derep_16S_fa) out_handle_plastid_derep_fa = happyfile.hopen_write_or_else(derep_plastid_fa) id = "" while 1: line = in_handle_derep_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >>out_handle_plastid_derep_fa, line else: print >>out_handle_16S_derep_fa, line in_handle_derep_fa.close() out_handle_16S_derep_fa.close() out_handle_plastid_derep_fa.close() # split 16S/Plastid derep counts table in_handle_derep_counts = happyfile.hopen_or_else(derep_counts) out_handle_16S_derep_counts = happyfile.hopen_write_or_else(tmp_derep_16S_counts) out_handle_plastid_derep_counts = happyfile.hopen_write_or_else(derep_plastid_counts) firstline = 1 while 1: line = in_handle_derep_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_derep_counts, line print >>out_handle_plastid_derep_counts, line else: if cols[0] in dict_plastid: print >>out_handle_plastid_derep_counts, line else: print >>out_handle_16S_derep_counts, line firstline = 0 in_handle_derep_counts.close() out_handle_16S_derep_counts.close() out_handle_plastid_derep_counts.close() # split 16S/Plastid swarm FASTA in_handle_fa = happyfile.hopen_or_else(swarm_fa) out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa) out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa) id = "" while 1: line = in_handle_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >>out_handle_plastid_fa, line else: print >>out_handle_16S_fa, line in_handle_fa.close() out_handle_16S_fa.close() out_handle_plastid_fa.close() # split 16S/Plastid swarm counts table in_handle_counts = happyfile.hopen_or_else(swarm_counts) out_handle_16S_counts = happyfile.hopen_write_or_else(tmp_swarm_16S_counts) out_handle_plastid_counts = happyfile.hopen_write_or_else(swarm_plastid_counts) firstline = 1 while 1: line = in_handle_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >>out_handle_16S_counts, line print >>out_handle_plastid_counts, line else: if cols[0] in dict_plastid: print >>out_handle_plastid_counts, line else: print >>out_handle_16S_counts, line firstline = 0 in_handle_counts.close() out_handle_16S_counts.close() out_handle_plastid_counts.close() # replace original swarm files with 16S only if os.path.exists(tmp_derep_16S_fa) and os.path.exists(tmp_derep_16S_counts) and os.path.exists(tmp_swarm_16S_table) and os.path.exists(tmp_swarm_16S_tax) and os.path.exists(tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts): replace_file(tmp_derep_16S_fa, derep_fa) replace_file(tmp_derep_16S_counts, derep_counts) replace_file(tmp_swarm_16S_table, swarm_table) replace_file(tmp_swarm_16S_tax, swarm_tax) replace_file(tmp_swarm_16S_fa, swarm_fa) replace_file(tmp_swarm_16S_counts, swarm_counts) else: print >>sys.stderr, "Not all tmp_ files found" sys.exit(2)
def remove_plastid_seqs(output_base_file): dict_plastid = {} swarm_tax = output_base_file + ".swarm.tax" derep_fa = output_base_file + ".derep.fa" derep_counts = output_base_file + ".derep.counts" swarm_table = output_base_file + ".swarm" swarm_fa = output_base_file + ".swarm.fa" swarm_counts = output_base_file + ".swarm.counts" derep_plastid_fa = output_base_file + ".plastid.derep.fa" derep_plastid_counts = output_base_file + ".plastid.derep.counts" swarm_plastid_table = output_base_file + ".plastid.swarm" swarm_plastid_fa = output_base_file + ".plastid.swarm.fa" swarm_plastid_counts = output_base_file + ".plastid.swarm.counts" tmp_derep_16S_fa = derep_fa + ".tmp" tmp_derep_16S_counts = derep_counts + ".tmp" tmp_swarm_16S_table = swarm_table + ".tmp" tmp_swarm_16S_fa = swarm_fa + ".tmp" tmp_swarm_16S_counts = swarm_counts + ".tmp" tmp_swarm_16S_tax = swarm_tax + ".tmp" if overwrite or not os.path.exists(swarm_plastid_fa): if verbose: print >> sys.stderr, "Filtering chloroplast sequences" # split 16S/Plastid swarm taxonomy table in_handle_tax = happyfile.hopen_or_else(swarm_tax) out_handle_16S_tax = happyfile.hopen_write_or_else(tmp_swarm_16S_tax) firstline = 1 while 1: line = in_handle_tax.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_tax, line else: m = re.match('Bacteria;Cyanobacteria;Chloroplast', cols[2]) if m: dict_plastid[cols[0]] = 1 else: print >> out_handle_16S_tax, line firstline = 0 in_handle_tax.close() out_handle_16S_tax.close() # split 16S/Plastid swarm file in_handle_table = happyfile.hopen_or_else(swarm_table) out_handle_16S_table = happyfile.hopen_write_or_else( tmp_swarm_16S_table) out_handle_plastid_table = happyfile.hopen_write_or_else( swarm_plastid_table) while 1: line = in_handle_table.readline() if not line: break line = line.rstrip() id_list = re.split('\s', line) swarm_id = id_list[0] if swarm_id in dict_plastid: print >> out_handle_plastid_table, line for id in id_list: dict_plastid[id] = 1 else: print >> out_handle_16S_table, line in_handle_table.close() out_handle_16S_table.close() out_handle_plastid_table.close() # split 16S/Plastid derep FASTA in_handle_derep_fa = happyfile.hopen_or_else(derep_fa) out_handle_16S_derep_fa = happyfile.hopen_write_or_else( tmp_derep_16S_fa) out_handle_plastid_derep_fa = happyfile.hopen_write_or_else( derep_plastid_fa) id = "" while 1: line = in_handle_derep_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >> out_handle_plastid_derep_fa, line else: print >> out_handle_16S_derep_fa, line in_handle_derep_fa.close() out_handle_16S_derep_fa.close() out_handle_plastid_derep_fa.close() # split 16S/Plastid derep counts table in_handle_derep_counts = happyfile.hopen_or_else(derep_counts) out_handle_16S_derep_counts = happyfile.hopen_write_or_else( tmp_derep_16S_counts) out_handle_plastid_derep_counts = happyfile.hopen_write_or_else( derep_plastid_counts) firstline = 1 while 1: line = in_handle_derep_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_derep_counts, line print >> out_handle_plastid_derep_counts, line else: if cols[0] in dict_plastid: print >> out_handle_plastid_derep_counts, line else: print >> out_handle_16S_derep_counts, line firstline = 0 in_handle_derep_counts.close() out_handle_16S_derep_counts.close() out_handle_plastid_derep_counts.close() # split 16S/Plastid swarm FASTA in_handle_fa = happyfile.hopen_or_else(swarm_fa) out_handle_16S_fa = happyfile.hopen_write_or_else(tmp_swarm_16S_fa) out_handle_plastid_fa = happyfile.hopen_write_or_else(swarm_plastid_fa) id = "" while 1: line = in_handle_fa.readline() if not line: break line = line.rstrip() if line.startswith(">"): id = re.split('\s', line[1:])[0] if id: if id in dict_plastid: print >> out_handle_plastid_fa, line else: print >> out_handle_16S_fa, line in_handle_fa.close() out_handle_16S_fa.close() out_handle_plastid_fa.close() # split 16S/Plastid swarm counts table in_handle_counts = happyfile.hopen_or_else(swarm_counts) out_handle_16S_counts = happyfile.hopen_write_or_else( tmp_swarm_16S_counts) out_handle_plastid_counts = happyfile.hopen_write_or_else( swarm_plastid_counts) firstline = 1 while 1: line = in_handle_counts.readline() if not line: break line = line.rstrip() cols = line.split('\t') if firstline: print >> out_handle_16S_counts, line print >> out_handle_plastid_counts, line else: if cols[0] in dict_plastid: print >> out_handle_plastid_counts, line else: print >> out_handle_16S_counts, line firstline = 0 in_handle_counts.close() out_handle_16S_counts.close() out_handle_plastid_counts.close() # replace original swarm files with 16S only if os.path.exists(tmp_derep_16S_fa) and os.path.exists( tmp_derep_16S_counts ) and os.path.exists(tmp_swarm_16S_table) and os.path.exists( tmp_swarm_16S_tax) and os.path.exists( tmp_swarm_16S_fa) and os.path.exists(tmp_swarm_16S_counts): replace_file(tmp_derep_16S_fa, derep_fa) replace_file(tmp_derep_16S_counts, derep_counts) replace_file(tmp_swarm_16S_table, swarm_table) replace_file(tmp_swarm_16S_tax, swarm_tax) replace_file(tmp_swarm_16S_fa, swarm_fa) replace_file(tmp_swarm_16S_counts, swarm_counts) else: print >> sys.stderr, "Not all tmp_ files found" sys.exit(2)
def get_taxonomy(swarm_content_fasta_file, swarm_content_ggsearch_file, database_file, cpus): global dict_id_best_hit global dict_id_best_bs global dict_id_taxonomy if swarm_content_fasta_file: if os.path.exists(swarm_content_ggsearch_file): if verbose: print("Ignoring content FASTA file: " + swarm_content_fasta_file, file=sys.stderr) else: print("[purity_plot] running ggsearch", file=sys.stderr) if cpus < 1: cpus = 1 cmd = " ".join(["glsearch36 -b 1 -m 8 -T", str(cpus), swarm_content_fasta_file, database_file, ">", swarm_content_ggsearch_file]) if verbose: print(cmd, file=sys.stderr) else: cmd += " 2>/dev/null" rc = os.system(cmd) if rc != 0: print("[purity_plot] ERROR: ggsearch", file=sys.stderr) sys.exit(2) in_handle1 = happyfile.hopen_or_else(swarm_content_ggsearch_file) if verbose: print("Reading ggsearch file: " + swarm_content_ggsearch_file, file=sys.stderr) while 1: line = in_handle1.readline() if not line: break line = line.rstrip() qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split("\t")[:12] if (not qid in dict_id_best_bs) or bs > dict_id_best_bs[qid]: dict_id_best_hit[qid] = sid dict_id_best_bs[qid] = bs in_handle1.close() in_handle2 = happyfile.hopen_or_else(database_file) if verbose: print("Reading database file: " + database_file, file=sys.stderr) while 1: line = in_handle2.readline() if not line: break line = line.rstrip() if line.startswith(">"): m = re.match('>(\S+)\s+(.+)$', line) if m: id = m.group(1) taxstr = m.group(2) taxstr = re.sub('\|', ';', taxstr) taxstr = re.sub('\+', ' ', taxstr) dict_id_taxonomy[id] = taxstr in_handle2.close()
def get_taxonomy(fasta_file, ggsearch_file, database_file, cpus): global dict_swarm_best_hit global dict_swarm_best_bs global dict_id_taxonomy if fasta_file: if os.path.exists(ggsearch_file): if verbose: print("Ignoring FASTA file: " + fasta_file, file=sys.stderr) else: print("[swarm_classify_taxonomy] running ggsearch", file=sys.stderr) if cpus < 1: cpus = 1 cmd = " ".join([ "glsearch36 -b 1 -m 8 -T", str(cpus), fasta_file, database_file, ">", ggsearch_file ]) if verbose: print(cmd, file=sys.stderr) rc = os.system(cmd) if rc != 0: print("[swarm_classify_taxonomy] ERROR: ggsearch", file=sys.stderr) sys.exit(2) in_handle1 = happyfile.hopen_or_else(ggsearch_file) if verbose: print("Reading ggsearch file: " + ggsearch_file, file=sys.stderr) while 1: line = in_handle1.readline() if not line: break line = line.rstrip() qid, sid, pid, alen, mm, go, qs, qe, ss, se, e, bs = line.split( "\t")[:12] if (not qid in dict_swarm_best_bs) or bs > dict_swarm_best_bs[qid]: dict_swarm_best_hit[qid] = sid dict_swarm_best_bs[qid] = bs in_handle1.close() in_handle2 = happyfile.hopen_or_else(database_file) if verbose: print("Reading database file: " + database_file, file=sys.stderr) while 1: line = in_handle2.readline() if not line: break line = line.rstrip() if line.startswith(">"): m = re.match('>(\S+)\s+(.+)$', line) if m: id = m.group(1) taxstr = m.group(2) taxstr = re.sub('\|', ';', taxstr) taxstr = re.sub('\+', ' ', taxstr) dict_id_taxonomy[id] = taxstr in_handle2.close()