def shogun_utree_lca(input, output, utree_indx, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) counts = [] utree_outf = os.path.join(output, 'taxon_counts.txt') # Indexing for emblalmer if not os.path.isfile(utree_outf): for basename in basenames: lcas = [] utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: lcas.append(';'.join(line[1].split('; '))) counts.append(Counter(filter(None, lcas))) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def shogun_embalmer_lca(input_dir, output_dir, embalmer_db, threads, pct_id, mincount, taxa_ncbi): if output_dir is None: output_dir = input_dir verify_make_dir(output_dir) inputfiles = [filename for filename in os.listdir(input_dir) if filename.endswith('.fna') or filename.endswith('fasta')] basenames = [os.path.splitext(filename)[0] for filename in inputfiles] outputfps = [] for i,filename in enumerate(inputfiles): input_fp = os.path.join(input_dir, filename) tsv_outf = os.path.join(output_dir, basenames[i] + '.embalmer.tsv') outputfps.append(tsv_outf) if not os.path.isfile(tsv_outf): print("Did not file the output file \"%s\". Running the alignment phase for this file." % tsv_outf) print(embalmer_search(input_fp, tsv_outf, embalmer_db+".edb", embalmer_db+".tax", embalmer_db+".acc", threads, pct_id, taxa_ncbi)) else: print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) counts = [] # Tabulating print("Tabulating and filtering hits...") # print a row of "-" for every 10 samples if len(inputfiles) >= 100: for i in range(floor(len(basenames)/10)): sys.stdout.write('-') sys.stdout.write('\n') sys.stdout.flush() taxon_outf = os.path.join(output_dir, 'taxon_counts.tsv') if os.path.isfile(taxon_outf): print("Skipping tabulation step, output file %s already exists." %(taxon_outf)) else: for outputfp in outputfps: with open(outputfp) as output_file: tsv_parser = csv.reader(output_file, delimiter='\t') taxon_counts = Counter() for line in tsv_parser: taxon = line[12] # drop trailing t__ in redistribute taxon = re.sub('; t__$','',taxon) taxon = re.sub('; t__None$','',taxon) taxon_counts[taxon] += 1 counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) # filter by mincount df[df < mincount] = 0 # drop spaces in column df.columns = [colname.replace('; ',';') for colname in df.columns] # drop columns that sum to zero df = df.loc[:,(df.sum(axis=0) != 0)] df.T.to_csv(taxon_outf, index_label='Taxon',na_rep='0',sep='\t') get_rank_specific_taxonomy_tables(df,output_dir)
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate( inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print( "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'ncbi': annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_bugbase(input, output, img_database_folder): verify_make_dir(output) utree_outf = os.path.join(output, 'taxa_counts.txt') # Indexing for emblalmer if not os.path.isfile(utree_outf): utree_indx = os.path.join(img_database_folder, 'img.genes.ctr') with open(os.path.join(img_database_folder, 'img_map.pkl'), 'rb') as inf: gg2img_oid = pickle.load(inf) basenames = [ os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna') ] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print( "Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) counts = [] for basename in basenames: lcas = [] utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: taxon = line[1].replace('; ', ';') if taxon in gg2img_oid: lcas.append(gg2img_oid[taxon]) counts.append(Counter(filter(None, lcas))) df = pd.DataFrame(counts, index=basenames).fillna(0).astype(int).T df.to_csv(utree_outf, sep='\t', index_label='#OTU ID') else: print("Found the output file \"%s\". Skipping all steps." % utree_outf)
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def shogun_bugbase(input, output, img_database_folder): verify_make_dir(output) utree_outf = os.path.join(output, 'taxa_counts.txt') # Indexing for emblalmer if not os.path.isfile(utree_outf): utree_indx = os.path.join(img_database_folder, 'img.genes.ctr') with open(os.path.join(img_database_folder, 'img_map.pkl'), 'rb') as inf: gg2img_oid = pickle.load(inf) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) counts = [] for basename in basenames: lcas = [] utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: taxon = line[1].replace('; ', ';') if taxon in gg2img_oid: lcas.append(gg2img_oid[taxon]) counts.append(Counter(filter(None, lcas))) df = pd.DataFrame(counts, index=basenames).fillna(0).astype(int).T df.to_csv(utree_outf, sep='\t', index_label='#OTU ID') else: print("Found the output file \"%s\". Skipping all steps." % utree_outf)
def annotate_fasta(input, output, extract_refseq_id, prefixes, depth, depth_force): verify_make_dir(output) if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) with open(input, 'r') if input != '-' else sys.stdin as inf: with open(os.path.join(output, output_fn + '.annotated.fna'), 'w') as output_fna: with open(os.path.join(output, output_fn + '.annotated.map'), 'w') as output_map: inf_fasta = FASTA(inf) annotater = refseq_annotater(inf_fasta.read(), prefixes, extract_refseq_id, depth=depth, depth_force=depth_force) for lines_fna, lines_map in annotater: output_fna.write(lines_fna) output_map.write(lines_map)
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree) if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_utree_lca(input, output, utree_indx, threads, confidence, support, mincount): verify_make_dir(output) basenames = [ os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna') ] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print( "Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) counts = [] utree_outf = os.path.join(output, 'taxon_counts.txt') # Tabulating print("Tabulating and filtering hits...") # print a row of "-" for every 10 samples if len(basenames) >= 100: for i in range(floor(len(basenames) / 10)): sys.stdout.write('-') sys.stdout.write('\n') sys.stdout.flush() if not os.path.isfile(utree_outf): n_fail_confidence_only = 0 n_fail_support_only = 0 n_fail_both = 0 n = 0 n_pass = 0 for i, basename in enumerate(basenames): if len(basenames) >= 100: if (i + 1) % 10 == 0: sys.stdout.write('.') sys.stdout.flush() lcas = [] # list of tuples [redistribute, confidence, support] utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: taxonomy = line[1] is_confident = float(line[2]) >= confidence is_supported = int(line[3]) >= support n += 1 if not is_confident and not is_supported: n_fail_both += 1 elif not is_confident: n_fail_confidence_only += 1 elif not is_supported: n_fail_support_only += 1 else: n_pass += 1 lcas.append(taxonomy) counts.append(Counter(lcas)) print( '%d total assignments\n%d failed confidence only\n%d failed support_only\n%d failed both\n%d remaining' % (n, n_fail_confidence_only, n_fail_support_only, n_fail_both, n_pass)) sys.stdout.write('\n') sys.stdout.flush() df = pd.DataFrame(counts, index=basenames) # filter by mincount df[df < mincount] = 0 # drop spaces in column df.columns = [colname.replace('; ', ';') for colname in df.columns] # drop trailing t__ in redistribute df.columns = [re.sub(';t__$', '', colname) for colname in df.columns] df.T.to_csv(os.path.join(output, 'taxon_counts.csv'), index_label='Taxon', na_rep='0', sep='\t')
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')] for fna_file in fna_files: sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] # Create a SAM file for each input FASTA file for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) img_map = IMGMap() for basename in basenames: sam_inf = os.path.join(output, basename + '.sam') step_outf = 'test' if os.path.isfile(step_outf): print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf) else: lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), ) sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')] img_map = IMGMap() ncbi_tree = NCBITree() lca = LCA(ncbi_tree, args.depth) with open(args.output, 'w') if args.output else sys.stdout as outf: csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n') csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id']) for file in sam_files: with open(file) as inf: lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map) for key in lca_map: img_ids, ncbi_tid = lca_map[key] csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)]) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth - 1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [ os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna') ] for fna_file in fna_files: sam_outf = os.path.join( output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [ os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam') ] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor( ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.') [:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta, reference_map, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [ os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna') ] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print( "Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) embalmer_outf = os.path.join(output, 'embalmer_out.txt') # Indexing for emblalmer if not os.path.isfile(embalmer_outf): lca_maps = defaultdict(lambda: defaultdict(list)) for basename in basenames: utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: lca_maps[';'.join( line[1].split('; '))][basename].append(line[0]) fna_faidx = {} for basename in basenames: fna_faidx[basename] = pyfaidx.Fasta( os.path.join(input, basename + '.fna')) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append( line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() print(tmpdir) with open(embalmer_outf, 'w') as embalmer_cat: for species in lca_maps.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename in lca_maps[species].keys(): for header in lca_maps[species][basename]: record = fna_faidx[basename][header][:] queries_fna.write( '>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[species]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) print( embalmer_align(queries_fna_filename, references_fna_filename, output_filename)) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) else: print( "Found the output file \"%s\". Skipping the strain alignment phase for this file." % embalmer_outf) # Convert the results from embalmer into CSV sparse_ncbi_dict = defaultdict(dict) begin, end = extract_ncbi_tid.split(',') # build query by NCBI_TID DataFrame with open(embalmer_outf) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta, reference_map, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) embalmer_outf = os.path.join(output, 'embalmer_out.txt') # Indexing for emblalmer if not os.path.isfile(embalmer_outf): lca_maps = defaultdict(lambda: defaultdict(list)) for basename in basenames: utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: lca_maps[';'.join(line[1].split('; '))][basename].append(line[0]) fna_faidx = {} for basename in basenames: fna_faidx[basename] = pyfaidx.Fasta(os.path.join(input, basename + '.fna')) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() print(tmpdir) with open(embalmer_outf, 'w') as embalmer_cat: for species in lca_maps.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename in lca_maps[species].keys(): for header in lca_maps[species][basename]: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[species]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) print(embalmer_align(queries_fna_filename, references_fna_filename, output_filename)) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) else: print("Found the output file \"%s\". Skipping the strain alignment phase for this file." % embalmer_outf) # Convert the results from embalmer into CSV sparse_ncbi_dict = defaultdict(dict) begin, end = extract_ncbi_tid.split(',') # build query by NCBI_TID DataFrame with open(embalmer_outf) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))