def annotate(self, gen_fasta): for title, seq in gen_fasta: title = '>' + title accession_version = find_between(title, self.begin, self.end) if "_" in accession_version: if accession_version[:2] in self.set_prefix: ncbi_tid = self.db.get_ncbi_tid_from_refseq_accession_version( accession_version) else: if '.' in accession_version: ncbi_tid = self.db.get_ncbi_tid_from_genbank_accession_version( accession_version) else: ncbi_tid = self.db.get_ncbi_tid_from_genbank_accession( accession_version) if ncbi_tid: gg = self.tree.green_genes_lineage( ncbi_tid[0], depth=self.depth, depth_force=self.depth_force) if gg: gg = '; '.join(gg.split(';')) header = 'ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) yield '>%s\n%s\n' % (header, seq), '%s\t%s\n' % ( header.split()[0], gg) else: print(accession_version)
def annotate(self, gen_fasta): for title, seq in gen_fasta: title = '>' + title gi = find_between(title, self.begin, self.end) ncbi_tid = self.db.get_ncbi_tid_from_gi(gi) if ncbi_tid: gg = self.tree.green_genes_lineage(ncbi_tid[0], depth=self.depth, depth_force=self.depth_force) if gg: gg = '; '.join(gg.split(';')) header = 'ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) yield '>%s\n%s\n' % (header, seq), '%s\t%s\n' % (header.split()[0], gg)
def annotate(self, gen_fasta): for title, seq in gen_fasta: title = '>' + title refseq_accession_version = find_between(title, self.begin, self.end) if refseq_accession_version[:2] in self.set_prefix: ncbi_tid = self.db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version) if ncbi_tid: gg = self.tree.green_genes_lineage(ncbi_tid[0], depth=self.depth, depth_force=self.depth_force) if gg: gg = '; '.join(gg.split(';')) header = 'ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) yield '>%s\n%s\n' % (header, seq), '%s\t%s\n' % (header.split()[0], gg)
def build_cluster_map(inf, bread='ref|,|'): begin, end = bread.split(',') cluster_map = defaultdict(set) fasta_gen = FASTA(inf) for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster', '_cluster') ref = find_between(header, begin, end) header_split = ref.split('_') key = '_'.join(header_split[:3]) value = '_'.join(header_split[-2:]) cluster_map[key].add(value) return cluster_map
def build_cluster_map(inf, bread='ref|,|'): begin,end = bread.split(',') cluster_map = defaultdict(set) fasta_gen = FASTA(inf) for header, sequence in fasta_gen.read(): if '.cluster' in header: header = header.replace('.cluster','_cluster') ref = find_between(header, begin, end) header_split = ref.split('_') key = '_'.join(header_split[:3]) value = header_split[-1] cluster_map[key].add(value) return cluster_map
def annotate(self, gen_fasta): for title, seq in gen_fasta: title = '>' + title # Extract NCBI TID and Convert to INT ncbi_tid = find_between(title, self.begin, self.end) if ncbi_tid and ncbi_tid != 'NA': ncbi_tid = int(ncbi_tid) gg = self.tree.green_genes_lineage( ncbi_tid, depth=self.depth, depth_force=self.depth_force) if gg: gg = '; '.join(gg.split(';')) header = 'ncbi_tid|%d|%s' % (ncbi_tid, title[1:]) yield '>%s\n%s\n' % (header, seq), '%s\t%s\n' % ( header.split()[0], gg)
def annotate(self, gen_fasta): for title, seq in gen_fasta: title = '>' + title gi = find_between(title, self.begin, self.end) ncbi_tid = self.db.get_ncbi_tid_from_gi(gi) if ncbi_tid: gg = self.tree.green_genes_lineage( ncbi_tid[0], depth=self.depth, depth_force=self.depth_force) if gg: gg = '; '.join(gg.split(';')) header = 'ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) yield '>%s\n%s\n' % (header, seq), '%s\t%s\n' % ( header.split()[0], gg)
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def binary_fasta(fh, db, prefix_set): """ :return: tuples of (title, seq) """ title = b'' data = b'' for line in fh: if line[:1] == b'>': if title: yield title.strip(), data # line_split = line.split(b'|') refseq_accession_version = find_between(line, b'ref|', b'|') if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version(refseq_accession_version.decode()) if ncbi_tid: title = b'ncbi_tid|%d|%s' % (ncbi_tid[0], line[1:]) data = b'' else: title = b'' elif title: data += line.strip() if title: yield title.strip(), data
def refseq_annotate(input, output, extract_refseq_id, prefixes): db = RefSeqDatabase() # check for the glob prefix prefixes = prefixes.split(',') begin, end = extract_refseq_id.split(',') if '*' in prefixes: prefix_set = set([_ for _ in db.refseq_prefix_mapper.keys()]) else: prefix_set = set([_ for _ in prefixes]) inf_fasta = FASTA(input) for title, seq in inf_fasta.read(): title = '>' + title refseq_accession_version = find_between(title, begin, end) if refseq_accession_version[:2] in prefix_set: ncbi_tid = db.get_ncbi_tid_from_refseq_accession_version( refseq_accession_version) if ncbi_tid: title = '>ncbi_tid|%d|%s' % (ncbi_tid[0], title[1:]) output.write('%s\n%s\n' % (title, seq))
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree) if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta, reference_map, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [ os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna') ] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print( "Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) embalmer_outf = os.path.join(output, 'embalmer_out.txt') # Indexing for emblalmer if not os.path.isfile(embalmer_outf): lca_maps = defaultdict(lambda: defaultdict(list)) for basename in basenames: utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: lca_maps[';'.join( line[1].split('; '))][basename].append(line[0]) fna_faidx = {} for basename in basenames: fna_faidx[basename] = pyfaidx.Fasta( os.path.join(input, basename + '.fna')) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append( line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() print(tmpdir) with open(embalmer_outf, 'w') as embalmer_cat: for species in lca_maps.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename in lca_maps[species].keys(): for header in lca_maps[species][basename]: record = fna_faidx[basename][header][:] queries_fna.write( '>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[species]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) print( embalmer_align(queries_fna_filename, references_fna_filename, output_filename)) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) else: print( "Found the output file \"%s\". Skipping the strain alignment phase for this file." % embalmer_outf) # Convert the results from embalmer into CSV sparse_ncbi_dict = defaultdict(dict) begin, end = extract_ncbi_tid.split(',') # build query by NCBI_TID DataFrame with open(embalmer_outf) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] # Create a SAM file for each input FASTA file for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) img_map = IMGMap() for basename in basenames: sam_inf = os.path.join(output, basename + '.sam') step_outf = 'test' if os.path.isfile(step_outf): print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf) else: lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), ) sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')] img_map = IMGMap() ncbi_tree = NCBITree() lca = LCA(ncbi_tree, args.depth) with open(args.output, 'w') if args.output else sys.stdout as outf: csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n') csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id']) for file in sam_files: with open(file) as inf: lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map) for key in lca_map: img_ids, ncbi_tid = lca_map[key] csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)]) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth - 1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [ os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna') ] for fna_file in fna_files: sam_outf = os.path.join( output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [ os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam') ] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor( ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.') [:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')] for fna_file in fna_files: sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def shogun_utree_capitalist(input, output, utree_indx, reference_fasta, reference_map, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_file = os.path.join(input, basename + '.fna') tsv_outf = os.path.join(output, basename + '.utree.tsv') if not os.path.isfile(tsv_outf): print(utree_search(utree_indx, fna_file, tsv_outf)) else: print("Found the output file \"%s\". Skipping the alignment phase for this file." % tsv_outf) embalmer_outf = os.path.join(output, 'embalmer_out.txt') # Indexing for emblalmer if not os.path.isfile(embalmer_outf): lca_maps = defaultdict(lambda: defaultdict(list)) for basename in basenames: utree_tsv = os.path.join(output, basename + '.utree.tsv') with open(utree_tsv) as inf: tsv_parser = csv.reader(inf, delimiter='\t') for line in tsv_parser: if line[1]: lca_maps[';'.join(line[1].split('; '))][basename].append(line[0]) fna_faidx = {} for basename in basenames: fna_faidx[basename] = pyfaidx.Fasta(os.path.join(input, basename + '.fna')) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() print(tmpdir) with open(embalmer_outf, 'w') as embalmer_cat: for species in lca_maps.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename in lca_maps[species].keys(): for header in lca_maps[species][basename]: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[species]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) print(embalmer_align(queries_fna_filename, references_fna_filename, output_filename)) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) else: print("Found the output file \"%s\". Skipping the strain alignment phase for this file." % embalmer_outf) # Convert the results from embalmer into CSV sparse_ncbi_dict = defaultdict(dict) begin, end = extract_ncbi_tid.split(',') # build query by NCBI_TID DataFrame with open(embalmer_outf) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))