def refseq_get_ftp_links_from_file(input, output): db = RefSeqDatabase() tree = NCBITree() ncbi_tid_set = set() for line in input: line = str.replace(line, ' unclassified', '') line = str.replace(line, 'cf', '') [ ncbi_tid_set.add(_[0]) for _ in db.yield_ncbi_tid_row_from_name(line.strip()) ] ncbi_tid_successors = set() # How many total strains are there in HMP? # for ncbi_tid in ncbi_tid_set: #TODO Switch the tree around - predecessor and successors [ ncbi_tid_successors.add(_) for _ in tree.tree.predecessors_iter(ncbi_tid) if not _ in ncbi_tid_set ] ncbi_tid_set = set.union(ncbi_tid_set, ncbi_tid_successors) output.write('ncbi_tid,gg_lineage,ftp_link\n') for ncbi_tid in ncbi_tid_set: [ output.write('%s,%s,%s\n' % (ncbi_tid, tree.gg_lineage(ncbi_tid), ftp_link)) for ftp_link in db.yield_ftp_links(ncbi_tid) ]
class TestGGLineage(unittest.TestCase): ncbi_tree = NCBITree() def test(self): # 11788 self.test_strains(11788) self.test_strains(391904) self.test_strains(-10) def test_strains(self, taxid): strain_name_1 = self.ncbi_tree.green_genes_lineage(taxid, depth=5, depth_force=True) strain_name_2 = self.ncbi_tree.green_genes_lineage(taxid, depth_force=True) strain_name_3 = self.ncbi_tree.green_genes_lineage(taxid, depth=5, depth_force=False) strain_name_4 = self.ncbi_tree.green_genes_lineage(taxid, depth_force=False) strain_name_5 = self.ncbi_tree.green_genes_lineage(taxid, depth=8, depth_force=True) strain_name_6 = self.ncbi_tree.green_genes_lineage(taxid, depth=8, depth_force=False) # Test the null pointer strain_name_7 = self.ncbi_tree.green_genes_lineage(taxid, depth=8)
def main(): parser = make_arg_parser() args = parser.parse_args() sam_files = [ os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam') ] img_map = IMGMap() ncbi_tree = NCBITree() with open(args.output, 'w') if args.output else sys.stdout as outf: csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n') csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id']) for file in sam_files: with open(file) as inf: lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), ncbi_tree, img_map) for key in lca_map: img_ids, ncbi_tid = lca_map[key] csv_outf.writerow([ os.path.basename(file)[:-4], key, ncbi_tid, ','.join(img_ids) ])
def add_green_genes_tax_to_gb_accession(input, output): # Load the taxonomy nt = NCBITree() # Skip header next(input) output_csv = csv.writer(output, delimiter="\t") # Write header output_csv.writerow(["gb_accession", "taxid", "green_genes_taxonomy"]) csv_input = csv.reader(input, delimiter="\t") for row in csv_input: out_row = row + [0] taxid = int(row[1]) out_row[2] = nt.green_genes_lineage(taxid, depth=8, depth_force=True) output_csv.writerow(out_row)
def get_tids(gbk, gbk_file, outf, nt=NCBITree()): dict_list = [] with open(gbk_file, 'r') as inf: y = re.compile(r"^(DEFINITION)\s\s(.*)$") p = re.compile(r"^(\s+)\/db_xref=\"taxon:(\d+)\"") for line in inf: if line.startswith('DEFINITION'): y_m = y.search(line) prod = str(y_m.group(2)) prod = '_'.join(prod.split(' ')) continue if line.startswith(' /db_xref="taxon:'): m = p.search(line) ncbi_tid = int(m.group(2)) organism = nt.green_genes_lineage(ncbi_tid, depth=8, depth_force=True) dict_list = [ncbi_tid, organism] outf.write('ncbi_tid|' + str(ncbi_tid) + '|mibig|' + gbk + '.1_cluster001' + '|organism|' + organism + '\t' + prod + '\n') break if not dict_list: print(gbk + ' failed to find tid') return ['None', 'None'] else: return dict_list
def shogun_bt2_db(input, output, annotater, extract_id, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate( inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print( "Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % (outf_fasta, outf_map)) # Build the output BT2 database verify_make_dir(os.path.join(output, 'bt2')) print(bowtie2_build(outf_fasta, os.path.join(output, 'bt2', output_fn)))
def main(): parser = make_arg_parser() args = parser.parse_args() nt_cat = os.path.join(args.nt_cat) gbkpath = os.path.join(args.input) outpath = os.path.join(args.output) if args.just_compile: compile_files(outpath) sys.exit() if not os.path.isdir(outpath): os.mkdir(os.path.join(outpath)) if not os.path.isdir(outpath): print('\nError creating output directory; check given path and try again\n') sys.exit() logfile = os.path.join(outpath, 'scrapelog.log') logging.basicConfig(filename=logfile, level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') gbks = os.listdir(gbkpath) gbks = [f for f in gbks if f.endswith('gbk')] with open(nt_cat, 'r') as nt_catalog: gbk_dd = defaultdict(list) reader = csv.reader(nt_catalog, delimiter='\t') next(reader) nt = NCBITree() gbk_set = set() for gbk_file in gbks: gbk_id = gbk_file.split('.cluster')[0] gbk_set.add(gbk_id) for line in reader: if line[1] in gbk_set: tid = line[2] organism = tid_to_name(tid, nt=nt) # print(line[1] + tid + organism) gbk_dd[line[1]] = [tid, organism] i = 0 for gbk_file in gbks: gbk_id = gbk_file.split('.cluster')[0] tid_org = gbk_dd[gbk_id] if not tid_org: print('Error getting taxonomy for %s for cluster file %s' % (gbk_id, gbk_file)) logging.warning('Error getting taxonomy for %s for cluster file %s' % (gbk_id, gbk_file)) tid_org = ['na', 'k__None;p__None;c__None;o__None;f__None;g__None;s__None;t__None'] i += 1 # print(tid_org) # ncbi_tid = str(tid_org[0]) # organism = str(tid_org[1]) gbk_filepath = os.path.join(gbkpath, gbk_file) parse_aa_seqs(gbk_file, tid_org, gbk_filepath, outpath) parse_dna_seqs(gbk_file, tid_org, gbk_filepath, outpath) parse_cluster_types(gbkpath, outpath, gbk_dd) if not args.no_compile: compile_files(outpath) logging.warning('DOJO could not acquire NCBI tid information for %s clusters' % i)
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree) if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def download_refseq_all(verbose): pool = multiprocessing.Pool(processes=4) rf = RefSeqDatabase() data = rf.get_blaze() tree = NCBITree() specified_kingdoms = {'k__Bacteria', 'k__Viruses', 'k__Archaea'} kingdoms = [] ftp_view = data.tree[data.tree.ftp != '' and data.tree.refseq_version != ''] ftp_links = yield_ftp_links(ftp_view, specified_kingdoms, tree) # ftp_test = [next(ftp_links) for _ in range(10)] pool.map(download_ftp_link, ftp_links) print('Done')
def shogun_utree_db(input, output, annotater, extract_id, threads, prefixes, depth, depth_force): verify_make_dir(output) # Verify the FASTA is annotated if input == '-': output_fn = 'stdin' else: output_fn = '.'.join(str(os.path.basename(input)).split('.')[:-1]) outf_fasta = os.path.join(output, output_fn + '.annotated.fna') outf_map = os.path.join(output, output_fn + '.annotated.map') if not os.path.isfile(outf_fasta) or not os.path.isfile(outf_map): tree = NCBITree() db = RefSeqDatabase() if annotater == 'refseq': annotater_class = RefSeqAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'nt': annotater_class = NTAnnotater(extract_id, prefixes, db, tree, depth=depth, depth_force=depth_force) elif annotater == 'ncbi': annotater_class = NCBIAnnotater(extract_id, tree, depth=depth, depth_force=depth_force) else: annotater_class = GIAnnotater(extract_id, db, tree, depth=depth, depth_force=depth_force) with open(outf_fasta, 'w') as output_fna: with open(outf_map, 'w') as output_map: with open(input) as inf: inf_fasta = FASTA(inf) for lines_fna, lines_map in annotater_class.annotate(inf_fasta.read()): output_fna.write(lines_fna) output_map.write(lines_map) else: print("Found the output files \"%s\" and \"%s\". Skipping the annotation phase for this file." % ( outf_fasta, outf_map)) # Build the output CTR verify_make_dir(os.path.join(output, 'utree')) path_uncompressed_tree = os.path.join(output, 'utree', output_fn + '.utr') path_compressed_tree = os.path.join(output, 'utree', output_fn + '.ctr') if os.path.exists(path_compressed_tree): print('Compressed tree database file %s exists, skipping this step.' % path_compressed_tree) else: if not os.path.exists(path_uncompressed_tree): print(utree_build(outf_fasta, outf_map, path_uncompressed_tree, threads=threads)) print(utree_compress(path_uncompressed_tree, path_compressed_tree)) os.remove(path_uncompressed_tree)
def test(self): ncbi_tree = NCBITree() # Try LCA with a null-pointer lca = ncbi_tree.lowest_common_ancestor(391904, -10) print(lca)
def tid_to_name(tid, nt=NCBITree()): tid = int(tid) organism = nt.green_genes_lineage(tid, depth=8, depth_force=True) return organism
def shogun_functional(input, output, bt2_indx, extract_ncbi_tid, threads): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] # Create a SAM file for each input FASTA file for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) img_map = IMGMap() for basename in basenames: sam_inf = os.path.join(output, basename + '.sam') step_outf = 'test' if os.path.isfile(step_outf): print("Found the \"%s.kegg.csv\". Skipping the LCA phase for this file." % step_outf) else: lca_map = build_img_ncbi_map(yield_alignments_from_sam_inf(sam_inf), ) sam_files = [os.path.join(args.input, filename) for filename in os.listdir(args.input) if filename.endswith('.sam')] img_map = IMGMap() ncbi_tree = NCBITree() lca = LCA(ncbi_tree, args.depth) with open(args.output, 'w') if args.output else sys.stdout as outf: csv_outf = csv.writer(outf, quoting=csv.QUOTE_ALL, lineterminator='\n') csv_outf.writerow(['sample_id', 'sequence_id', 'ncbi_tid', 'img_id']) for file in sam_files: with open(file) as inf: lca_map = build_lca_map(yield_alignments_from_sam_inf(inf), lca, img_map) for key in lca_map: img_ids, ncbi_tid = lca_map[key] csv_outf.writerow([os.path.basename(file).split('.')[0], key, ncbi_tid, ','.join(img_ids)]) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth - 1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [ os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna') ] for fna_file in fna_files: sam_outf = os.path.join( output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [ os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam') ] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor( ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.') [:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))