def download(args): """Downloads fasta or taxonomy dump files If args.db == 'taxonomy', download taxonomy dump files from ncbi and initialize the ete3 sqlite database If args.db == 'idmap', download the seqid->taxid mapfile from ncbi Otherwise download the protein fastafile corresponding to args.db (uniref50, uniref90, uniref100 or nr) """ if args.db == "taxonomy": prepare.download_ncbi_taxonomy(args.taxdir, args.force) prepare.init_sqlite_taxdb(args.taxdir, args.sqlitedb, args.force) elif args.db == "idmap": prepare.download_nr_idmap(args.dldir, args.tmpdir, args.force) else: prepare.download_fasta(args.dldir, args.db, args.tmpdir, args.force, args.skip_check, args.skip_idmap)
def make_lineage_df(taxids, taxdir, dbname, ranks, cpus=1): """ Creates a lineage dataframe with full taxonomic information for a list of taxids. Example: taxid species phylum genus genus.name phylum.name species.name 859655 305 1224 48736 Ralstonia Proteobacteria Ralstonia solanacearum 387344 1580 1239 1578 Lactobacillus Firmicutes Lactobacillus brevis 358681 1393 1239 55080 Brevibacillus Firmicutes Brevibacillus brevis Parameters ---------- taxids: list List of taxonomic ids to obtain information for taxdir: str Path to directory holding taxonomic info dbname: str Name of ete3 sqlite database within taxdir ranks: list Ranks to store information for cpus: int Number of cpus to use Returns ------- lineage_df: pandas.DataFrame Data Frame with full taxonomic info """ # Read the taxonomy db ncbi_taxa = init_sqlite_taxdb(taxdir, dbname) lineages = ncbi_taxa.get_lineage_translator(taxids) # Store potential missing taxids and warn user missing_taxids = set([int(x) for x in taxids]).difference(lineages.keys()) # Get possible translations for taxids that have been changed _, translate_dict = ncbi_taxa._translate_merged(list(set(taxids).difference(lineages.keys()))) rename = {y: x for x, y in translate_dict.items()} # Update lineages with missing taxids lineages.update(ncbi_taxa.get_lineage_translator(translate_dict.values())) items = [[taxid, ranks, taxdir, dbname, lineages[taxid]] for taxid in list(lineages.keys())] with Pool(processes=cpus) as pool: res = list( tqdm.tqdm(pool.imap(process_lineages, items), desc="Making lineages", total=len(items), unit=" taxids", ncols=100)) lineage_df = pd.concat(res, sort=False) lineage_df.rename(index=rename, inplace=True) lineage_df.rename(index=lambda x: int(x), inplace=True) for rank in ranks: lineage_df[rank] = pd.to_numeric(lineage_df[rank]) name_dict = make_name_dict(lineage_df, ranks) if len(missing_taxids) > 0: sys.stderr.write("#WARNING: Missing taxids found:\n") sys.stderr.write("#{}\n".format(",".join([str(x) for x in missing_taxids]))) sys.stderr.write("#To fix this, you can try to update the taxonomy database using\n") sys.stderr.write("#tango download taxonomy --force\n") return lineage_df.loc[:,lineage_df.dtypes==int], name_dict
def process_lineages(items): """ Looks up lineage information from taxids. The lineage object is a list of taxonomic ids corresponding to the full lineage of a single taxid. """ taxid, ranks, taxdir, dbname, lineage = items # Read the taxonomy db ncbi_taxa = init_sqlite_taxdb(taxdir, dbname) # Get ranks for each taxid in the lineage lineage_ranks = ncbi_taxa.get_rank(lineage) x = pd.DataFrame(lineage_ranks, index=["rank"]).T x = x.loc[x["rank"].isin(ranks)].reset_index().T x.columns = x.loc["rank"] x.drop("rank", inplace=True) x.index = [taxid] # Add taxids for lower ranks in the hierarchy x = propagate_lower(x, taxid, ranks) # Add names for taxids x = add_names(x, taxid, ncbi_taxa) return x