def main(infile, ofile, force=False): ofile = process_path(ofile) order_id_list, id2annotate = parse_id(infile) convertor = NCBI_convertor(order_id_list, db='protein') pid2assembly_dict = convertor.pid2assembly() if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) if exists(ofile) and not force: tqdm.write("detect existing " + ofile + ' no force param input, so it quit instead of writing.') return with open(ofile, 'w') as f1: print('#accession ID\tGI\tassembly_ID\tnuccore ID\tstart\tend\tstrand', file=f1) for pid in convertor.origin_ids: GI = convertor.GI[pid] _dict = pid2assembly_dict[pid] assembly_id = _dict['assembly'] info = _dict['nuc_info'] print(f'{pid}\t{GI}\t{assembly_id}\t' + '\t'.join(map(str, info)), file=f1) tqdm.write('finish writing into ' + ofile)
def main(infile, ofile, db='protein', force=False, redo=False): if not exists(dirname(ofile)): os.makedirs(dirname(ofile)) order_id_list, id2annotate = parse_id(infile) if db == 'protein': convertor = NCBI_convertor(order_id_list, db='protein') pid2assembly_dict = convertor.pid2assembly() genome_IDs = list( [_dict['assembly'] for pid, _dict in pid2assembly_dict.items()]) genome_IDs = [_ for _ in genome_IDs if _] elif db == 'genome': genome_IDs = order_id_list[::] else: raise SyntaxError('wrong input of start_at') gid2assembly_info, bp2info, bs2info = genomeID2Bio(genome_IDs) # if not too big, use panda to concat them # else...... too complicated...pass it if len(gid2assembly_info) <= 15000: ginfo_df = pd.DataFrame.from_dict(gid2assembly_info, orient='index') # ginfo_df.index = ginfo_df.iloc[:,0] bp_df = pd.DataFrame.from_dict(bp2info, orient='index') bs_df = pd.DataFrame.from_dict(bs2info, orient='index') _df1 = bp_df.reindex(ginfo_df.loc[:, 'BioprojectAccn']) _df1.index = ginfo_df.index _df2 = bs_df.reindex(ginfo_df.loc[:, 'BioSampleAccn']) _df2.index = ginfo_df.index full_df = pd.concat([ginfo_df, _df1, _df2], axis=1) full_df = full_df.applymap(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x) full_df = full_df.drop(['GI', 'relative biosample'], axis=1) if exists(ofile) and not force: tqdm.write("detect existing " + ofile + ' no force param input, so it quit instead of writing.') return full_df.to_csv(ofile, sep='\t', index=1, index_label='AssemblyAccn raw') tqdm.write('finish writing into ' + ofile + ' with tab separator format.') else: raise Exception('too much genomes to process')
def id2tax(id_list, redo=False, db="protein"): convertor = NCBI_convertor(id_list, db) suffix = 'protein2GI' convertor.check_cache(suffix=suffix, redo=redo) convertor.get_taxon() pid2info_dict = defaultdict(dict) for oid in convertor.origin_ids: pid2info_dict[oid]['GI'] = convertor.GI[oid] pid2info_dict[oid]['taxid'] = convertor.tids[oid] pid2info_dict[oid]['accession'] = oid access_intermedia(pid2info_dict, suffix=suffix) return pid2info_dict
def genomeID2Bio(genome_IDs): """ bio include BioSample and corresponding BioProject """ tqdm.write("with genome ID, start to retrieve genome information") convertor = NCBI_convertor(genome_IDs, db='assembly') convertor.get_GI() convertor.get_db_summary() aid2info = convertor.dbsummary bs_list = list( set([ _.get('BioSampleAccn') for _ in aid2info.values() if _.get('BioSampleAccn') ])) bp_list = list( set([ _.get('BioprojectAccn') for _ in aid2info.values() if _.get('BioprojectAccn') ])) tqdm.write("retrieving relative Bioproject and its Biosample info") bp2info = get_bioproject(bp_list) bs2info = get_biosample(bs_list) return aid2info, bp2info, bs2info
def aid2taxon(id_list, redo=False): convertor = NCBI_convertor(id_list, "assembly") suffix = 'protein2GI' convertor.check_cache(suffix=suffix, redo=redo) id2taxon = convertor.get_taxons_from_tid() return id2taxon
from bin.ncbi_convertor import NCBI_convertor if __name__ == "__main__": # test pids = open('./protein_ids').read().split('\n') convertor = NCBI_convertor(pids, db='protein') # convertor.check_cache(suffix=suffix, redo=redo) convertor.get_taxons_from_tid() pid2assembly_dict = convertor.pid2assembly() aids = open('./assembly_ids').read().split('\n') convertor = NCBI_convertor(aids, db='assembly') # convertor.check_cache(suffix=suffix, redo=redo) convertor.get_taxons_from_tid() nids = open('./nucleotide_ids').read().split('\n') convertor = NCBI_convertor(nids, db='nuccore') convertor.get_GI() convertor.get_db_summary() # convertor.check_cache(suffix=suffix, redo=redo) convertor.get_taxons_from_tid()