예제 #1
0
def main(infile, ofile, force=False):
    ofile = process_path(ofile)
    order_id_list, id2annotate = parse_id(infile)
    convertor = NCBI_convertor(order_id_list, db='protein')
    pid2assembly_dict = convertor.pid2assembly()

    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))

    if exists(ofile) and not force:
        tqdm.write("detect existing " + ofile +
                   ' no force param input, so it quit instead of writing.')
        return

    with open(ofile, 'w') as f1:
        print('#accession ID\tGI\tassembly_ID\tnuccore ID\tstart\tend\tstrand',
              file=f1)
        for pid in convertor.origin_ids:
            GI = convertor.GI[pid]
            _dict = pid2assembly_dict[pid]
            assembly_id = _dict['assembly']
            info = _dict['nuc_info']
            print(f'{pid}\t{GI}\t{assembly_id}\t' + '\t'.join(map(str, info)),
                  file=f1)
    tqdm.write('finish writing into ' + ofile)
예제 #2
0
def main(infile, ofile, db='protein', force=False, redo=False):
    if not exists(dirname(ofile)):
        os.makedirs(dirname(ofile))
    order_id_list, id2annotate = parse_id(infile)

    if db == 'protein':
        convertor = NCBI_convertor(order_id_list, db='protein')
        pid2assembly_dict = convertor.pid2assembly()
        genome_IDs = list(
            [_dict['assembly'] for pid, _dict in pid2assembly_dict.items()])
        genome_IDs = [_ for _ in genome_IDs if _]

    elif db == 'genome':
        genome_IDs = order_id_list[::]
    else:
        raise SyntaxError('wrong input of start_at')

    gid2assembly_info, bp2info, bs2info = genomeID2Bio(genome_IDs)

    # if not too big, use panda to concat them
    # else...... too complicated...pass it
    if len(gid2assembly_info) <= 15000:
        ginfo_df = pd.DataFrame.from_dict(gid2assembly_info, orient='index')
        # ginfo_df.index = ginfo_df.iloc[:,0]
        bp_df = pd.DataFrame.from_dict(bp2info, orient='index')
        bs_df = pd.DataFrame.from_dict(bs2info, orient='index')
        _df1 = bp_df.reindex(ginfo_df.loc[:, 'BioprojectAccn'])
        _df1.index = ginfo_df.index
        _df2 = bs_df.reindex(ginfo_df.loc[:, 'BioSampleAccn'])
        _df2.index = ginfo_df.index
        full_df = pd.concat([ginfo_df, _df1, _df2], axis=1)
        full_df = full_df.applymap(lambda x: x.replace('\n', ' ')
                                   if isinstance(x, str) else x)
        full_df = full_df.drop(['GI', 'relative biosample'], axis=1)

        if exists(ofile) and not force:
            tqdm.write("detect existing " + ofile +
                       ' no force param input, so it quit instead of writing.')
            return

        full_df.to_csv(ofile,
                       sep='\t',
                       index=1,
                       index_label='AssemblyAccn raw')
        tqdm.write('finish writing into ' + ofile +
                   ' with tab separator format.')

    else:
        raise Exception('too much genomes to process')
예제 #3
0
def id2tax(id_list, redo=False, db="protein"):
    convertor = NCBI_convertor(id_list, db)
    suffix = 'protein2GI'
    convertor.check_cache(suffix=suffix, redo=redo)
    convertor.get_taxon()

    pid2info_dict = defaultdict(dict)
    for oid in convertor.origin_ids:
        pid2info_dict[oid]['GI'] = convertor.GI[oid]
        pid2info_dict[oid]['taxid'] = convertor.tids[oid]
        pid2info_dict[oid]['accession'] = oid
    access_intermedia(pid2info_dict, suffix=suffix)
    return pid2info_dict
예제 #4
0
def genomeID2Bio(genome_IDs):
    """
    bio include BioSample and corresponding BioProject
    """
    tqdm.write("with genome ID, start to retrieve genome information")
    convertor = NCBI_convertor(genome_IDs, db='assembly')
    convertor.get_GI()
    convertor.get_db_summary()
    aid2info = convertor.dbsummary

    bs_list = list(
        set([
            _.get('BioSampleAccn') for _ in aid2info.values()
            if _.get('BioSampleAccn')
        ]))
    bp_list = list(
        set([
            _.get('BioprojectAccn') for _ in aid2info.values()
            if _.get('BioprojectAccn')
        ]))
    tqdm.write("retrieving relative Bioproject and its Biosample info")
    bp2info = get_bioproject(bp_list)
    bs2info = get_biosample(bs_list)
    return aid2info, bp2info, bs2info
예제 #5
0
def aid2taxon(id_list, redo=False):
    convertor = NCBI_convertor(id_list, "assembly")
    suffix = 'protein2GI'
    convertor.check_cache(suffix=suffix, redo=redo)
    id2taxon = convertor.get_taxons_from_tid()
    return id2taxon
예제 #6
0
from bin.ncbi_convertor import NCBI_convertor

if __name__ == "__main__":
    # test
    pids = open('./protein_ids').read().split('\n')
    convertor = NCBI_convertor(pids, db='protein')
    # convertor.check_cache(suffix=suffix, redo=redo)
    convertor.get_taxons_from_tid()
    pid2assembly_dict = convertor.pid2assembly()

    aids = open('./assembly_ids').read().split('\n')
    convertor = NCBI_convertor(aids, db='assembly')
    # convertor.check_cache(suffix=suffix, redo=redo)
    convertor.get_taxons_from_tid()

    nids = open('./nucleotide_ids').read().split('\n')
    convertor = NCBI_convertor(nids, db='nuccore')
    convertor.get_GI()
    convertor.get_db_summary()
    # convertor.check_cache(suffix=suffix, redo=redo)
    convertor.get_taxons_from_tid()