Python generate_excessive_dictionary_bact Examples

Programming Language: Python

Namespace/Package Name: catalog_preparation.catalog_creation_helper

Method/Function: generate_excessive_dictionary_bact

Examples at hotexamples.com: 2

Python generate_excessive_dictionary_bact - 2 examples found. These are the top rated real world Python examples of catalog_preparation.catalog_creation_helper.generate_excessive_dictionary_bact extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: create_tidy_dataset_all_bact.py Project: KonstantinYarygin/relation_extraction

def create_bacterial_df(nodes_ncbi_path, names_ncbi_path):
    ncbi_names_iter = pd.read_table(names_ncbi_path, names=['id', 'name', 'class'], usecols=[0, 2, 6], header=None,
                                    chunksize=CHUNK_SIZE)
    ncbi_nodes = pd.read_table(nodes_ncbi_path, names=['id', 'parent_id', 'rank'], usecols=[0, 2, 4], header=None)
    ncbi_names = pd.concat([chunk[~(chunk['class'].isin(CLASS_EXCLUSIONS))] for chunk in ncbi_names_iter])

    ids_all = create_all_bact_catalog(ncbi_nodes)
    names_all_bact = ncbi_names[ncbi_names['id'].isin(ids_all['id'].tolist())]
    names_all_bact = pd.merge(names_all_bact, ids_all, how='left', on='id', copy=False).drop_duplicates('name')

    names_all_bact = generate_excessive_dictionary_bact(names_all_bact)
    names_all_bact = names_all_bact.drop_duplicates(subset=['name'])
    names_all_bact['id'] = names_all_bact['id'].astype(int)

    #description = create_scientific_table(ncbi_nodes, ncbi_names)
    return [names_all_bact]#, description]

Example #2

Show file

File: create_tidy_dataset_gut_bact.py Project: KonstantinYarygin/relation_extraction

def create_gut_bacterial_df(nodes_ncbi_path, names_ncbi_path, gut_bact_list_path):
    SINTETIC_ID = 1000000000
    gut_names = pd.read_table(gut_bact_list_path, names=['name'], sep=',')
    ncbi_names_iter = pd.read_table(names_ncbi_path, names=['id', 'name', 'class'], usecols=[0, 2, 6], header=None,
                                    chunksize=CHUNK_SIZE)
    ncbi_nodes = pd.read_table(nodes_ncbi_path, names=['id', 'parent_id', 'rank'], usecols=[0, 2, 4], header=None)
    ncbi_names = pd.concat([chunk[~(chunk['class'].isin(CLASS_EXCLUSIONS))] for chunk in ncbi_names_iter])

    gut_names_first = gut_names['name'].apply(lambda x: str.split(x, ' ')[0])
    gut_names = pd.merge(gut_names, ncbi_names[['name', 'id']], how='left', on='name')

    # if this names were not found in ncbi base############
    gut_names_unknown = gut_names[np.isnan(gut_names['id'])].copy()
    gut_names.loc[np.isnan(gut_names['id']), 'name'] = gut_names_first[np.isnan(gut_names['id'])]
    gut_names = pd.merge(gut_names[['name']], ncbi_names, how='left', on='name')
    gut_names_unknown = pd.concat([gut_names_unknown, gut_names[np.isnan(gut_names['id'])].copy()])
    gut_names_unknown['id'] = range(SINTETIC_ID, SINTETIC_ID + len(gut_names_unknown))
    gut_names_unknown['class'] = 'unknown'
    gut_names_unknown['rank'] = 'unknown'
    ######################

    gut_names = gut_names[~np.isnan(gut_names['id'])]
    gut_names = gut_names.drop_duplicates(subset='id')

    gut_ids = clear_ids_by_rank(gut_names['id'].values, ncbi_nodes)

    gut_parent_ids = get_bind_ids(gut_ids['id'].values, ncbi_nodes)
    gut_ids_table = pd.concat([gut_parent_ids, gut_ids]).drop_duplicates('id')
    gut_names = ncbi_names[ncbi_names['id'].isin(gut_ids_table['id'].tolist())]
    gut_names = pd.merge(gut_names, gut_ids_table, how='left', on='id', copy=False).drop_duplicates('name')

    gut_names = pd.concat([gut_names, gut_names_unknown])
    gut_names = generate_excessive_dictionary_bact(gut_names)
    gut_names = gut_names.drop_duplicates(subset=['name'])
    gut_names['id'] = gut_names['id'].astype(int)
    return gut_names