Exemple #1
0
def dbquery(
    ctx,
    verbose: bool,
    debug: bool,
    ipython: bool,
    null: bool,
):
    '''
    session.bind.execute("select column_name,
                          data_type,
                          character_maximum_length from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem'").fetchall()
    '''

    database = ctx.obj['database']

    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    if verbose:
        ic(config, config_mtime)

    with self_contained_session(db_url=database) as session:
        if verbose:
            ic(session)

        if ipython:
            import IPython
            IPython.embed()
Exemple #2
0
def dumpconfig(
    ctx,
    verbose: bool,
    debug: bool,
    ipython: bool,
    null: bool,
):

    database = ctx.obj['database']

    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    pprint.pprint(config)
    with self_contained_session(db_url=database) as session:
        query = "select * from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem'"
        for index, match in enumerate(session.bind.execute(query).fetchall()):
            ic(index, match)

        if ipython:
            import IPython
            IPython.embed()
Exemple #3
0
def describe(
    ctx,
    verbose: bool,
    debug: bool,
    ipython: bool,
):

    database = ctx.obj['database']
    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    if verbose:
        ic(config, config_mtime)

    #query = "SELECT pubchem.pubchem_compound_cid from pubchem ORDER BY pubchem.pubchem_compound_cid"
    query = "select column_name, data_type, character_maximum_length, column_default, is_nullable from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem';"

    ic('column_name, data_type, character_maximum_length, column_default, is_nullable'
       )
    with self_contained_session(db_url=database) as session:
        for index, match in enumerate(session.bind.execute(query).fetchall()):
            ic(index, match)

        if ipython:
            import IPython
            IPython.embed()
Exemple #4
0
def last_cid(
    ctx,
    verbose: bool,
    debug: bool,
    ipython: bool,
):

    database = ctx.obj['database']

    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    if verbose:
        ic(config, config_mtime)

    #query = "SELECT pubchem.pubchem_compound_cid from pubchem ORDER BY pubchem.pubchem_compound_cid"
    query = "SELECT MAX(pubchem.pubchem_compound_cid) from pubchem"

    with self_contained_session(db_url=database) as session:
        for index, match in enumerate(session.bind.execute(query).fetchone()):
            ic(index, match)

        if ipython:
            import IPython
            IPython.embed()
Exemple #5
0
def find(
    ctx,
    match: str,
    verbose: bool,
    cid: bool,
    debug: bool,
    ipython: bool,
):

    assert match

    database = ctx.obj['database']
    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    if verbose:
        ic(config, config_mtime)

    if not cid:
        query = "SELECT * from pubchem WHERE pubchem.pubchem_iupac_name LIKE '%%{}%%' ORDER BY pubchem_exact_mass DESC".format(
            match)
    else:
        query = "SELECT * from pubchem WHERE pubchem_compound_cid = '{}'".format(
            match)

    with self_contained_session(db_url=database) as session:
        result = session.bind.execute(query)
        result_keys = result.keys()
        for index, match in enumerate(result.fetchall()):
            result_zip = zip(result_keys, match)
            #result_dict = {k.replace('pubchem_', ''): v for (k, v) in result_zip if v}
            result_dict = {k: v for (k, v) in result_zip if v}
            humanized_result_dict = humanize_result_dict(result_dict)
            ic(index, humanized_result_dict)

        #ic(result_keys)

        if ipython:
            import IPython
            IPython.embed()
Exemple #6
0
def dbimport(
    ctx,
    paths,
    add: bool,
    verbose: bool,
    debug: bool,
    ipython: bool,
    simulate: bool,
    count: int,
    start_cid: int,
    delete_database: bool,
    null: bool,
):

    total_records = 155000000

    database = ctx.obj['database']
    if delete_database:
        if not simulate:
            really_delete_database(database)

    config, config_mtime = click_read_config(
        click_instance=click,
        app_name=ctx.obj['appname'],
        verbose=verbose,
        debug=debug,
    )
    if verbose:
        ic(config, config_mtime)

    #primary_key_created = False
    with self_contained_session(db_url=database) as session:
        if verbose:
            ic(session)

        ic(BASE)
        BASE.metadata.create_all(session.bind)

        if not paths:
            ic('waiting for input')

        all_sdf_keys = config['sdf_keys'].keys()
        assert "PUBCHEM_XLOGP3" in all_sdf_keys

        #mdict_df = pandas.DataFrame()
        for index, path in enumerate_input(iterator=paths,
                                           null=null,
                                           debug=debug,
                                           skip=None,
                                           head=None,
                                           tail=None,
                                           verbose=verbose):
            path = Path(path).expanduser()
            last_cid_in_file = int(path.name.split("_")[-1].split('.')[0])
            ic(last_cid_in_file)
            if start_cid:
                if last_cid_in_file < start_cid:
                    ic('skipping:', path)
                    continue

            ic(index, path)
            if simulate:
                continue

            import_start_time = time.time()  # per sdf.gz
            md5_hash = md5_hash_file(path)
            expected_md5 = Path(path.as_posix() +
                                '.md5').read_text().split()[0]
            ic(md5_hash)
            ic(expected_md5)
            assert md5_hash == expected_md5
            for mindex, mdict in enumerate(
                    molecule_dict_generator(path=path.as_posix(),
                                            verbose=verbose)):
                if start_cid:
                    if int(mdict['PUBCHEM_COMPOUND_CID']) < start_cid:
                        continue

                if count:
                    if count > (mindex + 1):
                        ic(count)
                        sys.exit(1)

                for key in all_sdf_keys:
                    if key not in mdict.keys():
                        mdict[key] = ''

                if verbose:
                    ic(mdict)

                mdict = {k.lower(): v for k, v in mdict.items()}
                mdict = {k.replace(' ', '_'): v for k, v in mdict.items()}
                for key in mdict.keys():
                    #assert key in SDF_FIELD_TYPES.keys()
                    key_type = SDF_FIELD_TYPES[key]
                    if mdict[key]:
                        if key_type in ['Integer', 'Boolean']:
                            mdict[key] = int(mdict[key])
                        if key_type in ['Boolean']:
                            mdict[key] = bool(mdict[key])
                    else:  # ''
                        mdict[key] = None

                pubchem_row = PubChem(**mdict)
                #ic(pubchem_row)
                cid = mdict['pubchem_compound_cid']
                elapsed_time = max(int(time.time() - import_start_time), 1)
                records_per_sec = max(int((mindex + 1) / elapsed_time), 1)
                records_remaning = total_records - cid
                seconds_eta = records_remaning / records_per_sec
                hours_eta = seconds_eta / (60 * 60)
                days_eta = round(hours_eta / 24, 3)

                session.add(pubchem_row)
                if mindex % 1000 == 0:
                    session.commit()
                    name = mdict['pubchem_iupac_name']
                    ic(days_eta, records_per_sec, records_remaning, mindex,
                       cid, name)

            if ipython:
                import IPython
                IPython.embed()
                break