def dbquery( ctx, verbose: bool, debug: bool, ipython: bool, null: bool, ): ''' session.bind.execute("select column_name, data_type, character_maximum_length from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem'").fetchall() ''' database = ctx.obj['database'] config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) if verbose: ic(config, config_mtime) with self_contained_session(db_url=database) as session: if verbose: ic(session) if ipython: import IPython IPython.embed()
def dumpconfig( ctx, verbose: bool, debug: bool, ipython: bool, null: bool, ): database = ctx.obj['database'] config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) pprint.pprint(config) with self_contained_session(db_url=database) as session: query = "select * from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem'" for index, match in enumerate(session.bind.execute(query).fetchall()): ic(index, match) if ipython: import IPython IPython.embed()
def describe( ctx, verbose: bool, debug: bool, ipython: bool, ): database = ctx.obj['database'] config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) if verbose: ic(config, config_mtime) #query = "SELECT pubchem.pubchem_compound_cid from pubchem ORDER BY pubchem.pubchem_compound_cid" query = "select column_name, data_type, character_maximum_length, column_default, is_nullable from INFORMATION_SCHEMA.COLUMNS where table_name = 'pubchem';" ic('column_name, data_type, character_maximum_length, column_default, is_nullable' ) with self_contained_session(db_url=database) as session: for index, match in enumerate(session.bind.execute(query).fetchall()): ic(index, match) if ipython: import IPython IPython.embed()
def last_cid( ctx, verbose: bool, debug: bool, ipython: bool, ): database = ctx.obj['database'] config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) if verbose: ic(config, config_mtime) #query = "SELECT pubchem.pubchem_compound_cid from pubchem ORDER BY pubchem.pubchem_compound_cid" query = "SELECT MAX(pubchem.pubchem_compound_cid) from pubchem" with self_contained_session(db_url=database) as session: for index, match in enumerate(session.bind.execute(query).fetchone()): ic(index, match) if ipython: import IPython IPython.embed()
def find( ctx, match: str, verbose: bool, cid: bool, debug: bool, ipython: bool, ): assert match database = ctx.obj['database'] config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) if verbose: ic(config, config_mtime) if not cid: query = "SELECT * from pubchem WHERE pubchem.pubchem_iupac_name LIKE '%%{}%%' ORDER BY pubchem_exact_mass DESC".format( match) else: query = "SELECT * from pubchem WHERE pubchem_compound_cid = '{}'".format( match) with self_contained_session(db_url=database) as session: result = session.bind.execute(query) result_keys = result.keys() for index, match in enumerate(result.fetchall()): result_zip = zip(result_keys, match) #result_dict = {k.replace('pubchem_', ''): v for (k, v) in result_zip if v} result_dict = {k: v for (k, v) in result_zip if v} humanized_result_dict = humanize_result_dict(result_dict) ic(index, humanized_result_dict) #ic(result_keys) if ipython: import IPython IPython.embed()
def dbimport( ctx, paths, add: bool, verbose: bool, debug: bool, ipython: bool, simulate: bool, count: int, start_cid: int, delete_database: bool, null: bool, ): total_records = 155000000 database = ctx.obj['database'] if delete_database: if not simulate: really_delete_database(database) config, config_mtime = click_read_config( click_instance=click, app_name=ctx.obj['appname'], verbose=verbose, debug=debug, ) if verbose: ic(config, config_mtime) #primary_key_created = False with self_contained_session(db_url=database) as session: if verbose: ic(session) ic(BASE) BASE.metadata.create_all(session.bind) if not paths: ic('waiting for input') all_sdf_keys = config['sdf_keys'].keys() assert "PUBCHEM_XLOGP3" in all_sdf_keys #mdict_df = pandas.DataFrame() for index, path in enumerate_input(iterator=paths, null=null, debug=debug, skip=None, head=None, tail=None, verbose=verbose): path = Path(path).expanduser() last_cid_in_file = int(path.name.split("_")[-1].split('.')[0]) ic(last_cid_in_file) if start_cid: if last_cid_in_file < start_cid: ic('skipping:', path) continue ic(index, path) if simulate: continue import_start_time = time.time() # per sdf.gz md5_hash = md5_hash_file(path) expected_md5 = Path(path.as_posix() + '.md5').read_text().split()[0] ic(md5_hash) ic(expected_md5) assert md5_hash == expected_md5 for mindex, mdict in enumerate( molecule_dict_generator(path=path.as_posix(), verbose=verbose)): if start_cid: if int(mdict['PUBCHEM_COMPOUND_CID']) < start_cid: continue if count: if count > (mindex + 1): ic(count) sys.exit(1) for key in all_sdf_keys: if key not in mdict.keys(): mdict[key] = '' if verbose: ic(mdict) mdict = {k.lower(): v for k, v in mdict.items()} mdict = {k.replace(' ', '_'): v for k, v in mdict.items()} for key in mdict.keys(): #assert key in SDF_FIELD_TYPES.keys() key_type = SDF_FIELD_TYPES[key] if mdict[key]: if key_type in ['Integer', 'Boolean']: mdict[key] = int(mdict[key]) if key_type in ['Boolean']: mdict[key] = bool(mdict[key]) else: # '' mdict[key] = None pubchem_row = PubChem(**mdict) #ic(pubchem_row) cid = mdict['pubchem_compound_cid'] elapsed_time = max(int(time.time() - import_start_time), 1) records_per_sec = max(int((mindex + 1) / elapsed_time), 1) records_remaning = total_records - cid seconds_eta = records_remaning / records_per_sec hours_eta = seconds_eta / (60 * 60) days_eta = round(hours_eta / 24, 3) session.add(pubchem_row) if mindex % 1000 == 0: session.commit() name = mdict['pubchem_iupac_name'] ic(days_eta, records_per_sec, records_remaning, mindex, cid, name) if ipython: import IPython IPython.embed() break