"Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})". format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) load(args, dba, logger, logfile) # Dataset and Provenance dataset_id = dba.ins_dataset({ 'name': 'IDG Eligible Targets List', 'source': f'IDG generated data in file {IDG_LIST_FILE}.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'IDG Target Flags are archived on GitHub in repo https://github.com/druggablegenome/IDGTargets.', 'url': 'https://github.com/druggablegenome/IDGTargets' }) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'idg' }, { 'dataset_id': dataset_id, 'table_name': 'target',
format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) download_eco(args) download_uniprots(args) # UniProt uses ECO IDs in GOAs, not GO evidence codes, so get a mapping of # ECO IDs to GO evidence codes eco_map = mk_eco_map(args) # Human proteins # Dataset and Provenance # This has to be done first because the dataset id is needed for xrefs and aliases dataset_id = dba.ins_dataset({ 'name': 'UniProt', 'source': f"UniProt XML file {UP_HUMAN_FILE} from {UP_BASE_URL}", 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org' }) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." provs = [{ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'name' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'description' }, { 'dataset_id': dataset_id, 'table_name': 'target',
dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})". format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) start_time = time.time() #do_glygen(dba, logger, logfile) do_tiga(dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'ExtLinks', 'source': 'Tested links to target/protein info in external resources.', 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details." # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'extlink' }) assert rv, f"Error inserting provenance. See logfile {logfile} for details." elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
if type(rv) == int: print(f"\nSet tdl to NULL for {rv} target rows") else: print(f"Error setting target.tdl values to NULL. See logfile {logfile} for details.") exit(1) rv = dba.del_dataset('TDLs') if rv: print(f"Deleted previous 'TDLs' dataset") else: print(f"Error deleting 'TDLs' dataset. See logfile {logfile} for details.") exit(1) load_tdls(dba, logfile, logger) # Dataset dataset_id = dba.ins_dataset( {'name': 'TDLs', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'TDLs are calculated by the loading app from data in TCRD.'} ) assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details." # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'tdl'}) assert rv, f"Error inserting provenance. See logfile {logfile} for details." # Add version number to filename and archive mapping file to old_versions dir mmver = '.'.join( dbi['data_ver'].split('.')[:2] ) outfn = OUTFILE_PAT.format(mmver) export_uniprot_mapping(dba, outfn) shutil.copy(outfn, '/usr/local/apache2/htdocs/tcrd/download/PharosTCRD_UniProt_Mapping.tsv') print(f"Copied {outfn} to /usr/local/apache2/htdocs/tcrd/download/PharosTCRD_UniProt_Mapping.tsv") elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) # Dataset and Provenance # This has to be done first because the dataset id is needed for xrefs dataset_id = dba.ins_dataset( {'name': 'HGNC', 'source': 'Custom download file from https://www.genenames.org/download/custom/', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genenames.org/', 'comments': 'File downloaded with the following column data: HGNC ID, Approved symbol, Approved name, Status, Chromosome, UniProt ID, NCBI Gene ID, Mouse genome database ID'} ) assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details." provs = [ {'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'sym', 'comment': "This is only updated with HGNC data if data from UniProt is absent."}, {'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'geneid', 'comment': "This is only updated with HGNC data if data from UniProt is absent."}, {'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'chr'}, {'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': f"dataset_id ={dataset_id}", 'comment': 'These are MGI xrefs only.'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, f"Error inserting provenance. See logfile {logfile} for details." load(args, dba, dataset_id, logger, logfile) elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
download_mappings() up2chembl = parse_mappings(DOWNLOAD_DIR + UNIPROT2CHEMBL_FILE) if not args['--quiet']: print(" Got {} UniProt to ChEMBL 'SINGLE PROTEIN' mappings".format( len(up2chembl))) # process and load new data load(args, dba, up2chembl, chembldb, logfile, logger) # Dataset dataset_id = dba.ins_dataset({ 'name': 'ChEMBL', 'source': 'ChEMBL MySQL database {}'.format(CHEMBL_DB), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/chembl/', 'comments': "The ChEMBL activities in TCRD are from two sources only: 'Scientific Literature' and 'Patent Bioactivity Data', and are also filtered for family-specific cutoffs." }) assert dataset_id, f"Error inserting ChEMBL dataset. See logfile {logfile} for details." dataset_id2 = dba.ins_dataset({ 'name': 'ChEMBL Info', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version':
'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})". format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) start_time = time.time() load(args, dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'DRGC Resources', 'source': 'RSS APIs at ', 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'drgc_resource' }) assert rv, f"Error inserting provenance. See logfile {logfile} for details." elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})". format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) start_time = time.time() load(args, dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubMed', 'source': 'NCBI E-Utils', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/pubmed' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'pubmed'}) assert rv, f"Error inserting provenance. See logfile {logfile} for details." rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'protein2pubmed' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) elapsed = time.time() - start_time
loglevel = int(args['--loglevel']) logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) download(args) load(args, dba, logger, logfile) # Dataset and Provenance dataset_id = dba.ins_dataset( {'name': 'IMPC Phenotypes', 'source': "Files %s and %s from ftp://ftp.ebi.ac.uk/pub/databases/impc/all-data-releases/latest/results/"%(os.path.basename(GENO_PHENO_FILE), os.path.basename(STAT_RES_FILE)), 'app': PROGRAM, 'app_version': __version__} ) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." provs = [ {'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'IMPC'"} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, f"Error inserting provenance. See logfile {logfile} for details." elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
# --2021-10-27 14:52:43-- https://unmtid-shinyapps.net/download/TIGA/20210915/tiga_gene-trait_stats.tsv # Resolving unmtid-shinyapps.net... 3.129.66.110 # Connecting to unmtid-shinyapps.net|3.129.66.110|:443... connected. # ERROR: cannot verify unmtid-shinyapps.net’s certificate, issued by “/C=US/O=Let's Encrypt/CN=R3”: # Issued certificate has expired. # -SLM 20211027 start_time = time.time() load(dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TIGA', 'source': f'TIGA Download files {TIGA_FILE} and {TIGA_PROV_FILE} from {BASE_URL}', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://unmtid-shinyapps.net/shiny/tiga/' }) assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details." # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tiga' }, { 'dataset_id': dataset_id, 'table_name': 'tiga_provenance' }] for prov in provs:
fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) download(args) start_time = time.time() load(args, dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset( {'name': 'OMIM', 'source': 'Files {} downloaded from omim.org'.format(", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE])), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://omim.org/'} ) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'omim'}, {'dataset_id': dataset_id, 'table_name': 'omim_ps'}, {'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'OMIM'"} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) elapsed = time.time() - start_time print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))