logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) #download(args) start_time = time.time() load(args, dba, logger, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TIGA',
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) load_ids_classifications(args, dba, logger, logfile) dto = parse_dto_owl(args, DTO_OWL_FILE) load_dto(args, dba, logfile, dto) # Dataset dataset_id = dba.ins_dataset({ 'name':
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) print "\nWorking on JensenLab DISEASES..." download_DISEASES(args) start_time = time.time() load_DISEASES(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with DISEASES. Elapsed time: {}".format(slmf.secs2str(elapsed))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HGNC', 'source': 'Custom download file from https://www.genenames.org/download/custom/', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genenames.org/', 'comments': 'File downloaded with the following column data: HGNC ID Approved symbol Approved name Status UniProt ID NCBI Gene ID Mouse genome database ID' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'sym', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'geneid', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} hgnc_ct = 0 mgi_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 nf_ct = 0 db_err_ct = 0 with open(HGNC_TSV_FILE, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: NCBI Gene ID # 5: UniProt ID # 6: Mouse genome database ID ct += 1 pbar.update(ct) sym = row[1] if row[4] != '': geneid = int(row[4]) else: geneid = None if row[5] != '': up = row[5] else: up = None targets = dba.find_targets({'sym': sym}) if not targets and up: targets = dba.find_targets({'uniprot': up}) if not targets and geneid: targets = dba.find_targets({'geneid': geneid}) if not targets: nf_ct += 1 logger.warn("No target found for {}|{}|{}".format( sym, up, geneid)) continue for t in targets: p = t['components']['protein'][0] pid = p['id'] tmark[pid] = True # HGNC xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'HGNC', 'dataset_id': dataset_id, 'value': row[0] }) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref if row[6] != '': rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': row[6] }) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add missing syms if p['sym'] == None: rv = dba.upd_protein(pid, 'sym', sym) if rv: logger.info( "Inserted new sym {} for protein {}, {}".format( sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.upd_protein(pid, 'geneid', geneid) if rv: logger.info( "Inserted new geneid {} for protein {}, {}". format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != geneid: logger.warn( "GeneID discrepancy: UniProt={}, HGNC={}". format(p['geneid'], geneid)) geneiddiscr_ct += 1 pbar.finish() print "Processed {} lines - {} targets annotated.".format(ct, len(tmark)) print "No target found for {} lines.".format(nf_ct) print " Inserted {} HGNC ID xrefs".format(hgnc_ct) print " Inserted {} MGI ID xrefs".format(mgi_ct) if sym_ct > 0: print " Added {} new HGNC symbols".format(sym_ct) if symdiscr_ct > 0: print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format( symdiscr_ct, logfile) if geneid_ct > 0: print " Added {} new NCBI Gene IDs".format(geneid_ct) if geneiddiscr_ct > 0: print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format( geneiddiscr_ct, logfile) if db_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( db_err_ct, logfile)
if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE loglevel = int(args['--loglevel']) logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) download(args) start_time = time.time() load(args, dba, logger, logfile) elapsed = time.time() - start_time # Dataset dataset_id = dba.ins_dataset( {'name': 'HomoloGene', 'source': 'File %s'%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/homologene', 'comments': 'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'homology'})
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) download(args) start_time = time.time() load(args, dba, logger, logfile) elapsed = time.time() - start_time # Dataset dataset_id = dba.ins_dataset({
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI GI Numbers', 'source': 'UniProt ID Mapping file %s' % (BASE_URL + FILENAME), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '') line_ct = slmf.wcl(infile) # ID Mappiing fields # 1. UniProtKB-AC # 2. UniProtKB-ID # 3. GeneID (EntrezGene) # 4. RefSeq # 5. GI # 6. PDB # 7. GO # 8. UniRef100 # 9. UniRef90 # 10. UniRef50 # 11. UniParc # 12. PIR # 13. NCBI-taxon # 14. MIM # 15. UniGene # 16. PubMed # 17. EMBL # 18. EMBL-CDS # 19. Ensembl # 20. Ensembl_TRS # 21. Ensembl_PRO # 22. Additional PubMed if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} xref_ct = 0 skip_ct = 0 dba_err_ct = 0 for line in tsv: data = line.split('\t') ct += 1 up = data[0] if not data[4]: # no gi skip_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: skip_ct += 1 continue target = targets[0] tmark[target['id']] = True pid = target['components']['protein'][0]['id'] for gi in data[4].split('; '): rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'NCBI GI', 'dataset_id': dataset_id, 'value': gi }) if rv: xref_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} rows processed".format(ct) print " Inserted {} new GI xref rows for {} targets".format( xref_ct, len(tmark)) print " Skipped {} rows with no GI".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) download_mappings(args) start_time = time.time() load(args, dba, logfile, logger) elapsed = time.time() - start_time # Dataset dataset_id = dba.ins_dataset({
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Eligible Targets List', 'source': 'IDG generated data in file %s.' % IDG_LIST_FILE, 'app': PROGRAM, 'app_version': __version__, 'comments': 'IDG Flags and Families set from list of targets on GitHub.', 'url': 'https://github.com/druggablegenome/IDGTargets' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'idg', 'where_clause': 'column_name == "idg"' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'fam', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'famext', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(IDG_LIST_FILE) print '\nProcessing {} lines in list file {}'.format( line_ct, IDG_LIST_FILE) logger.info("Processing {} lines in list file {}".format( line_ct, IDG_LIST_FILE)) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() notfnd = [] multfnd = [] ct = 0 idg_ct = 0 fam_ct = 0 famext_ct = 0 dba_err_ct = 0 with open(IDG_LIST_FILE, 'rU') as ifh: csvreader = csv.reader(ifh) #header = csvreader.next() # skip header line #ct += 1 for row in csvreader: ct += 1 sym = row[0] fam = row[2] targets = dba.find_targets({'sym': sym}, idg=False, include_annotations=False) if not targets: notfnd.append(sym) continue if len(targets) > 1: multfnd.append(sym) for t in targets: rv = dba.upd_target(t['id'], 'idg', 1) if rv: idg_ct += 1 else: dba_err_ct += 1 rv = dba.upd_target(t['id'], 'fam', fam) if rv: fam_ct += 1 else: dba_err_ct += 1 if row[3]: famext = row[3] rv = dba.upd_target(t['id'], 'famext', famext) if rv: famext_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "{} targets updated with IDG flags".format(idg_ct) print "{} targets updated with fams".format(fam_ct) print " {} targets updated with famexts".format(famext_ct) if notfnd: print "No target found for {} symbols: {}".format( len(notfnd), ", ".join(notfnd)) if multfnd: print "Multiple targets found for {} symbols: {}".format( len(multfnd), ", ".join(multfnd)) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset #dataset_id = 8 dataset_id = dba.ins_dataset({ 'name': 'NCBI Gene', 'source': 'EUtils web API at %s' % EFETCH_GENE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.ncbi.nlm.nih.gov/gene' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene Summary'" }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene PubMed Count'" }, { 'dataset_id': dataset_id, 'table_name': 'generif' }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }, { 'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) s = shelve.open(SHELF_FILE, writeback=True) s['loaded'] = [] s['retries'] = {} s['counts'] = defaultdict(int) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ct = 0 skip_ct = 0 if args['--pastid']: past_id = args['--pastid'] tct = dba.get_target_count(past_id=past_id) else: past_id = None tct = dba.get_target_count() pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() if not args['--quiet']: print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct) for t in dba.get_targets(past_id=past_id): ct += 1 tid = t['id'] if tid in s['loaded']: logger.info("Skipping previously loaded target %d" % tid) p = t['components']['protein'][0] pid = p['id'] if p['geneid'] == None: skip_ct += 1 continue geneid = str(p['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) s['retries'][tid] = True continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) s['retries'][tid] = True continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) s['retries'][tid] = True continue load_annotations(dba, t, dataset_id, gene_annotations, s) time.sleep(0.5) pbar.update(ct) pbar.finish() print "Processed %d targets." % ct if skip_ct > 0: print "Skipped %d targets with no geneid" % skip_ct print "Loaded NCBI annotations for %d targets" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len(s['retries']) loop = 1 while len(s['retries']) > 0: print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % ( loop, len(s['retries'])) logger.info( "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (loop, len(s['retries']))) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['retries'])).start() ct = 0 act = 0 for tid, _ in s['retries'].items(): ct += 1 t = dba.get_target(tid, include_annotations=False) geneid = str(t['components']['protein'][0]['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) continue load_annotations(dba, t, dataset_id, gene_annotations, s) act += 1 del s['retries'][tid] time.sleep(0.5) pbar.update(ct) loop += 1 if loop == 5: print("Completed 5 retry loops. Aborting.") break pbar.finish() print "Processed %d targets." % ct print " Annotated %d additional targets" % act print " Total annotated targets: %d" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len( s['retries']) print "\nInserted %d aliases" % s['counts']['alias'] print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary'] print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc'] print "Inserted %d GeneRIFs" % s['counts']['generif'] print "Inserted %d PubMed xrefs" % s['counts']['pmxr'] #print "Inserted %d other xrefs" % s['counts']['xref'] if s['counts']['xml_err'] > 0: print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % ( s['counts']['xml_err'], logfile) if s['counts']['dba_err'] > 0: print "WARNNING: %d DB errors occurred. See logfile %s for details." % ( s['counts']['dba_err'], logfile)
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) start_time = time.time() load(args, dba, logfile, logger) elapsed = time.time() - start_time # Dataset dataset_id = dba.ins_dataset({ 'name':
def load(args): dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Families', 'source': 'IDG-KMC generated data from file %s' % os.path.basename(INFILE), 'app': PROGRAM, 'app_version': __version__, 'comments': "Target family designations generated by IDG-KMC groups at UNM and UMiami." }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'tiofam' }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 idg_ct = 0 upd_ct1 = 0 upd_ct2 = 0 null_ct = 0 notfnd = [] mulfnd = [] dba_err_ct = 0 with open(INFILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct += 1 for row in csvreader: ct += 1 pbar.update(ct) up = row[2].strip() fam = row[3].strip() famext = row[4].strip() if not fam: null_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.append(up) continue if len(targets) > 1: mulfnd.append(up) continue t = targets[0] # only update fam for non-IDG targets # IDG target fams are set by load-IDGList.py if t['fam']: idg_ct += 1 continue rv = dba.rv = dba.upd_target(t['id'], 'fam', fam) if not rv: print "ERROR updating target.fam: %d to %s" % (t['id'], fam) else: upd_ct1 += 1 if famext and famext != '': rv = dba.upd_target(t['id'], 'famext', famext) if not rv: print "ERROR updating target.famext: %d to %s" % (t['id'], famext) else: upd_ct2 += 1 pbar.finish() print "{} rows processed.".format(ct) print "{} IDG family designations loaded into TCRD.".format(upd_ct1) print "{} IDG extended family designations loaded into TCRD.".format( upd_ct2) print "Skipped {} IDG2 targets.".format(idg_ct) if notfnd: print "[WARNING] No target found for {} UniProt accessions: {}".format( len(notfnd), ", ".join(notfnd)) if mulfnd: print "[WARNING] Multiple targets found for {} UniProt accessions: {}".format( len(mulfnd), ", ".join(mulfnd)) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)