def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HGNC', 'source': 'Custom download file from https://www.genenames.org/download/custom/', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genenames.org/', 'comments': 'File downloaded with the following column data: HGNC ID Approved symbol Approved name Status UniProt ID NCBI Gene ID Mouse genome database ID' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'sym', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'geneid', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} hgnc_ct = 0 mgi_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 nf_ct = 0 db_err_ct = 0 with open(HGNC_TSV_FILE, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: NCBI Gene ID # 5: UniProt ID # 6: Mouse genome database ID ct += 1 pbar.update(ct) sym = row[1] if row[4] != '': geneid = int(row[4]) else: geneid = None if row[5] != '': up = row[5] else: up = None targets = dba.find_targets({'sym': sym}) if not targets and up: targets = dba.find_targets({'uniprot': up}) if not targets and geneid: targets = dba.find_targets({'geneid': geneid}) if not targets: nf_ct += 1 logger.warn("No target found for {}|{}|{}".format( sym, up, geneid)) continue for t in targets: p = t['components']['protein'][0] pid = p['id'] tmark[pid] = True # HGNC xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'HGNC', 'dataset_id': dataset_id, 'value': row[0] }) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref if row[6] != '': rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': row[6] }) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add missing syms if p['sym'] == None: rv = dba.upd_protein(pid, 'sym', sym) if rv: logger.info( "Inserted new sym {} for protein {}, {}".format( sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.upd_protein(pid, 'geneid', geneid) if rv: logger.info( "Inserted new geneid {} for protein {}, {}". format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != geneid: logger.warn( "GeneID discrepancy: UniProt={}, HGNC={}". format(p['geneid'], geneid)) geneiddiscr_ct += 1 pbar.finish() print "Processed {} lines - {} targets annotated.".format(ct, len(tmark)) print "No target found for {} lines.".format(nf_ct) print " Inserted {} HGNC ID xrefs".format(hgnc_ct) print " Inserted {} MGI ID xrefs".format(mgi_ct) if sym_ct > 0: print " Added {} new HGNC symbols".format(sym_ct) if symdiscr_ct > 0: print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format( symdiscr_ct, logfile) if geneid_ct > 0: print " Added {} new NCBI Gene IDs".format(geneid_ct) if geneiddiscr_ct > 0: print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format( geneiddiscr_ct, logfile) if db_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( db_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Eligible Targets List', 'source': 'IDG generated data in file %s.' % IDG_LIST_FILE, 'app': PROGRAM, 'app_version': __version__, 'comments': 'IDG Flags and Families set from list of targets on GitHub.', 'url': 'https://github.com/druggablegenome/IDGTargets' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'idg', 'where_clause': 'column_name == "idg"' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'fam', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }, { 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'famext', 'where_clause': 'column_name == "fam"', 'where_clause': 'idg == 1' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(IDG_LIST_FILE) print '\nProcessing {} lines in list file {}'.format( line_ct, IDG_LIST_FILE) logger.info("Processing {} lines in list file {}".format( line_ct, IDG_LIST_FILE)) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() notfnd = [] multfnd = [] ct = 0 idg_ct = 0 fam_ct = 0 famext_ct = 0 dba_err_ct = 0 with open(IDG_LIST_FILE, 'rU') as ifh: csvreader = csv.reader(ifh) #header = csvreader.next() # skip header line #ct += 1 for row in csvreader: ct += 1 sym = row[0] fam = row[2] targets = dba.find_targets({'sym': sym}, idg=False, include_annotations=False) if not targets: notfnd.append(sym) continue if len(targets) > 1: multfnd.append(sym) for t in targets: rv = dba.upd_target(t['id'], 'idg', 1) if rv: idg_ct += 1 else: dba_err_ct += 1 rv = dba.upd_target(t['id'], 'fam', fam) if rv: fam_ct += 1 else: dba_err_ct += 1 if row[3]: famext = row[3] rv = dba.upd_target(t['id'], 'famext', famext) if rv: famext_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "{} targets updated with IDG flags".format(idg_ct) print "{} targets updated with fams".format(fam_ct) print " {} targets updated with famexts".format(famext_ct) if notfnd: print "No target found for {} symbols: {}".format( len(notfnd), ", ".join(notfnd)) if multfnd: print "Multiple targets found for {} symbols: {}".format( len(multfnd), ", ".join(multfnd)) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI GI Numbers', 'source': 'UniProt ID Mapping file %s' % (BASE_URL + FILENAME), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '') line_ct = slmf.wcl(infile) # ID Mappiing fields # 1. UniProtKB-AC # 2. UniProtKB-ID # 3. GeneID (EntrezGene) # 4. RefSeq # 5. GI # 6. PDB # 7. GO # 8. UniRef100 # 9. UniRef90 # 10. UniRef50 # 11. UniParc # 12. PIR # 13. NCBI-taxon # 14. MIM # 15. UniGene # 16. PubMed # 17. EMBL # 18. EMBL-CDS # 19. Ensembl # 20. Ensembl_TRS # 21. Ensembl_PRO # 22. Additional PubMed if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} xref_ct = 0 skip_ct = 0 dba_err_ct = 0 for line in tsv: data = line.split('\t') ct += 1 up = data[0] if not data[4]: # no gi skip_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: skip_ct += 1 continue target = targets[0] tmark[target['id']] = True pid = target['components']['protein'][0]['id'] for gi in data[4].split('; '): rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'NCBI GI', 'dataset_id': dataset_id, 'value': gi }) if rv: xref_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} rows processed".format(ct) print " Inserted {} new GI xref rows for {} targets".format( xref_ct, len(tmark)) print " Skipped {} rows with no GI".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Families', 'source': 'IDG-KMC generated data from file %s' % os.path.basename(INFILE), 'app': PROGRAM, 'app_version': __version__, 'comments': "Target family designations generated by IDG-KMC groups at UNM and UMiami." }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'tiofam' }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 idg_ct = 0 upd_ct1 = 0 upd_ct2 = 0 null_ct = 0 notfnd = [] mulfnd = [] dba_err_ct = 0 with open(INFILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct += 1 for row in csvreader: ct += 1 pbar.update(ct) up = row[2].strip() fam = row[3].strip() famext = row[4].strip() if not fam: null_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.append(up) continue if len(targets) > 1: mulfnd.append(up) continue t = targets[0] # only update fam for non-IDG targets # IDG target fams are set by load-IDGList.py if t['fam']: idg_ct += 1 continue rv = dba.rv = dba.upd_target(t['id'], 'fam', fam) if not rv: print "ERROR updating target.fam: %d to %s" % (t['id'], fam) else: upd_ct1 += 1 if famext and famext != '': rv = dba.upd_target(t['id'], 'famext', famext) if not rv: print "ERROR updating target.famext: %d to %s" % (t['id'], famext) else: upd_ct2 += 1 pbar.finish() print "{} rows processed.".format(ct) print "{} IDG family designations loaded into TCRD.".format(upd_ct1) print "{} IDG extended family designations loaded into TCRD.".format( upd_ct2) print "Skipped {} IDG2 targets.".format(idg_ct) if notfnd: print "[WARNING] No target found for {} UniProt accessions: {}".format( len(notfnd), ", ".join(notfnd)) if mulfnd: print "[WARNING] Multiple targets found for {} UniProt accessions: {}".format( len(mulfnd), ", ".join(mulfnd)) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)