def main(): argparser = argparse.ArgumentParser(description="Export TCRD target data to a CSV file") argparser.add_argument("-o", "--outfile", help='Output file [path/]name', default=OUTFILE) argparser.add_argument('-db', '--dbname', help='MySQL database name', default=DBNAME) argparser.add_argument("-i", "--idg", help="Export only IDG-Eligible tagets", action="store_true", default=False) argparser.add_argument("-e", "--expand", help="Export expanded (a LOT of data) CSV version", action="store_true", default=False) args = argparser.parse_args() dba = DBAdaptor({'dbname': args.dbname}) dbi = dba.get_dbinfo() print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c")) print "\nConnected to TCRD database %s (schema ver %s, data ver %s)\n" % (dbi['dbname'], dbi['schema_ver'], dbi['data_ver']) if args.idg: tct = dba.get_target_count(idg=True) print "Exporting CSV for %d IDG-Eligible targets from TCRD to file %s" % (tct, args.outfile) else: tct = dba.get_target_count(idg=False) print "Exporting CSV for all %d targets from TCRD to file %s" % (tct, args.outfile) header = ['TCRD ID', 'Name', 'Description', 'HGNC Sym', 'NCBI Gene ID', 'UniProt', 'STRING ID', 'TDL', 'IDG Eligible', 'DTO ID', 'DTO Class'] if args.expand: header = header + ['PANTHER Class(es)', 'GeneRIF Count', 'NCBI Gene PubMed Count', 'JensenLab PubMed Score', 'PubTator Score', 'Ab Count', 'Monoclonal Ab Count', 'Activity Count', 'ChEMBL Selective Compound', 'ChEMBL First Reference Year', 'DrugCentral Activity Count', 'PDB Count', 'PDBs', 'GO Annotation Count', 'Experimental MF/BP Leaf Term GOA(s)', 'OMIM Phenotype Count', 'OMIM Phenotype(s)', 'JAX/MGI Human Ortholog Phenotype Count', 'JAX/MGI Human Ortholog Phenotype(s)', 'IMPC Ortholog Phenotype Count', 'IMPC Ortholog Phenotype(s)', 'GWAS Count', 'GWAS Phenotype(s)', 'Pathway Count', 'Pathways', 'Total Disease Count', 'Top 5 Text-Mining DISEASES', 'eRAM Diseases', 'EBI Patent Count', 'Is Transcription Factor', 'TMHMM Prediction', 'HPA Tissue Specificity Index', 'HPM Gene Tissue Specificity Index', 'HPM Protein Tissue Specificity Index', 'TIN-X Novelty', 'Top 5 TIN-X Importance(s)'] pbar_widgets = ['Progress: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() with open(args.outfile, 'wb') as csvout: csvwriter = csv.writer(csvout, quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) ct = 0 if args.idg: for t in dba.get_targets(idg=True, include_annotations=args.expand): ct += 1 if args.expand: csvwriter.writerow( target2csv_exp(t) ) else: csvwriter.writerow( target2csv(t) ) pbar.update(ct) else: for t in dba.get_targets(idg=False, include_annotations=args.expand): #for tid in [9]: # t = dba.get_target(tid, True) ct += 1 if args.expand: csvwriter.writerow(target2csv_exp(t)) else: csvwriter.writerow(target2csv(t)) pbar.update(ct) pbar.finish() print "%d CSV rows exported" % ct print "\n%s: Done." % PROGRAM
def calc_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'KEGG Nearest Tclins', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Nearest upstream and downstream Tclin targets are found and stored based on KEGG Distances.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'kegg_nearest_tclin' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 uct = 0 umark = set() dct = 0 dmark = set() dba_err_ct = 0 for target in dba.get_targets(): #tids = [1983, 7166] #for tid in tids: # target = dba.get_target(tid) ct += 1 if target['tdl'] == 'Tclin': continue pid = target['components']['protein'][0]['id'] ups = dba.get_nearest_kegg_tclins(pid, 'upstream') if ups: umark.add(pid) for d in ups: d['tclin_id'] = d['protein_id'] d['protein_id'] = pid d['direction'] = 'upstream' rv = dba.ins_kegg_nearest_tclin(d) if rv: uct += 1 else: dba_err_ct += 1 dns = dba.get_nearest_kegg_tclins(pid, 'downstream') if dns: dmark.add(pid) for d in dns: d['tclin_id'] = d['protein_id'] d['protein_id'] = pid d['direction'] = 'downstream' rv = dba.ins_kegg_nearest_tclin(d) if rv: dct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "\n{} targets processed.".format(ct) print " {} non-Tclin targets have upstream Tclin target(s)".format( len(umark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(uct) print " {} non-Tclin targets have downstream Tclin target(s)".format( len(dmark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(dct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) aliasmap = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] ct = 0 skip_ct = 0 mult_ct = 0 line_ct = slmf.wcl(INFILE1) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(INFILE1, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # taxid uniprot_ac|uniprot_id string_id identity bit_score ct += 1 pbar.update(ct) if float(row[3]) != 100: skip_ct += 1 continue [uniprot, name] = row[1].split("|") ensp = row[2].replace('9606.', '') bitscore = float(row[4]) if uniprot in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[uniprot][1]: aliasmap[uniprot] = (ensp, bitscore) else: aliasmap[uniprot] = (ensp, bitscore) if name in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[name][1]: aliasmap[name] = (ensp, bitscore) else: aliasmap[name] = (ensp, bitscore) pbar.finish() unmap_ct = len(aliasmap) print "{} input lines processed.".format(ct) print " Skipped {} non-identity lines".format(skip_ct) print " Got {} uniprot/name to STRING ID mappings".format(unmap_ct) line_ct = slmf.wcl(INFILE2) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 warn_ct = 0 with open(INFILE2, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ## string_protein_id ## alias ## source ## ct += 1 pbar.update(ct) alias = row[1] ensp = row[0].replace('9606.', '') if alias in aliasmap and aliasmap[alias][0] != ensp: # do not replace mappings from *human.uniprot_2_string.2018* with aliases logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp)) warn_ct += 1 continue aliasmap[alias] = (ensp, None) pbar.finish() amap_ct = len(aliasmap) - unmap_ct print "{} input lines processed.".format(ct) print " Added {} alias to STRING ID mappings".format(amap_ct) if warn_ct > 0: print " Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile) tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nLoading STRING IDs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 upd_ct = 0 nf_ct = 0 dba_err_ct = 0 for target in dba.get_targets(include_annotations=True): ct += 1 pbar.update(ct) p = target['components']['protein'][0] geneid = 'hsa:' + str(p['geneid']) hgncid = None if 'HGNC' in p['xrefs']: hgncid = p['xrefs']['HGNC'][0]['value'] ensp = None if p['uniprot'] in aliasmap: ensp = aliasmap[p['uniprot']][0] elif p['name'] in aliasmap: ensp = aliasmap[p['name']][0] elif geneid in aliasmap: ensp = aliasmap[geneid][0] elif hgncid and hgncid in aliasmap: ensp = aliasmap[hgncid][0] if not ensp: nf_ct += 1 logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot'])) continue rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} ) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Updated {} STRING ID values".format(upd_ct) if nf_ct > 0: print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(ortho_df, args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Orthologs', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.genenames.org/cgi-bin/hcop' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ortholog', 'comment': "Orthologs are majority vote from the OMA, EggNOG and InParanoid resources as per HGNC." }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nLoading ortholog data for {} TCRD targets".format(tct) logger.info("Loading ortholog data for {} TCRD targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 ortho_ct = 0 tskip_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 for target in dba.get_targets(): ct += 1 pbar.update(ct) logger.info("Processing target %d" % target['id']) p = target['components']['protein'][0] if p['sym']: # try first by symbol to_df = ortho_df.loc[ortho_df['human_symbol'] == p['sym']] elif p['geneid']: # then try by GeneID to_df = ortho_df.loc[ortho_df['human_entrez_gene'] == p['geneid']] else: tskip_ct += 1 continue if len(to_df) == 0: continue for idx, row in to_df.iterrows(): if row['ortholog_species_symbol'] == '-' and row[ 'ortholog_species_name'] == '-': skip_ct += 1 continue sp = TAXID2SP[row['ortholog_species']] init = { 'protein_id': p['id'], 'taxid': row['ortholog_species'], 'species': sp, 'sources': row['sources'], 'symbol': row['ortholog_species_symbol'], 'name': row['ortholog_species_name'] } # Add MOD DB ID if it's there if row['ortholog_species_db_id'] != '-': init['db_id'] = row['ortholog_species_db_id'] # Add NCBI Gene ID if it's there if row['ortholog_species_entrez_gene'] != '-': init['geneid'] = row['ortholog_species_entrez_gene'] # Construct MOD URLs for mouse, rat, zebrafish, fly, worm and yeast if sp == 'Mouse': init[ 'mod_url'] = 'http://www.informatics.jax.org/marker/' + row[ 'ortholog_species_db_id'] elif sp == 'Rat': rgdid = row['ortholog_species_db_id'].replace('RGD:', '') init[ 'mod_url'] = 'http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=' + rgdid elif sp == 'Zebrafish': init['mod_url'] = 'http://zfin.org/' + row[ 'ortholog_species_db_id'] elif sp == 'Fruitfly': init['mod_url'] = "http://flybase.org/reports/%s.html" % row[ 'ortholog_species_db_id'] elif sp == 'C. elegans': init['mod_url'] = 'http://www.wormbase.org/search/gene/' + row[ 'ortholog_species_symbol'] elif sp == 'S.cerevisiae': init['mod_url'] = 'https://www.yeastgenome.org/locus/' + row[ 'ortholog_species_db_id'] rv = dba.ins_ortholog(init) if rv: ortho_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print "Loaded {} new ortholog rows".format(ortho_ct) print " Skipped {} empty ortholog entries".format(skip_ct) print " Skipped {} targets with no sym/geneid".format(tskip_ct) if len(notfnd) > 0: print " No orthologs found for {} targets.".format(len(notfnd)) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TDLs', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'TDLs are generated by the loading app from data in TCRD.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'tdl' }) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0} bump_ct = 0 dba_err_ct = 0 upd_ct = 0 for target in dba.get_targets(idg=False, include_annotations=True): ct += 1 pbar.update(ct) (tdl, bump_flag) = get_tdl(target) tdl_cts[tdl] += 1 if bump_flag: bump_ct += 1 rv = dba.upd_target(target['id'], 'tdl', tdl) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "{} TCRD targets processed.".format(ct) print "Set TDL values for {} targets:".format(upd_ct) print " {} targets are Tclin".format(tdl_cts['Tclin']) print " {} targets are Tchem".format(tdl_cts['Tchem']) print " {} targets are Tbio - {} bumped from Tdark".format( tdl_cts['Tbio'], bump_ct) print " {} targets are Tdark".format(tdl_cts['Tdark']) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Harmonogram CDFs', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'CDFs are calculated by the loader app based on gene_attribute data in TCRD.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': 1, 'table_name': 'hgram_cdf'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] # Create a dictionary of gene_attribute_type.name => [] pairs counts = {} # Create a dictionary of gene_attribute_type.name => {} pairs stats = {} gatypes = dba.get_gene_attribute_types() for ga in gatypes: counts[ga] = [] stats[ga] = {} tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nCollecting counts for {} gene attribute types on {} TCRD targets".format( len(gatypes), tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 for t in dba.get_targets(idg=False, include_annotations=True, get_ga_counts=True): ct += 1 pbar.update(ct) p = t['components']['protein'][0] pid = p['id'] if not 'gene_attribute_counts' in p: continue for type, attr_count in p['gene_attribute_counts'].items(): counts[type].append(attr_count) pbar.finish() print "\nCalculatig Gene Attribute stats. See logfile {}.".format(logfile) logger.info("Calculatig Gene Attribute stats:") for type, l in counts.items(): if len(l) == 0: del (counts[type]) continue npa = numpy.array(l) logger.info(" %s: %d counts; mean: %.2f; std: %.2f" % (type, len(l), npa.mean(), npa.std())) stats[type]['mean'] = npa.mean() stats[type]['std'] = npa.std() print "\nLoading HGram CDFs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 nan_ct = 0 cdf_ct = 0 dba_err_ct = 0 for t in dba.get_targets(idg=False, include_annotations=True, get_ga_counts=True): ct += 1 p = t['components']['protein'][0] pid = p['id'] if not 'gene_attribute_counts' in p: continue for type, attr_count in p['gene_attribute_counts'].items(): attr_cdf = gaussian_cdf(attr_count, stats[type]['mean'], stats[type]['std']) if math.isnan(attr_cdf): attr_cdf = 1.0 / (1.0 + math.exp(-1.702 * ( (attr_count - stats[type]['mean']) / stats[type]['std']))) if math.isnan(attr_cdf): nan_ct += 1 continue rv = dba.ins_hgram_cdf({ 'protein_id': p['id'], 'type': type, 'attr_count': attr_count, 'attr_cdf': attr_cdf }) if not rv: dba_err_ct += 1 continue cdf_ct += 1 pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print " Loaded {} new hgram_cdf rows".format(cdf_ct) print " Skipped {} NaN CDFs".format(nan_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Aintibodypedia.com', 'source': 'Web API at %s' % ABPC_API_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.antibodypedia.com' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "Ab Count"' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "MAb Count"' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "Antibodypedia.com URL"' }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nLoading Antibodypedia annotations for {} TCRD targets".format( tct) logger.info( "Loading Antibodypedia annotations for {} TCRD targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 tiab_ct = 0 timab_ct = 0 tiurl_ct = 0 dba_err_ct = 0 net_err_ct = 0 for target in dba.get_targets(): ct += 1 pbar.update(ct) tid = target['id'] p = target['components']['protein'][0] pid = p['id'] url = ABPC_API_URL + p['uniprot'] r = None attempts = 1 while attempts <= 5: try: logger.info("Getting {} [Target {}, attempt {}]".format( url, tid, attempts)) r = requests.get(url) break except: attempts += 1 time.sleep(1) if not r: net_err_ct += 1 logger.error("No response for {} [Target {}, attempt {}]".format( url, tid, attempts)) continue if r.status_code != 200: net_err_ct += 1 logger.error( "Bad response: {} for {} [Target {}, attempt {}]".format( r.status_code, url, tid, attempts)) continue abpd = json.loads(r.text) rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Ab Count', 'integer_value': int(abpd['num_antibodies']) }) if rv: tiab_ct += 1 else: dba_err_ct += 1 if 'ab_type_monoclonal' in abpd: mab_ct = int(abpd['ab_type_monoclonal']) else: mab_ct = 0 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'MAb Count', 'integer_value': mab_ct }) if rv: timab_ct += 1 else: dba_err_ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Antibodypedia.com URL', 'string_value': abpd['url'] }) if rv: tiurl_ct += 1 else: dba_err_ct += 1 time.sleep(1) pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} Ab Count tdl_info rows".format(tiab_ct) print " Inserted {} MAb Count tdl_info rows".format(timab_ct) print " Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct) if net_err_ct > 0: print "WARNING: Network error for {} targets. See logfile {} for details.".format( net_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def run_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TMHMM Predictions', 'source': 'Results of running TMHMM on protein sequences.', 'app': PROGRAM, 'app_version': __version__, }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'TMHMM Prediction'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) tct = dba.get_target_count(idg=False) print "\nProcessing {} TCRD targets".format(tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() regex = re.compile(r'PredHel=(\d+)') ct = 0 ti_ct = 0 dba_err_ct = 0 for t in dba.get_targets(idg=False, include_annotations=False): ct += 1 p = t['components']['protein'][0] fasta = ">%s|%s %s\n%s\n" % (t['id'], p['name'], p['description'], p['seq']) #print "[DEBUG] Fasta:\n%s" % fasta fasta_filename = "/tmp/%s.fa" % t['id'] f = open(fasta_filename, 'w') f.write(fasta) f.close() cmd = '%s --short --noplot %s' % (TMHMM_BIN, fasta_filename) #print "[DEBUG] Cmd: %s" % cmd output = '' for line in runProcess(cmd.split()): output += line os.remove(fasta_filename) #print "[DEBUG] Output: %s" % output pred = regex.findall(output)[0] #print "[DEBUG] PredHel: %s" % predhel if pred != '0': rv = dba.ins_tdl_info({ 'protein_id': p['id'], 'itype': 'TMHMM Prediction', 'string_value': output }) if not rv: dba_err_ct += 1 continue ti_ct += 1 pbar.update(ct) pbar.finish() print "{} targets processed.".format(ct) print " Inserted {} new TMHMM Prediction tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI Gene', 'source': 'EUtils web API at %s' % EFETCH_GENE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.ncbi.nlm.nih.gov/gene' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene Summary'" }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene PubMed Count'" }, { 'dataset_id': dataset_id, 'table_name': 'generif' }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }, { 'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) s = shelve.open(SHELF_FILE, writeback=True) s['loaded'] = [] s['retries'] = {} s['counts'] = defaultdict(int) tct = dba.get_target_count() if not args['--quiet']: print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 skip_ct = 0 for t in dba.get_targets(include_annotations=False): tid = t['id'] ct += 1 p = t['components']['protein'][0] pid = p['id'] if p['geneid'] == None: skip_ct += 1 continue geneid = str(p['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) s['retries'][tid] = True continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) s['retries'][tid] = True continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) s['retries'][tid] = True continue load_annotations(dba, t, dataset_id, gene_annotations, s) time.sleep(0.5) pbar.update(ct) pbar.finish() print "Processed %d targets." % ct if skip_ct > 0: print "Skipped %d targets with no geneid" % skip_ct print "Loaded NCBI annotations for %d targets" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len(s['retries']) loop = 1 while len(s['retries']) > 0: print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % ( loop, len(s['retries'])) logger.info( "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (loop, len(s['retries']))) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['retries'])).start() ct = 0 act = 0 for tid, _ in s['retries'].items(): ct += 1 t = dba.get_target(tid, include_annotations=False) geneid = str(t['components']['protein'][0]['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) continue load_annotations(dba, t, dataset_id, gene_annotations, s) act += 1 del s['retries'][tid] time.sleep(0.5) pbar.update(ct) loop += 1 if loop == 5: print("Completed 5 retry loops. Aborting.") break pbar.finish() print "Processed %d targets." % ct print " Annotated %d additional targets" % act print " Total annotated targets: %d" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len( s['retries']) print "\nInserted %d aliases" % s['counts']['alias'] print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary'] print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc'] print "Inserted %d GeneRIFs" % s['counts']['generif'] print "Inserted %d PubMed xrefs" % s['counts']['pmxr'] #print "Inserted %d other xrefs" % s['counts']['xref'] if s['counts']['xml_err'] > 0: print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % ( s['counts']['xml_err'], logfile) if s['counts']['dba_err'] > 0: print "WARNNING: %d DB errors occurred. See logfile %s for details." % ( s['counts']['dba_err'], logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubMed', 'source': 'NCBI E-Utils', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/pubmed' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'pubmed'}) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'protein2pubmed' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) s = shelve.open(SHELF_FILE, writeback=True) s['loaded'] = [ ] # list of target IDs that have been successfully processed s['pmids'] = [] # list of stored pubmed ids s['p2p_ct'] = 0 s['errors'] = defaultdict(list) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if args['--pastid']: tct = dba.get_target_count(idg=False, past_id=args['--pastid']) else: tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nLoading pubmeds for {} TCRD targets".format(tct) logger.info("Loading pubmeds for {} TCRD targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 dba_err_ct = 0 if args['--pastid']: past_id = args['--pastid'] else: past_id = 0 for target in dba.get_targets(include_annotations=True, past_id=past_id): ct += 1 logger.info("Processing target {}: {}".format(target['id'], target['name'])) p = target['components']['protein'][0] if 'PubMed' not in p['xrefs']: continue pmids = [d['value'] for d in p['xrefs']['PubMed']] chunk_ct = 0 err_ct = 0 for chunk in chunker(pmids, 200): chunk_ct += 1 r = get_pubmed(chunk) if not r or r.status_code != 200: # try again... r = get_pubmed(chunk) if not r or r.status_code != 200: logger.error( "Bad E-Utils response for target {}, chunk {}".format( target['id'], chunk_ct)) s['errors'][target['id']].append(chunk_ct) err_ct += 1 continue soup = BeautifulSoup(r.text, "xml") pmas = soup.find('PubmedArticleSet') for pma in pmas.findAll('PubmedArticle'): pmid = pma.find('PMID').text if pmid not in s['pmids']: # only store each pubmed once logger.debug(" parsing XML for PMID: %s" % pmid) init = parse_pubmed_article(pma) rv = dba.ins_pubmed(init) if not rv: dba_err_ct += 1 continue s['pmids'].append( pmid) # add pubmed id to list of saved ones rv = dba.ins_protein2pubmed({ 'protein_id': p['id'], 'pubmed_id': pmid }) if not rv: dba_err_ct += 1 continue s['p2p_ct'] += 1 time.sleep(0.5) if err_ct == 0: s['loaded'].append(target['id']) pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print " Successfully loaded all PubMeds for {} targets".format( len(s['loaded'])) print " Inserted {} new pubmed rows".format(len(s['pmids'])) print " Inserted {} new protein2pubmed rows".format(s['p2p_ct']) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if len(s['errors']) > 0: print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format( len(s['errors']), logfile) loop = 1 while len(s['errors']) > 0: print "\nRetry loop {}: Trying to load PubMeds for {} proteins".format( loop, len(s['errors'])) logger.info( "Retry loop {}: Trying to load data for {} proteins".format( loop, len(s['errors']))) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['errors'])).start() ct = 0 dba_err_ct = 0 for tid, chunk_cts in s['errors']: ct += 1 target in dba.get_targets(tid, include_annotations=True) logger.info("Processing target {}: {}".format( target['id'], target['name'])) p = target['components']['protein'][0] chunk_ct = 0 err_ct = 0 for chunk in chunker(pmids, 200): chunk_ct += 1 # only process chunks that are in the errors lists if chunk_ct not in chunk_cts: continue r = get_pubmed(chunk) if not r or r.status_code != 200: # try again... r = get_pubmed(chunk) if not r or r.status_code != 200: logger.error( "Bad E-Utils response for target {}, chunk {}". format(target['id'], chunk_ct)) err_ct += 1 continue soup = BeautifulSoup(r.text, "xml") pmas = soup.find('PubmedArticleSet') for pma in pmas.findAll('PubmedArticle'): pmid = pma.find('PMID').text if pmid not in s['pmids']: # only store each pubmed once logger.debug(" parsing XML for PMID: %s" % pmid) init = parse_pubmed_article(pma) rv = dba.ins_pubmed(init) if not rv: dba_err_ct += 1 continue s['pmids'].append( pmid) # add pubmed id to list of saved ones rv = dba.ins_protein2pubmed({ 'protein_id': p['id'], 'pubmed_id': pmid }) if not rv: dba_err_ct += 1 continue s['p2p_ct'] += 1 # remove chunk number from this target's error list s['errors'][tid].remove(chunk_ct) # it this target has no more errors, delete it from errors if len(s['errors'][tid]) == 0: del (s['errors'][tid]) time.sleep(0.5) if err_ct == 0: s['loaded'].append(target['id']) pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print " Successfully loaded all PubMeds for a total {} targets".format( len(s['loaded'])) print " Inserted {} new pubmed rows".format(len(s['pmids'])) print " Inserted {} new protein2pubmed rows".format(s['p2p_ct']) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if len(s['errors']) > 0: print " {} targets remaining for next retry loop.".format( len(s['errors'])) s.close() # Find the set of TIN-X PubMed IDs not already stored in TCRD tinx_pmids = [str(pmid) for pmid in dba.get_tinx_pmids()] tinx_pmid_ct = len(tinx_pmids) pmids = [str(pmid) for pmid in dba.get_pmids()] if not args['--quiet']: print "\nChecking for {} TIN-X PubMed IDs in TCRD".format(tinx_pmid_ct) logger.info( "Checking for {} TIN-X PubMed IDs in TCRD".format(tinx_pmid_ct)) not_in_tcrd = list(set(tinx_pmids) - set(pmids)) # for pmid in tinx_pmids: # rv = dba.get_pubmed(pmid) # if not rv: # not_in_tcrd.add(pmid) not_in_tcrd_ct = len(not_in_tcrd) if not args['--quiet']: print "\nProcessing {} TIN-X PubMed IDs not in TCRD".format( not_in_tcrd_ct) logger.info("Processing {} TIN-X PubMed IDs".format(not_in_tcrd_ct)) ct = 0 pm_ct = 0 net_err_ct = 0 dba_err_ct = 0 chunk_ct = 0 for chunk in chunker(list(not_in_tcrd), 200): chunk_ct += 1 logger.info("Processing TIN-X PubMed IDs chunk {}".format(chunk_ct)) r = get_pubmed(chunk) if not r or r.status_code != 200: # try again... r = get_pubmed(chunk) if not r or r.status_code != 200: logger.error( "Bad E-Utils response for chunk {}".format(chunk_ct)) net_err_ct += 1 continue soup = BeautifulSoup(r.text, "xml") pmas = soup.find('PubmedArticleSet') for pma in pmas.findAll('PubmedArticle'): ct += 1 logger.debug(" parsing XML for PMID: {}".format(pmid)) init = parse_pubmed_article(pma) rv = dba.ins_pubmed(init) if not rv: dba_err_ct += 1 continue pm_ct += 1 time.sleep(0.5) print "Processed {} TIN-X PubMed IDs.".format(ct) print " Inserted {} new pubmed rows".format(pm_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if net_err_ct > 0: print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format( net_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GO Experimental Leaf Term Flags', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'These values are calculated by the loader app and indicate that a protein is annotated with a GO leaf term in either the Molecular Function or Biological Process branch with an experimental evidenve code.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Experimental MF/BP Leaf Term GOA'" }) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) gofile = DOWNLOAD_DIR + FILENAME logger.info("Parsing GO OBO file: %s" % gofile) godag = GODag(gofile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 ti_ct = 0 notfnd = {} dba_err_ct = 0 exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP'] for t in dba.get_targets(idg=False, include_annotations=True): ct += 1 p = t['components']['protein'][0] if 'goas' in p: lfe_goa_strs = [] for d in p['goas']: if d['go_term'].startswith('C'): continue # only want MF/BP terms ev = d['evidence'] if ev not in exp_codes: continue # only want experimental evidence GOAs gt = godag.query_term(d['go_id']) if not gt: k = "%s:%s" % (d['go_id'], d['go_term']) notfnd[k] = True logger.error("GO term %s not found in GODag" % k) continue if len(gt.children) == 0: # if it's a leaf node lfe_goa_strs.append("%s|%s|%s" % (d['go_id'], d['go_term'], ev)) if lfe_goa_strs: rv = dba.ins_tdl_info({ 'protein_id': p['id'], 'itype': 'Experimental MF/BP Leaf Term GOA', 'string_value': "; ".join(lfe_goa_strs) }) if not rv: dba_err_ct += 1 continue ti_ct += 1 pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} new tdl_info rows".format(ti_ct) if len(notfnd.keys()) > 0: print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format( (len(notfnd.keys()), logfile)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def calc_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Consensus Expression Values', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Consensus of GTEx, HPM and HPA expression values are calculated by the loader app.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'Consensus'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) tmap = {} # tissue name to Tissue Type as per TIO line_ct = slmf.wcl(TISSUESTYPED_FILE) line_ct -= 1 if not args['--quiet']: print '\nProcessiong {} lines in tissue mapping file: {}'.format( line_ct, TISSUESTYPED_FILE) with open(TISSUESTYPED_FILE, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 0 for row in csvreader: ct += 1 tissue = row[0].lower() tmap[tissue] = row[2] if not args['--quiet']: print ' Got {} tissue name mappings'.format(len(tmap)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nCalculating/Loading Consensus expressions for {} TCRD targets".format( tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 nouid = set() exp_ct = 0 dba_err_ct = 0 for t in dba.get_targets(include_annotations=True): ct += 1 p = t['components']['protein'][0] if not 'expressions' in p and not 'gtexs' in p: continue want = ['HPA', 'HPM Gene', 'HPM Protein'] exps = [e for e in p['expressions'] if e['etype'] in want] gtexs = None if 'gtexs' in p: gtexs = p['gtexs'] aggexps = aggregate_exps(exps, gtexs, tmap) for tissue, vals in aggexps.items(): (cons, conf) = calculate_consensus(vals) init = { 'protein_id': p['id'], 'etype': 'Consensus', 'tissue': tissue, 'qual_value': cons, 'confidence': conf } rv = dba.ins_expression(init) if rv: exp_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "Processed {} targets.".format(ct) print " Inserted {} new Consensus expression rows.".format(exp_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)