Esempio n. 1
0
def main():
  argparser = argparse.ArgumentParser(description="Export TCRD target data to a CSV file")
  argparser.add_argument("-o", "--outfile", help='Output file [path/]name', default=OUTFILE)
  argparser.add_argument('-db', '--dbname', help='MySQL database name', default=DBNAME)
  argparser.add_argument("-i", "--idg", help="Export only IDG-Eligible tagets", action="store_true", default=False)
  argparser.add_argument("-e", "--expand", help="Export expanded (a LOT of data) CSV version", action="store_true", default=False)
  args = argparser.parse_args()
  
  dba = DBAdaptor({'dbname': args.dbname})
  dbi = dba.get_dbinfo()
  print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c"))
  print "\nConnected to TCRD database %s (schema ver %s, data ver %s)\n" % (dbi['dbname'], dbi['schema_ver'], dbi['data_ver'])

  if args.idg:
    tct = dba.get_target_count(idg=True)
    print "Exporting CSV for %d IDG-Eligible targets from TCRD to file %s" % (tct, args.outfile)
  else:
    tct = dba.get_target_count(idg=False)
    print "Exporting CSV for all %d targets from TCRD to file %s" % (tct, args.outfile)

  header = ['TCRD ID', 'Name', 'Description', 'HGNC Sym', 'NCBI Gene ID', 'UniProt', 'STRING ID', 'TDL', 'IDG Eligible', 'DTO ID', 'DTO Class']
  if args.expand:
    header = header + ['PANTHER Class(es)', 'GeneRIF Count', 'NCBI Gene PubMed Count', 'JensenLab PubMed Score', 'PubTator Score', 'Ab Count', 'Monoclonal Ab Count', 'Activity Count', 'ChEMBL Selective Compound', 'ChEMBL First Reference Year', 'DrugCentral Activity Count', 'PDB Count', 'PDBs', 'GO Annotation Count', 'Experimental MF/BP Leaf Term GOA(s)', 'OMIM Phenotype Count', 'OMIM Phenotype(s)', 'JAX/MGI Human Ortholog Phenotype Count', 'JAX/MGI Human Ortholog Phenotype(s)', 'IMPC Ortholog Phenotype Count', 'IMPC Ortholog Phenotype(s)', 'GWAS Count', 'GWAS Phenotype(s)', 'Pathway Count', 'Pathways', 'Total Disease Count', 'Top 5 Text-Mining DISEASES', 'eRAM Diseases', 'EBI Patent Count', 'Is Transcription Factor', 'TMHMM Prediction', 'HPA Tissue Specificity Index', 'HPM Gene Tissue Specificity Index', 'HPM Protein Tissue Specificity Index', 'TIN-X Novelty', 'Top 5 TIN-X Importance(s)']
    
  pbar_widgets = ['Progress: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  with open(args.outfile, 'wb') as csvout:
    csvwriter = csv.writer(csvout, quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(header)
    ct = 0
    if args.idg:
      for t in dba.get_targets(idg=True, include_annotations=args.expand):
        ct += 1
        if args.expand:
          csvwriter.writerow( target2csv_exp(t) )
        else:
          csvwriter.writerow( target2csv(t) )
        pbar.update(ct)
    else:
      for t in dba.get_targets(idg=False, include_annotations=args.expand):
      #for tid in [9]:
      #  t = dba.get_target(tid, True)
        ct += 1
        if args.expand:
          csvwriter.writerow(target2csv_exp(t))
        else:
          csvwriter.writerow(target2csv(t))
        pbar.update(ct)
  pbar.finish()

  print "%d CSV rows exported" % ct
  print "\n%s: Done." % PROGRAM
Esempio n. 2
0
def calc_and_load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'KEGG Nearest Tclins',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'Nearest upstream and downstream Tclin targets are found and stored based on KEGG Distances.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'kegg_nearest_tclin'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    uct = 0
    umark = set()
    dct = 0
    dmark = set()
    dba_err_ct = 0
    for target in dba.get_targets():
        #tids = [1983, 7166]
        #for tid in tids:
        #  target = dba.get_target(tid)
        ct += 1
        if target['tdl'] == 'Tclin':
            continue
        pid = target['components']['protein'][0]['id']
        ups = dba.get_nearest_kegg_tclins(pid, 'upstream')
        if ups:
            umark.add(pid)
            for d in ups:
                d['tclin_id'] = d['protein_id']
                d['protein_id'] = pid
                d['direction'] = 'upstream'
                rv = dba.ins_kegg_nearest_tclin(d)
                if rv:
                    uct += 1
                else:
                    dba_err_ct += 1
        dns = dba.get_nearest_kegg_tclins(pid, 'downstream')
        if dns:
            dmark.add(pid)
            for d in dns:
                d['tclin_id'] = d['protein_id']
                d['protein_id'] = pid
                d['direction'] = 'downstream'
                rv = dba.ins_kegg_nearest_tclin(d)
                if rv:
                    dct += 1
                else:
                    dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()

    if not args['--quiet']:
        print "\n{} targets processed.".format(ct)
        print "  {} non-Tclin targets have upstream Tclin target(s)".format(
            len(umark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(uct)
        print "  {} non-Tclin targets have downstream Tclin target(s)".format(
            len(dmark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(dct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 3
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  aliasmap = {}
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  ct = 0
  skip_ct = 0
  mult_ct = 0
  line_ct = slmf.wcl(INFILE1)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(INFILE1, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      # taxid   uniprot_ac|uniprot_id   string_id   identity   bit_score
      ct += 1
      pbar.update(ct)
      if float(row[3]) != 100:
        skip_ct += 1
        continue
      [uniprot, name] = row[1].split("|")
      ensp = row[2].replace('9606.', '')
      bitscore = float(row[4])
      if uniprot in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[uniprot][1]:
          aliasmap[uniprot] = (ensp, bitscore)
      else:
        aliasmap[uniprot] = (ensp, bitscore)
      if name in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[name][1]:
          aliasmap[name] = (ensp, bitscore)
      else:
        aliasmap[name] = (ensp, bitscore)
  pbar.finish()
  unmap_ct = len(aliasmap)
  print "{} input lines processed.".format(ct)
  print "  Skipped {} non-identity lines".format(skip_ct)
  print "  Got {} uniprot/name to STRING ID mappings".format(unmap_ct)

  line_ct = slmf.wcl(INFILE2)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  warn_ct = 0
  with open(INFILE2, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ## string_protein_id ## alias ## source ##
      ct += 1
      pbar.update(ct)
      alias = row[1]
      ensp = row[0].replace('9606.', '')
      if alias in aliasmap and aliasmap[alias][0] != ensp:
        # do not replace mappings from *human.uniprot_2_string.2018* with aliases
        logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp))
        warn_ct += 1
        continue
      aliasmap[alias] = (ensp, None)
  pbar.finish()
  amap_ct = len(aliasmap) - unmap_ct
  print "{} input lines processed.".format(ct)
  print "  Added {} alias to STRING ID mappings".format(amap_ct)
  if warn_ct > 0:
    print "  Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile)

  tct = dba.get_target_count(idg=False)
  if not args['--quiet']:
    print "\nLoading STRING IDs for {} TCRD targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  ct = 0
  upd_ct = 0
  nf_ct = 0
  dba_err_ct = 0
  for target in dba.get_targets(include_annotations=True):
    ct += 1
    pbar.update(ct)
    p = target['components']['protein'][0]
    geneid = 'hsa:' + str(p['geneid'])
    hgncid = None
    if 'HGNC' in p['xrefs']:
      hgncid = p['xrefs']['HGNC'][0]['value']
    ensp = None
    if p['uniprot'] in aliasmap:
      ensp = aliasmap[p['uniprot']][0]
    elif p['name'] in aliasmap:
      ensp = aliasmap[p['name']][0]
    elif geneid in aliasmap:
      ensp = aliasmap[geneid][0]
    elif hgncid and hgncid in aliasmap:
      ensp = aliasmap[hgncid][0]
    if not ensp:
      nf_ct += 1
      logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot']))
      continue
    rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} )
    if rv:
      upd_ct += 1
    else:
      dba_err_ct += 1
  pbar.finish()
  print "Updated {} STRING ID values".format(upd_ct)
  if nf_ct > 0:
    print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 4
0
def load(ortho_df, args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Orthologs',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.genenames.org/cgi-bin/hcop'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'ortholog',
        'comment':
        "Orthologs are majority vote from the OMA, EggNOG and InParanoid resources as per HGNC."
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading ortholog data for {} TCRD targets".format(tct)
    logger.info("Loading ortholog data for {} TCRD targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    ortho_ct = 0
    tskip_ct = 0
    skip_ct = 0
    notfnd = set()
    dba_err_ct = 0
    for target in dba.get_targets():
        ct += 1
        pbar.update(ct)
        logger.info("Processing target %d" % target['id'])
        p = target['components']['protein'][0]
        if p['sym']:  # try first by symbol
            to_df = ortho_df.loc[ortho_df['human_symbol'] == p['sym']]
        elif p['geneid']:  # then try by GeneID
            to_df = ortho_df.loc[ortho_df['human_entrez_gene'] == p['geneid']]
        else:
            tskip_ct += 1
            continue
        if len(to_df) == 0:
            continue
        for idx, row in to_df.iterrows():
            if row['ortholog_species_symbol'] == '-' and row[
                    'ortholog_species_name'] == '-':
                skip_ct += 1
                continue
            sp = TAXID2SP[row['ortholog_species']]
            init = {
                'protein_id': p['id'],
                'taxid': row['ortholog_species'],
                'species': sp,
                'sources': row['sources'],
                'symbol': row['ortholog_species_symbol'],
                'name': row['ortholog_species_name']
            }
            # Add MOD DB ID if it's there
            if row['ortholog_species_db_id'] != '-':
                init['db_id'] = row['ortholog_species_db_id']
            # Add NCBI Gene ID if it's there
            if row['ortholog_species_entrez_gene'] != '-':
                init['geneid'] = row['ortholog_species_entrez_gene']
            # Construct MOD URLs for mouse, rat, zebrafish, fly, worm and yeast
            if sp == 'Mouse':
                init[
                    'mod_url'] = 'http://www.informatics.jax.org/marker/' + row[
                        'ortholog_species_db_id']
            elif sp == 'Rat':
                rgdid = row['ortholog_species_db_id'].replace('RGD:', '')
                init[
                    'mod_url'] = 'http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=' + rgdid
            elif sp == 'Zebrafish':
                init['mod_url'] = 'http://zfin.org/' + row[
                    'ortholog_species_db_id']
            elif sp == 'Fruitfly':
                init['mod_url'] = "http://flybase.org/reports/%s.html" % row[
                    'ortholog_species_db_id']
            elif sp == 'C. elegans':
                init['mod_url'] = 'http://www.wormbase.org/search/gene/' + row[
                    'ortholog_species_symbol']
            elif sp == 'S.cerevisiae':
                init['mod_url'] = 'https://www.yeastgenome.org/locus/' + row[
                    'ortholog_species_db_id']
            rv = dba.ins_ortholog(init)
            if rv:
                ortho_ct += 1
            else:
                dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "Processed {} targets.".format(ct)
    print "Loaded {} new ortholog rows".format(ortho_ct)
    print "  Skipped {} empty ortholog entries".format(skip_ct)
    print "  Skipped {} targets with no sym/geneid".format(tskip_ct)
    if len(notfnd) > 0:
        print "  No orthologs found for {} targets.".format(len(notfnd))
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 5
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'TDLs',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'TDLs are generated by the loading app from data in TCRD.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'tdl'
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile {} for details.".format(
            logfile)
        sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0}
    bump_ct = 0
    dba_err_ct = 0
    upd_ct = 0
    for target in dba.get_targets(idg=False, include_annotations=True):
        ct += 1
        pbar.update(ct)
        (tdl, bump_flag) = get_tdl(target)
        tdl_cts[tdl] += 1
        if bump_flag:
            bump_ct += 1
        rv = dba.upd_target(target['id'], 'tdl', tdl)
        if rv:
            upd_ct += 1
        else:
            dba_err_ct += 1
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "Set TDL values for {} targets:".format(upd_ct)
    print "  {} targets are Tclin".format(tdl_cts['Tclin'])
    print "  {} targets are Tchem".format(tdl_cts['Tchem'])
    print "  {} targets are Tbio - {} bumped from Tdark".format(
        tdl_cts['Tbio'], bump_ct)
    print "  {} targets are Tdark".format(tdl_cts['Tdark'])
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 6
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Harmonogram CDFs',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'CDFs are calculated by the loader app based on gene_attribute data in TCRD.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': 1, 'table_name': 'hgram_cdf'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    # Create a dictionary of gene_attribute_type.name => [] pairs
    counts = {}
    # Create a dictionary of gene_attribute_type.name => {} pairs
    stats = {}
    gatypes = dba.get_gene_attribute_types()
    for ga in gatypes:
        counts[ga] = []
        stats[ga] = {}

    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nCollecting counts for {} gene attribute types on {} TCRD targets".format(
            len(gatypes), tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    for t in dba.get_targets(idg=False,
                             include_annotations=True,
                             get_ga_counts=True):
        ct += 1
        pbar.update(ct)
        p = t['components']['protein'][0]
        pid = p['id']
        if not 'gene_attribute_counts' in p: continue
        for type, attr_count in p['gene_attribute_counts'].items():
            counts[type].append(attr_count)
    pbar.finish()

    print "\nCalculatig Gene Attribute stats. See logfile {}.".format(logfile)
    logger.info("Calculatig Gene Attribute stats:")
    for type, l in counts.items():
        if len(l) == 0:
            del (counts[type])
            continue
        npa = numpy.array(l)
        logger.info("  %s: %d counts; mean: %.2f; std: %.2f" %
                    (type, len(l), npa.mean(), npa.std()))
        stats[type]['mean'] = npa.mean()
        stats[type]['std'] = npa.std()

    print "\nLoading HGram CDFs for {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    nan_ct = 0
    cdf_ct = 0
    dba_err_ct = 0
    for t in dba.get_targets(idg=False,
                             include_annotations=True,
                             get_ga_counts=True):
        ct += 1
        p = t['components']['protein'][0]
        pid = p['id']
        if not 'gene_attribute_counts' in p: continue
        for type, attr_count in p['gene_attribute_counts'].items():
            attr_cdf = gaussian_cdf(attr_count, stats[type]['mean'],
                                    stats[type]['std'])
            if math.isnan(attr_cdf):
                attr_cdf = 1.0 / (1.0 + math.exp(-1.702 * (
                    (attr_count - stats[type]['mean']) / stats[type]['std'])))
            if math.isnan(attr_cdf):
                nan_ct += 1
                continue
            rv = dba.ins_hgram_cdf({
                'protein_id': p['id'],
                'type': type,
                'attr_count': attr_count,
                'attr_cdf': attr_cdf
            })
            if not rv:
                dba_err_ct += 1
                continue
            cdf_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "Processed {} targets.".format(ct)
    print "  Loaded {} new hgram_cdf rows".format(cdf_ct)
    print "  Skipped {} NaN CDFs".format(nan_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 7
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as load()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Aintibodypedia.com',
        'source': 'Web API at %s' % ABPC_API_URL,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.antibodypedia.com'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "Ab Count"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "MAb Count"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "Antibodypedia.com URL"'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading Antibodypedia annotations for {} TCRD targets".format(
            tct)
    logger.info(
        "Loading Antibodypedia annotations for {} TCRD targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    tiab_ct = 0
    timab_ct = 0
    tiurl_ct = 0
    dba_err_ct = 0
    net_err_ct = 0
    for target in dba.get_targets():
        ct += 1
        pbar.update(ct)
        tid = target['id']
        p = target['components']['protein'][0]
        pid = p['id']
        url = ABPC_API_URL + p['uniprot']
        r = None
        attempts = 1
        while attempts <= 5:
            try:
                logger.info("Getting {} [Target {}, attempt {}]".format(
                    url, tid, attempts))
                r = requests.get(url)
                break
            except:
                attempts += 1
                time.sleep(1)
        if not r:
            net_err_ct += 1
            logger.error("No response for {} [Target {}, attempt {}]".format(
                url, tid, attempts))
            continue
        if r.status_code != 200:
            net_err_ct += 1
            logger.error(
                "Bad response: {} for {} [Target {}, attempt {}]".format(
                    r.status_code, url, tid, attempts))
            continue
        abpd = json.loads(r.text)
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Ab Count',
            'integer_value': int(abpd['num_antibodies'])
        })
        if rv:
            tiab_ct += 1
        else:
            dba_err_ct += 1
        if 'ab_type_monoclonal' in abpd:
            mab_ct = int(abpd['ab_type_monoclonal'])
        else:
            mab_ct = 0
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'MAb Count',
            'integer_value': mab_ct
        })
        if rv:
            timab_ct += 1
        else:
            dba_err_ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Antibodypedia.com URL',
            'string_value': abpd['url']
        })
        if rv:
            tiurl_ct += 1
        else:
            dba_err_ct += 1
        time.sleep(1)
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} Ab Count tdl_info rows".format(tiab_ct)
    print "  Inserted {} MAb Count tdl_info rows".format(timab_ct)
    print "  Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct)
    if net_err_ct > 0:
        print "WARNING: Network error for {} targets. See logfile {} for details.".format(
            net_err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 8
0
def run_and_load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'TMHMM Predictions',
        'source': 'Results of running TMHMM on protein sequences.',
        'app': PROGRAM,
        'app_version': __version__,
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'TMHMM Prediction'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    tct = dba.get_target_count(idg=False)
    print "\nProcessing {} TCRD targets".format(tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()

    regex = re.compile(r'PredHel=(\d+)')
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for t in dba.get_targets(idg=False, include_annotations=False):
        ct += 1
        p = t['components']['protein'][0]
        fasta = ">%s|%s %s\n%s\n" % (t['id'], p['name'], p['description'],
                                     p['seq'])
        #print "[DEBUG] Fasta:\n%s" % fasta
        fasta_filename = "/tmp/%s.fa" % t['id']
        f = open(fasta_filename, 'w')
        f.write(fasta)
        f.close()
        cmd = '%s --short --noplot %s' % (TMHMM_BIN, fasta_filename)
        #print "[DEBUG] Cmd: %s" % cmd
        output = ''
        for line in runProcess(cmd.split()):
            output += line
        os.remove(fasta_filename)
        #print "[DEBUG] Output: %s" % output
        pred = regex.findall(output)[0]
        #print "[DEBUG] PredHel: %s" % predhel
        if pred != '0':
            rv = dba.ins_tdl_info({
                'protein_id': p['id'],
                'itype': 'TMHMM Prediction',
                'string_value': output
            })
            if not rv:
                dba_err_ct += 1
                continue
            ti_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} targets processed.".format(ct)
    print "  Inserted {} new TMHMM Prediction tdl_info rows".format(ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'NCBI Gene',
        'source': 'EUtils web API at %s' % EFETCH_GENE_URL,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.ncbi.nlm.nih.gov/gene'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene Summary'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene PubMed Count'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'generif'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }, {
        'dataset_id': dataset_id,
        'table_name': 'alias',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    s = shelve.open(SHELF_FILE, writeback=True)
    s['loaded'] = []
    s['retries'] = {}
    s['counts'] = defaultdict(int)

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct
    logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    skip_ct = 0
    for t in dba.get_targets(include_annotations=False):
        tid = t['id']
        ct += 1
        p = t['components']['protein'][0]
        pid = p['id']
        if p['geneid'] == None:
            skip_ct += 1
            continue
        geneid = str(p['geneid'])
        logger.info("Processing target %d: geneid %s" % (tid, geneid))
        (status, headers, xml) = get_ncbigene(geneid)
        if not status:
            logger.warn("Failed getting Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        if status != 200:
            logger.warn("Bad API response for Gene ID %s: %s" %
                        (geneid, status))
            s['retries'][tid] = True
            continue
        gene_annotations = parse_genexml(xml)
        if not gene_annotations:
            s['counts']['xml_err'] += 1
            logger.error("XML Error for Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        load_annotations(dba, t, dataset_id, gene_annotations, s)
        time.sleep(0.5)
        pbar.update(ct)
    pbar.finish()
    print "Processed %d targets." % ct
    if skip_ct > 0:
        print "Skipped %d targets with no geneid" % skip_ct
    print "Loaded NCBI annotations for %d targets" % len(s['loaded'])
    if len(s['retries']) > 0:
        print "Total targets remaining for retries: %d " % len(s['retries'])

    loop = 1
    while len(s['retries']) > 0:
        print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (
            loop, len(s['retries']))
        logger.info(
            "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets"
            % (loop, len(s['retries'])))
        pbar_widgets = [
            'Progress: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA()
        ]
        pbar = ProgressBar(widgets=pbar_widgets,
                           maxval=len(s['retries'])).start()
        ct = 0
        act = 0
        for tid, _ in s['retries'].items():
            ct += 1
            t = dba.get_target(tid, include_annotations=False)
            geneid = str(t['components']['protein'][0]['geneid'])
            logger.info("Processing target %d: geneid %s" % (tid, geneid))
            (status, headers, xml) = get_ncbigene(geneid)
            if not status:
                logger.warn("Failed getting Gene ID %s" % geneid)
                continue
            if status != 200:
                logger.warn("Bad API response for Gene ID %s: %s" %
                            (geneid, status))
                continue
            gene_annotations = parse_genexml(xml)
            if not gene_annotations:
                s['counts']['xml_err'] += 1
                logger.error("XML Error for Gene ID %s" % geneid)
                continue
            load_annotations(dba, t, dataset_id, gene_annotations, s)
            act += 1
            del s['retries'][tid]
            time.sleep(0.5)
            pbar.update(ct)
        loop += 1
        if loop == 5:
            print("Completed 5 retry loops. Aborting.")
            break
        pbar.finish()
        print "Processed %d targets." % ct
        print "  Annotated %d additional targets" % act
        print "  Total annotated targets: %d" % len(s['loaded'])
        if len(s['retries']) > 0:
            print "Total targets remaining for retries: %d " % len(
                s['retries'])

    print "\nInserted %d aliases" % s['counts']['alias']
    print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary']
    print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc']
    print "Inserted %d GeneRIFs" % s['counts']['generif']
    print "Inserted %d PubMed xrefs" % s['counts']['pmxr']
    #print "Inserted %d other xrefs" % s['counts']['xref']
    if s['counts']['xml_err'] > 0:
        print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % (
            s['counts']['xml_err'], logfile)
    if s['counts']['dba_err'] > 0:
        print "WARNNING: %d DB errors occurred. See logfile %s for details." % (
            s['counts']['dba_err'], logfile)
Esempio n. 10
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'PubMed',
        'source': 'NCBI E-Utils',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'https://www.ncbi.nlm.nih.gov/pubmed'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'pubmed'})
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'protein2pubmed'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    s = shelve.open(SHELF_FILE, writeback=True)
    s['loaded'] = [
    ]  # list of target IDs that have been successfully processed
    s['pmids'] = []  # list of stored pubmed ids
    s['p2p_ct'] = 0
    s['errors'] = defaultdict(list)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if args['--pastid']:
        tct = dba.get_target_count(idg=False, past_id=args['--pastid'])
    else:
        tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nLoading pubmeds for {} TCRD targets".format(tct)
        logger.info("Loading pubmeds for {} TCRD targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    dba_err_ct = 0
    if args['--pastid']:
        past_id = args['--pastid']
    else:
        past_id = 0
    for target in dba.get_targets(include_annotations=True, past_id=past_id):
        ct += 1
        logger.info("Processing target {}: {}".format(target['id'],
                                                      target['name']))
        p = target['components']['protein'][0]
        if 'PubMed' not in p['xrefs']: continue
        pmids = [d['value'] for d in p['xrefs']['PubMed']]
        chunk_ct = 0
        err_ct = 0
        for chunk in chunker(pmids, 200):
            chunk_ct += 1
            r = get_pubmed(chunk)
            if not r or r.status_code != 200:
                # try again...
                r = get_pubmed(chunk)
                if not r or r.status_code != 200:
                    logger.error(
                        "Bad E-Utils response for target {}, chunk {}".format(
                            target['id'], chunk_ct))
                    s['errors'][target['id']].append(chunk_ct)
                    err_ct += 1
                    continue
            soup = BeautifulSoup(r.text, "xml")
            pmas = soup.find('PubmedArticleSet')
            for pma in pmas.findAll('PubmedArticle'):
                pmid = pma.find('PMID').text
                if pmid not in s['pmids']:
                    # only store each pubmed once
                    logger.debug("  parsing XML for PMID: %s" % pmid)
                    init = parse_pubmed_article(pma)
                    rv = dba.ins_pubmed(init)
                    if not rv:
                        dba_err_ct += 1
                        continue
                    s['pmids'].append(
                        pmid)  # add pubmed id to list of saved ones
                rv = dba.ins_protein2pubmed({
                    'protein_id': p['id'],
                    'pubmed_id': pmid
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                s['p2p_ct'] += 1
            time.sleep(0.5)
        if err_ct == 0:
            s['loaded'].append(target['id'])
        pbar.update(ct)
    pbar.finish()
    print "Processed {} targets.".format(ct)
    print "  Successfully loaded all PubMeds for {} targets".format(
        len(s['loaded']))
    print "  Inserted {} new pubmed rows".format(len(s['pmids']))
    print "  Inserted {} new protein2pubmed rows".format(s['p2p_ct'])
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    if len(s['errors']) > 0:
        print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format(
            len(s['errors']), logfile)

    loop = 1
    while len(s['errors']) > 0:
        print "\nRetry loop {}: Trying to load PubMeds for {} proteins".format(
            loop, len(s['errors']))
        logger.info(
            "Retry loop {}: Trying to load data for {} proteins".format(
                loop, len(s['errors'])))
        pbar_widgets = [
            'Progress: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA()
        ]
        pbar = ProgressBar(widgets=pbar_widgets,
                           maxval=len(s['errors'])).start()
        ct = 0
        dba_err_ct = 0
        for tid, chunk_cts in s['errors']:
            ct += 1
            target in dba.get_targets(tid, include_annotations=True)
            logger.info("Processing target {}: {}".format(
                target['id'], target['name']))
            p = target['components']['protein'][0]
            chunk_ct = 0
            err_ct = 0
            for chunk in chunker(pmids, 200):
                chunk_ct += 1
                # only process chunks that are in the errors lists
                if chunk_ct not in chunk_cts:
                    continue
                r = get_pubmed(chunk)
                if not r or r.status_code != 200:
                    # try again...
                    r = get_pubmed(chunk)
                    if not r or r.status_code != 200:
                        logger.error(
                            "Bad E-Utils response for target {}, chunk {}".
                            format(target['id'], chunk_ct))
                        err_ct += 1
                        continue
                soup = BeautifulSoup(r.text, "xml")
                pmas = soup.find('PubmedArticleSet')
                for pma in pmas.findAll('PubmedArticle'):
                    pmid = pma.find('PMID').text
                    if pmid not in s['pmids']:
                        # only store each pubmed once
                        logger.debug("  parsing XML for PMID: %s" % pmid)
                        init = parse_pubmed_article(pma)
                        rv = dba.ins_pubmed(init)
                        if not rv:
                            dba_err_ct += 1
                            continue
                        s['pmids'].append(
                            pmid)  # add pubmed id to list of saved ones
                    rv = dba.ins_protein2pubmed({
                        'protein_id': p['id'],
                        'pubmed_id': pmid
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    s['p2p_ct'] += 1
                # remove chunk number from this target's error list
                s['errors'][tid].remove(chunk_ct)
                # it this target has no more errors, delete it from errors
                if len(s['errors'][tid]) == 0:
                    del (s['errors'][tid])
                time.sleep(0.5)
            if err_ct == 0:
                s['loaded'].append(target['id'])
            pbar.update(ct)
        pbar.finish()
        print "Processed {} targets.".format(ct)
        print "  Successfully loaded all PubMeds for a total {} targets".format(
            len(s['loaded']))
        print "  Inserted {} new pubmed rows".format(len(s['pmids']))
        print "  Inserted {} new protein2pubmed rows".format(s['p2p_ct'])
        if dba_err_ct > 0:
            print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
                dba_err_ct, logfile)
    if len(s['errors']) > 0:
        print "  {} targets remaining for next retry loop.".format(
            len(s['errors']))
    s.close()

    # Find the set of TIN-X PubMed IDs not already stored in TCRD
    tinx_pmids = [str(pmid) for pmid in dba.get_tinx_pmids()]
    tinx_pmid_ct = len(tinx_pmids)
    pmids = [str(pmid) for pmid in dba.get_pmids()]
    if not args['--quiet']:
        print "\nChecking for {} TIN-X PubMed IDs in TCRD".format(tinx_pmid_ct)
        logger.info(
            "Checking for {} TIN-X PubMed IDs in TCRD".format(tinx_pmid_ct))
    not_in_tcrd = list(set(tinx_pmids) - set(pmids))
    # for pmid in tinx_pmids:
    #   rv = dba.get_pubmed(pmid)
    #   if not rv:
    #     not_in_tcrd.add(pmid)
    not_in_tcrd_ct = len(not_in_tcrd)
    if not args['--quiet']:
        print "\nProcessing {} TIN-X PubMed IDs not in TCRD".format(
            not_in_tcrd_ct)
        logger.info("Processing {} TIN-X PubMed IDs".format(not_in_tcrd_ct))
    ct = 0
    pm_ct = 0
    net_err_ct = 0
    dba_err_ct = 0
    chunk_ct = 0
    for chunk in chunker(list(not_in_tcrd), 200):
        chunk_ct += 1
        logger.info("Processing TIN-X PubMed IDs chunk {}".format(chunk_ct))
        r = get_pubmed(chunk)
        if not r or r.status_code != 200:
            # try again...
            r = get_pubmed(chunk)
            if not r or r.status_code != 200:
                logger.error(
                    "Bad E-Utils response for chunk {}".format(chunk_ct))
                net_err_ct += 1
                continue
        soup = BeautifulSoup(r.text, "xml")
        pmas = soup.find('PubmedArticleSet')
        for pma in pmas.findAll('PubmedArticle'):
            ct += 1
            logger.debug("  parsing XML for PMID: {}".format(pmid))
            init = parse_pubmed_article(pma)
            rv = dba.ins_pubmed(init)
            if not rv:
                dba_err_ct += 1
                continue
            pm_ct += 1
        time.sleep(0.5)
    print "Processed {} TIN-X PubMed IDs.".format(ct)
    print "  Inserted {} new pubmed rows".format(pm_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    if net_err_ct > 0:
        print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format(
            net_err_ct, logfile)
Esempio n. 11
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GO Experimental Leaf Term Flags',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'These values are calculated by the loader app and indicate that a protein is annotated with a GO leaf term in either the Molecular Function or Biological Process branch with an experimental evidenve code.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id':
        dataset_id,
        'table_name':
        'tdl_info',
        'where_clause':
        "itype = 'Experimental MF/BP Leaf Term GOA'"
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile {} for details.".format(
            logfile)
        sys.exit(1)

    gofile = DOWNLOAD_DIR + FILENAME
    logger.info("Parsing GO OBO file: %s" % gofile)
    godag = GODag(gofile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    ti_ct = 0
    notfnd = {}
    dba_err_ct = 0
    exp_codes = ['EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP']
    for t in dba.get_targets(idg=False, include_annotations=True):
        ct += 1
        p = t['components']['protein'][0]
        if 'goas' in p:
            lfe_goa_strs = []
            for d in p['goas']:
                if d['go_term'].startswith('C'):
                    continue  # only want MF/BP terms
                ev = d['evidence']
                if ev not in exp_codes:
                    continue  # only want experimental evidence GOAs
                gt = godag.query_term(d['go_id'])
                if not gt:
                    k = "%s:%s" % (d['go_id'], d['go_term'])
                    notfnd[k] = True
                    logger.error("GO term %s not found in GODag" % k)
                    continue
                if len(gt.children) == 0:  # if it's a leaf node
                    lfe_goa_strs.append("%s|%s|%s" %
                                        (d['go_id'], d['go_term'], ev))
            if lfe_goa_strs:
                rv = dba.ins_tdl_info({
                    'protein_id': p['id'],
                    'itype': 'Experimental MF/BP Leaf Term GOA',
                    'string_value': "; ".join(lfe_goa_strs)
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                ti_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} new  tdl_info rows".format(ti_ct)
    if len(notfnd.keys()) > 0:
        print "WARNING: {} GO terms not found in GODag. See logfile {} for details.".format(
            (len(notfnd.keys()), logfile))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Esempio n. 12
0
def calc_and_load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Consensus Expression Values',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'Consensus of GTEx, HPM and HPA expression values are calculated by the loader app.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'expression',
        'where_clause': "etype = 'Consensus'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    tmap = {}  # tissue name to Tissue Type as per TIO
    line_ct = slmf.wcl(TISSUESTYPED_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print '\nProcessiong {} lines in tissue mapping file: {}'.format(
            line_ct, TISSUESTYPED_FILE)
    with open(TISSUESTYPED_FILE, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 0
        for row in csvreader:
            ct += 1
            tissue = row[0].lower()
            tmap[tissue] = row[2]
    if not args['--quiet']:
        print '  Got {} tissue name mappings'.format(len(tmap))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nCalculating/Loading Consensus expressions for {} TCRD targets".format(
            tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    nouid = set()
    exp_ct = 0
    dba_err_ct = 0
    for t in dba.get_targets(include_annotations=True):
        ct += 1
        p = t['components']['protein'][0]
        if not 'expressions' in p and not 'gtexs' in p:
            continue
        want = ['HPA', 'HPM Gene', 'HPM Protein']
        exps = [e for e in p['expressions'] if e['etype'] in want]
        gtexs = None
        if 'gtexs' in p:
            gtexs = p['gtexs']
        aggexps = aggregate_exps(exps, gtexs, tmap)
        for tissue, vals in aggexps.items():
            (cons, conf) = calculate_consensus(vals)
            init = {
                'protein_id': p['id'],
                'etype': 'Consensus',
                'tissue': tissue,
                'qual_value': cons,
                'confidence': conf
            }
            rv = dba.ins_expression(init)
            if rv:
                exp_ct += 1
            else:
                dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    if not args['--quiet']:
        print "Processed {} targets.".format(ct)
        print "  Inserted {} new Consensus expression rows.".format(exp_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)