예제 #1
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  aliasmap = {}
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  ct = 0
  skip_ct = 0
  mult_ct = 0
  line_ct = slmf.wcl(INFILE1)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(INFILE1, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      # taxid   uniprot_ac|uniprot_id   string_id   identity   bit_score
      ct += 1
      pbar.update(ct)
      if float(row[3]) != 100:
        skip_ct += 1
        continue
      [uniprot, name] = row[1].split("|")
      ensp = row[2].replace('9606.', '')
      bitscore = float(row[4])
      if uniprot in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[uniprot][1]:
          aliasmap[uniprot] = (ensp, bitscore)
      else:
        aliasmap[uniprot] = (ensp, bitscore)
      if name in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[name][1]:
          aliasmap[name] = (ensp, bitscore)
      else:
        aliasmap[name] = (ensp, bitscore)
  pbar.finish()
  unmap_ct = len(aliasmap)
  print "{} input lines processed.".format(ct)
  print "  Skipped {} non-identity lines".format(skip_ct)
  print "  Got {} uniprot/name to STRING ID mappings".format(unmap_ct)

  line_ct = slmf.wcl(INFILE2)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  warn_ct = 0
  with open(INFILE2, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ## string_protein_id ## alias ## source ##
      ct += 1
      pbar.update(ct)
      alias = row[1]
      ensp = row[0].replace('9606.', '')
      if alias in aliasmap and aliasmap[alias][0] != ensp:
        # do not replace mappings from *human.uniprot_2_string.2018* with aliases
        logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp))
        warn_ct += 1
        continue
      aliasmap[alias] = (ensp, None)
  pbar.finish()
  amap_ct = len(aliasmap) - unmap_ct
  print "{} input lines processed.".format(ct)
  print "  Added {} alias to STRING ID mappings".format(amap_ct)
  if warn_ct > 0:
    print "  Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile)

  tct = dba.get_target_count(idg=False)
  if not args['--quiet']:
    print "\nLoading STRING IDs for {} TCRD targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  ct = 0
  upd_ct = 0
  nf_ct = 0
  dba_err_ct = 0
  for target in dba.get_targets(include_annotations=True):
    ct += 1
    pbar.update(ct)
    p = target['components']['protein'][0]
    geneid = 'hsa:' + str(p['geneid'])
    hgncid = None
    if 'HGNC' in p['xrefs']:
      hgncid = p['xrefs']['HGNC'][0]['value']
    ensp = None
    if p['uniprot'] in aliasmap:
      ensp = aliasmap[p['uniprot']][0]
    elif p['name'] in aliasmap:
      ensp = aliasmap[p['name']][0]
    elif geneid in aliasmap:
      ensp = aliasmap[geneid][0]
    elif hgncid and hgncid in aliasmap:
      ensp = aliasmap[hgncid][0]
    if not ensp:
      nf_ct += 1
      logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot']))
      continue
    rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} )
    if rv:
      upd_ct += 1
    else:
      dba_err_ct += 1
  pbar.finish()
  print "Updated {} STRING ID values".format(upd_ct)
  if nf_ct > 0:
    print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
예제 #2
0
def load(infile, args, logger):
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'PubChem CIDs',
        'source': 'File %s' % BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'cmpd_activity',
        'column_name':
        'pubchem_cid',
        'comment':
        "Loaded from UniChem file mapping ChEMBL IDs to PubChem CIDs."
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'drug_activity',
        'column_name':
        'pubchem_cid',
        'comment':
        "Loaded from UniChem file mapping ChEMBL IDs to PubChem CIDs."
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, infile)
    chembl2pc = {}
    with open(infile, 'rU') as tsv:
        ct = 0
        tsv.readline()  # skip header line
        for line in tsv:
            data = line.split('\t')
            chembl2pc[data[0]] = int(data[1])
    if not args['--quiet']:
        print "Got {} ChEMBL to PubChem mappings".format(len(chembl2pc))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    chembl_activities = dba.get_cmpd_activities(catype='ChEMBL')
    if not args['--quiet']:
        print "\nLoading PubChem CIDs for {} ChEMBL activities".format(
            len(chembl_activities))
    logger.info("Loading PubChem CIDs for {} ChEMBL activities".format(
        len(chembl_activities)))
    pbar = ProgressBar(widgets=pbar_widgets,
                       maxval=len(chembl_activities)).start()
    ct = 0
    pcid_ct = 0
    notfnd = set()
    dba_err_ct = 0
    for ca in chembl_activities:
        ct += 1
        if ca['cmpd_id_in_src'] not in chembl2pc:
            notfnd.add(ca['cmpd_id_in_src'])
            logger.warn("{} not found".format(ca['cmpd_id_in_src']))
            continue
        pccid = chembl2pc[ca['cmpd_id_in_src']]
        rv = dba.do_update({
            'table': 'cmpd_activity',
            'id': ca['id'],
            'col': 'cmpd_pubchem_cid',
            'val': pccid
        })
        if rv:
            pcid_ct += 1
        else:
            dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} ChEMBL activities processed.".format(ct)
    print "  Inserted {} new PubChem CIDs".format(pcid_ct)
    if len(notfnd) > 0:
        print "  {} ChEMBL IDs not found. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: %d DB errors occurred. See logfile %s for details." % (
            dba_err_ct, logfile)

    drug_activities = dba.get_drug_activities()
    if not args['--quiet']:
        print "\nLoading PubChem CIDs for {} drug activities".format(
            len(drug_activities))
    logger.info("Loading PubChem CIDs for {} drug activities".format(
        len(drug_activities)))
    pbar = ProgressBar(widgets=pbar_widgets,
                       maxval=len(drug_activities)).start()
    ct = 0
    pcid_ct = 0
    skip_ct = 0
    notfnd = set()
    dba_err_ct = 0
    for da in drug_activities:
        ct += 1
        if not da['cmpd_chemblid']:
            skip_ct += 1
            continue
        if da['cmpd_chemblid'] not in chembl2pc:
            notfnd.add(da['cmpd_chemblid'])
            logger.warn("{} not found".format(da['cmpd_chemblid']))
            continue
        pccid = chembl2pc[da['cmpd_chemblid']]
        rv = dba.do_update({
            'table': 'drug_activity',
            'id': da['id'],
            'col': 'cmpd_pubchem_cid',
            'val': pccid
        })
        if rv:
            pcid_ct += 1
        else:
            dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} drug activities processed.".format(ct)
    print "  Inserted {} new PubChem CIDs".format(pcid_ct)
    print "  Skipped {} drug activities with no ChEMBL ID".format(skip_ct)
    if len(notfnd) > 0:
        print "  {} ChEMBL IDs not found. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #3
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'GeneRIF Years',
        'source': 'PubMed records via NCBI E-Utils',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'https://www.ncbi.nlm.nih.gov/pubmed'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'generif',
        'column_name': 'years'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pubmed2date = pickle.load(open(PICKLE_FILE, 'rb'))
    if not args['--quiet']:
        print "\nGot %d PubMed date mappings from file %s" % (len(pubmed2date),
                                                              PICKLE_FILE)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    generifs = dba.get_generifs()
    if not args['--quiet']:
        print "\nProcessing {} GeneRIFs".format(len(generifs))
    logger.info("Processing {} GeneRIFs".format(len(generifs)))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start()
    yrre = re.compile(r'^(\d{4})')
    ct = 0
    yr_ct = 0
    skip_ct = 0
    net_err_ct = 0
    dba_err_ct = 0
    for generif in generifs:
        ct += 1
        logger.debug("Processing GeneRIF: {}".format(generif))
        # GeneRIFs with multiple refs often have duplicates, so fix that
        if "|" in generif['pubmed_ids']:
            pmids = set(generif['pubmed_ids'].split("|"))
            pmids = list(pmids)
            rv = dba.do_update({
                'table': 'generif',
                'id': generif['id'],
                'col': 'pubmed_ids',
                'val': "|".join(pmids)
            })
            if not rv:
                dba_err_ct += 1
        else:
            pmids = [generif['pubmed_ids']]

        years = list()
        for pmid in pmids:
            if pmid in pubmed2date:
                m = yrre.match(pubmed2date[pmid])
                if m:
                    years.append(m.groups(1)[0])
                else:
                    years.append('')
            else:
                years.append('')
        # See if we got any years...
        if any(years):  # if so, so do the updates
            rv = dba.do_update({
                'table': 'generif',
                'id': generif['id'],
                'col': 'years',
                'val': "|".join(years)
            })
            if rv:
                yr_ct += 1
            else:
                dba_err_ct += 1
        else:  # if not, skip
            skip_ct += 1
        pbar.update(ct)
    pbar.finish()
    if not args['--quiet']:
        print "{} GeneRIFs processed.".format(ct)
    print "  Updated {} genefifs with years".format(yr_ct)
    print "  Skipped {} generifs with no years.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    if net_err_ct > 0:
        print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format(
            net_err_ct, logfile)
예제 #4
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    with open(TISSUE2UBERON_FILE, 'r') as ifh:
        tiss2uid = ast.literal_eval(ifh.read())
    if not args['--quiet']:
        print "\nGot {} tissue to Uberon ID mappings from file {}".format(
            len(tiss2uid), TISSUE2UBERON_FILE)

    exp_ct = dba.get_expression_count(etype=ETYPE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} {} expression rows".format(exp_ct, ETYPE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=exp_ct).start()
    ct = 0
    nouid = set()
    upd_ct = 0
    dba_err_ct = 0
    for exp in dba.get_expressions(etype=ETYPE):
        ct += 1
        uberon_id = None
        if exp['oid']:
            uberon_id = dba.get_uberon_id({'oid': exp['oid']})
        if not uberon_id:
            uberon_id = dba.get_uberon_id({'name': exp['tissue']})
        if not uberon_id and exp['tissue'] in tiss2uid:
            uberon_id = tiss2uid[exp['tissue']]
        if not uberon_id:
            nouid.add(exp['tissue'])
            continue
        rv = dba.do_update({
            'table': 'expression',
            'id': exp['id'],
            'col': 'uberon_id',
            'val': uberon_id
        })
        if rv:
            upd_ct += 1
        else:
            dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    for t in nouid:
        logger.warn("No Uberon ID found for {}".format(t))
    print "{} {} expression rows processed.".format(ct, ETYPE)
    print "  Updated {} with Uberon IDs".format(upd_ct)
    if nouid:
        print "No Uberon ID found for {} tissues. See logfile {} for details.".format(
            len(nouid), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)