コード例 #1
0
def do_tiga(dba, logger, logfile):
    tigas = dba.get_tigas()
    tigact = len(tigas)
    print(f"\nLoading {tigact} TIGA ExtLinks for TCRD proteins")
    ct = 0
    el_ct = 0
    pmark = {}
    dba_err_ct = 0
    for d in tigas:
        ct += 1
        slmf.update_progress(ct / tigact)
        rv = dba.ins_extlink({
            'source': 'TIGA',
            'protein_id': d['protein_id'],
            'url': TIGA_PAGE_URL.format(d['ensg'])
        })
        if not rv:
            dba_err_ct += 1
            continue
        el_ct += 1
        pmark[d['protein_id']] = True
    print("Inserted {} new TIGA extlink rows for {} TCRD proteins.".format(
        el_ct, len(pmark)))
    if dba_err_ct > 0:
        print(
            f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #2
0
ファイル: load-TDLs.py プロジェクト: stevemathias/TCRDpy3
def load_tdls(dba, logfile, logger):
  tids = dba.get_target_ids()
  tct = len(tids)
  print(f"\nCalculating/Loading TDLs for {tct} TCRD targets")
  ct = 0
  tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0}
  bump_ct = 0
  dba_err_ct = 0
  upd_ct = 0
  for tid in tids:
    tinfo = dba.get_target4tdlcalc(tid)
    ct += 1
    slmf.update_progress(ct/tct)
    (tdl, bump_flag) = compute_tdl(tinfo)
    tdl_cts[tdl] += 1
    if bump_flag:
      bump_ct += 1
    rv = dba.do_update({'table': 'target', 'id': tid, 'col': 'tdl', 'val': tdl})
    if rv:
      upd_ct += 1
    else:
      dba_err_ct += 1
  print(f"{ct} TCRD targets processed.")
  print(f"Set TDL value for {upd_ct} targets:")
  print("  {} targets are Tclin".format(tdl_cts['Tclin']))
  print("  {} targets are Tchem".format(tdl_cts['Tchem']))
  print("  {} targets are Tbio - {} bumped from Tdark".format(tdl_cts['Tbio'], bump_ct))
  print("  {} targets are Tdark".format(tdl_cts['Tdark']))
  if dba_err_ct:
    print(f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
コード例 #3
0
def load_mondo(args, dba, logger, logfile, mondod, cfgd):
    mondo_ct = len(mondod)
    if not args['--quiet']:
        print(f"Loading {mondo_ct} Mondo terms")
    ct = 0
    ins_ct = 0
    dba_err_ct = 0
    for mondod, md in mondod.items():
        ct += 1
        ud['mondoid'] = mondoid
        rv = dba.ins_mondo(md)
        if rv:
            ins_ct += 1
        else:
            dba_err_ct += 1
        slmf.update_progress(ct / mondo_ct)

    # Dataset
    # data-version field in the header of the OBO file has a relase version:
    # data-version: releases/2016-03-25
    f = os.popen("head %s" % cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME'])
    for line in f:
        if line.startswith("data-version:"):
            ver = line.replace('data-version: ', '')
            break
    f.close()
    dataset_id = dba.ins_dataset({
        'name':
        'Mondo',
        'source':
        'File %s, version %s' % (cfgd['BASE_URL'] + cfgd['FILENAME'], ver),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://github.com/monarch-initiative/mondo'
    })
    assert dataset_id, f"Error inserting dataset See logfile {logfile} for details."
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'mondo'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'mondo_parent'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'mondo_xref'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, f"Error inserting provenance. See logfile {logfile} for details."

    print(f"{ct} terms processed.")
    print(f"  Inserted {ins_ct} new uberon rows")
    if dba_err_ct > 0:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #4
0
ファイル: load-TDLs.py プロジェクト: stevemathias/TCRDpy3
def export_uniprot_mapping(dba, ofn):
  uptdls = dba.get_uniprots_tdls()
  ct = len(uptdls)
  exp_ct = 0
  print(f"\nExporting UniProts/TDLs for {ct} TCRD targets")
  with open(ofn, 'w') as ofh:
    ofh.write(f"UniProt_accession\tPharos_target\tTDL\n")
    for d in uptdls:
      ofh.write(f"{d['uniprot']}\t{d['uniprot']}\t{d['tdl']}\n")
      exp_ct += 1
      slmf.update_progress(exp_ct/ct)
  print(f"Wrote {exp_ct} lines to file {ofn}")
コード例 #5
0
def load_do(args, dba, logger, logfile, dod, cfgd):
    do_ct = len(dod)
    if not args['--quiet']:
        print(f"Loading {do_ct} Disease Ontology terms")
    ct = 0
    ins_ct = 0
    dba_err_ct = 0
    for doid, d in dod.items():
        ct += 1
        d['doid'] = doid
        rv = dba.ins_do(d)
        if rv:
            ins_ct += 1
        else:
            dba_err_ct += 1
        slmf.update_progress(ct / do_ct)

    # Dataset
    # data-version field in the header of the OBO file has a relase version:
    # data-version: releases/2016-03-25
    for line in os.popen("head %s" % cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME']):
        if line.startswith("data-version:"):
            ver = line.replace('data-version: ', '')
            break
    dataset_id = dba.ins_dataset({
        'name':
        'Disease Ontology',
        'source':
        'File %s, version %s' % (cfgd['BASE_URL'] + cfgd['FILENAME'], ver),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://disease-ontology.org/'
    })
    assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details."
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'do'})
    assert rv, f"Error inserting provenance. See logfile {logfile} for details."
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'do_xref'
    })
    assert rv, f"Error inserting provenance. See logfile {logfile} for details."

    print(f"{ct} terms processed.")
    print(f"  Inserted {ins_ct} new do rows")
    if dba_err_ct > 0:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #6
0
def load_mouse_rat(args, dba, dataset_id, logger, logfile):
    fn = UP_DOWNLOAD_DIR + UP_RODENT_FILE.replace('.gz', '')
    if not args['--quiet']:
        print(f"\nParsing file {fn}")
    root = objectify.parse(fn).getroot()
    up_ct = len(root.entry)
    if not args['--quiet']:
        print(f"Loading data for {up_ct} UniProt records")
    logger.info(f"Loading data for {up_ct} UniProt records in file {fn}")
    ct = 0
    load_ct = 0
    skip_ct = 0
    xml_err_ct = 0
    dba_err_ct = 0
    for i in range(len(root.entry)):
        ct += 1
        slmf.update_progress(ct / up_ct)
        entry = root.entry[i]
        # filter for mouse and rat records
        for orgname in entry.organism.find(NS + 'name'):
            if orgname.get('type') == 'scientific':
                break
        if orgname not in ['Mus musculus', 'Rattus norvegicus']:
            skip_ct += 1
            logger.debug("Skipping {} entry {}".format(orgname,
                                                       entry.accession))
            continue
        logger.info("Processing entry {}".format(entry.accession))
        nhpinit = entry2nhpinit(entry, dataset_id)
        if not nhpinit:
            xml_err_ct += 1
            logger.error("XML Error for {}".format(entry.accession))
            continue
        nhpid = dba.ins_nhprotein(nhpinit)
        if not nhpid:
            dba_err_ct += 1
            continue
        logger.debug("Nhprotein insert id: {}".format(nhpid))
        load_ct += 1
    print(f"Processed {ct} UniProt records.")
    print(f"  Loaded {load_ct} Mouse and Rat nhproteins")
    if skip_ct > 0:
        print(f"  Skipped {skip_ct} non-Mouse/Rat records")
    if xml_err_ct > 0:
        print(
            f"WARNING: {xml_err_ct} XML parsing errors occurred. See logfile {logfile} for details."
        )
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #7
0
def do_glygen(dba, logger, logfile):
    proteins = dba.get_proteins()
    pct = len(proteins)
    print(f"\nChecking/Loading GlyGen ExtLinks for {pct} TCRD proteins")
    ct = 0
    el_ct = 0
    notfnd = set()
    api_err_ct = 0
    dba_err_ct = 0
    for p in proteins:
        logger.info(f"Processing protein {p['id']}: {p['uniprot']}")
        ct += 1
        slmf.update_progress(ct / pct)
        ingg = chk_glygen(p['uniprot'])
        if ingg == True:
            rv = dba.ins_extlink({
                'source':
                'GlyGen',
                'protein_id':
                p['id'],
                'url':
                GLYGEN_PROTEIN_PAGE_URL.format(p['uniprot'])
            })
            if not rv:
                dba_err_ct += 1
                continue
            el_ct += 1
        elif ingg == False:
            logger.warn(f"No GlyGen record for {p['uniprot']}")
            notfnd.add(p['uniprot'])
            continue
        else:
            logger.Error("Unexpected GlyGen API result for {p['uniprot']}")
            api_err_ct += 1
            continue
    print(f"Processed {ct} TCRD proteins.")
    print(f"Inserted {el_ct} new GlyGen extlink rows.")
    if notfnd:
        print(
            "No GlyGen record found for {} TCRD UniProts. See logfile {} for details."
            .format(len(notfnd), logfile))
    if api_err_ct > 0:
        print(
            f"WARNING: {api_err_ct} unexpected API responses. See logfile {logfile} for details."
        )
    if dba_err_ct > 0:
        print(
            f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #8
0
def load_mondo(dba, logger, logfile, mondod, cfgd):
  mondo_ct = len(mondod)
  print(f"Loading {mondo_ct} MonDO terms")
  ct = 0
  ins_ct = 0
  dba_err_ct = 0
  for mondoid,md in mondod.items():
    ct += 1
    md['mondoid'] = mondoid
    if 'xrefs' in md:
      for xref in md['xrefs']:
        if 'source' in xref and 'source="MONDO:equivalentTo"' in xref['source']:
          xref['equiv_to'] = 1
        else:
          xref['equiv_to'] = 0
    rv = dba.ins_mondo(md)
    if rv:
      ins_ct += 1
    else:
      dba_err_ct += 1
    slmf.update_progress(ct/mondo_ct)

  # Dataset
  # data-version field in the header of the OBO file has a relase version:
  # data-version: releases/2016-03-25
  f = os.popen("head %s"%cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME'])
  for line in f:
    if line.startswith("data-version:"):
      ver = line.replace('data-version: ', '')
      break
  f.close()
  dataset_id = dba.ins_dataset( {'name': 'Mondo', 'source': 'Mondo file {}, version {}'.format(cfgd['BASE_URL']+cfgd['FILENAME'], ver), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://mondo.monarchinitiative.org/'} )
  assert dataset_id, f"Error inserting dataset See logfile {logfile} for details."
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'mondo'} ,
            {'dataset_id': dataset_id, 'table_name': 'mondo_parent'},
            {'dataset_id': dataset_id, 'table_name': 'mondo_xref'} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, f"Error inserting provenance. See logfile {logfile} for details."
  
  print(f"{ct} terms processed.")
  print(f"  Inserted {ins_ct} new mondo rows (w/ associated parents and xrefs)")
  if dba_err_ct:
    print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
コード例 #9
0
def parse_mappings(fn):
    line_ct = slmf.wcl(fn)
    print(f"\nProcessing {line_ct} input lines in mapping file {fn}")
    up2chembl = {}
    with open(fn, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if row[0].startswith('#'):
                continue
            if row[3] != 'SINGLE PROTEIN':
                continue
            if row[0] in up2chembl:
                up2chembl[row[0]].append(row[1])
            else:
                up2chembl[row[0]] = [row[1]]
    return up2chembl
コード例 #10
0
def load_human(args, dba, dataset_id, eco_map, logger, logfile):
    fn = UP_DOWNLOAD_DIR + UP_HUMAN_FILE.replace('.gz', '')
    if not args['--quiet']:
        print(f"\nParsing file {fn}")
    root = objectify.parse(fn).getroot()
    up_ct = len(root.entry)
    if not args['--quiet']:
        print(f"Loading data for {up_ct} UniProt records")
    logger.info(f"Loading data for {up_ct} UniProt records in file {fn}")
    ct = 0
    load_ct = 0
    xml_err_ct = 0
    dba_err_ct = 0
    for i in range(len(root.entry)):
        ct += 1
        slmf.update_progress(ct / up_ct)
        entry = root.entry[i]
        logger.info("Processing entry {}".format(entry.accession))
        tinit = entry2tinit(entry, dataset_id, eco_map)
        if not tinit:
            xml_err_ct += 1
            logger.error("XML Error for {}".format(entry.accession))
            continue
        tid = dba.ins_target(tinit)
        if not tid:
            dba_err_ct += 1
            continue
        logger.debug(f"Target insert id: {tid}")
        load_ct += 1
    print(f"Processed {ct} UniProt records.")
    print(f"  Loaded {load_ct} targets/proteins")
    if xml_err_ct > 0:
        print(
            f"WARNING: {xml_err_ct} XML parsing errors occurred. See logfile {logfile} for details."
        )
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #11
0
ファイル: load-TIN-X.py プロジェクト: stevemathias/TCRDpy3
def load_pubmed(curs, logger, logfile):
  st = time.time()
  fn = INFILES['pubmed']
  line_ct = slmf.wcl(fn)
  print(f'\nLoading TIN-X pubmeds from {fn}...')
  ct = 0
  pm_ct = 0
  dup_ct = 0
  err_ct = 0
  with open(fn, 'r') as ifh:
    tsvreader = csv.reader(ifh, delimiter='\t')
    for row in tsvreader:
      if ct == 0: # skip header
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      try:
        curs.execute(INS_SQL['pubmed'], tuple(row))
        pm_ct += 1
      except Error as e:
        if f"Duplicate entry '{row[0]}'" in e.msg:
          # this should not happen under "production" runs, but it's here for testing/debugging
          dup_ct += 1
          continue
        else:
          err_ct += 1
          logger.error(f"``{e}`` for line {ct}. Data: {row}")
          continue
  ets = slmf.secs2str(time.time() - st)
  print(f"\n  Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}")
  if err_ct:
    print(f"  WARNING: {err_ct} errors occurred. See logfile {logfile} for details.")
  if dup_ct:
    print(f"  Skipped {dup_ct} existing pubmeds.")
  print("Done.")
コード例 #12
0
def load_DISEASES(dba, logger, logfile):
    # Knowledge channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_K
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Knowledge file {fn}")
    with open(fn, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        skip_ct = 0
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Knowledge ' + row[4]
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    # Experiment channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_E
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Experiment file {fn}")
    with open(fn, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            if row[2].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Experiment ' + row[4]
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP or with ENSP did")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    # Text Mining channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_T
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Textmining file {fn}")
    with open(fn, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            if float(row[5]) < 3.0:
                # skip rows with confidence < 3.0
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Text Mining'
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'zscore': row[4],
                    'conf': row[5]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP or with confidence < 3")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #13
0
ファイル: load-IDGList.py プロジェクト: stevemathias/TCRDpy3
def load(args, dba, logger, logfile):
    line_ct = slmf.wcl(IDG_LIST_FILE)
    print(f"\nProcessing {line_ct} lines in file {IDG_LIST_FILE}")
    logger.info(f"Processing {line_ct} lines in list file {IDG_LIST_FILE}")
    ct = 0
    idg_ct = 0
    fam_ct = 0
    notfnd = []
    multfnd = []
    dba_err_ct = 0
    with open(IDG_LIST_FILE, 'r') as ifh:
        csvreader = csv.reader(ifh)
        for row in csvreader:
            if ct == 0:
                header = row  # header line
                ct += 1
                continue
            ct += 1
            slmf.update_progress(ct / line_ct)
            sym = row[0]
            fam = row[1]
            if fam == 'IonChannel':
                fam = 'IC'
            tids = dba.find_target_ids({'sym': sym})
            if not tids:
                notfnd.append(sym)
                continue
            if len(tids) > 1:
                multfnd.append(sym)
                continue
            rv = dba.do_update({
                'table': 'target',
                'col': 'idg',
                'id': tids[0],
                'val': 1
            })
            if rv:
                idg_ct += 1
            else:
                db_err_ct += 1
            rv = dba.do_update({
                'table': 'target',
                'col': 'fam',
                'id': tids[0],
                'val': fam
            })
            if rv:
                fam_ct += 1
            else:
                dba_err_ct += 1
    print(f"{ct} lines processed")
    print(f"{idg_ct} target rows updated with IDG flags")
    print(f"{fam_ct} target rows updated with fams")
    if notfnd:
        print("WARNING: No target found for {} symbols: {}".format(
            len(notfnd), ", ".join(notfnd)))
    if multfnd:
        print("WARNING: Multiple targets found for {} symbols: {}".format(
            len(multfnd), ", ".join(multfnd)))
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} database errors occured. See logfile {logfile} for details."
        )
コード例 #14
0
def load(dba, logger, logfile):
    infile = DOWNLOAD_DIR + TIGA_FILE
    line_ct = slmf.wcl(infile)
    print(f"\nProcessing {line_ct} lines in TIGA file {infile}")
    ct = 0
    k2pids = defaultdict(list)  # maps sym|ENSG to TCRD protein_id(s)
    notfnd = set()
    pmark = {}
    tiga_ct = 0
    dba_err_ct = 0
    with open(infile, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        for row in tsvreader:
            if ct == 0:  # skip header
                header = row  # header line
                ct += 1
                continue
            # 0: ensemblId
            # 1: efoId
            # 2: trait
            # 3: n_study
            # 4: n_snp
            # 5: n_snpw
            # 6: geneNtrait
            # 7: geneNstudy
            # 8: traitNgene
            # 9: traitNstudy
            # 10: pvalue_mlog_median
            # 11: pvalue_mlog_max
            # 12: or_median
            # 13: n_beta
            # 14: study_N_mean
            # 15: rcras
            # 16: geneSymbol
            # 17: TDL
            # 18: geneFamily
            # 19: geneIdgList
            # 20: geneName
            # 21: meanRank
            # 22: meanRankScore
            ct += 1
            slmf.update_progress(ct / line_ct)
            sym = row[16]
            ensg = row[0]
            k = sym + '|' + ensg
            pids = []
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                pids = dba.find_protein_ids({'sym': sym})
                if not pids:
                    pids = dba.find_protein_ids_by_xref({
                        'xtype': 'Ensembl',
                        'value': ensg
                    })
                    if not pids:
                        notfnd.add(k)
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each sym/ENSG once
            init = {
                'ensg': ensg,
                'efoid': row[1],
                'trait': row[2],
                'n_study': row[3],
                'n_snp': row[4],
                'n_snpw': row[5],
                'geneNtrait': row[6],
                'geneNstudy': row[7],
                'traitNgene': row[8],
                'traitNstudy': row[9],
                'pvalue_mlog_median': row[10],
                'pvalue_mlog_max': row[11],
                'n_beta': row[13],
                'study_N_mean': row[14],
                'rcras': row[15],
                'meanRank': row[21],
                'meanRankScore': row[22]
            }
            if row[12] != 'NA':
                init['or_median'] = row[12]
            #if row[] != 'NA':
            #  init[''] = row[]
            for pid in pids:
                init['protein_id'] = pid
                rv = dba.ins_tiga(init)
                if not rv:
                    dba_err_ct += 1
                    continue
                tiga_ct += 1
                pmark[pid] = True
    for k in notfnd:
        logger.warn(f"No protein found for {k}")
    print(f"Processed {ct} lines")
    print("  Inserted {} new tiga rows for {} proteins".format(
        tiga_ct, len(pmark)))
    if notfnd:
        print("No target found for {} sym/ENSGs. See logfile {} for details.".
              format(len(notfnd), logfile))
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    infile = DOWNLOAD_DIR + TIGA_PROV_FILE
    line_ct = slmf.wcl(infile)
    print(f"\nProcessing {line_ct} lines in TIGA provenance file {infile}")
    ct = 0
    tigaprov_ct = 0
    dba_err_ct = 0
    with open(infile, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        for row in tsvreader:
            if ct == 0:  # skip header
                header = row  # header line
                ct += 1
                continue
            # 0: ensemblId
            # 1: TRAIT_URI
            # 2: STUDY_ACCESSION
            # 3: PUBMEDID
            # 4: efoId
            ct += 1
            slmf.update_progress(ct / line_ct)
            rv = dba.ins_tiga_provenance({
                'ensg': row[0],
                'efoid': row[4],
                'study_acc': row[2],
                'pubmedid': row[3]
            })
            if not rv:
                dba_err_ct += 1
                continue
            tigaprov_ct += 1
    print(f"Processed {ct} lines")
    print(f"  Inserted {tigaprov_ct} new tiga_provenance rows")
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #15
0
def load(args, dba, logger, logfile):
  fn = DOWNLOAD_DIR + GENO_PHENO_FILE.replace('.gz', '')
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in input file {fn}")
  ct = 0
  pt_ct = 0
  pmark = {}
  sym2nhpids = {}
  notfnd = set()
  skip_ct = 0
  dba_err_ct = 0
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      # 0: marker_accession_id
      # 1: marker_symbol
      # 2: phenotyping_center
      # 3: colony_id
      # 4: sex
      # 5: zygosity
      # 6: allele_accession_id
      # 7: allele_symbol
      # 8: allele_name
      # 9: strain_accession_id
      # 10: strain_name
      # 11: project_name
      # 12: project_fullname
      # 13: pipeline_name
      # 14: pipeline_stable_id
      # 15: procedure_stable_id
      # 16: procedure_name
      # 17: parameter_stable_id
      # 18: parameter_name
      # 19: top_level_mp_term_id
      # 20: top_level_mp_term_name
      # 21: mp_term_id
      # 22: mp_term_name
      # 23: p_value
      # 24: percentage_change
      # 25: effect_size
      # 26: statistical_method
      # 27: resource_name
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[1]
      if not row[21] and not row[22]:
        # skip data with neither a term_id or term_name
        skip_ct += 1
        continue
      if sym in sym2nhpids:
        # we've already found it
        nhpids = sym2nhpids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus')
        if not nhpids:
          notfnd.add(sym)
          logger.warn("No nhprotein found for symbol {}".format(sym))
          continue
        sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once
      pval = None
      if row[23] and row[23] != '':
        try:
          pval = float(row[23])
        except:
          logger.warn("Problem converting p_value {} for row {}".format(row[23], ct))
      sex = None
      if row[4] and len(row[4]) <= 8:
        sex = row[4]
      for nhpid in nhpids:
        rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': sex, 'gp_assoc': 1})
        if rv:
          pmark[nhpid] = True
          pt_ct += 1
        else:
          dba_err_ct += 1
  print(f"{ct} lines processed.")
  print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark)))
  if notfnd:
    print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile))
  if skip_ct > 0:
    print(f"Skipped {skip_ct} lines with no term_id or term_name.")
  if dba_err_ct > 0:
    print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")

  fn = DOWNLOAD_DIR + STAT_RES_FILE.replace('.gz', '')
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines from input file {fn}")
  ct = 0
  pt_ct = 0
  pmark = {}
  sym2nhpids = {}
  notfnd = set()
  skip_ct = 0
  pv_ct = 0
  dba_err_ct = 0
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      # 0: phenotyping_center
      # 1: intercept_estimate
      # 2: procedure_id
      # 3: mutant_biological_model_id
      # 4: rotated_residuals_test
      # 5: weight_effect_p_value
      # 6: male_mutant_count
      # 7: pipeline_stable_key
      # 8: female_ko_effect_p_value
      # 9: pipeline_stable_id
      # 10: parameter_stable_key
      # 11: data_type
      # 12: parameter_stable_id
      # 13: interaction_significant
      # 14: strain_accession_id
      # 15: control_selection_method
      # 16: parameter_name
      # 17: allele_name
      # 18: phenotyping_center_id
      # 19: weight_effect_stderr_estimate
      # 20: weight_effect_parameter_estimate
      # 21: procedure_stable_id
      # 22: status
      # 23: sex_effect_parameter_estimate
      # 24: female_ko_effect_stderr_estimate
      # 25: female_percentage_change
      # 26: group_2_residuals_normality_test
      # 27: marker_accession_id
      # 28: mp_term_name
      # 29: group_1_residuals_normality_test
      # 30: genotype_effect_p_value
      # 31: dependent_variable
      # 32: resource_name
      # 33: project_id
      # 34: procedure_name
      # 35: doc_id
      # 36: top_level_mp_term_id
      # 37: allele_accession_id
      # 38: blups_test
      # 39: null_test_p_value
      # 40: p_value
      # 41: marker_symbol
      # 42: control_biological_model_id
      # 43: pipeline_name
      # 44: sex
      # 45: interaction_effect_p_value
      # 46: colony_id
      # 47: project_name
      # 48: female_ko_parameter_estimate
      # 49: female_mutant_count
      # 50: organisation_id
      # 51: external_db_id
      # 52: female_control_count
      # 53: intermediate_mp_term_id
      # 54: db_id
      # 55: male_ko_effect_p_value
      # 56: top_level_mp_term_name
      # 57: metadata_group
      # 58: sex_effect_stderr_estimate
      # 59: zygosity
      # 60: male_percentage_change
      # 61: sex_effect_p_value
      # 62: mp_term_id
      # 63: male_ko_effect_stderr_estimate
      # 64: additional_information
      # 65: statistical_method
      # 66: _version_
      # 67: intercept_estimate_stderr_estimate
      # 68: male_control_count
      # 69: intermediate_mp_term_name
      # 70: strain_name
      # 71: classification_tag
      # 72: effect_size
      # 73: procedure_stable_key
      # 74: allele_symbol
      # 75: resource_id
      # 76: group_2_genotype
      # 77: variance_significant
      # 78: pipeline_id
      # 79: group_1_genotype
      # 80: male_ko_parameter_estimate
      # 81: genotype_effect_parameter_estimate
      # 82: categories
      # 83: parameter_id
      # 84: batch_significant
      # 85: genotype_effect_stderr_estimate
      # 86: resource_fullname
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[41]
      if not row[62] and not row[28]:
        # skip lines with neither a term_id or term_name
        skip_ct += 1
        continue
      if sym in sym2nhpids:
        # we've already found it
        nhpids = sym2nhpids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus')
        if not nhpids:
          notfnd.add(sym)
          logger.warn("No nhprotein found for symbol {}".format(sym))
          continue
        sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once
      pval = None
      if row[40] and row[40] != '':
        try:
          pval = float(row[40])
        except:
          logger.warn("Problem converting p_value {} for row {}".format(row[40], ct))
      sex = None
      if row[4] and len(row[4]) <= 8:
        sex = row[4]
      for nhpid in nhpids:
        rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': sex, 'gp_assoc': 0})
        if rv:
          pmark[nhpid] = True
          pt_ct += 1
        else:
          dba_err_ct += 1
  print(f"{ct} lines processed.")
  print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark)))
  if notfnd:
    print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile))
  if skip_ct > 0:
    print(f"Skipped {skip_ct} lines with no term_id or term_name.")
  if dba_err_ct > 0:
    print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
コード例 #16
0
ファイル: tin-x.py プロジェクト: stevemathias/TCRDpy3
def tinx(args, dba, logger, logfile):
  # The results of parsing the input mentions files will be the following dictionaries:
  pid2pmids = {}  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
                  # Including the UniProt accession in the key is just for convenience when
                  # checking the output. It is not used for anything.
  doid2pmids = {} # DOID => set of all PMIDs that mention the disease
  pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper 
  pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper 

  # First parse the Disease Ontology OBO file to get DO names and defs
  dofile = DO_DOWNLOAD_DIR + DO_OBO
  print(f"\nParsing Disease Ontology file {dofile}")
  do_parser = obo.Parser(dofile)
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  print("  Got {} Disease Ontology terms".format(len(do)))

  fn = JL_DOWNLOAD_DIR+PROTEIN_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in protein file {fn}")
  with open(fn, 'rU') as tsvf:
    #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('ENSP'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      ensp = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      tids = dba.find_target_ids({'stringid': ensp})
      if not tids:
        # if we don't find a target by stringid, which is the more reliable and
        # prefered way, try by Ensembl xref
        tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp})
      if not tids:
        notfnd.add(ensp)
        continue
      for tid in tids:
        t = dba.get_target(tid, annot=False)
        p = t['components']['protein'][0]
        k = "{},{}".format(p['id'], p['uniprot'])
        if k in pid2pmids:
          pid2pmids[k] = pid2pmids[k].union(pmids)
        else:
          pid2pmids[k] = set(pmids)
        for pmid in pmids:
          if pmid in pmid_protein_ct:
            pmid_protein_ct[pmid] += 1.0
          else:
            pmid_protein_ct[pmid] = 1.0
  for ensp in notfnd:
    logger.warn(f"No target found for {ensp}")
  print(f"\n{ct} lines processed")
  print(f"  Skipped {skip_ct} non-ENSP lines")
  print("  Saved {} protein to PMIDs mappings".format(len(pid2pmids)))
  print("  Saved {} PMID to protein count mappings".format(len(pmid_protein_ct)))
  if notfnd:
    print("  No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile))

  fn = JL_DOWNLOAD_DIR+DISEASE_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'rU') as tsvf:
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('DOID:'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      doid = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      if doid not in do:
        logger.warn(f"{doid} not found in DO")
        notfnd.add(doid)
        continue
      if doid in doid2pmids:
        doid2pmids[doid] = doid2pmids[doid].union(pmids)
      else:
        doid2pmids[doid] = set(pmids)
      for pmid in pmids:
        if pmid in pmid_disease_ct:
          pmid_disease_ct[pmid] += 1.0
        else:
          pmid_disease_ct[pmid] = 1.0
  print(f"\n{ct} lines processed.")
  print(f"  Skipped {skip_ct} non-DOID lines")
  print("  Saved {} DOID to PMIDs mappings".format(len(doid2pmids)))
  print("  Saved {} PMID to disease count mappings".format(len(pmid_disease_ct)))
  if notfnd:
    print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile))

  if not args['--quiet']:
    print("\nComputing protein novely scores")
  # To calculate novelty scores, each paper (PMID) is assigned a
  # fractional target (FT) score of one divided by the number of targets
  # mentioned in it. The novelty score of a given protein is one divided
  # by the sum of the FT scores for all the papers mentioning that
  # protein.
  ct = 0
  with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf:
    pnovf.write("Protein ID,UniProt,Novelty\n")
    for k in pid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in pid2pmids[k]:
        ft_score_sum += 1.0 / pmid_protein_ct[pmid]
      novelty = 1.0 / ft_score_sum
      pnovf.write( "%s,%.8f\n" % (k, novelty) )
  print(f"  Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing disease novely scores")
  # Exactly as for proteins, but using disease mentions
  ct = 0
  with open(DISEASE_NOVELTY_FILE, 'w') as dnovf:
    dnovf.write("DOID,Novelty\n")
    for doid in doid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in doid2pmids[doid]:
        ft_score_sum += 1.0 / pmid_disease_ct[pmid]
      novelty = 1.0 / ft_score_sum
      dnovf.write( "%s,%.8f\n" % (doid, novelty) )
  print(f"  Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing importance scores")
  # To calculate importance scores, each paper is assigned a fractional
  # disease-target (FDT) score of one divided by the product of the
  # number of targets mentioned and the number of diseases
  # mentioned. The importance score for a given disease-target pair is
  # the sum of the FDT scores for all papers mentioning that disease and
  # protein.
  ct = 0
  with open(IMPORTANCE_FILE, 'w') as impf:
    impf.write("DOID,Protein ID,UniProt,Score\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        fdt_score_sum = 0.0
        for pmid in pd_pmids:
          fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] )
        if fdt_score_sum > 0:
          ct += 1
          impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) )
  print(f"  Wrote {ct} importance scores to file {IMPORTANCE_FILE}")

  if not args['--quiet']:
    print("\nComputing PubMed rankings")
  # PMIDs are ranked for a given disease-target pair based on a score
  # calculated by multiplying the number of targets mentioned and the
  # number of diseases mentioned in that paper. Lower scores have a lower
  # rank (higher priority). If the scores do not discriminate, PMIDs are
  # reverse sorted by value with the assumption that larger PMIDs are
  # newer and of higher priority.
  ct = 0
  with open(PMID_RANKING_FILE, 'w') as pmrf:
    pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions)
        for pmid in pd_pmids:
          scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) )
        if len(scores) > 0:
          scores.sort(key = cmp_to_key(cmp_pmids_scores))
          for i,t in enumerate(scores):
            ct += 1
            pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) )
  print(f"  Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
コード例 #17
0
def load(args, dba, logger, logfile):
    if not args['--quiet']:
        print("\nGetting target resource data from RSS...")
    target_data = get_target_data()
    assert target_data, "Error getting target data: FATAL"
    rss_ct = len(target_data)
    ct = 0
    skip_ct = 0
    res_ct = 0
    tmark = set()
    notfnd = set()
    mulfnd = set()
    dba_err_ct = 0
    if not args['--quiet']:
        print(f"Processing {rss_ct} target resource records...")
    for td in target_data:
        logger.info("Processing target resource data: {}".format(td))
        ct += 1
        slmf.update_progress(ct / rss_ct)
        if not td['pharosReady']:
            skip_ct += 1
            continue
        sym = td['target']
        #rssid = td['id'].rsplit('/')[-1]
        rssid = td['id']
        resource_data = get_resource_data(td['id'])
        dbjson = json.dumps(resource_data['data'][0]['resource'])
        tids = dba.find_target_ids({'sym': sym})
        if not tids:
            tids = dba.find_target_ids({'sym': sym}, incl_alias=True)
            if not tids:
                notfnd.add(sym)
                logger.warn("No target found for {}".format(sym))
                continue
        if len(tids) > 1:
            mulfnd.add(sym)
            logger.warn("Multiple targets found for {}".format(sym))
        tid = tids[0]
        rv = dba.ins_drgc_resource({
            'rssid': rssid,
            'resource_type': td['resourceType'],
            'target_id': tid,
            'json': dbjson
        })
        if not rv:
            dba_err_ct += 1
            continue
        tmark.add(tid)
        res_ct += 1
    print(f"{ct} RSS target resource records processed.")
    print(f"  Skipped {skip_ct} non-pharosReady resources.")
    print("Inserted {} new drgc_resource rows for {} targets".format(
        res_ct, len(tmark)))
    if notfnd:
        print(
            "WARNING: No target found for {} symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if mulfnd:
        print(
            "WARNING: Multiple targets found for {} symbols. See logfile {} for details."
            .format(len(mulfnd), logfile))
    if dba_err_ct > 0:
        print(
            f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #18
0
def load_pmscores(dba, logger, logfile):
    ensp2pids = {}  # ENSP => list of TCRD protein ids
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    skip_ct = 0
    notfnd = set()
    dba_err_ct = 0
    infile = JL_DOWNLOAD_DIR + PM_SCORES_FILE
    line_ct = slmf.wcl(infile)
    print(f"Processing {line_ct} lines in file {infile}")
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not pids:
                        notfnd.add(ensp)
                        logger.warn("No protein found for {}".format(ensp))
                        continue
                ensp2pids[
                    ensp] = pids  # save this mapping so we only lookup each ENSP once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    print(f"{ct} input lines processed.")
    print("  Inserted {} new pmscore rows for {} proteins".format(
        pms_ct, len(pmscores)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP")
    if notfnd:
        print(
            "  No protein found for {} STRING IDs. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    print("Updating {} JensenLab PubMed Scores...".format(len(pmscores)))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.upd_pms_tdlinfo(pid, score)
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print(f"  Updated {ti_ct} 'JensenLab PubMed Score' tdl_info rows")
    if dba_err_ct:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #19
0
def load_tinx(args, dba, do, logger, logfile):
  fn = f"{TINX_OUTDIR}ProteinNovelty.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # Protein ID,UniProt,Novelty
    ct = 1
    tn_ct = 0
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      pid = row[0]
      rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} )
      if rv:
        tn_ct += 1
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {tnct} new tinx_novelty rows".)
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
  
  dmap = {}
  fn = f"{TINX_OUTDIR}DiseaseNovelty.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Novelty
    ct = 1
    dct = 0
    notfnd = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      doid = row[0]
      if doid in do:
        if 'name' in do[doid]:
          dname = do[doid]['name'][0].value
        else:
          continue
        if 'def' in do[doid]:
          ddef = do[doid]['def'][0].value
        else:
          ddef = None
      else:
        logger.warn("{row[0]} not in DO map")
        notfnd.append(row[0])
        continue
      rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 
                                  'summary': ddef, 'score': float(row[1])} )
      if rv:
        dct += 1
        dmap[doid] = rv # map DOID to tinx_disease.id
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {dct} new tinx_disease rows".)
  print("  Saved {} keys in dmap".format(len(dmap)))
  if notfnd:
    print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")

  imap = {}
  fn = f"{TINX_OUTDIR}Importance.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,Score
    ct = 1
    ti_ct = 0
    skips1 = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if row[0] not in dmap:
        logger.warn("{row[0]} not in dmap")
        skips1.add(row[0])
        continue
      did = dmap[row[0]]
      pid = row[1]
      rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did,
                                     'score': float(row[3])} )
      if rv:
        ti_ct += 1
        # map DOID|PID to tinx_importance.id
        k = f"{row[0]}|{row[1]}"
        imap[k] = rv 
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {ti_ct} new tinx_importance rows".)
  print("  Saved {} keys in imap".format(len(imap)))
  if len(skips1) > 0:
    print("WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")

  fn = f"{TINX_OUTDIR}PMIDRanking.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  regex = re.compile(r"^DOID:0*")
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,PubMed ID,Rank
    ct = 1
    tar_ct = 0
    skips = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      k = "%s|%s"%(row[0],row[1])
      if k not in imap:
        logger.warn("{k} not in imap")
        skips.add(k)
        continue
      iid = imap[k]
      rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} )
      if rv:
        tar_ct += 1
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {tar_ct} new tinx_articlerank rows".)
  if len(skips) > 0:
    print("WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
コード例 #20
0
def load(args, dba, up2chembl, chembldb, logfile, logger):
    upct = len(up2chembl)
    if not args['--quiet']:
        print(f"\nProcessing {upct} UniProt accessions in up2chembl")
    ct = 0
    dba_err_ct = 0
    notfnd = set()
    nic_ct = 0
    nga_ct = 0
    tdl_ct = 0
    ca_ct = 0
    cyti_ct = 0
    csti_ct = 0
    t2acts = {}
    c2acts = {}
    for up in up2chembl.keys():
        ct += 1
        slmf.update_progress(ct / upct)
        tids = dba.find_target_ids({'uniprot': up}, incl_alias=True)
        if not tids:
            notfnd.add(up)
            logger.warn(f"No TCRD target found for UniProt {up}")
            continue
        tid = tids[0]
        tp = dba.get_targetprotein(tid)
        logger.info(f"Processing ChEMBL data for UniProt {up}: target {tid}")
        chembl_acts = []
        for ctid in up2chembl[up]:
            # Query 1
            with closing(chembldb.cursor(dictionary=True)) as curs:
                curs.execute(SQLq1, (ctid, ))
                for d in curs:
                    if d['year']:
                        d['reference'] = "{}, ({}) {}:{}:{}".format(
                            d['journal'], d['year'], d['volume'], d['issue'],
                            d['first_page'])
                    else:
                        d['reference'] = "{}, {}:{}:{}".format(
                            d['journal'], d['volume'], d['issue'],
                            d['first_page'])
                    for k in ['journal', 'volume', 'issue', 'first_page']:
                        del (d[k])
                    chembl_acts.append(d)
            # Query 2
            with closing(chembldb.cursor(dictionary=True)) as curs:
                curs.execute(SQLq2, (ctid, ))
                for d in curs:
                    d['reference'] = None
                    chembl_acts.append(d)
        if tp['fam'] in CUTOFFS:
            cutoff = CUTOFFS[tp['fam']]
        else:
            cutoff = 6.0  # 1uM for other families
        logger.info(f"Filter cutoff for {up} (target id {tid}) is {cutoff}")
        filtered_acts = [
            a for a in chembl_acts if a['pchembl_value'] >= cutoff
        ]
        logger.info("{} ChEMBL acts => {} filtered acts".format(
            len(chembl_acts), len(filtered_acts)))
        if not filtered_acts:
            nga_ct += 1
            continue

        #
        # if we get here, the target has qualifying activites (and is thus Tchem)
        #
        # sort filtered activities by pchembl_value (descending), so that the
        # activity with the largest will be sorted_by_pchembl_value[0]
        sorted_by_pchembl_value = sorted(filtered_acts,
                                         key=itemgetter('pchembl_value'),
                                         reverse=True)

        # load TCRD cmpd_activities
        # The most potent activity value for a given target will be this one:
        # MIN(cmpd_activity.id) WHERE catype = 'ChEMBL' AND target_id = 3000
        for a in sorted_by_pchembl_value:
            if 'pubmed_id' in a:
                pmid = a['pubmed_id']
            else:
                pmid = None
            try:
                rv = dba.ins_cmpd_activity({
                    'target_id':
                    tid,
                    'catype':
                    'ChEMBL',
                    'cmpd_id_in_src':
                    a['chembl_id'],
                    'cmpd_name_in_src':
                    a['compound_name'],
                    'smiles':
                    a['canonical_smiles'],
                    'reference':
                    a['reference'],
                    'act_value':
                    a['pchembl_value'],
                    'act_type':
                    a['standard_type'],
                    'pubmed_ids':
                    pmid
                })
            except:
                # some names have weird hex characters and cause errors, so replace w/ ?
                rv = dba.ins_cmpd_activity({
                    'target_id': tid,
                    'catype': 'ChEMBL',
                    'cmpd_id_in_src': a['chembl_id'],
                    'cmpd_name_in_src': '?',
                    'smiles': a['canonical_smiles'],
                    'reference': a['reference'],
                    'act_value': a['pchembl_value'],
                    'act_type': a['standard_type'],
                    'pubmed_ids': pmid
                })
            if rv:
                ca_ct += 1
            else:
                dba_err_ct += 1

        # Save First ChEMBL Reference Year tdl_info, if there is one
        yrs = [a['year'] for a in filtered_acts if 'year' in a and a['year']]
        if len(yrs) > 0:
            first_year = min(yrs)
            rv = dba.ins_tdl_info({
                'target_id': tid,
                'itype': 'ChEMBL First Reference Year',
                'integer_value': first_year
            })
            if rv:
                cyti_ct += 1
            else:
                dba_err_ct += 1

        # Save mappings for selective compound calculations
        t2acts[tid] = copy.copy(sorted_by_pchembl_value)
        for a in chembl_acts:
            ac = copy.copy(a)
            smi = ac['canonical_smiles']
            del (ac['canonical_smiles'])
            ac['tid'] = tid
            ac['tname'] = tp['name']
            if smi in c2acts:
                c2acts[smi].append(ac)
            else:
                c2acts[smi] = [ac]
    print(f"{ct} UniProt accessions processed.")
    if notfnd:
        print(
            "  No TCRD target found for {} UniProt accessions. See logfile {} for details."
            .format(len(notfnd), logfile))
    if nic_ct > 0:
        print(f"  {nic_ct} targets not found in ChEMBL")
    print(f"  {nga_ct} targets have no qualifying activities in ChEMBL")
    print(f"Inserted {ca_ct} new cmpd_activity rows")
    print(
        f"Inserted {cyti_ct} new 'ChEMBL First Reference Year' tdl_info rows")
    if dba_err_ct > 0:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    # Selective compound calculations
    if not args['--quiet']:
        print("\nRunning selective compound analysis...")
    c2macts = {}
    for c, acts in c2acts.items():
        if len(acts) > 1:
            c2macts[c] = list(acts)
    # then sort the activity lists by pchembl_value
    c2smacts = {}
    for c, acts in c2macts.items():
        c2smacts[c] = sorted(acts, key=itemgetter('pchembl_value'))
    selective = []
    for smi in c2smacts.keys():
        i = 1
        while i <= len(c2smacts[smi]) - 1:
            if c2smacts[smi][i]['tid'] == c2smacts[smi][i - 1]['tid']:
                i += 1
                continue
            diff = c2smacts[smi][i]['pchembl_value'] - c2smacts[smi][
                i - 1]['pchembl_value']
            if diff >= 2:
                selective.append(smi)
                break
            i += 1
    if not args['--quiet']:
        print("  Found {} selective compounds".format(len(selective)))
    cscti_ct = 0
    dba_err_ct = 0
    for tid, acts in t2acts.items():
        for a in acts:
            if a['canonical_smiles'] in selective:
                # Save ChEMBL Selective Compound tdl_info
                val = "{}|{}".format(a['chembl_id'], a['canonical_smiles'])
                rv = dba.ins_tdl_info({
                    'target_id': tid,
                    'itype': 'ChEMBL Selective Compound',
                    'string_value': val
                })
                if rv:
                    cscti_ct += 1
                else:
                    dba_err_ct += 1
                break
    if not args['--quiet']:
        print(
            f"Inserted {cscti_ct} new 'ChEMBL Selective Compound' tdl_info rows"
        )
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
コード例 #21
0
def load(args, dba, dataset_id, logger, logfile):
  line_ct = slmf.wcl(HGNC_TSV_FILE)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in file {HGNC_TSV_FILE}")
  ct = 0
  hgnc_ct = 0
  mgi_ct = 0
  chr_ct = 0
  sym_ct = 0
  symdiscr_ct = 0
  geneid_ct = 0
  geneiddiscr_ct = 0
  notfnd = set()
  pmark = {}
  db_err_ct = 0
  with open(HGNC_TSV_FILE, 'r') as ifh:
    tsvreader = csv.reader(ifh, delimiter='\t')
    for row in tsvreader:
      # 0: HGNC ID
      # 1: Approved symbol
      # 2: Approved name
      # 3: Status
      # 4: Chromosome
      # 5: Mouse genome database ID
      # 6: NCBI Gene ID
      # 7: UniProt ID
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[1]
      if row[6] != '':
        geneid = int(row[6])
      else:
        geneid = None
      if row[7] != '':
        up = row[7]
      else:
        up = None
      pids = dba.find_protein_ids({'sym': sym})
      if not pids and geneid:
        pids = dba.find_protein_ids({'geneid': geneid})
      if not pids and up:
        pids = dba.find_protein_ids({'uniprot': up})
      if up and not pids:
        notfnd.add(f"{sym}|{geneid}|{up}")
        logger.warn(f"No protein found for {sym}|{geneid}|{up}")
        continue
      for pid in pids:
        # HGNC xref
        hgncid = row[0].replace('HGNC:', '')
        rv = dba.ins_xref({'protein_id': pid, 'xtype': 'HGNC ID',
                           'dataset_id': dataset_id, 'value': hgncid})
        if rv:
          hgnc_ct += 1
        else:
          db_err_ct += 1
        # MGI xref
        if row[5] != '':
          mgiid = row[5].replace('MGI:', '')
          rv = dba.ins_xref({'protein_id': pid, 'xtype': 'MGI ID',
                             'dataset_id': dataset_id, 'value': mgiid})
          if rv:
            mgi_ct += 1
          else:
            db_err_ct += 1
        # Add protein.chr values
        rv = dba.do_update({'table': 'protein', 'col': 'chr', 'id': pid, 'val': row[4]})
        if rv:
          chr_ct += 1
        else:
          db_err_ct += 1
        p = dba.get_protein(pid)
        # Add missing syms
        if p['sym'] == None:
          rv = dba.do_update({'table': 'protein', 'col': 'sym', 'id': pid, 'val': sym})
          if rv:
            logger.info("Inserted new sym {} for protein {}|{}".format(sym, pid, p['uniprot']))
            sym_ct += 1
          else:
            db_err_ct += 1
        else:
          # Check for symbol discrepancies
          if p['sym'] != sym:
            logger.warn("Symbol discrepancy: UniProt's=%s, HGNC's=%s" % (p['sym'], sym))
            symdiscr_ct += 1
        if geneid:
          # Add missing geneids
          if p['geneid'] == None:
            rv = dba.do_update({'table': 'protein', 'col': 'geneid', 'id': pid, 'val': geneid})
            if rv:
              logger.info("Inserted new geneid {} for protein {}, {}".format(geneid, pid, p['uniprot']))
              geneid_ct += 1
            else:
              db_err_ct += 1
          else:
            # Check for geneid discrepancies
            if p['geneid'] != geneid:
              logger.warn("GeneID discrepancy: UniProt's={}, HGNC's={}".format(p['geneid'], geneid))
              geneiddiscr_ct += 1
        pmark[pid] = True
  print("Processed {} lines - {} proteins annotated.".format(ct, len(pmark)))
  if notfnd:
    print("No protein found for {} lines (with UniProts).".format(len(notfnd)))
  print(f"  Updated {chr_ct} protein.chr values.")
  print(f"  Inserted {hgnc_ct} HGNC ID xrefs")
  print(f"  Inserted {mgi_ct} MGI ID xrefs")
  if sym_ct > 0:
    print(f"  Inserted {sym_ct} new HGNC symbols")
  if symdiscr_ct > 0:
    print(f"WARNING: Found {symdiscr_ct} discrepant HGNC symbols. See logfile {logfile} for details")
  if geneid_ct > 0:
    print(f"  Inserted {geneid_ct} new NCBI Gene IDs")
  if geneiddiscr_ct > 0:
    print(f"WARNING: Found {geneiddiscr_ct} discrepant NCBI Gene IDs. See logfile {logfile} for details")
  if db_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")