예제 #1
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'NCBI GI Numbers',
        'source':
        'UniProt ID Mapping file %s' % (BASE_URL + FILENAME),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.uniprot.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    # ID Mappiing fields
    # 1. UniProtKB-AC
    # 2. UniProtKB-ID
    # 3. GeneID (EntrezGene)
    # 4. RefSeq
    # 5. GI
    # 6. PDB
    # 7. GO
    # 8. UniRef100
    # 9. UniRef90
    # 10. UniRef50
    # 11. UniParc
    # 12. PIR
    # 13. NCBI-taxon
    # 14. MIM
    # 15. UniGene
    # 16. PubMed
    # 17. EMBL
    # 18. EMBL-CDS
    # 19. Ensembl
    # 20. Ensembl_TRS
    # 21. Ensembl_PRO
    # 22. Additional PubMed
    if not args['--quiet']:
        print "\nProcessing {} rows in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        tmark = {}
        xref_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for line in tsv:
            data = line.split('\t')
            ct += 1
            up = data[0]
            if not data[4]:  # no gi
                skip_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                skip_ct += 1
                continue
            target = targets[0]
            tmark[target['id']] = True
            pid = target['components']['protein'][0]['id']
            for gi in data[4].split('; '):
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'NCBI GI',
                    'dataset_id': dataset_id,
                    'value': gi
                })
                if rv:
                    xref_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} rows processed".format(ct)
    print "  Inserted {} new GI xref rows for {} targets".format(
        xref_ct, len(tmark))
    print "  Skipped {} rows with no GI".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #2
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not debug:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'AnimalTFDB',
        'source':
        'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.bioguo.org/AnimalTFDB/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'Is Transcription Factor'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0}

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}\n".format(
            line_ct, INFILE)
    with open(INFILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        ti_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            sym = row[3]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                gid = row[2]
                targets = dba.find_targets({'geneid': gid})
            if not targets:
                ensg = row[1]
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensg
                })
            if not targets:
                notfnd.append(row)
                continue
            t = targets[0]
            TDLs[t['tdl']] += 1
            pid = t['components']['protein'][0]['id']
            rv = dba.ins_tdl_info({
                'protein_id': pid,
                'itype': 'Is Transcription Factor',
                'boolean_value': 1
            })
            if rv:
                ti_ct += 1
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} lines processed.".format(ct)
    print "  Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct)
    if notfnd:
        print "No target found for {} rows:".format(len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: %d DB errors occurred. See logfile %s for details." % (
            dba_err_ct, logfile)
    for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']:
        print "{}: {}".format(tdl, TDLs[tdl])
예제 #3
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'PubTator Text-mining Scores',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/',
        'comments':
        'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'ptscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'PubTator PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ptscores = {}  # protein.id => sum(all scores)
    pts_ct = 0
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        geneid2pid = {}
        notfnd = set()
        for row in tsvreader:
            # NCBI Gene ID  year  score
            ct += 1
            pbar.update(ct)
            gidstr = row[0].replace(',', ';')
            geneids = gidstr.split(';')
            for geneid in geneids:
                if not geneid or '(tax:' in geneid:
                    continue
                if geneid in geneid2pid:
                    # we've already found it
                    pids = geneid2pid[geneid]
                elif geneid in notfnd:
                    # we've already not found it
                    continue
                else:
                    targets = dba.find_targets({'geneid': geneid})
                    if not targets:
                        notfnd.add(geneid)
                        logger.warn("No target found for {}".format(geneid))
                        continue
                    pids = []
                    for target in targets:
                        pids.append(target['components']['protein'][0]['id'])
                        geneid2pid[
                            geneid] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_ptscore({
                        'protein_id': pid,
                        'year': row[1],
                        'score': row[2]
                    })
                    if rv:
                        pts_ct += 1
                    else:
                        dba_err_ct += 1
                    if pid in ptscores:
                        ptscores[pid] += float(row[2])
                    else:
                        ptscores[pid] = float(row[2])
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Inserted {} new ptscore rows for {} targets.".format(
        pts_ct, len(ptscores))
    if notfnd:
        print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in ptscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'PubTator Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #4
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  if os.path.isfile(T2AID_PICKLE):
    t2aid = pickle.load( open(T2AID_PICKLE, 'rb'))
    act = 0
    for tid in t2aid.keys():
      for aid in t2aid[tid]:
        act += 1
    if not args['--debug']:
      print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act)
  else:
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    line_ct = slmf.wcl(AIDGI_FILE)
    t2aid = {}
    if not args['--quiet']:
      print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE)
    with open(AIDGI_FILE, 'rU') as csvfile:
      pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
      csvreader = csv.reader(csvfile)
      ct = 0
      skip_ct = 0
      fndgi_ct = 0
      fndpl_ct = 0
      notfnd = set()
      assay_ct = 0
      dba_err_ct = 0
      for row in csvreader:
        # aid, tgt_gi, tgt_species, tgt_name
        #print "[DEBUG]", row
        ct += 1
        if row[2] != 'H**o sapiens':
          skip_ct += 1
          continue
        gi = row[1]
        targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi})
        if targets:
          fndgi_ct += 1
        else:
          url = EFETCH_PROTEIN_URL + gi
          r = requests.get(url)
          if r.status_code == 200:
            soup = BeautifulSoup(r.text, "xml")
            grl = soup.find('Gene-ref_locus')
            if grl:
              sym = grl.text
              targets = dba.find_targets({'sym': sym})
          if targets:
            fndpl_ct += 1
          else:
            notfnd.append(gi)
            logger.warn("No target found for GI {}".format(gi))
            continue
        t = targets[0]
        tid = t['id']
        if tid in t2aid:
          t2aid[tid].append(row[0])
          assay_ct += 1
        else:
          t2aid[tid] = [row[0]]
          assay_ct += 1
        pbar.update(ct)
    pbar.finish()
    pickle.dump(t2aid, open(T2AID_PICKLE, "wb"))
    print "\n{} rows processed.".format(ct)
    print "  {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid))
    print "  Skipped {} non-huamn assay rows".format(skip_ct)
    print "    {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct)
    print "  No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile)

  assay_info = {}
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  line_ct = slmf.wcl(ASSAYS_FILE)
  if not args['--quiet']:
    print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE)
  with open(ASSAYS_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    csvreader = csv.reader(csvfile)
    ct = 0
    for row in csvreader:
      # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList
      aid = row[0]
      assay_info[aid] = row[1:]
      pbar.update(ct)
  pbar.finish()
  elapsed = time.time() - start_time
  print "Got assay info for {} assays.".format(len(assay_info))

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  tct = len(t2aid.keys())
  if not args['--quiet']:
    print "\nLoading MLP Assay Info for {} targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  ti_ct = 0
  mai_ct = 0
  dba_err_ct = 0
  for tid, aids in t2aid.items():
    ct += 1
    for aid in aids:
      ainfo = assay_info[aid]
      rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]})
      if rv:
        mai_ct += 1
      else:
        dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "\n{} targets processed.".format(ct)
  print "  Inserted {} new mlp_assay_info rows".format(mai_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
예제 #5
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'LocSigDB',
        'source':
        'File %s from %s' % (FILENAME, BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://genome.unmc.edu/LocSigDB/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'locsig'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    fn = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, fn)
    with open(fn, 'rU') as f:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        up2pid = {}
        notfnd = set()
        ls_ct = 0
        skip_ct = 0
        pmark = set()
        dba_err_ct = 0
        for line in f:
            ct += 1
            data = line.split(',')
            if 'H**o sapiens' not in data[5]:
                skip_ct += 1
                continue
            fnd = False
            for up in data[4].split(';'):
                if up in up2pid:
                    # we've already found it
                    pid = up2pid[up]
                elif up in notfnd:
                    # we've already not found it
                    continue
                else:
                    targets = dba.find_targets({'uniprot': up})
                    if not targets:
                        notfnd.add(up)
                        continue
                    pid = targets[0]['components']['protein'][0]['id']
                    up2pid[up] = pid
                rv = dba.ins_locsig({
                    'protein_id': pid,
                    'location': data[2],
                    'signal': data[0],
                    'pmids': data[3]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                ls_ct += 1
                pmark.add(pid)
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for {}".format(up))
    print "{} lines processed.".format(ct)
    print "  Inserted {} new locsig rows for {} proteins".format(
        ls_ct, len(pmark))
    print "  Skipped {} non-human rows".format(skip_ct)
    if notfnd:
        print "No target found for {} UniProts. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #6
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Reactome Pathways',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.reactome.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype = 'Reactome'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.zip', '')
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input line from Reactome Pathways file {}".format(
            line_ct, infile)
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # Apoptosis       R-HSA-109581    Reactome Pathway        ACIN1   ADD1    AKT1    AKT2   ...
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        sym2pids = defaultdict(list)
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pwname = row[0]
            pwid = row[1]
            url = 'http://www.reactome.org/content/detail/' + pwid
            syms = row[3:]
            for sym in syms:
                if sym in sym2pids:
                    pids = sym2pids[sym]
                elif sym in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'sym': sym})
                    if not targets:
                        notfnd.add(sym)
                        continue
                    pids = []
                    for t in targets:
                        pids.append(t['components']['protein'][0]['id'])
                    sym2pids[
                        sym] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_pathway({
                        'protein_id': pid,
                        'pwtype': 'Reactome',
                        'name': pwname,
                        'id_in_source': pwid,
                        'url': url
                    })
                    if rv:
                        pw_ct += 1
                        pmark.add(pid)
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for sym in sym2pids:
        logger.warn("No target found for {}".format(sym))
    print "Processed {} Reactome Pathways.".format(ct)
    print "  Inserted {} pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    if notfnd:
        print "  No target found for {} Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #7
0
def main(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(LOGLEVEL)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    if not args['--quiet']:
        print "\nLoading project info from pickle file {}".format(PROJECTS_P)
        projects = pickle.load(open(PROJECTS_P, 'rb'))

    if not args['--quiet']:
        print "\nCreating Tagger..."
    tgr = Tagger()
    tgr.load_names(ENTITIES_FILE, NAMES_FILE)
    tgr.load_global(GLOBAL_FILE)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    for year in [str(yr) for yr in range(2000, 2018)]:  # 2000-2017
        pct = len(projects[year])
        print "\nTagging {} projects from {}".format(pct, year)
        logger.info("Tagging {} projects from {}".format(pct, year))
        pbar = ProgressBar(widgets=pbar_widgets, maxval=pct).start()
        start_time = time.time()
        ct = 0
        ttag_ct = 0
        abstag_ct = 0
        skip_ct = 0
        ttagsnotfnd = set()
        ttag2targetid = {}
        appid2targets = defaultdict(set)
        target2appids = defaultdict(set)
        for appid in projects[year].keys():
            ct += 1
            logger.debug("  Processing appid {}".format(appid))
            ginfo = projects[year][appid]
            # if there's no $$, we're not interested
            if ginfo['TOTAL_COST']:
                gcost = int(ginfo['TOTAL_COST'])
            elif ginfo['TOTAL_COST_SUB_PROJECT']:
                gcost = int(ginfo['TOTAL_COST_SUB_PROJECT'])
            else:
                continue
            # also, if there's less than $10k we're not interested
            if gcost < 10000:
                skip_ct += 1
                continue
            #
            # tag titles
            #
            matches = tgr.get_matches(projects[year][appid]['PROJECT_TITLE'],
                                      appid, [9606])
            if matches:
                ttag_ct += 1
                # the same tag can match multiple times, so get a set of ENSPs
                ensps = set()
                for m in matches:
                    ensps.add(m[2][0][1])
                ensps = list(ensps)
                for ensp in ensps:
                    if ensp in ttag2targetid:
                        tid = ttag2targetid[ensp]
                    elif ensp in ttagsnotfnd:
                        continue
                    else:
                        targets = dba.find_targets({'stringid': ensp},
                                                   idg=False)
                        if not targets:
                            targets = dba.find_targets_by_xref(
                                {
                                    'xtype': 'Ensembl',
                                    'value': ensp
                                }, idg=False)
                        if not targets:
                            ttagsnotfnd.add(ensp)
                            continue
                        tid = targets[0]['id']
                        ttag2targetid[
                            ensp] = tid  # save this so we don't look up the targets again
                    appid2targets[appid].add(tid)
                    target2appids[tid].add(appid)
            #
            # tag abstracts
            #
            if 'ABSTRACT' in projects[year][appid]:
                matches = tgr.get_matches(projects[year][appid]['ABSTRACT'],
                                          appid, [9606])
                if matches:
                    abstag_ct += 1
                    # the same tag can match multiple times, so get a set of ENSPs
                    ensps = set()
                    for m in matches:
                        ensps.add(m[2][0][1])
                    ensps = list(ensps)
                    for ensp in ensps:
                        if ensp in ttag2targetid:
                            tid = ttag2targetid[ensp]
                        elif ensp in ttagsnotfnd:
                            continue
                        else:
                            targets = dba.find_targets({'stringid': ensp},
                                                       idg=False)
                            if not targets:
                                targets = dba.find_targets_by_xref(
                                    {
                                        'xtype': 'Ensembl',
                                        'value': ensp
                                    },
                                    idg=False)
                            if not targets:
                                ttagsnotfnd.add(ensp)
                                continue
                            tid = targets[0]['id']
                            ttag2targetid[
                                ensp] = tid  # save this so we don't look up the targets again
                        appid2targets[appid].add(tid)
                        target2appids[tid].add(appid)
            pbar.update(ct)
        pbar.finish()

        del_ct = 0
        for appid, tidset in appid2targets.items():
            if len(tidset) > 10:
                del_ct += 1
                del (appid2targets[appid])

        logger.info("{} projects processed.".format(ct))
        logger.info("  Removed {} projects with > 10 targets" % del_ct)
        logger.info(
            "  Skipped {} projects with funds less than $10k:".format(skip_ct))
        logger.info("  {} titles have tagging result(s)".format(ttag_ct))
        logger.info("  {} abstracts have tagging result(s)".format(abstag_ct))
        logger.info("{} total tags map to {}/{} distinct targets".format(
            len(ttag2targetid.keys()), len(set(ttag2targetid.values())),
            len(target2appids.keys())))
        logger.info("{} project applications map to target(s)".format(
            len(appid2targets.keys())))
        if ttagsnotfnd:
            logger.info("  No target found for {} tags".format(
                len(ttagsnotfnd)))
        pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year)
        pickle.dump(appid2targets, open(pfile, 'wb'))
        logger.info("Tagging results saved to pickle {} for {}".format(
            pfile, year))
        pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year)
        pickle.dump(target2appids, open(pfile, 'wb'))
        logger.info("Tagging results saved to pickle {} for {}".format(
            pfile, year))
        print "{} projects processed. See logfile {} for details.".format(
            ct, LOGFILE)
예제 #8
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'WikiPathways',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.wikipathways.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype = 'WikiPathways'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    fn = DOWNLOAD_DIR + PATHWAYS_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} input lines from WikiPathways file {}".format(
            line_ct, fn)
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # Apoptosis Modulation and Signaling%WikiPathways_20160516%WP1772%H**o sapiens    http://www.wikipathways.org/instance/WP1772_r85184       843     3725    842 ...
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        gid2pids = defaultdict(list)
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            name = row[0].split('%')[0]
            wpid = row[1].split('/')[-1]
            geneids = row[2:]
            for gid in geneids:
                if gid in gid2pids:
                    pids = gid2pids[gid]
                elif gid in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'geneid': gid})
                    if not targets:
                        notfnd.add(gid)
                        continue
                    pids = []
                for t in targets:
                    pids.append(t['components']['protein'][0]['id'])
                gid2pids[
                    gid] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_pathway({
                        'protein_id': pid,
                        'pwtype': 'WikiPathways',
                        'name': name,
                        'id_in_source': wpid,
                        'url': row[1]
                    })
                    if rv:
                        pw_ct += 1
                        pmark.add(pid)
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for gid in gid2pids:
        logger.warn("No target found for {}".format(gid))
    print "Processed {} WikiPathways.".format(ct)
    print "  Inserted {} pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    if notfnd:
        print "  No target found for {} Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #9
0
def load():
    args = docopt(__doc__, version=__version__)
    debug = int(args['--debug'])
    if debug:
        print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args)

    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = "%s.log" % PROGRAM
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not debug:
        logger.propagate = False  # turns off console logging when debug is 0
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "Connected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'TechDev Worklist Info',
        'source':
        'Files from TechDev Groups',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'Loading app uses data from spreadsheets submitted by the TechDev groups listing targets being investigated.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'techdev_contact',
        'comment': ""
    }, {
        'dataset_id': dataset_id,
        'table_name': 'techdev_info',
        'comment': ""
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    for tdid, filename in INPUTFILES.items():
        line_ct = wcl(filename)
        if not args['--quiet']:
            print '\nProcessing %d lines from input file: %s' % (line_ct,
                                                                 filename)
        with open(filename, 'rU') as csvfile:
            csvreader = csv.reader(csvfile)
            pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
            ct = 0
            contact = {}
            skip_ct = 0
            err_ct = 0
            info_ct = 0
            notfnd = []
            dba_err_ct = 0
            for row in csvreader:
                ct += 1
                if row[0] == 'TechDev ID:':
                    techdev_id = int(row[1])
                    contact['id'] = techdev_id
                    continue
                if row[0] == 'Grant Number:':
                    contact['grant_number'] = row[1]
                    continue
                if row[0] == 'Submitter name:':
                    contact['contact_name'] = row[1]
                    continue
                if row[0] == 'Contact email:':
                    contact['contact_email'] = row[1]
                    continue
                if row[0] == 'Submission date:':
                    contact['date'] = row[1]
                    continue
                if row[0] == 'tcrd_target_id':
                    contact['pi'] = PIS[techdev_id]
                    contact_id = dba.ins_techdev_contact(contact)
                    if not contact_id:
                        logger.error("DBA error inserting techdev_contact.")
                        print "Exiting due to DBA error inserting techdev_contact. See logfile %s for details." % logfile
                        break
                    continue
                if not row[6]:
                    skip_ct += 1
                    continue
                sym = row[1]
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.append(sym)
                    continue
                t = targets[0]
                pid = t['components']['protein'][0]['id']
                init = {'contact_id': contact_id, 'protein_id': pid}
                if not row[7]:
                    err_ct += 1
                    continue
                init['comment'] = row[7]
                if row[8]:
                    init['publication_pcmid'] = row[8]
                if row[9]:
                    init['publication_pmid'] = row[9]
                if row[11]:
                    init['resource_url'] = row[11]
                if row[10]:
                    init['data_url'] = row[10]
                rv = dba.ins_techdev_info(init)
                if rv:
                    info_ct += 1
                else:
                    dba_err_ct += 1
                pbar.update(ct)
        pbar.finish()
        if not args['--quiet']:
            print "%d lines processed." % ct
            print "  Skipped %d lines not under investigation" % skip_ct
            if err_ct > 0:
                print "  WARNING: %d lines did not have a comment!" % err_ct
            if notfnd:
                print "  WARNING: %d symbols did not find a target!"
                for sym in notfnd:
                    print "    %s" % sym
            print "  Inserted 1 new techdev_contact row"
            print "  Inserted %d new techdev_info rows" % info_ct
            if dba_err_ct > 0:
                print "WARNING: %d DB errors occurred. See logfile %s for details." % (
                    dba_err_ct, logfile)
예제 #10
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Pathway Commons',
        'source': 'File %s' % BASE_URL + PATHWAYS_FILE,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.pathwaycommons.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'pathway',
        'where_clause': "pwtype LIKE 'PathwayCommons %s'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} records from PathwayCommons file {}".format(
            line_ct, infile)
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        # Example line:
        # http://identifiers.org/kegg.pathway/hsa00010    name: Glycolysis / Gluconeogenesis; datasource: kegg; organism: 9606; idtype: uniprot  A8K7J7  B4DDQ8  B4DNK4  E9PCR7  P04406  P06744  P07205  P07738  P09467 P09622   P09972  P10515  P11177  P14550  P30838  P35557  P51648  P60174  Q01813  Q16822  Q53Y25  Q6FHV6 Q6IRT1   Q6ZMR3  Q8IUN7  Q96C23  Q9BRR6  Q9NQR9  Q9NR19
        # However, note that pathway commons URLs in file give 404.
        # E.g. URL from this line:
        # http://pathwaycommons.org/pc2/Pathway_0136871cbdf9a3ecc09529f1878171df  name: VEGFR1 specific signals; datasource: pid; organism: 9606; idtype: uniprot    O14786  O15530  O60462  P05771  P07900  P15692  P16333  P17252  P17612  P17948  P19174  P20936     P22681  P27361  P27986  P28482  P29474  P31749  P42336  P49763  P49765  P62158  P98077  Q03135  Q06124  Q16665  Q9Y5K6
        # needs to be converted to:
        # http://apps.pathwaycommons.org/pathways?uri=http%3A%2F%2Fpathwaycommons.org%2Fpc2%2FPathway_0136871cbdf9a3ecc09529f1878171df
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        up2pid = {}
        pmark = set()
        notfnd = set()
        pw_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            src = re.search(r'datasource: (\w+)', row[1]).groups()[0]
            if src in ['kegg', 'wikipathways', 'reactome']:
                skip_ct += 1
                continue
            pwtype = 'PathwayCommons: ' + src
            name = re.search(r'name: (.+?);', row[1]).groups()[0]
            url = PCAPP_BASE_URL + urllib.quote(row[0], safe='')
            ups = row[2:]
            for up in ups:
                if up in up2pid:
                    pid = up2pid[up]
                elif up in notfnd:
                    continue
                else:
                    targets = dba.find_targets({'uniprot': up})
                    if not targets:
                        notfnd.add(up)
                        continue
                    t = targets[0]
                    pid = t['components']['protein'][0]['id']
                    up2pid[up] = pid
                rv = dba.ins_pathway({
                    'protein_id': pid,
                    'pwtype': pwtype,
                    'name': name,
                    'url': url
                })
                if rv:
                    pw_ct += 1
                    pmark.add(pid)
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for {}".format(up))
    print "Processed {} Pathway Commons records.".format(ct)
    print "  Inserted {} new pathway rows for {} proteins.".format(
        pw_ct, len(pmark))
    print "  Skipped {} records from 'kegg', 'wikipathways', 'reactome'".format(
        skip_ct)
    if notfnd:
        print "  No target found for {} UniProt accessions. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
예제 #11
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as main()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Drugable Epigenome Domains', 'source': 'Files from http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Drugable Epigenome Class'"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  total_ti_ct = 0
  notfnd = set()
  for k,d in FILE_LIST.items():
    if not args['--quiet']:
      print "\nProcessing Epigenetic {}s".format(k)
    for dom,f in d.items():
      f = INPUT_DIR + f
      line_ct = slmf.wcl(f)
      if not args['--quiet']:
        print 'Processing {} lines from {} input file {}'.format(line_ct, dom, f)
      with open(f, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next() # skip header lines
        ct = 0
        not_fnd_ct = 0
        tct = 0
        ti_ct = 0
        dba_err_ct = 0
        for row in csvreader:
          ct += 1
          targets = dba.find_targets({'sym': row[0]})
          if not targets:
            targets = dba.find_targets({'geneid': row[3]})
          if not targets:
            targets = dba.find_targets({'uniprot': row[2]})
          if not targets:
            k = "%s|%s|%s"%(row[0],row[3],row[2])
            notfnd.add(k)
            logger.warn("No target found for: {}".format(k))
            continue
          tct += 1
          t = targets[0]
          p = t['components']['protein'][0]
          if len(row) == 5:
            val = "Epigenetic %s - %s" % (k, dom)
          else:
            val = "Epigenetic %s - %s %s: %s" % (k, dom, row[4], row[5])
          rv = dba.ins_tdl_info({'protein_id': p['id'], 'itype': 'Drugable Epigenome Class', 'string_value': val})
          if not rv:
            dba_err_ct += 1
            continue
          ti_ct += 1
        if not args['--quiet']:
          print "  {} lines processed. Found {}, skipped {}".format(ct, tct, not_fnd_ct)
          print "  Inserted {} new tdl_info rows".format(ti_ct)
        if dba_err_ct > 0:
          print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
        total_ti_ct += ti_ct
  if not args['--quiet']:
    print "\nInserted a total of {} new Drugable Epigenome Class tdl_infos".format(total_ti_ct)
    if len(notfnd) > 0:
      print "  No target found for {} sym/geneid/uniprots. See logfile {} for details.".format(len(notfnd), logfile)