Example #1
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)
  
  #
  # Protein Level Expressions
  #
  line_ct = slmf.wcl(PROTEIN_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  rs2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(PROTEIN_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      if rs in rs2pids:
        # we've already found it
        pids = rs2pids[rs]
      elif rs in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False)
        if not targets:
          notfnd.add(rs)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        rs2pids[rs] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      if row[3] == 'NA':
        init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Protein','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids))
  if notfnd:
    print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(PROTEIN_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(PROTEIN_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      tau = row[1]
      if rs not in rs2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  #
  # Gene Level Expressions
  #
  line_ct = slmf.wcl(GENE_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  sym2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(GENE_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      if sym in sym2pids:
        pids = sym2pids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          notfnd.add(sym)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        sym2pids[sym] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      
      if row[3] == 'NA':
        init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Gene','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids))
  if notfnd:
    print "  No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(GENE_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(GENE_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      tau = row[1]
      if sym not in sym2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with symbols not in map from expression file".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Example #2
0
def tinx(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # The results of parsing the input mentions files will be the following dictionaries:
    pid2pmids = {
    }  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
    # Including the UniProt accession in the key is just for convenience when
    # checking the output. It is not used for anything.
    doid2pmids = {}  # DOID => set of all PMIDs that mention the disease
    pmid_disease_ct = {
    }  # PMID => count of diseases mentioned in a given paper
    pmid_protein_ct = {
    }  # PMID => count of proteins mentioned in a given paper

    # First parse the Disease Ontology OBO file to get DO names and defs
    dofile = DO_DOWNLOAD_DIR + DO_OBO
    print "\nParsing Disease Ontology file {}".format(dofile)
    do_parser = obo.Parser(open(dofile))
    do = {}
    for stanza in do_parser:
        do[stanza.tags['id'][0].value] = stanza.tags
    print "  Got {} Disease Ontology terms".format(len(do))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = JL_DOWNLOAD_DIR + PROTEIN_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in protein file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('ENSP'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            ensp = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                # if we don't find a target by stringid, which is the more reliable and
                # prefered way, try by Ensembl xref
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensp
                })
            if not targets:
                notfnd.add(ensp)
                continue
            for t in targets:
                p = t['components']['protein'][0]
                k = "%s,%s" % (p['id'], p['uniprot'])
                if k in pid2pmids:
                    pid2pmids[k] = pid2pmids[k].union(pmids)
                else:
                    pid2pmids[k] = set(pmids)
                for pmid in pmids:
                    if pmid in pmid_protein_ct:
                        pmid_protein_ct[pmid] += 1.0
                    else:
                        pmid_protein_ct[pmid] = 1.0
    pbar.finish()
    for ensp in notfnd:
        logger.warn("No target found for {}".format(ensp))
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-ENSP lines".format(skip_ct)
    print "  Saved {} protein to PMIDs mappings".format(len(pid2pmids))
    print "  Saved {} PMID to protein count mappings".format(
        len(pmid_protein_ct))
    if notfnd:
        print "  No target found for {} ENSPs. See logfile {} for details.".format(
            len(notfnd), logfile)

    fn = JL_DOWNLOAD_DIR + DISEASE_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('DOID:'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            doid = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            if doid not in do:
                logger.warn("%s not found in DO" % doid)
                notfnd.add(doid)
                continue
            if doid in doid2pmids:
                doid2pmids[doid] = doid2pmids[doid].union(pmids)
            else:
                doid2pmids[doid] = set(pmids)
            for pmid in pmids:
                if pmid in pmid_disease_ct:
                    pmid_disease_ct[pmid] += 1.0
                else:
                    pmid_disease_ct[pmid] = 1.0
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-DOID lines".format(skip_ct)
    print "  Saved {} DOID to PMIDs mappings".format(len(doid2pmids))
    print "  Saved {} PMID to disease count mappings".format(
        len(pmid_disease_ct))
    if notfnd:
        print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(
            len(notfnd), logfile)

    if not args['--quiet']:
        print "\nComputing protein novely scores"
    # To calculate novelty scores, each paper (PMID) is assigned a
    # fractional target (FT) score of one divided by the number of targets
    # mentioned in it. The novelty score of a given protein is one divided
    # by the sum of the FT scores for all the papers mentioning that
    # protein.
    ct = 0
    with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf:
        pnovf.write("Protein ID,UniProt,Novelty\n")
        for k in pid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in pid2pmids[k]:
                ft_score_sum += 1.0 / pmid_protein_ct[pmid]
            novelty = 1.0 / ft_score_sum
            pnovf.write("%s,%.8f\n" % (k, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, PROTEIN_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing disease novely scores"
    # Exactly as for proteins, but using disease mentions
    ct = 0
    with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf:
        dnovf.write("DOID,Novelty\n")
        for doid in doid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in doid2pmids[doid]:
                ft_score_sum += 1.0 / pmid_disease_ct[pmid]
            novelty = 1.0 / ft_score_sum
            dnovf.write("%s,%.8f\n" % (doid, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, DISEASE_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing importance scores"
    # To calculate importance scores, each paper is assigned a fractional
    # disease-target (FDT) score of one divided by the product of the
    # number of targets mentioned and the number of diseases
    # mentioned. The importance score for a given disease-target pair is
    # the sum of the FDT scores for all papers mentioning that disease and
    # protein.
    ct = 0
    with open(IMPORTANCE_FILE, 'wb') as impf:
        impf.write("DOID,Protein ID,UniProt,Score\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                fdt_score_sum = 0.0
                for pmid in pd_pmids:
                    fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] *
                                            pmid_disease_ct[pmid])
                if fdt_score_sum > 0:
                    ct += 1
                    impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum))
    print "  Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE)

    if not args['--quiet']:
        print "\nComputing PubMed rankings"
    # PMIDs are ranked for a given disease-target pair based on a score
    # calculated by multiplying the number of targets mentioned and the
    # number of diseases mentioned in that paper. Lower scores have a lower
    # rank (higher priority). If the scores do not discriminate, PMIDs are
    # reverse sorted by value with the assumption that larger PMIDs are
    # newer and of higher priority.
    ct = 0
    with open(PMID_RANKING_FILE, 'wb') as pmrf:
        pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                scores = [
                ]  # scores are tuples of (PMID, protein_mentions*disease_mentions)
                for pmid in pd_pmids:
                    scores.append(
                        (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]))
                if len(scores) > 0:
                    scores.sort(cmp_pmids_scores)
                    for i, t in enumerate(scores):
                        ct += 1
                        pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i))
    print "  Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
Example #3
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as load()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    start_time = time.time()

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'ChEMBL',
        'source':
        'ChEMBL MySQL database {}'.format(CHEMBL_DB),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/chembl/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    dataset_id2 = dba.ins_dataset({
        'name':
        'ChEMBL Info',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'First reference year and selective compound info are generated by loader app.'
    })
    if not dataset_id2:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'chembl_acivity'
    }, {
        'dataset_id': dataset_id2,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'ChEMBL First Reference Year'",
        'comment': "Derived from filtered ChEMBL activities."
    }, {
        'dataset_id': dataset_id2,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'ChEMBL Selective Compound'",
        'comment': "Derived from filtered ChEMBL activities."
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    # ChEMBL MySQL connection
    f = open('/home/smathias/.dbirc', 'r')
    pw = f.readline().strip()
    chembldb = mysql.connect(host='localhost',
                             port=3306,
                             db=CHEMBL_DB,
                             user='******',
                             passwd=pw)

    # First get mapping of UniProt accestions to ChEMBL IDs
    up2chembl = {}
    f = DOWNLOAD_DIR + UNIPROT2CHEMBL_FILE
    line_ct = slmf.wcl(f)
    if not args['--quiet']:
        print "\nProcessing %d input lines in file %s" % (line_ct, f)
    with open(f, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            if row[0] in up2chembl:
                up2chembl[row[0]].append(row[1])
            else:
                up2chembl[row[0]] = [row[1]]
    if not args['--quiet']:
        print "%d input lines processed." % ct
    #print "Saved %d keys in up2chembl dict" % len(up2chembl.keys())

    upct = len(up2chembl)
    if not args['--quiet']:
        print "\nProcessing %d UniProt to ChEMBL ID(s) mappings" % upct
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=upct).start()
    ct = 0
    notfnd = []
    fnd_ct = 0
    err_ct = 0
    dba_err_ct = 0
    nic_ct = 0
    nga_ct = 0
    tdl_ct = 0
    ca_ct = 0
    csti_ct = 0
    ccti_ct = 0
    cyti_ct = 0
    t2acts = {}
    c2acts = {}
    for up in up2chembl.keys():
        ct += 1
        pbar.update(ct)
        targets = dba.find_targets({'uniprot': up}, include_annotations=True)
        if not targets:
            notfnd.append(up)
            continue
        t = targets[0]
        tid = t['id']
        logger.info("Loading ChEMBL data for target %d - %s/%s" %
                    (t['id'], t['components']['protein'][0]['sym'], up))
        chembl_acts = []
        for ctid in up2chembl[up]:
            with closing(chembldb.cursor(mysql.cursors.DictCursor)) as curs:
                # Query 1
                curs.execute(SQLq1, (ctid, ))
                for d in curs:
                    if d['year']:
                        d['reference'] = "%s, (%d) %s:%s:%s" % (
                            d['journal'], d['year'], d['volume'], d['issue'],
                            d['first_page'])
                    else:
                        d['reference'] = "%s, %s:%s:%s" % (
                            d['journal'], d['volume'], d['issue'],
                            d['first_page'])
                    for k in ['journal', 'volume', 'issue', 'first_page']:
                        del (d[k])
                    chembl_acts.append(d)
            # Query 2
            with closing(chembldb.cursor(mysql.cursors.DictCursor)) as curs:
                curs.execute(SQLq2, (ctid, ))
                for d in curs:
                    d['reference'] = None
                    chembl_acts.append(d)

        if t['fam'] == 'GPCR':
            cutoff = 7.0  # 100nM
        elif t['fam'] == 'IC':
            cutoff = 5.0  # 10uM
        elif t['fam'] == 'Kinase':
            cutoff = 7.52288  # 30nM
        elif t['fam'] == 'NR':
            cutoff = 7.0  # 100nM
        else:
            cutoff = 6.0  # 1uM for non-IDG Family targets
        logger.info("Target %d (%s) filter cutoff: %f " %
                    (tid, t['name'], len(chembl_acts)))
        filtered_acts = [
            a for a in chembl_acts if a['pchembl_value'] >= cutoff
        ]
        logger.info("%d ChEMBL acts => %d filtered acts" %
                    (len(chembl_acts), len(filtered_acts)))
        if not filtered_acts:
            nga_ct += 1
            continue
        logger.info("  Got %d filtered activities" % len(filtered_acts))

        #
        # if we get here, target is Tchem
        #
        # sort all activities by std_val, so best activity is in sorted_by_stdval[-1]
        decorated = [(a['pchembl_value'], a) for a in filtered_acts]
        decorated.sort()
        sorted_by_stdval = [a for (key, a) in decorated]
        # sort filtered activities by reference year, so oldest activity is in sorted_by_year[0]
        decorated = [(a['year'], a) for a in filtered_acts if 'year' in a]
        decorated.sort()
        sorted_by_year = [a for (key, a) in decorated]

        # Save chembl_activities
        # The best activity for a given target will be the one with MAX(chembl_activity.id)
        for a in sorted_by_stdval:
            if 'pubmed_id' in a:
                pmid = a['pubmed_id']
            else:
                pmid = None
            try:
                rv = dba.ins_cmpd_activity({
                    'target_id':
                    tid,
                    'catype':
                    'ChEMBL',
                    'cmpd_id_in_src':
                    a['chembl_id'],
                    'cmpd_name_in_src':
                    a['compound_name'],
                    'smiles':
                    a['canonical_smiles'],
                    'reference':
                    a['reference'],
                    'act_value':
                    a['pchembl_value'],
                    'act_type':
                    a['standard_type'],
                    'pubmed_ids':
                    pmid
                })
            except:
                # some names have weird hex characters and cause errors...
                rv = dba.ins_cmpd_activity({
                    'target_id': tid,
                    'catype': 'ChEMBL',
                    'cmpd_id_in_src': a['chembl_id'],
                    'cmpd_name_in_src': '?',
                    'smiles': a['canonical_smiles'],
                    'reference': a['reference'],
                    'act_value': a['pchembl_value'],
                    'act_type': a['standard_type'],
                    'pubmed_ids': pmid
                })
            if rv:
                ca_ct += 1
            else:
                dba_err_ct += 1

        # Save First ChEMBL Reference Year tdl_info, if there is one
        if len(sorted_by_year) > 0:
            oldest = sorted_by_year[0]
            rv = dba.ins_tdl_info({
                'target_id': tid,
                'itype': 'ChEMBL First Reference Year',
                'integer_value': sorted_by_year[0]['year']
            })
            if rv:
                cyti_ct += 1
            else:
                dba_err_ct += 1

        # Save mappings for selective compound calculations
        t2acts[tid] = copy.copy(sorted_by_stdval)
        for a in chembl_acts:
            ac = copy.copy(a)
            smi = ac['canonical_smiles']
            del (ac['canonical_smiles'])
            ac['tid'] = tid
            ac['tname'] = t['components']['protein'][0]['name']
            if smi in c2acts:
                c2acts[smi].append(ac)
            else:
                c2acts[smi] = [ac]
    pbar.finish()
    print "%d UniProt accessions processed." % ct
    if nic_ct > 0:
        print "  %d targets not found in ChEMBL" % nic_ct
    print "  %d targets have no qualifying TCRD activities in ChEMBL" % nga_ct
    print "Inserted %d new cmpd_activity rows" % ca_ct
    print "Inserted %d new ChEMBL First Reference Year tdl_infos" % cyti_ct
    if err_ct > 0:
        print "%d ERRORS" % err_ct
    if dba_err_ct > 0:
        print "WARNING: %d database errors occured. See logfile %s for details." % (
            dba_err_ct, logfile)

    # Selective compound calculations
    if not args['--quiet']:
        print "\nRunning selective compound analysis..."
    #pickle.dump(t2acts, open('T2ChEMBLActs.p', 'wb'))
    #print "%d target to activities mappings saved to pickle T2ChEMBLActs.p" % len(t2acts.keys())
    #pickle.dump(c2acts, open('C2AllChEMBLActs.p', 'wb'))
    #print "%d compound to activity mappings saved to pickle C2AllChEMBLActs.p" % len(c2acts.keys())
    # filter c2acts for compounds with multiple activities
    c2macts = {}
    for c, acts in c2acts.items():
        if len(acts) > 1:
            c2macts[c] = list(acts)
    # then sort the activity lists by pchembl_value
    c2smacts = {}
    for c, acts in c2macts.items():
        decorated = [(a['pchembl_value'], a) for a in acts]
        decorated.sort()
        c2smacts[c] = [a for (key, a) in decorated]
    #pickle.dump(c2smacts, open('C2ChEMBLActs.p', 'wb'))
    #print "%d compound to activities mappings saved to pickle C2ChEMBLActs.p" % len(c2smacts.keys())
    selective = []
    for smi in c2smacts.keys():
        i = 1
        while i <= len(c2smacts[smi]) - 1:
            if c2smacts[smi][i]['tid'] == c2smacts[smi][i - 1]['tid']:
                i += 1
                continue
            diff = c2smacts[smi][i]['pchembl_value'] - c2smacts[smi][
                i - 1]['pchembl_value']
            if diff >= 2:
                selective.append(smi)
                break
            i += 1
    #pickle.dump(selective, open(SC_PFILE, 'wb'))
    #print "%d selective compounds saved to %s" % (len(selective), SC_PFILE)
    if not args['--quiet']:
        print "  Found %d selective compounds" % len(selective)
    cscti_ct = 0
    for tid, acts in t2acts.items():
        for a in acts:
            if a['canonical_smiles'] in selective:
                # Save ChEMBL Selective Compound tdl_info
                val = "%s|%s" % (a['chembl_id'], a['canonical_smiles'])
                rv = dba.ins_tdl_info({
                    'target_id': tid,
                    'itype': 'ChEMBL Selective Compound',
                    'string_value': val
                })
                if rv:
                    cscti_ct += 1
                else:
                    dba_err_ct += 1
                break
    if not args['--quiet']:
        print "Inserted %d new ChEMBL Selective Compound tdl_infos" % cscti_ct
Example #4
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'eRAM Disease Associations',
        'source': 'Data scraped from eRAM web pages.',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.unimd.org/eram/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'eRAM'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    s = shelve.open(ERAM_SHELF_FILE)
    dis_ct = len(s['disease_names'])
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} disease names in shelf file {}".format(
            dis_ct, ERAM_SHELF_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=dis_ct).start()
    ct = 0
    pmark = {}
    skip_ct = 0
    dnerr1_ct = 0
    dnerr2_ct = 0
    notfnd = set()
    dis_ct = 0
    dba_err_ct = 0
    for dname in s['disease_names']:
        ct += 1
        try:
            dname = str(dname)
        except:
            dnerr2_ct += 1
            logger.warn("UnicodeEncodeError for disease name '{}'".format(
                dname.encode('ascii', 'ignore')))
            continue
        if dname not in s:
            dnerr_ct += 1
            logger.warn("Disease name '{}' not in shelf".format(dname))
            continue
        if 'currated_genes' not in s[dname]:
            skip_ct += 1
            continue
        for cg in s[dname]['currated_genes']:
            sym = cg['sym']
            geneid = cg['geneid']
            k = "%s|%s" % (sym, geneid)
            if k in notfnd:
                continue
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pmark[t['id']] = True
                for doid in s[dname]['doids']:
                    rv = dba.ins_disease({
                        'protein_id': p['id'],
                        'dtype': 'eRAM',
                        'name': dname,
                        'did': doid,
                        'source': cg['sources']
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    dis_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} diseases with no currated genes. See logfile {} for details.".format(
            skip_ct, logfile)
    if dnerr1_ct > 0:
        print "{} disease names not found in shelf. See logfile {} for details.".format(
            dnerr1_ct, logfile)
    if dnerr2_ct > 0:
        print "{} disease names cannot be decoded to strs. See logfile {} for details.".format(
            dnerr2_ct, logfile)
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'JensenLab PubMed Text-mining Scores',
        'source': 'File %s' % BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': BASE_URL
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'pmscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'JensenLab PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ensp2pids = {}
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    upd_ct = 0
    notfnd = {}
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            pbar.update(ct)
            if not row[0].startswith('ENSP'): continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'stringid': ensp})
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not targets:
                        notfnd[ensp] = True
                        logger.warn("No target found for {}".format(ensp))
                        continue
                pids = []
                for target in targets:
                    pids.append(target['components']['protein'][0]['id'])
                    ensp2pids[
                        ensp] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    pbar.finish()
    print "{} input lines processed.".format(ct)
    print "  Inserted {} new pmscore rows for {} targets".format(
        pms_ct, len(pmscores))
    if len(notfnd) > 0:
        print "No target found for {} STRING IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} JensenLab PubMed Score tdl_infos".format(
        len(pmscores.keys()))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'JensenLab PubMed Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "  Inserted {} new JensenLab PubMed Score tdl_info rows".format(
        ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Example #6
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GWAS Catalog',
        'source':
        'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' %
        os.path.basename(INFILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gwas/home'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INFILE)
    line_ct -= 1
    if not args['--quiet']:
        print '\nProcessing {} lines from input file {}'.format(
            line_ct, INFILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    outlist = []
    with open(INFILE, 'rU') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        notfnd = set()
        pmark = {}
        gwas_ct = 0
        dba_err_ct = 0
        # 0: DATE ADDED TO CATALOG
        # 1: PUBMEDID
        # 2: FIRST AUTHOR
        # 3: DATE
        # 4: JOURNAL
        # 5: LINK
        # 6: STUDY
        # 7: DISEASE/TRAIT
        # 8: INITIAL SAMPLE SIZE
        # 9: REPLICATION SAMPLE SIZE
        # 10: REGION
        # 11: CHR_ID
        # 12: CHR_POS
        # 13: REPORTED GENE(S)
        # 14: MAPPED_GENE
        # 15: UPSTREAM_GENE_ID
        # 16: DOWNSTREAM_GENE_ID
        # 17: SNP_GENE_IDS
        # 18: UPSTREAM_GENE_DISTANCE
        # 19: DOWNSTREAM_GENE_DISTANCE
        # 20: STRONGEST SNP-RISK ALLELE
        # 21: SNPS
        # 22: MERGED
        # 23: SNP_ID_CURRENT
        # 24: CONTEXT
        # 25: INTERGENIC
        # 26: RISK ALLELE FREQUENCY
        # 27: P-VALUE
        # 28: PVALUE_MLOG
        # 29: P-VALUE (TEXT)
        # 30: OR or BETA
        # 31: 95% CI (TEXT)
        # 32: PLATFORM [SNPS PASSING QC]
        # 33: CNV
        # 34: MAPPED_TRAIT
        # 35: MAPPED_TRAIT_URI
        # 36: STUDY ACCESSION
        # 37: GENOTYPING TECHNOLOGY
        symregex = re.compile(r' ?[-,;] ?')
        for row in tsvreader:
            ct += 1
            if len(row) < 14: continue
            symstr = row[14]
            if symstr == 'NR': continue
            symlist = symregex.split(symstr)
            for sym in symlist:
                if sym in notfnd:
                    continue
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.add(sym)
                    logger.warn("No target found for symbol {}".format(sym))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    try:
                        pval = float(row[27])
                    except:
                        pval = None
                    try:
                        orbeta = float(row[30])
                    except:
                        orbeta = None
                    if row[25]:
                        ig = int(row[25])
                    else:
                        ig = None
                    rv = dba.ins_gwas({
                        'protein_id': p['id'],
                        'disease_trait': row[7],
                        'snps': row[21],
                        'pmid': row[1],
                        'study': row[6],
                        'context': row[24],
                        'intergenic': ig,
                        'p_value': pval,
                        'or_beta': orbeta,
                        'cnv': row[33],
                        'mapped_trait': row[34],
                        'mapped_trait_uri': row[35]
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    pmark[p['id']] = True
                    gwas_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new gwas rows for {} proteins".format(
        gwas_ct, len(pmark.keys()))
    if notfnd:
        print "No target found for {} symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #7
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Jensen Lab DISEASES',
        'source':
        'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://diseases.jensenlab.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype LIKE 'JensenLab %'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    # Knowledge channel
    fn = DOWNLOAD_DIR + FILE_K
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Knowledge ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                init = {
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                }

                rv = dba.ins_disease(init)
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Experiment channel
    fn = DOWNLOAD_DIR + FILE_E
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[6] == '0':
                # skip zero confidence rows
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Experiment ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} zero confidence rows".format(skip_ct)
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Text Mining channel
    fn = DOWNLOAD_DIR + FILE_T
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Text Mining'
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'zscore': row[4],
                    'conf': row[5]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #8
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'LINCS L1000 XRefs', 'source': 'File %s'%os.path.basename(L1000_FILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://support.lincscloud.org/hc/en-us/articles/202092616-The-Landmark-Genes'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  line_ct = slmf.wcl(L1000_FILE)
  if not args['--quiet']:
    print "\nProcessing {} rows in file {}".format(line_ct, L1000_FILE)
  with open(L1000_FILE, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    ct = 0
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    pmark = {}
    xref_ct = 0
    notfnd = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      l1000 = row[0]
      sym = row[1]
      geneid = row[2]
      targets = dba.find_targets({'sym': sym})
      if not targets:
        targets = dba.find_targets({'geneid': geneid})
        if not targets:
          continue
      target = targets[0]
      pid = target['components']['protein'][0]['id']
      rv = dba.ins_xref({'protein_id': pid, 'xtype': 'L1000 ID',
                         'dataset_id': dataset_id, 'value': l1000})
      if rv:
        xref_ct += 1
        pmark[pid] = True
      else:
        dba_err_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new L1000 ID xref rows for {} proteins.".format(xref_ct, len(pmark))
  if len(notfnd) > 0:
    print "No target found for {} symbols/geneids. See logfile {} for details.".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Example #9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HomoloGene',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ncbi.nlm.nih.gov/homologene',
        'comments':
        'Only Human, Mouse and Rat members of HomoloGene groups are loaded. These relate protein to nhprotein.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{'dataset_id': dataset_id, 'table_name': 'homology'}]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        hom_ct = 0
        nf_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pbar.update(ct)
            # homologene_group_id    tax_id    ncbi_gene_id    symbol    protein_gi    ref_seq
            taxid = int(row[1])
            if taxid not in TAXIDS:
                skip_ct += 1
                continue
            if taxid == 9606:
                targets = dba.find_targets({'geneid': row[2]})
                if not targets:
                    nf_ct += 1
                    logger.warn("No target found for {}".format(row))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    rv = dba.ins_homologene({
                        'protein_id': p['id'],
                        'groupid': row[0],
                        'taxid': taxid
                    })
                    if rv:
                        hom_ct += 1
                    else:
                        dba_err_ct += 1
            else:
                nhproteins = dba.find_nhproteins({'geneid': row[2]})
                if not nhproteins:
                    nf_ct += 1
                    logger.warn("No nhprotein found for {}".format(row))
                    continue
                for nhp in nhproteins:
                    rv = dba.ins_homologene({
                        'nhprotein_id': nhp['id'],
                        'groupid': row[0],
                        'taxid': taxid
                    })
                    if rv:
                        hom_ct += 1
                    else:
                        dba_err_ct += 1
    pbar.finish()
    print "Processed {} lines.".format(ct)
    print "Loaded {} new homologene rows".format(hom_ct)
    print "  Skipped {} non-Human/Mouse/Rat lines".format(skip_ct)
    if nf_ct > 0:
        print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format(
            nf_ct, logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #10
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  exp_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Expression', 'source': 'File Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full', 'comments': 'Qualitative expression values are generated by the loading app.'} )
  assert exp_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  cpt_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Compartments', 'source': 'File Table S6 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full'} )
  assert cpt_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': exp_dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HCA RNA'", 'comment': 'TPM and qualitative expression values are derived from file Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1'},
            {'dataset_id': cpt_dataset_id, 'table_name': 'compartment', 'where_clause': "ctype = 'Human Cell Atlas'"} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  if not args['--quiet']:
    print "\nCalculating expression level percentiles"
  pctiles = calc_pctiles()
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  
  #
  # Expressions
  #
  line_ct = slmf.wcl(RNA_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines from HCA file {}".format(line_ct, RNA_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  k2pids = defaultdict(list)
  notfnd = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(RNA_FILE, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next()
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      sym = row[1]
      ensg = row[0]
      k = "%s|%s"%(sym,ensg)
      if k in k2pids:
        # we've already found it
        pids = k2pids[k]
      elif k in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False)
        if not targets:
          notfnd.add(k)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        k2pids[k] = pids
      for pid in pids:
        cell_lines = [c.replace(' (TPM)', '') for c in header[2:]]
        for (i,cl) in enumerate(cell_lines):
          tpm_idx = i + 2 # add two because row has ENSG and Gene at beginning
          tpm = float(row[tpm_idx])
          qv = calc_qual_value( tpm, pctiles[cl] )
          rv = dba.ins_expression( {'protein_id': pid, 'etype': 'HCA RNA',
                                    'tissue': 'Cell Line '+cl, 
                                    'qual_value': qv, 'number_value': tpm} )
          if not rv:
            dba_err_ct += 1
            continue
          exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins.".format(exp_ct, len(pmark))
  if notfnd:
    print "  No target found for {} Symbols/ENSGs. See logfile {} for details".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  #
  # Compartments
  #
  line_ct = slmf.wcl(LOC_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines from HCA file {}".format(line_ct, LOC_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  k2pids = defaultdict(list)
  notfnd = set()
  dba_err_ct = 0
  pmark = {}
  cpt_ct = 0
  with open(LOC_FILE, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next()
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      uniprot = row[2]
      sym = row[1]
      k = "%s|%s"%(uniprot,sym)
      if k in k2pids:
        # we've already found it
        pids = k2pids[k]
      elif k in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'uniprot': uniprot}, False)
        if not targets:
          targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          notfnd.add(k)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        k2pids[k] = pids
      for pid in pids:
        compartments = [c for c in header[3:-5]]
        for (i,c) in enumerate(compartments):
          val_idx = i + 3 # add three because row has ENSG,Gene,Uniprot at beginning
          val = int(row[val_idx])
          if val == 0:
            continue
          rel = row[-5]
          if rel == 'Uncertain':
            continue
          rv = dba.ins_compartment( {'protein_id': pid, 'ctype': 'Human Cell Atlas',
                                     'go_id': COMPARTMENTS[c][1], 
                                     'go_term': COMPARTMENTS[c][0], 'reliability': rel} )
          if not rv:
            dba_err_ct += 1
            continue
          cpt_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new compartment rows for {} protein.s".format(cpt_ct, len(pmark))
  if notfnd:
    print "  No target found for {} UniProts/Symbols. See logfile {} for details".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Example #11
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Drug Target Ontology IDs and Classifications',
        'source':
        'Files %s from Schurer Group' % (", ".join(SRC_FILES)),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://drugtargetontology.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'protein',
        'column_name': 'dtoid'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'protein',
        'column_name': 'dtoclass'
    }]
    #{'dataset_id': dataset_id, 'table_name': 'dto'} ]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    line_ct = slmf.wcl(MAPPING_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, MAPPING_FILE)
    logger.info("Processing {} input lines in file {}".format(
        line_ct, MAPPING_FILE))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    up2dto = {}
    up2pid = {}
    ct = 0
    with open(MAPPING_FILE, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct += 1
        upd_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in csvreader:
            ct += 1
            dtoid = row[0]
            up = row[1]
            logger.info("Searching for UniProt: {}".format(up))
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                notfnd.add(up)
                continue
            t = targets[0]
            pid = t['components']['protein'][0]['id']
            rv = dba.upd_protein(pid, 'dtoid', dtoid)
            if rv:
                upd_ct += 1
                up2dto[up] = dtoid
                up2pid[up] = pid
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for UniProt: {}".format(up))
    print "{} lines processed.".format(ct)
    print "  Updated {} protein.dtoid values".format(upd_ct)
    print "Got {} UniProt to DTO mappings for TCRD targets".format(len(up2dto))
    print "Got {} UniProt to Protein ID mappings for TCRD targets".format(
        len(up2pid))
    if notfnd:
        print "WARNING: No target found for {} UniProts. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Classifications
    line_ct = slmf.wcl(CLASS_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, CLASS_FILE)
    logger.info("Processing {} input lines in file {}".format(
        line_ct, CLASS_FILE))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dto_mark = {}
    with open(CLASS_FILE) as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct += 1
        upd_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in csvreader:
            ct += 1
            up = row[0]
            dto_class = row[1]
            if up not in up2pid:
                notfnd.add(up)
                continue
            pid = up2pid[up]
            rv = dba.upd_protein(pid, 'dtoclass', dto_class)
            if rv:
                upd_ct += 1
            else:
                dba_err_ct += 1
            # if dto_class in dto_mark:
            #   # we've already loaded this term/tree
            #   continue
            # term_tree = extract_tree(row)

            # rv = dba.ins_dto({'id': dtoid, 'name': dtoname, 'parent': leaf_term_parent_id})
            # if rv:
            #   dto_mark[dtoid] = True
            # else:
            #   dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("UniProt {} not in map.".format(up))
    print "{} lines processed.".format(ct)
    print "  Updated {} protein.dtoclass values".format(upd_ct)
    if notfnd:
        print "WARNING: Got {} unmapped UniProts. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #12
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IMPC Mouse Clones',
        'source':
        "File %s obtained directly from Terry Meehan/Alba Gomez at EBI." %
        os.path.basename(IMPC_FILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.mousephenotype.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'IMPC Clones'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'IMPC Status'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    line_ct = slmf.wcl(IMPC_FILE)
    if not args['--quiet']:
        print "\nProcessing {} rows from input file {}".format(
            line_ct, IMPC_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    skip_ct = 0
    notfnd = set()
    ti1_ct = 0
    ti2_ct = 0
    dba_err_ct = 0
    with open(IMPC_FILE, 'rU') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for d in csvreader:
            # Gene,MGI Accession,Public IDG,Public CMG Tier1,Public CMG Tier 2,Number of notifications,Status,# Clones,Non-Assigned Plans,Assigned plans,Aborted MIs,MIs in Progress,GLT Mice,Private
            ct += 1
            sym = d['Gene'].upper()
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets_by_xref({
                    'xtype': 'MGI ID',
                    'value': d['MGI Accession']
                })
            if not targets:
                k = "%s,%s" % (d['Gene'], d['MGI Accession'])
                notfnd.add(k)
                continue
            if not d['Status'] and not d['# Clones']:
                skip_ct += 1
                continue
            tids = list()
            for t in targets:
                pid = t['components']['protein'][0]['id']
                if not d['Status']:
                    status = '?'
                else:
                    status = d['Status']
                rv = dba.ins_tdl_info({
                    'protein_id': pid,
                    'itype': 'IMPC Status',
                    'string_value': status
                })
                if rv:
                    ti1_ct += 1
                else:
                    dba_err_ct += 1
                if not d['# Clones']:
                    continue
                rv = dba.ins_tdl_info({
                    'protein_id': pid,
                    'itype': 'IMPC Clones',
                    'string_value': d['# Clones']
                })
                if rv:
                    ti2_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for: {}".format(k))
    if not args['--quiet']:
        print "{} rows processed.".format(ct)
    print "Inserted {} new 'IMPC Status' tdl_info rows".format(ti1_ct)
    print "Inserted {} new 'IMPC Clones' tdl_info rows".format(ti2_ct)
    print "Skipped {} rows with no relevant info".format(skip_ct)
    if notfnd:
        print "No target found for {} rows. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #13
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Transcription Factor Flags',
        'source': BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://humantfs.ccbr.utoronto.ca/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'Is Transcription Factor'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0}

    ifn = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(ifn)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, ifn)
    with open(ifn, 'rU') as ifh:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(ifh)
        header = csvreader.next()  # skip header line
        ct = 0
        ti_ct = 0
        skip_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in csvreader:
            # 0 Ensembl ID
            # 1 HGNC symbol
            # 2 DBD
            # 3 Is TF?
            # 4 TF assessment
            # 5 Binding mode,Motif status
            # 6 Final Notes
            # 7 Final Comments
            # 8 Interpro ID(s)
            # 9 EntrezGene ID
            # 10 EntrezGene Description
            # 11 PDB ID
            # 12 TF tested by HT-SELEX?
            # 13 TF tested by PBM?
            # 14 Conditional Binding Requirements
            # 15 Original Comments
            # 16 Vaquerizas 2009 classification
            # 17 CisBP considers it a TF?
            # 18 TFCat classification
            # 19 Is a GO TF?
            # 20 Initial assessment
            # 21 Curator 1
            # 22 Curator 2
            # 23 TFclass considers
            ct += 1
            if row[3] != 'Yes':
                skip_ct += 1
                continue
            sym = row[1]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                gid = row[9]
                if gid != 'None' and not gid.startswith('IPR'):
                    targets = dba.find_targets({'geneid': gid})
            if not targets:
                ensg = row[0]
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensg
                })
            if not targets:
                k = "%s|%s|%s" % (sym, gid, ensg)
                notfnd.add(k)
                continue
            t = targets[0]
            TDLs[t['tdl']] += 1
            pid = t['components']['protein'][0]['id']
            rv = dba.ins_tdl_info({
                'protein_id': pid,
                'itype': 'Is Transcription Factor',
                'boolean_value': 1
            })
            if rv:
                ti_ct += 1
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for {}".format(k))
    print "\n{} lines processed.".format(ct)
    print "  Inserted {} new 'Is Transcription Factor' tdl_infos".format(ti_ct)
    print "  Skipped {} non-TF lines".format(skip_ct)
    if notfnd:
        print "No target found for {} symbols/geneids/ENSGs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']:
        print "%s: %d" % (tdl, TDLs[tdl])
Example #14
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Cell Surface Protein Atlas',
        'source':
        'Worksheet B in S1_File.xlsx from http://wlab.ethz.ch/cspa/#downloads, converted to CSV',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://wlab.ethz.ch/cspa'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id':
        dataset_id,
        'table_name':
        'expression',
        'where_clause':
        "etype = 'Cell Surface Protein Atlas'",
        'comment':
        'Only high confidence values are loaded.'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines from CSPA file {}".format(line_ct, INFILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    k2pids = defaultdict(list)
    notfnd = set()
    skip_ct = 0
    dba_err_ct = 0
    pmark = {}
    exp_ct = 0
    with open(INFILE, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()
        for row in csvreader:
            ct += 1
            pbar.update(ct)
            if row[2] != '1 - high confidence':
                skip_ct += 1
                continue
            uniprot = row[1]
            geneid = row[4]
            k = "%s|%s" % (uniprot, geneid)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                targets = dba.find_targets({'uniprot': uniprot}, False)
                if not targets:
                    targets = dba.find_targets({'geneid': geneid}, False)
                if not targets:
                    notfnd.add(k)
                    continue
                pids = []
                for t in targets:
                    pids.append(t['components']['protein'][0]['id'])
            for pid in pids:
                cell_lines = [
                    c for c in header[6:-1]
                ]  # there's a blank field at the end of the header line
                for (i, cl) in enumerate(cell_lines):
                    val_idx = i + 6  # add six because row has other values at beginning
                    if not row[val_idx]:
                        continue
                    rv = dba.ins_expression({
                        'protein_id': pid,
                        'etype': 'Cell Surface Protein Atlas',
                        'tissue': 'Cell Line ' + cl,
                        'boolean_value': True
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    exp_ct += 1
                pmark[pid] = True
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for {}".format(k))
    print "Processed {} CSPA lines.".format(ct)
    print "  Inserted {} new expression rows for {} proteins.".format(
        exp_ct, len(pmark))
    print "  Skipped {} non-high confidence rows".format(skip_ct)
    if notfnd:
        print "  No target found for {} UniProts/GeneIDs. See logfile {} for details".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #15
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HGNC',
        'source':
        'Custom download file from https://www.genenames.org/download/custom/',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.genenames.org/',
        'comments':
        'File downloaded with the following column data: HGNC ID Approved symbol Approved name   Status  UniProt ID NCBI Gene ID    Mouse genome database ID'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'sym',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'geneid',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    line_ct = slmf.wcl(HGNC_TSV_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    tmark = {}
    hgnc_ct = 0
    mgi_ct = 0
    sym_ct = 0
    symdiscr_ct = 0
    geneid_ct = 0
    geneiddiscr_ct = 0
    nf_ct = 0
    db_err_ct = 0
    with open(HGNC_TSV_FILE, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # 0: HGNC ID
            # 1: Approved symbol
            # 2: Approved name
            # 3: Status
            # 4: UniProt ID
            # 5: NCBI Gene ID
            # 6: Mouse genome database ID
            ct += 1
            pbar.update(ct)
            sym = row[1]
            geneid = row[5]
            up = row[4]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                targets = dba.find_targets({'uniprot': up})
            if not targets:
                nf_ct += 1
                #logger.warn("No target found for {}|{}|{}".format(sym, geneid, up))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pid = p['id']
                tmark[pid] = True
                # HGNC xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'HGNC',
                    'dataset_id': dataset_id,
                    'value': row[0]
                })
                if rv:
                    hgnc_ct += 1
                else:
                    db_err_ct += 1
                # MGI xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'MGI ID',
                    'dataset_id': dataset_id,
                    'value': row[6]
                })
                if rv:
                    mgi_ct += 1
                else:
                    db_err_ct += 1
                # Add missing syms
                if p['sym'] == None:
                    rv = dba.upd_protein(pid, 'sym', sym)
                    if rv:
                        logger.info(
                            "Inserted new sym {} for protein {}, {}".format(
                                sym, pid, p['uniprot']))
                        sym_ct += 1
                    else:
                        db_err_ct += 1
                else:
                    # Check for symbol discrepancies
                    if p['sym'] != sym:
                        logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" %
                                    (p['sym'], sym))
                        symdiscr_ct += 1
                if geneid:
                    # Add missing geneids
                    if p['geneid'] == None:
                        rv = dba.upd_protein(pid, 'geneid', geneid)
                        if rv:
                            logger.info(
                                "Inserted new geneid {} for protein {}, {}".
                                format(geneid, pid, p['uniprot']))
                            geneid_ct += 1
                        else:
                            db_err_ct += 1
                    else:
                        # Check for geneid discrepancies
                        if p['geneid'] != int(geneid):
                            logger.warn(
                                "GeneID discrepancy: UniProt={}, HGNC={}".
                                format(p['geneid'], geneid))
                            geneiddiscr_ct += 1
    pbar.finish()
    print "Processed {} lines - {} targets annotated.".format(ct, len(tmark))
    print "No target found for {} lines.".format(nf_ct)
    print "  Inserted {} HGNC ID xrefs".format(hgnc_ct)
    print "  Inserted {} MGI ID xrefs".format(mgi_ct)
    if sym_ct > 0:
        print "  Added {} new HGNC symbols".format(sym_ct)
    if symdiscr_ct > 0:
        print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format(
            symdiscr_ct, logfile)
    if geneid_ct > 0:
        print "  Added {} new NCBI Gene IDs".format(geneid_ct)
    if geneiddiscr_ct > 0:
        print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format(
            geneiddiscr_ct, logfile)
    if db_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            db_err_ct, logfile)
Example #16
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # the following maps Monarch's tcrdmatches_full.subject to TCRD's ortholog.id
    # ie. 'MGI:1347010' => 156650
    ortho2id = dba.get_orthologs_dbid2id()
    if not args['--quiet']:
        print "\nGot {} orthologs from TCRD".format(len(ortho2id))

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Monarch Ortholog Disease Associations',
        'source':
        'UMiami Monarch MySQL database on AWS server.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        "Monarch database contact: John Turner <*****@*****.**>"
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'ortholog_disease',
        'comment': ""
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    # if not args['--quiet']:
    #   print "\nConnecting to UMiami Monarch database."
    # monarchdb =  mysql.connect(host=MONARCH_DB_HOST, port=MONARCH_DB_PORT, db=MONARCH_DB_NAME,
    #                            user=MONARCH_DB_USER, passwd=MONARCH_DB_PW)
    # assert monarchdb, "ERROR connecting to Monarch database."
    # monarch_odas = []
    # with closing(monarchdb.cursor(mysql.cursors.DictCursor)) as curs:
    #   curs.execute(SQLq)
    #   for d in curs:
    #     monarch_odas.append(d)
    # if not args['--quiet']:
    #   print "  Got {} ortholog disease records from Monarch database.".format(len(monarch_odas))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(FILENAME)
    logger.info("Processing {} lines in file {}".format(line_ct, FILENAME))
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, FILENAME)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(FILENAME, 'rU') as ifh:
        csvreader = csv.reader(ifh)
        ct = 0
        od_ct = 0
        notfnd = set()
        ortho_notfnd = set()
        pmark = {}
        dba_err_ct = 0
        for row in csvreader:
            # HGNC Sym, UniProt, name, did, score, Ortholog TaxID, Ortholog Species, Ortholog DBID, Ortholog GeneID, Ortholog Symbol
            ct += 1
            up = row[1]
            sym = row[0]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'sym': sym})
            if not targets:
                k = "%s|%s" % (up, sym)
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            ortholog = dba.get_ortholog({'symbol': row[9], 'taxid': row[5]})
            if not ortholog:
                ortholog = dba.get_ortholog({
                    'geneid': row[8],
                    'taxid': row[5]
                })
            if not ortholog:
                k = "%s|%s|%s" % (row[9], row[8], row[5])
                ortho_notfnd.add(k)
                logger.warn("No ortholog found for {}".format(k))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_ortholog_disease({
                    'protein_id': p['id'],
                    'dtype': 'Monarch',
                    'ortholog_id': ortholog['id'],
                    'name': row[2],
                    'did': row[3],
                    'score': row[4]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                od_ct += 1
            pbar.update(ct)
        pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Inserted {} new ortholog_disease rows for {} proteins.".format(
        od_ct, len(pmark))
    if notfnd:
        print "WARNING: No target found for {} UniProts/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if ortho_notfnd:
        print "WARNING: No ortholog found for {} symbols/geneids. See logfile {} for details.".format(
            len(ortho_notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #17
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging when debug is 0
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'JAX/MGI Mouse/Human Orthology Phenotypes',
        'source':
        'File %s from ftp.informatics.jax.org' % PT_FILE,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.informatics.jax.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotyp'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    if not args['--quiet']:
        print "\nParsing Mammalian Phenotype Ontology file {}".format(
            DOWNLOAD_DIR + MPO_OWL_FILE)
    mpo = parse_mp_owl(MPO_OWL_FILE)
    if not args['--quiet']:
        print "Got {} MP terms".format(len(mpo))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    fn = DOWNLOAD_DIR + PT_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines from input file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pt_ct = 0
        skip_ct = 0
        pmark = {}
        notfnd = set()
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if not row[6] or row[6] == '':
                skip_ct += 1
                continue
            sym = row[0]
            geneid = row[1]
            k = "%s|%s" % (sym, geneid)
            if k in notfnd:
                continue
            targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                targets = dba.find_targets({'geneid': geneid}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            for t in targets:
                pid = t['components']['protein'][0]['id']
                pmark[pid] = True
                for mpid in row[6].split():
                    rv = dba.ins_phenotype({
                        'protein_id': pid,
                        'ptype': 'JAX/MGI Human Ortholog Phenotype',
                        'term_id': mpid,
                        'term_name': mpo[mpid]['name']
                    })
                    if rv:
                        pt_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new phenotype rows for {} proteins".format(
        pt_ct, len(pmark.keys()))
    print "  Skipped {} lines with no MP terms".format(skip_ct)
    if notfnd:
        print "No target found for {} gene symbols/ids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #18
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'PANTHER protein classes',
        'source':
        'File %s from ftp://ftp.pantherdb.org//sequence_classifications/current_release/PANTHER_Sequence_Classification_files/, and files %s and %s from http://data.pantherdb.org/PANTHER14.1/ontology/'
        % (os.path.basename(P2PC_FILE), os.path.basename(CLASS_FILE),
           os.path.basename(RELN_FILE)),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.pantherdb.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'panther_class'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'p2pc'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    relns = {}
    line_ct = slmf.wcl(RELN_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in relationships file {}".format(
            line_ct, RELN_FILE)
    with open(RELN_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            pcid = row[0]
            parentid = row[2]
            if pcid in relns:
                relns[pcid].append(parentid)
            else:
                relns[pcid] = [parentid]
    print "{} input lines processed.".format(ct)
    print "  Got {} PANTHER Class relationships".format(len(relns))

    pc2dbid = {}
    line_ct = slmf.wcl(CLASS_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in class file {}".format(
            line_ct, CLASS_FILE)
    with open(CLASS_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pc_ct = 0
        pcmark = {}
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pc = row[0]
            init = {'pcid': pc, 'name': row[2]}
            if row[3]:
                init['desc'] = row[3]
            if pc in relns:
                init['parent_pcids'] = "|".join(relns[pc])
            # there are duplicates in this file too, so only insert if we haven't
            if pc not in pcmark:
                rv = dba.ins_panther_class(init)
                if rv:
                    pc_ct += 1
                else:
                    dba_err_ct += 1
                pc2dbid[pc] = rv
                pcmark[pc] = True
    print "{} lines processed.".format(ct)
    print "  Inserted {} new panther_class rows".format(pc_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(P2PC_FILE)
    regex = re.compile(r'#(PC\d{5})')
    if not args['--quiet']:
        print "\nProcessing {} lines in classification file {}".format(
            line_ct, P2PC_FILE)
    with open(P2PC_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 02
        pmark = {}
        p2pc_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            [sp, hgnc, up] = row[0].split('|')
            up = up.replace('UniProtKB=', '')
            hgnc = hgnc.replace('HGNC=', '')
            if not row[8]:
                skip_ct += 1
                continue
            #print "[DEBUG] searching by uniprot", up
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                #print "[DEBUG] searching by Ensembl xref", ensg
                targets = dba.find_targets_by_xref({
                    'xtype': 'HGNC',
                    'value': hgnc
                })
            if not targets:
                k = "%s|%s" % (up, hgnc)
                notfnd.add(k)
                continue
            t = targets[0]
            pid = t['components']['protein'][0]['id']
            pmark[pid] = True
            #print "[DEBUG] PCs:",  row[8]
            for pc in regex.findall(row[8]):
                #print "[DEBUG]    ", pc
                pcid = pc2dbid[pc]
                rv = dba.ins_p2pc({
                    'protein_id': pid,
                    'panther_class_id': pcid
                })
                if rv:
                    p2pc_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for {}".format(k))
    print "{} lines processed.".format(ct)
    print "  Inserted {} new p2pc rows for {} distinct proteins".format(
        p2pc_ct, len(pmark))
    print "  Skipped {} rows without PCs".format(skip_ct)
    if notfnd:
        print "No target found for {} UniProt/HGNCs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #19
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Drug Central',
        'source':
        "Drug Central files download files: %s" % ", ".join(SRC_FILES),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://drugcentral.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset. See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'drug_activity'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'DrugCentral Indication'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    # First get mapping of DrugCentral names to ids
    name2id = {}
    line_ct = slmf.wcl(NAME_ID_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, NAME_ID_FILE)
    with open(NAME_ID_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            name2id[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(name2id))

    # Next get drug info fields
    infos = {}
    line_ct = slmf.wcl(DRUGINFO_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, DRUGINFO_FILE)
    with open(DRUGINFO_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            infos[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(infos))

    #
    # MOA activities
    #
    drug2tids = defaultdict(list)
    line_ct = slmf.wcl(TCLIN_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from DrugDB MOA activities file {}".format(
            line_ct, TCLIN_FILE)
    with open(TCLIN_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 1,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['cmpd_chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tclin rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Non-MOA activities
    #
    line_ct = slmf.wcl(TCHEM_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from Non-MOA activities file {}".format(
            line_ct, TCHEM_FILE)
    with open(TCHEM_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 0,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tchem rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Indications (diseases)
    #
    line_ct = slmf.wcl(DRUGIND_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from indications file {}".format(
            line_ct, DRUGIND_FILE)
    with open(DRUGIND_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # DRUG_ID DRUG_NAME       INDICATION_FDB  UMLS_CUI        SNOMEDCT_CUI    DOID
        ct = 0
        t2d_ct = 0
        notfnd = {}
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            drug = row[1]
            if drug not in drug2tids:
                notfnd[drug] = True
                continue
            init = {
                'protein_id': tid,
                'dtype': 'DrugCentral Indication',
                'name': row[2],
                'drug_name': drug
            }
            if row[5] != '':
                init['did'] = row[5]
            for tid in drug2tids[drug]:
                # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets
                init['protein_id'] = tid
                rv = dba.ins_disease(init)
                if rv:
                    t2d_ct += 1
                else:
                    dba_err_ct += 1
    print "{} DrugCentral indication rows processed.".format(ct)
    print "  Inserted {} new disease rows".format(t2d_ct)
    if len(notfnd.keys()) > 0:
        print "WARNNING: {} drugs NOT FOUND in activity files:".format(
            len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #20
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Guide to Pharmacology',
        'source':
        'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.guidetopharmacology.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'cmpd_activity',
        'where_clause': "ctype = 'Guide to Pharmacology'"
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = DOWNLOAD_DIR + L_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, fn)
    ligands = {}
    skip_ct = 0
    with open(fn, 'rU') as ifh:
        csvreader = csv.reader(ifh)
        header = csvreader.next()  # skip header line
        ct = 1
        for row in csvreader:
            # These are the fields in version 2019.2
            # 0 Ligand id               The GtP ligand identifier
            # 1 Name                    The name of the ligand
            # 2 Species                 (Peptides) The species which endogenously express a particular peptide ligand sequence
            # 3 Type                    The type of chemical
            # 4 Approved                The drug is or has in the past been approved for human clinical use by a regulatory agency
            # 5 Withdrawn               The drug is no longer approved for its original clinical use in one or more countries
            # 6 Labelled                The ligand has been labelled with a chemical group such as a fluorscent tag or unstable isotope
            # 7 Radioactive             Ligand has been labelled with radioactive isotope
            # 8 PubChem SID             The PubChem Substance identifier assigned when we deposited the ligand in PubChem
            # 9 PubChem CID             Our curated PubChem Compound database link
            # 10 UniProt id              (Peptides) The UniProtKB/SwissProt Accession for peptide sequences
            # 11 IUPAC name              The IUPAC chemical name
            # 12 INN                 The International Non-proprietary Name assigned by the WHO
            # 13 Synonyms                Commonly used synonyms from the literature
            # 14 SMILES                  Specification of the chemical structure in canonical, isomeric SMILES format
            # 15 InChIKey                A hashed version of the full InChI designed for easy web searches of chemical compounds
            # 16 InChI                   A textual identifier for the chemical structure
            ct += 1
            ligand_id = int(row[0])
            ligand_type = row[3]
            if ligand_type == 'Antibody' or ligand_type == 'Peptide':
                skip_ct += 1
                continue
            ligands[ligand_id] = {
                'name': row[1],
                'pubchem_cid': row[9],
                'smiles': row[14]
            }
    if not args['--quiet']:
        print "  Got info for {} ligands".format(len(ligands))
        print "  Skipped {} antibodies/peptides".format(skip_ct)

    # this dict will map uniprot|sym from interactions file to TCRD target(s)
    # so we only have to find target(s) once for each pair.
    k2ts = defaultdict(list)
    fn = DOWNLOAD_DIR + I_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, fn)
    with open(fn, 'rU') as ifh:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(ifh)
        header = csvreader.next()  # skip header line
        ct = 1
        tmark = {}
        ca_ct = 0
        ap_ct = 0
        md_ct = 0
        ba_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in csvreader:
            # NB. these do NOT match the file descriptions on the site. This is directly from the header
            # 0 target
            # 1 target_id
            # 2 target_gene_symbol
            # 3 target_uniprot
            # 4 target_ensembl_gene_id
            # 5 target_ligand
            # 6 target_ligand_id
            # 7 target_ligand_gene_symbol
            # 8 target_ligand_ensembl_gene_id
            # 9 target_ligand_uniprot
            # 10 target_ligand_pubchem_sid
            # 11 target_species
            # 12 ligand
            # 13 ligand_id
            # 14 ligand_gene_symbol
            # 15 ligand_species
            # 16 ligand_pubchem_sid
            # 17 type
            # 18 action
            # 19 action_comment
            # 20 selectivity
            # 21 endogenous
            # 22 primary_target
            # 23 concentration_range
            # 24 affinity_units
            # 25 affinity_high
            # 26 affinity_median
            # 27 affinity_low
            # 28 original_affinity_units
            # 29 original_affinity_low_nm
            # 30 original_affinity_median_nm
            # 31 original_affinity_high_nm
            # 32 original_affinity_relation
            # 33 assay_description
            # 34 receptor_site
            # 35 ligand_context
            # 36 pubmed_id
            ct += 1
            pbar.update(ct)
            lid = int(row[13])
            if lid not in ligands:
                ap_ct += 1
                continue
            if row[26] == '':  # no activity value
                md_ct += 1
                continue
            if '|' in row[3]:
                skip_ct += 1
                continue
            val = "%.8f" % float(row[26])
            act_type = row[28]
            up = row[3]
            sym = row[2]
            k = "%s|%s" % (up, sym)
            if k == '|':
                md_ct += 1
                continue
            if k in k2ts:
                # already found target(s)
                ts = k2ts[k]
            elif k in notfnd:
                # already didn't find target(s)
                continue
            else:
                # lookup target(s)
                targets = dba.find_targets({'uniprot': up})
                if not targets:
                    targets = dba.find_targets({'sym': sym})
                    if not targets:
                        notfnd.add(k)
                        logger.warn("No target found for {}".format(k))
                        continue
                ts = []
                for t in targets:
                    ts.append({'id': t['id'], 'fam': t['fam']})
                k2ts[k] = ts
            if row[36] and row[36] != '':
                pmids = row[36]
            else:
                pmids = None
            if ligands[lid]['pubchem_cid'] == '':
                pccid = None
            else:
                pccid = ligands[lid]['pubchem_cid']
            for t in ts:
                if t['fam'] == 'GPCR':
                    cutoff = 7.0  # 100nM
                elif t['fam'] == 'IC':
                    cutoff = 5.0  # 10uM
                elif t['fam'] == 'Kinase':
                    cutoff = 7.52288  # 30nM
                elif t['fam'] == 'NR':
                    cutoff = 7.0  # 100nM
                else:
                    cutoff = 6.0  # 1uM for non-IDG Family targets
                if val >= cutoff:
                    # target is Tchem, save activity
                    tmark[t['id']] = True
                    rv = dba.ins_cmpd_activity({
                        'target_id':
                        t['id'],
                        'catype':
                        'Guide to Pharmacology',
                        'cmpd_id_in_src':
                        lid,
                        'cmpd_name_in_src':
                        ligands[lid]['name'],
                        'smiles':
                        ligands[lid]['smiles'],
                        'act_value':
                        val,
                        'act_type':
                        act_type,
                        'pubmed_ids':
                        pmids,
                        'cmpd_pubchem_cid':
                        pccid
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    ca_ct += 1
                else:
                    ba_ct += 1
    pbar.finish()
    print "{} rows processed.".format(ct)
    print "  Inserted {} new cmpd_activity rows for {} targets".format(
        ca_ct, len(tmark))
    print "  Skipped {} with below cutoff activity values".format(ba_ct)
    print "  Skipped {} activities with multiple targets".format(skip_ct)
    print "  Skipped {} antibody/peptide activities".format(ap_ct)
    print "  Skipped {} activities with missing data".format(md_ct)
    if notfnd:
        print "No target found for {} uniprots/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #21
0
def calc_and_load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'KEGG Distances', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Directed graphs are produced from KEGG pathway KGML files and all shortest path lengths are then calculated and stored.'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'kegg_distance'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  kgmls = get_kgmls(KGML_DIR)

  if not args['--quiet']:
    print "\nProcessing {} KGML files in {}".format(len(kgmls), KGML_DIR)
    logger.info("Processing {} KGML files in {}".format(len(kgmls), KGML_DIR))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(kgmls)).start()
  # All pathways shortest path lengths
  # (node1, node2) => distance
  all_pws_spls = {}
  ct = 0
  err_ct = 0
  for kgml in kgmls:
    logger.info("  Working on {}".format(kgml))
    ct += 1
    try:
      dig = kg.kgml_file_to_digraph(kgml)
    except:
      err_ct += 1
      logger.error("Error parsing file: {}".format(kgml))
      continue
    aspls = nx.all_pairs_shortest_path_length(dig)
    dct = 0
    for source in aspls:
      for target in aspls[source]:
        if source == target: continue
        st = (source, target)
        if st in all_pws_spls:
          if aspls[source][target] < all_pws_spls[st]:
            all_pws_spls[st] = aspls[source][target]
            dct += 1
        else:
          all_pws_spls[st] = aspls[source][target]
          dct += 1
    logger.info("  {} has {} non-zero shortest path lengths".format(kgml, dct))
    pbar.update(ct)
  pbar.finish()
  logger.info("Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls)))
  if not args['--quiet']:
    print "  Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls))
  if err_ct > 0:
    print "WARNNING: {} parsing errors occurred. See logfile {} for details.".format(err_ct, logfile)

  logger.info("Processing {} KEGG Distances".format(len(all_pws_spls)))
  if not args['--quiet']:
    print "\nProcessing {} KEGG Distances".format(len(all_pws_spls))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(all_pws_spls)).start()
  gid2pids = defaultdict(list) # So we only find each target once,
                               # save protein.geneid => protein.id(s)
  notfnd = set()
  ct = 0
  skip_ct = 0
  kd_ct = 0
  dba_err_ct = 0
  for st,dist in all_pws_spls.items():
    ct += 1
    geneid1 = re.sub(r'^hsa:', '', st[0])
    geneid2 = re.sub(r'^hsa:', '', st[1])
    if geneid1 in gid2pids:
      pids1 = gid2pids[geneid1]
    elif geneid1 in notfnd:
      skip_ct += 1
      continue
    else:
      targets = dba.find_targets({'geneid': geneid1})
      if not targets:
        skip_ct += 1
        notfnd.add(geneid1) # add to notfnd so we don't try looking it up again
        logger.warn("No target found for KEGG Gene ID {}".format(geneid1))
        continue
      pids1 = []
      for t in targets:
        pid = t['components']['protein'][0]['id']
        pids1.append(pid)
        gid2pids[geneid1].append(pid)
    if geneid2 in gid2pids:
      pids2 = gid2pids[geneid2]
    elif geneid2 in notfnd:
      skip_ct += 1
      continue
    else:
      targets = dba.find_targets({'geneid': geneid2})
      if not targets:
        skip_ct += 1
        notfnd.add(geneid2) # add to notfnd so we don't try looking it up again
        logger.warn("No target found for KEGG Gene ID {}".format(geneid2))
        continue
      pids2 = []
      for t in targets:
        pid = t['components']['protein'][0]['id']
        pids2.append(pid)
        gid2pids[geneid2].append(pid)
    for pid1 in pids1:
      for pid2 in pids2:
        rv = dba.ins_kegg_distance({'pid1': pid1, 'pid2': pid2, 'distance': dist})
        if rv:
          kd_ct += 1
        else:
          dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "{} KEGG Distances processed.".format(ct)
  print "  Inserted {} new kegg_distance rows".format(kd_ct)
  if skip_ct > 0:
    print "  {} KEGG IDs not found in TCRD - Skipped {} rows. See logfile {} for details.".format(len(notfnd), skip_ct, logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Example #22
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'EBI Patent Counts',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.surechembl.org/search/',
        'comments':
        'Patents from SureChEMBL were tagged using the JensenLab tagger.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'patent_count'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'EBI Total Patent Count'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    patent_cts = {}
    notfnd = set()
    pc_ct = 0
    dba_err_ct = 0
    fname = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fname)
    with open(fname, 'rU') as csvfile:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 0
        for row in csvreader:
            ct += 1
            up = row[0]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets_by_alias({
                    'type': 'UniProt',
                    'value': up
                })
                if not targets:
                    notfnd.add(up)
                    continue
            pid = targets[0]['components']['protein'][0]['id']
            rv = dba.ins_patent_count({
                'protein_id': pid,
                'year': row[2],
                'count': row[3]
            })
            if rv:
                pc_ct += 1
            else:
                dba_err_ct += 1
            if pid in patent_cts:
                patent_cts[pid] += int(row[3])
            else:
                patent_cts[pid] = int(row[3])
            pbar.update(ct)
    pbar.finish()
    for up in notfnd:
        logger.warn("No target found for {}".format(up))
    print "{} lines processed.".format(ct)
    print "Inserted {} new patent_count rows for {} proteins".format(
        pc_ct, len(patent_cts))
    if notfnd:
        print "No target found for {} UniProts. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    if not args['--quiet']:
        print "\nLoading {} Patent Count tdl_infos".format(len(patent_cts))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, count in patent_cts.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'EBI Total Patent Count',
            'integer_value': count
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "  {} processed".format(ct)
    print "  Inserted {} new EBI Total Patent Count tdl_info rows".format(
        ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #23
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'LINCS',
        'source':
        "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.",
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://lincsproject.org/LINCS/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    gid2pids = {}
    notfnd = set()
    dba_err_ct = 0
    pmark = {}
    lincs_ct = 0
    with open(INPUT_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        for row in tsvreader:
            # 0: level5_lm.pr_gene_id
            # 1: level5_lm.zscore
            # 2: perturbagen.dc_id
            # 3: perturbagen.canonical_smiles
            # 4: signature.cell_id
            ct += 1
            gid = row[0]
            if gid in gid2pids:
                # we've already found it
                pids = gid2pids[gid]
            elif gid in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                targets = dba.find_targets({'geneid': gid}, False)
                if not targets:
                    notfnd.add(gid)
                    continue
                pids = []
                for t in targets:
                    pid = t['components']['protein'][0]['id']
                    pids.append(pid)
                gid2pids[
                    gid] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_lincs({
                    'protein_id': pid,
                    'cellid': row[4],
                    'zscore': row[1],
                    'pert_dcid': row[2],
                    'pert_smiles': row[3]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                pmark[pid] = True
                lincs_ct += 1
            pbar.update(ct)
    pbar.finish()
    for gid in notfnd:
        logger.warn("No target found for {}".format(gid))
    print "{} lines processed.".format(ct)
    print "Loaded {} new lincs rows for {} proteins.".format(
        lincs_ct, len(pmark))
    if notfnd:
        print "No target found for {} geneids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #24
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    #omim2doid = pickle.load( open(OMIM2DOID_PFILE, 'r') )
    #mesh2doid = pickle.load( open(MESH2DOID_PFILE, 'r') )
    conn = conn_tcrd({})
    mesh2doid = get_db2do_map(conn, 'MESH')
    omim2doid = get_db2do_map(conn, 'OMIM')

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'CTD Disease Associations',
        'source':
        'File %s from %s.' % (INPUT_FILE, BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://ctdbase.org/',
        'comments':
        "Only disease associations with direct evidence are loaded into TCRD."
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'CTD'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    infile = (DOWNLOAD_DIR + INPUT_FILE).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, infile)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        k2pids = {}
        pmark = {}
        notfnd = set()
        skip_ct = 0
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            # 0: GeneSymbol
            # 1: GeneID
            # 2: DiseaseName
            # 3: DiseaseID (MeSH or OMIM identifier)
            # 4: DirectEvidence ('|'-delimited list)
            # 5: InferenceChemicalName
            # 6: InferenceScore
            # 7: OmimIDs ('|'-delimited list)
            # 8: PubMedIDs ('|'-delimited list)
            ct += 1
            if row[0].startswith('#'):
                continue
            if not row[4]:  # only load associations with direct evidence
                skip_ct += 1
                continue
            sym = row[0]
            geneid = row[1]
            k = "%s|%s" % (sym, geneid)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    targets = dba.find_targets({'geneid': geneid})
                if not targets:
                    notfnd.add(geneid)
                    logger.warn("No target found for {}".format(k))
                    continue
                pids = []
                for t in targets:
                    p = t['components']['protein'][0]
                    pmark[p['id']] = True
                    pids.append(p['id'])
                k2pids[
                    k] = pids  # save this mapping so we only lookup each target once
            # Try to map MeSH and OMIM IDs to DOIDs
            if row[3].startswith('MESH:'):
                mesh = row[3].replace('MESH:', '')
                if mesh in mesh2doid:
                    dids = mesh2doid[mesh]
                else:
                    dids = [row[3]]
            elif row[3].startswith('OMIM:'):
                omim = row[3].replace('OMIM:', '')
                if omim in omim2doid:
                    dids = omim2doid[omim]
                else:
                    dids = [row[3]]
            else:
                dids = [row[3]]
            for pid in pids:
                for did in dids:
                    rv = dba.ins_disease({
                        'protein_id': pid,
                        'dtype': 'CTD',
                        'name': row[2],
                        'did': did,
                        'evidence': row[4]
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new disease rows for {} proteins.".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} with no direct evidence.".format(skip_ct)
    if notfnd:
        print "No target found for {} symbols/geneids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #25
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Expression Atlas',
        'source':
        'IDG-KMC generated data at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gxa/',
        'comment':
        'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'Expression Atlas'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    with open(INPUT_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        k2pids = {}
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            # 0: "Gene ID"
            # 1: "DOID"
            # 2: "Gene Name"
            # 3: "log2foldchange"
            # 4: "p-value"
            # 5: "disease"
            # 6: "experiment_id"
            # 7: "contrast_id"
            ct += 1
            sym = row[2]
            ensg = row[0]
            k = "%s|%s" % (sym, ensg)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'sym': sym}, idg=False)
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'ENSG',
                        'value': ensg
                    })
                if not targets:
                    notfnd.add(k)
                    logger.warn("No target found for {}".format(k))
                    continue
                pids = []
                for t in targets:
                    p = t['components']['protein'][0]
                    pmark[p['id']] = True
                    pids.append(p['id'])
                k2pids[
                    k] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': 'Expression Atlas',
                    'name': row[5],
                    'did': row[1],
                    'log2foldchange': "%.3f" % float(row[3]),
                    'pvalue': row[4]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new disease rows for {} proteins.".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} symbols/ensgs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #26
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IDG Eligible Targets List',
        'source':
        'IDG generated data in file %s.' % IDG_LIST_FILE,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'IDG Flags and Families set from list of targets on GitHub.',
        'url':
        'https://github.com/druggablegenome/IDGTargets'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'idg',
        'where_clause': 'column_name == "idg"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'fam',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'famext',
        'where_clause': 'column_name == "fam"',
        'where_clause': 'idg == 1'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(IDG_LIST_FILE)
    print '\nProcessing {} lines in list file {}'.format(
        line_ct, IDG_LIST_FILE)
    logger.info("Processing {} lines in list file {}".format(
        line_ct, IDG_LIST_FILE))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    notfnd = []
    multfnd = []
    ct = 0
    idg_ct = 0
    fam_ct = 0
    famext_ct = 0
    dba_err_ct = 0
    with open(IDG_LIST_FILE, 'rU') as ifh:
        csvreader = csv.reader(ifh)
        #header = csvreader.next() # skip header line
        #ct += 1
        for row in csvreader:
            ct += 1
            sym = row[0]
            fam = row[2]
            targets = dba.find_targets({'sym': sym},
                                       idg=False,
                                       include_annotations=False)
            if not targets:
                notfnd.append(sym)
                continue
            if len(targets) > 1:
                multfnd.append(sym)
            for t in targets:
                rv = dba.upd_target(t['id'], 'idg', 1)
                if rv:
                    idg_ct += 1
                else:
                    dba_err_ct += 1
                rv = dba.upd_target(t['id'], 'fam', fam)
                if rv:
                    fam_ct += 1
                else:
                    dba_err_ct += 1
                if row[3]:
                    famext = row[3]
                    rv = dba.upd_target(t['id'], 'famext', famext)
                    if rv:
                        famext_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "{} targets updated with IDG flags".format(idg_ct)
    print "{} targets updated with fams".format(fam_ct)
    print "  {} targets updated with famexts".format(famext_ct)
    if notfnd:
        print "No target found for {} symbols: {}".format(
            len(notfnd), ", ".join(notfnd))
    if multfnd:
        print "Multiple targets found for {} symbols: {}".format(
            len(multfnd), ", ".join(multfnd))
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #27
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'OMIM',
        'source':
        'Files %s downloaded from omim.org' %
        ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://omim.org/',
        'comments':
        'Confirmed OMIM phenotypes and OMIM Phenotype Series info'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'omim'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'omim_ps'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'OMIM'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    # OMIMs and Phenotypic Series
    fname = DOWNLOAD_DIR + TITLES_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        omim_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Prefix ???
            # 1: Mim Number
            # 2: Preferred Title; symbol Alternative Title(s); symbol(s)
            # 3: Included Title(s); symbols
            title = row[2].partition(';')[0]
            rv = dba.ins_omim({'mim': row[1], 'title': title})
            if not rv:
                dba_err_ct += 1
                continue
            omim_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim rows".format(omim_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fname = DOWNLOAD_DIR + PS_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        ps_ct = 0
        err_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Phenotypic Series Number
            # 1: Mim Number
            # 2: Phenotype
            if len(row) == 2:
                init = {'omim_ps_id': row[0], 'title': row[1]}
            elif len(row) == 3:
                init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]}
            else:
                err_ct += 1
                logger.warn("Parsing error for row {}".format(row))
                continue
            rv = dba.ins_omim_ps(init)
            if not rv:
                dba_err_ct += 1
                continue
            ps_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim_ps rows".format(ps_ct)
    if err_ct > 0:
        print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(
            er_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Phenotypes
    fname = DOWNLOAD_DIR + GENEMAP_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        tmark = {}
        skip_ct = 0
        notfnd_ct = 0
        prov_ct = 0
        dds_ct = 0
        pt_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0 - Sort ???
            # 1 - Month
            # 2 - Day
            # 3 - Year
            # 4 - Cytogenetic location
            # 5 - Gene Symbol(s)
            # 6 - Confidence
            # 7 - Gene Name
            # 8 - MIM Number
            # 9 - Mapping Method
            # 10 - Comments
            # 11 - Phenotypes
            # 12 - Mouse Gene Symbol
            pts = row[11]
            if pts.startswith('?'):
                prov_ct += 1
                continue
            if '(4)' in pts:
                dds_ct += 1
            trait = "MIM Number: %s" % row[8]
            if row[11]:
                trait += "; Phenotype: %s" % pts
            found = False
            syms = row[5].split(', ')
            logger.info("Checking for OMIM syms: {}".format(syms))
            for sym in syms:
                targets = dba.find_targets({'sym': sym})
                if targets:
                    found = True
                    for t in targets:
                        p = t['components']['protein'][0]
                        logger.info(
                            "  Symbol {} found target {}: {}, {}".format(
                                sym, t['id'], p['name'], p['description']))
                        rv = dba.ins_phenotype({
                            'protein_id': p['id'],
                            'ptype': 'OMIM',
                            'trait': trait
                        })
                        if not rv:
                            dba_err_ct += 1
                            continue
                        tmark[t['id']] = True
                        pt_ct += 1
            if not found:
                notfnd_ct += 1
                logger.warn("No target found for row {}".format(row))
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "  Skipped {} provisional phenotype rows.".format(prov_ct)
    print "  Skipped {} deletion/duplication syndrome rows.".format(dds_ct)
    print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark))
    if notfnd_ct > 0:
        print "No target found for {} good lines. See logfile {} for details.".format(
            notfnd_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Example #28
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'DisGeNET Disease Associations', 'source': 'File %s from %s.'%(INPUT_FILE, BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.disgenet.org/web/DisGeNET/menu'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'DisGeNET'"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  infile = (DOWNLOAD_DIR + INPUT_FILE).replace('.gz', '')
  line_ct = slmf.wcl(infile)
  if not args['--quiet']:
     print "\nProcessing {} lines in file {}".format(line_ct, infile)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  with open(infile, 'rU') as f:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    k2pids = {}
    pmark = {}
    notfnd = set()
    dis_ct = 0
    dba_err_ct = 0
    for line in f:
      # 0: geneId
      # 1: geneSymbol
      # 2: DSI
      # 3: DPI
      # 4: diseaseId
      # 5: diseaseName
      # 6: diseaseType
      # 7: diseaseClass
      # 8: diseaseSemanticType
      # 9: score
      # 10: EI
      # 11: YearInitial
      # 12: YearFinal
      # 13: NofPmids
      # 14: NofSnps
      # 15: source
      ct += 1
      if line.startswith('#'):
        continue
      if line.startswith('geneId'):
        # header row
        continue
      data = line.split('\t')
      geneid = data[0].strip()
      sym = data[1]
      k = "%s|%s"%(sym,geneid)
      if k in k2pids:
        # we've already found it
        pids = k2pids[k]
      elif k in notfnd:
        # we've already not found it
          continue
      else:
        targets = dba.find_targets({'sym': sym})
        if not targets:
          targets = dba.find_targets({'geneid': geneid})
        if not targets:
          notfnd.add(k)
          logger.warn("No target found for {}".format(k))
          continue
        pids = []
        for t in targets:
          p = t['components']['protein'][0]
          pmark[p['id']] = True
          pids.append(p['id'])
        k2pids[k] = pids # save this mapping so we only lookup each target once
      pmid_ct = data[13].strip()
      snp_ct = data[14].strip()
      if pmid_ct != '0':
        if snp_ct != '0':
          ev = "%s PubMed IDs; %s SNPs"%(pmid_ct, snp_ct)
        else:
          ev = "%s PubMed IDs"%pmid_ct
      else:
        ev = "%s SNPs"%snp_ct
      for pid in pids:
        rv = dba.ins_disease( {'protein_id': pid, 'dtype': 'DisGeNET', 'name': data[5],
                               'did': data[4], 'score': data[9], 'source': data[15].strip(),
                               'evidence': ev} )
        if not rv:
          dba_err_ct += 1
          continue
        dis_ct += 1
      pbar.update(ct)
  pbar.finish()
  print "{} lines processed.".format(ct)
  print "Loaded {} new disease rows for {} proteins.".format(dis_ct, len(pmark))
  if notfnd:
    print "No target found for {} symbols/geneids. See logfile {} for details.".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)