Esempio n. 1
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Parse the MP OWL file
  if not args['--quiet']:
    print "\nParsing Mammalian Phenotype Ontology file {}".format(DOWNLOAD_DIR + FILENAME)
  mp = parse_mp_owl(DOWNLOAD_DIR + FILENAME)
  print "Got {} MP terms".format(len(mp))

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Mammalian Phenotype Ontology', 'source': 'OWL file downloaded from %s'%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'mpo'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nLoading {} Mammalian Phenotype Ontology terms".format(len(mp))
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(mp)).start()
  ct = 0
  mpo_ct = 0
  dba_err_ct = 0
  for mpd in mp:
    ct += 1
    rv = dba.ins_mpo(mpd)
    if rv:
      mpo_ct += 1
    else:
      dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "{} terms processed.".format(ct)
  print "  Inserted {} new mpo rows".format(mpo_ct)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 2
0
def main():
  argparser = argparse.ArgumentParser(description="Export TCRD target data to a CSV file")
  argparser.add_argument("-o", "--outfile", help='Output file [path/]name', default=OUTFILE)
  argparser.add_argument('-db', '--dbname', help='MySQL database name', default=DBNAME)
  argparser.add_argument("-i", "--idg", help="Export only IDG-Eligible tagets", action="store_true", default=False)
  argparser.add_argument("-e", "--expand", help="Export expanded (a LOT of data) CSV version", action="store_true", default=False)
  args = argparser.parse_args()
  
  dba = DBAdaptor({'dbname': args.dbname})
  dbi = dba.get_dbinfo()
  print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c"))
  print "\nConnected to TCRD database %s (schema ver %s, data ver %s)\n" % (dbi['dbname'], dbi['schema_ver'], dbi['data_ver'])

  if args.idg:
    tct = dba.get_target_count(idg=True)
    print "Exporting CSV for %d IDG-Eligible targets from TCRD to file %s" % (tct, args.outfile)
  else:
    tct = dba.get_target_count(idg=False)
    print "Exporting CSV for all %d targets from TCRD to file %s" % (tct, args.outfile)

  header = ['TCRD ID', 'Name', 'Description', 'HGNC Sym', 'NCBI Gene ID', 'UniProt', 'STRING ID', 'TDL', 'IDG Eligible', 'DTO ID', 'DTO Class']
  if args.expand:
    header = header + ['PANTHER Class(es)', 'GeneRIF Count', 'NCBI Gene PubMed Count', 'JensenLab PubMed Score', 'PubTator Score', 'Ab Count', 'Monoclonal Ab Count', 'Activity Count', 'ChEMBL Selective Compound', 'ChEMBL First Reference Year', 'DrugCentral Activity Count', 'PDB Count', 'PDBs', 'GO Annotation Count', 'Experimental MF/BP Leaf Term GOA(s)', 'OMIM Phenotype Count', 'OMIM Phenotype(s)', 'JAX/MGI Human Ortholog Phenotype Count', 'JAX/MGI Human Ortholog Phenotype(s)', 'IMPC Ortholog Phenotype Count', 'IMPC Ortholog Phenotype(s)', 'GWAS Count', 'GWAS Phenotype(s)', 'Pathway Count', 'Pathways', 'Total Disease Count', 'Top 5 Text-Mining DISEASES', 'eRAM Diseases', 'EBI Patent Count', 'Is Transcription Factor', 'TMHMM Prediction', 'HPA Tissue Specificity Index', 'HPM Gene Tissue Specificity Index', 'HPM Protein Tissue Specificity Index', 'TIN-X Novelty', 'Top 5 TIN-X Importance(s)']
    
  pbar_widgets = ['Progress: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  with open(args.outfile, 'wb') as csvout:
    csvwriter = csv.writer(csvout, quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csvwriter.writerow(header)
    ct = 0
    if args.idg:
      for t in dba.get_targets(idg=True, include_annotations=args.expand):
        ct += 1
        if args.expand:
          csvwriter.writerow( target2csv_exp(t) )
        else:
          csvwriter.writerow( target2csv(t) )
        pbar.update(ct)
    else:
      for t in dba.get_targets(idg=False, include_annotations=args.expand):
      #for tid in [9]:
      #  t = dba.get_target(tid, True)
        ct += 1
        if args.expand:
          csvwriter.writerow(target2csv_exp(t))
        else:
          csvwriter.writerow(target2csv(t))
        pbar.update(ct)
  pbar.finish()

  print "%d CSV rows exported" % ct
  print "\n%s: Done." % PROGRAM
Esempio n. 3
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)
  
  #
  # Protein Level Expressions
  #
  line_ct = slmf.wcl(PROTEIN_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  rs2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(PROTEIN_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      if rs in rs2pids:
        # we've already found it
        pids = rs2pids[rs]
      elif rs in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False)
        if not targets:
          notfnd.add(rs)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        rs2pids[rs] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      if row[3] == 'NA':
        init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Protein','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids))
  if notfnd:
    print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(PROTEIN_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(PROTEIN_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      tau = row[1]
      if rs not in rs2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  #
  # Gene Level Expressions
  #
  line_ct = slmf.wcl(GENE_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  sym2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(GENE_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      if sym in sym2pids:
        pids = sym2pids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          notfnd.add(sym)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        sym2pids[sym] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      
      if row[3] == 'NA':
        init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Gene','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids))
  if notfnd:
    print "  No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(GENE_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(GENE_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      tau = row[1]
      if sym not in sym2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with symbols not in map from expression file".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 4
0
def main(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    generifs = dba.get_generifs()
    if not args['--quiet']:
        print "\nProcessing {} GeneRIFs".format(len(generifs))
    logger.info("Processing {} GeneRIFs".format(len(generifs)))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start()
    yrre = re.compile(r'^(\d{4})')
    ct = 0
    yr_ct = 0
    skip_ct = 0
    net_err_ct = 0
    pubmed2date = {}
    missing_pmids = set()
    for generif in generifs:
        ct += 1
        pbar.update(ct)
        for pmid in generif['pubmed_ids'].split("|"):
            if pmid in pubmed2date:
                continue
            # See if this PubMed is in TCRD...
            pm = dba.get_pubmed(pmid)
            if pm:
                # if so get date from there
                if pm['date']:
                    pubmed2date[pmid] = pm['date']
            else:
                # if not, will have to get it via EUtils
                missing_pmids.add(pmid)
    pbar.finish()
    if not args['--quiet']:
        print "{} GeneRIFs processed.".format(ct)
        in_tcrd_ct = len(pubmed2date)
        print "Got date mapping for {} PubMeds in TCRD".format(in_tcrd_ct)

    if not args['--quiet']:
        print "\nGetting {} missing PubMeds from E-Utils".format(
            len(missing_pmids))
    logger.debug("Getting {} missing PubMeds from E-Utils".format(
        len(missing_pmids)))
    chunk_ct = 0
    err_ct = 0
    no_date_ct = 0
    pmids = list(missing_pmids)
    for chunk in chunker(pmids, 200):
        chunk_ct += 1
        if not args['--quiet']:
            print "  Processing chunk {}".format(chunk_ct)
        logger.debug("Chunk {}: {}".format(chunk_ct, chunk))
        r = get_pubmed(chunk)
        if not r or r.status_code != 200:
            # try again...
            r = get_pubmed(pmid)
            if not r or r.status_code != 200:
                logger.error(
                    "Bad E-Utils response for PubMed ID {}".format(pmid))
                net_err_ct += 1
                continue
        soup = BeautifulSoup(r.text, "xml")
        pmas = soup.find('PubmedArticleSet')
        for pma in pmas.findAll('PubmedArticle'):
            pmid = pma.find('PMID').text
            date = get_pubmed_article_date(pma)
            if date:
                pubmed2date[pmid] = date
            else:
                no_date_ct += 1
    elapsed = time.time() - start_time
    if not args['--quiet']:
        print "{} PubMed IDs processed.".format(ct)
        print "Got date mapping for {} PubMeds not in TCRD".format(
            len(pubmed2date) - in_tcrd_ct)
        print "No date for {} PubMeds".format(no_date_ct)
    if net_err_ct > 0:
        print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format(
            net_err_ct, logfile)
    if not args['--quiet']:
        print "Dumping map to file: {}".format(PICKLE_FILE)
    pickle.dump(pubmed2date, open(PICKLE_FILE, 'wb'))
Esempio n. 5
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Human Protein Atlas',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM from HPA file http://www.proteinatlas.org/download/normal_tissue.tsv.zip.',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.proteinatlas.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'expression',
        'where_clause':
        "etype = 'HPA'",
        'comment':
        'Qualitative expression values are derived from files from http://www.proteinatlas.org/'
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'tdl_info',
        'where_clause':
        "itype = 'HPA Tissue Specificity Index'",
        'comment':
        'Tissue Specificity scores are derived from files from http://www.proteinatlas.org/. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    with open(TISSUE2UBERON_FILE, 'r') as ifh:
        tiss2uid = ast.literal_eval(ifh.read())
    if not args['--quiet']:
        print "\nGot {} tissue to Uberon ID mappings from file {}".format(
            len(tiss2uid), TISSUE2UBERON_FILE)

    line_ct = slmf.wcl(HPA_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in HPA file {}".format(line_ct, HPA_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    pmark = {}
    exp_ct = 0
    nouid = set()
    with open(HPA_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # "protein_id"    "Tissue"        "Gene"  "Gene name"     "Level" "Reliability"
            ct += 1
            tissue = row[1]
            init = {
                'protein_id': row[0],
                'etype': 'HPA',
                'tissue': tissue,
                'qual_value': row[4],
                'evidence': row[5]
            }
            # Add Uberon ID, if we can find one
            if tissue in tiss2uid:
                uberon_id = tiss2uid[tissue]
            else:
                uberon_id = dba.get_uberon_id({'name': tissue})
            if uberon_id:
                init['uberon_id'] = uberon_id
            else:
                nouid.add(tissue)
            rv = dba.ins_expression(init)
            if not rv:
                dba_err_ct += 1
                continue
            exp_ct += 1
            pmark[row[1]] = True
            pbar.update(ct)
    pbar.finish()
    print "Processed {} HPA lines.".format(ct)
    print "  Inserted {} new expression rows for {} proteins.".format(
        exp_ct, len(pmark))
    if nouid:
        print "No Uberon ID found for {} tissues. See logfile {} for details.".format(
            len(nouid), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    line_ct = slmf.wcl(HPA_TAU_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in HPA TAU file {}".format(
            line_ct, HPA_TAU_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    pmark = {}
    skip_ct = 0
    ti_ct = 0
    with open(HPA_TAU_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # "Gene"  "TAU"   "protein_id"
            ct += 1
            pbar.update(ct)
            if row[1] == 'None':
                skip_ct += 1
                continue
            rv = dba.ins_tdl_info({
                'protein_id': int(row[2]),
                'itype': 'HPA Tissue Specificity Index',
                'number_value': row[1]
            })
            if not rv:
                dba_err_ct += 1
                continue
            pmark[row[1]] = True
            ti_ct += 1
    pbar.finish()
    print "Processed {} lines.".format(ct)
    print "  Inserted {} new HPA Tissue Specificity Index tdl_info rows for {} proteins.".format(
        ti_ct, len(pmark))
    if skip_ct:
        print "  Skipped {} rows with no tau.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 6
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  aliasmap = {}
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  ct = 0
  skip_ct = 0
  mult_ct = 0
  line_ct = slmf.wcl(INFILE1)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(INFILE1, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      # taxid   uniprot_ac|uniprot_id   string_id   identity   bit_score
      ct += 1
      pbar.update(ct)
      if float(row[3]) != 100:
        skip_ct += 1
        continue
      [uniprot, name] = row[1].split("|")
      ensp = row[2].replace('9606.', '')
      bitscore = float(row[4])
      if uniprot in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[uniprot][1]:
          aliasmap[uniprot] = (ensp, bitscore)
      else:
        aliasmap[uniprot] = (ensp, bitscore)
      if name in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[name][1]:
          aliasmap[name] = (ensp, bitscore)
      else:
        aliasmap[name] = (ensp, bitscore)
  pbar.finish()
  unmap_ct = len(aliasmap)
  print "{} input lines processed.".format(ct)
  print "  Skipped {} non-identity lines".format(skip_ct)
  print "  Got {} uniprot/name to STRING ID mappings".format(unmap_ct)

  line_ct = slmf.wcl(INFILE2)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  warn_ct = 0
  with open(INFILE2, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ## string_protein_id ## alias ## source ##
      ct += 1
      pbar.update(ct)
      alias = row[1]
      ensp = row[0].replace('9606.', '')
      if alias in aliasmap and aliasmap[alias][0] != ensp:
        # do not replace mappings from *human.uniprot_2_string.2018* with aliases
        logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp))
        warn_ct += 1
        continue
      aliasmap[alias] = (ensp, None)
  pbar.finish()
  amap_ct = len(aliasmap) - unmap_ct
  print "{} input lines processed.".format(ct)
  print "  Added {} alias to STRING ID mappings".format(amap_ct)
  if warn_ct > 0:
    print "  Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile)

  tct = dba.get_target_count(idg=False)
  if not args['--quiet']:
    print "\nLoading STRING IDs for {} TCRD targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  ct = 0
  upd_ct = 0
  nf_ct = 0
  dba_err_ct = 0
  for target in dba.get_targets(include_annotations=True):
    ct += 1
    pbar.update(ct)
    p = target['components']['protein'][0]
    geneid = 'hsa:' + str(p['geneid'])
    hgncid = None
    if 'HGNC' in p['xrefs']:
      hgncid = p['xrefs']['HGNC'][0]['value']
    ensp = None
    if p['uniprot'] in aliasmap:
      ensp = aliasmap[p['uniprot']][0]
    elif p['name'] in aliasmap:
      ensp = aliasmap[p['name']][0]
    elif geneid in aliasmap:
      ensp = aliasmap[geneid][0]
    elif hgncid and hgncid in aliasmap:
      ensp = aliasmap[hgncid][0]
    if not ensp:
      nf_ct += 1
      logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot']))
      continue
    rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} )
    if rv:
      upd_ct += 1
    else:
      dba_err_ct += 1
  pbar.finish()
  print "Updated {} STRING ID values".format(upd_ct)
  if nf_ct > 0:
    print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 7
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Jensen Lab TISSUES', 'source': 'Files %s from %s'%(", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://tissues.jensenlab.org/'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    sys.exit(1)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "type LIKE 'JensenLab %'"})
  if not rv:
    print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
    sys.exit(1)

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)

  # this dict will map ENSP|sym from input files to TCRD protein_id(s)
  # so we only have to find target(s) once for each pair.
  # See find_pids() below
  pmap = {}

  # Knowledge channel
  fn = DOWNLOAD_DIR+FILE_K
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      k = "%s|%s" % (row[0], row[1]) # ENSP|sym
      if k in notfnd:
        continue
      pids = find_pids(dba, k, pmap)
      if not pids:
        notfnd.add(k)
        continue
      etype = 'JensenLab Knowledge ' + row[4]
      init = {'etype': etype, 'tissue': row[3],'boolean_value': 1, 
              'oid': row[2], 'evidence': row[5], 'conf': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
  
  # Experiment channel
  fn = DOWNLOAD_DIR+FILE_E
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    skip_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      if row[6] == '0':
        # skip zero confidence rows
        skip_ct += 1
        continue
      sym = row[1]
      # some rows look like:
      # ['ENSP00000468389', 'PSENEN {ECO:0000313|Ensembl:ENSP00000468593}', 'BTO:0002860', 'Oral mucosa', 'HPA', 'High: 1 antibody', '1']
      if ' ' in sym:
        sym = sym.split()[0]
      k = "%s|%s" % (row[0], sym) # ENSP|sym
      if k in notfnd:
        continue
      try:
        pids = find_pids(dba, k, pmap)
      except ValueError:
        print "[ERROR] Row: %s; k: %s" % (str(row), k)
      if not pids:
        notfnd.add(k)
        continue
      etype = 'JensenLab Experiment ' + row[4]
      init = {'etype': etype, 'tissue': row[3],
              'string_value': row[5], 'oid': row[2], 'conf': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        pmark[pid] = True
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  print "  Skipped {} zero confidence rows".format(skip_ct)
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  # Text Mining channel
  fn = DOWNLOAD_DIR+FILE_T
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      k = "%s|%s" % (row[0], row[1]) # ENSP|sym
      if k in notfnd:
        continue
      pids = find_pids(dba, k, pmap)
      if not pids:
        notfnd.add(k)
        logger.warn("No target found for {}".format(k))
        continue
      etype = 'JensenLab Text Mining'
      init = {'etype': etype, 'tissue': row[3], 'boolean_value': 1,
              'oid': row[2], 'zscore': row[4], 'conf': row[5], 'url': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        pmark[pid] = True
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 8
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'LINCS',
        'source':
        "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.",
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://lincsproject.org/LINCS/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    gid2pids = {}
    notfnd = set()
    dba_err_ct = 0
    pmark = {}
    lincs_ct = 0
    with open(INPUT_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        for row in tsvreader:
            # 0: level5_lm.pr_gene_id
            # 1: level5_lm.zscore
            # 2: perturbagen.dc_id
            # 3: perturbagen.canonical_smiles
            # 4: signature.cell_id
            ct += 1
            gid = row[0]
            if gid in gid2pids:
                # we've already found it
                pids = gid2pids[gid]
            elif gid in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                targets = dba.find_targets({'geneid': gid}, False)
                if not targets:
                    notfnd.add(gid)
                    continue
                pids = []
                for t in targets:
                    pid = t['components']['protein'][0]['id']
                    pids.append(pid)
                gid2pids[
                    gid] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_lincs({
                    'protein_id': pid,
                    'cellid': row[4],
                    'zscore': row[1],
                    'pert_dcid': row[2],
                    'pert_smiles': row[3]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                pmark[pid] = True
                lincs_ct += 1
            pbar.update(ct)
    pbar.finish()
    for gid in notfnd:
        logger.warn("No target found for {}".format(gid))
    print "{} lines processed.".format(ct)
    print "Loaded {} new lincs rows for {} proteins.".format(
        lincs_ct, len(pmark))
    if notfnd:
        print "No target found for {} geneids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as load()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    t2up = {}
    with open(INFILE) as ifh:
        for line in ifh:
            line = line.rstrip()
            [up, tid] = line.split(" ")
            t2up[tid] = up
    tct = len(t2up)
    if not args['--quiet']:
        print "\nGot {} UniProt accessions from file {}".format(tct, INFILE)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nLoading Antibodypedia annotations for {} targets".format(tct)
    logger.info("Loading Antibodypedia annotations for {} targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    tiab_ct = 0
    timab_ct = 0
    tiurl_ct = 0
    dba_err_ct = 0
    net_err_ct = 0
    for tid, up in t2up.items():
        ct += 1
        pid = int(tid)
        pbar.update(ct)
        url = ABPC_API_URL + up
        r = None
        attempts = 1
        while attempts <= 5:
            try:
                logger.info("Getting {} [Target {}, attempt {}]".format(
                    url, tid, attempts))
                r = requests.get(url)
                break
            except:
                attempts += 1
                time.sleep(1)
        if not r:
            net_err_ct += 1
            logger.error("No response for {} [Target {}, attempt {}]".format(
                url, tid, attempts))
            continue
        if r.status_code != 200:
            net_err_ct += 1
            logger.error(
                "Bad response: {} for {} [Target {}, attempt {}]".format(
                    r.status_code, url, tid, attempts))
            continue
        abpd = json.loads(r.text)
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Ab Count',
            'integer_value': int(abpd['num_antibodies'])
        })
        if rv:
            tiab_ct += 1
        else:
            dba_err_ct += 1
        if 'ab_type_monoclonal' in abpd:
            mab_ct = int(abpd['ab_type_monoclonal'])
        else:
            mab_ct = 0
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'MAb Count',
            'integer_value': mab_ct
        })
        if rv:
            timab_ct += 1
        else:
            dba_err_ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Antibodypedia.com URL',
            'string_value': abpd['url']
        })
        if rv:
            tiurl_ct += 1
        else:
            dba_err_ct += 1
        time.sleep(1)
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} Ab Count tdl_info rows".format(tiab_ct)
    print "  Inserted {} MAb Count tdl_info rows".format(timab_ct)
    print "  Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct)
    if net_err_ct > 0:
        print "WARNING: Network error for {} targets. See logfile {} for details.".format(
            net_err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 10
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as load()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Aintibodypedia.com',
        'source': 'Web API at %s' % ABPC_API_URL,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.antibodypedia.com'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "Ab Count"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "MAb Count"'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': 'itype == "Antibodypedia.com URL"'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading Antibodypedia annotations for {} TCRD targets".format(
            tct)
    logger.info(
        "Loading Antibodypedia annotations for {} TCRD targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    tiab_ct = 0
    timab_ct = 0
    tiurl_ct = 0
    dba_err_ct = 0
    net_err_ct = 0
    for target in dba.get_targets():
        ct += 1
        pbar.update(ct)
        tid = target['id']
        p = target['components']['protein'][0]
        pid = p['id']
        url = ABPC_API_URL + p['uniprot']
        r = None
        attempts = 1
        while attempts <= 5:
            try:
                logger.info("Getting {} [Target {}, attempt {}]".format(
                    url, tid, attempts))
                r = requests.get(url)
                break
            except:
                attempts += 1
                time.sleep(1)
        if not r:
            net_err_ct += 1
            logger.error("No response for {} [Target {}, attempt {}]".format(
                url, tid, attempts))
            continue
        if r.status_code != 200:
            net_err_ct += 1
            logger.error(
                "Bad response: {} for {} [Target {}, attempt {}]".format(
                    r.status_code, url, tid, attempts))
            continue
        abpd = json.loads(r.text)
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Ab Count',
            'integer_value': int(abpd['num_antibodies'])
        })
        if rv:
            tiab_ct += 1
        else:
            dba_err_ct += 1
        if 'ab_type_monoclonal' in abpd:
            mab_ct = int(abpd['ab_type_monoclonal'])
        else:
            mab_ct = 0
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'MAb Count',
            'integer_value': mab_ct
        })
        if rv:
            timab_ct += 1
        else:
            dba_err_ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'Antibodypedia.com URL',
            'string_value': abpd['url']
        })
        if rv:
            tiurl_ct += 1
        else:
            dba_err_ct += 1
        time.sleep(1)
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} Ab Count tdl_info rows".format(tiab_ct)
    print "  Inserted {} MAb Count tdl_info rows".format(timab_ct)
    print "  Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct)
    if net_err_ct > 0:
        print "WARNING: Network error for {} targets. See logfile {} for details.".format(
            net_err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 11
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GWAS Catalog',
        'source':
        'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' %
        os.path.basename(INFILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gwas/home'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INFILE)
    line_ct -= 1
    if not args['--quiet']:
        print '\nProcessing {} lines from input file {}'.format(
            line_ct, INFILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    outlist = []
    with open(INFILE, 'rU') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        notfnd = set()
        pmark = {}
        gwas_ct = 0
        dba_err_ct = 0
        # 0: DATE ADDED TO CATALOG
        # 1: PUBMEDID
        # 2: FIRST AUTHOR
        # 3: DATE
        # 4: JOURNAL
        # 5: LINK
        # 6: STUDY
        # 7: DISEASE/TRAIT
        # 8: INITIAL SAMPLE SIZE
        # 9: REPLICATION SAMPLE SIZE
        # 10: REGION
        # 11: CHR_ID
        # 12: CHR_POS
        # 13: REPORTED GENE(S)
        # 14: MAPPED_GENE
        # 15: UPSTREAM_GENE_ID
        # 16: DOWNSTREAM_GENE_ID
        # 17: SNP_GENE_IDS
        # 18: UPSTREAM_GENE_DISTANCE
        # 19: DOWNSTREAM_GENE_DISTANCE
        # 20: STRONGEST SNP-RISK ALLELE
        # 21: SNPS
        # 22: MERGED
        # 23: SNP_ID_CURRENT
        # 24: CONTEXT
        # 25: INTERGENIC
        # 26: RISK ALLELE FREQUENCY
        # 27: P-VALUE
        # 28: PVALUE_MLOG
        # 29: P-VALUE (TEXT)
        # 30: OR or BETA
        # 31: 95% CI (TEXT)
        # 32: PLATFORM [SNPS PASSING QC]
        # 33: CNV
        # 34: MAPPED_TRAIT
        # 35: MAPPED_TRAIT_URI
        # 36: STUDY ACCESSION
        # 37: GENOTYPING TECHNOLOGY
        symregex = re.compile(r' ?[-,;] ?')
        for row in tsvreader:
            ct += 1
            if len(row) < 14: continue
            symstr = row[14]
            if symstr == 'NR': continue
            symlist = symregex.split(symstr)
            for sym in symlist:
                if sym in notfnd:
                    continue
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.add(sym)
                    logger.warn("No target found for symbol {}".format(sym))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    try:
                        pval = float(row[27])
                    except:
                        pval = None
                    try:
                        orbeta = float(row[30])
                    except:
                        orbeta = None
                    if row[25]:
                        ig = int(row[25])
                    else:
                        ig = None
                    rv = dba.ins_gwas({
                        'protein_id': p['id'],
                        'disease_trait': row[7],
                        'snps': row[21],
                        'pmid': row[1],
                        'study': row[6],
                        'context': row[24],
                        'intergenic': ig,
                        'p_value': pval,
                        'or_beta': orbeta,
                        'cnv': row[33],
                        'mapped_trait': row[34],
                        'mapped_trait_uri': row[35]
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    pmark[p['id']] = True
                    gwas_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new gwas rows for {} proteins".format(
        gwas_ct, len(pmark.keys()))
    if notfnd:
        print "No target found for {} symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 12
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging when debug is 0
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'JAX/MGI Mouse/Human Orthology Phenotypes',
        'source':
        'File %s from ftp.informatics.jax.org' % PT_FILE,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.informatics.jax.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotyp'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    if not args['--quiet']:
        print "\nParsing Mammalian Phenotype Ontology file {}".format(
            DOWNLOAD_DIR + MPO_OWL_FILE)
    mpo = parse_mp_owl(MPO_OWL_FILE)
    if not args['--quiet']:
        print "Got {} MP terms".format(len(mpo))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    fn = DOWNLOAD_DIR + PT_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines from input file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pt_ct = 0
        skip_ct = 0
        pmark = {}
        notfnd = set()
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if not row[6] or row[6] == '':
                skip_ct += 1
                continue
            sym = row[0]
            geneid = row[1]
            k = "%s|%s" % (sym, geneid)
            if k in notfnd:
                continue
            targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                targets = dba.find_targets({'geneid': geneid}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            for t in targets:
                pid = t['components']['protein'][0]['id']
                pmark[pid] = True
                for mpid in row[6].split():
                    rv = dba.ins_phenotype({
                        'protein_id': pid,
                        'ptype': 'JAX/MGI Human Ortholog Phenotype',
                        'term_id': mpid,
                        'term_name': mpo[mpid]['name']
                    })
                    if rv:
                        pt_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new phenotype rows for {} proteins".format(
        pt_ct, len(pmark.keys()))
    print "  Skipped {} lines with no MP terms".format(skip_ct)
    if notfnd:
        print "No target found for {} gene symbols/ids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 13
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'HGNC',
        'source':
        'Custom download file from https://www.genenames.org/download/custom/',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.genenames.org/',
        'comments':
        'File downloaded with the following column data: HGNC ID Approved symbol Approved name   Status  UniProt ID NCBI Gene ID    Mouse genome database ID'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'sym',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'protein',
        'column_name':
        'geneid',
        'comment':
        "This is only updated with HGNC data if data from UniProt is absent."
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    line_ct = slmf.wcl(HGNC_TSV_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    tmark = {}
    hgnc_ct = 0
    mgi_ct = 0
    sym_ct = 0
    symdiscr_ct = 0
    geneid_ct = 0
    geneiddiscr_ct = 0
    nf_ct = 0
    db_err_ct = 0
    with open(HGNC_TSV_FILE, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # 0: HGNC ID
            # 1: Approved symbol
            # 2: Approved name
            # 3: Status
            # 4: UniProt ID
            # 5: NCBI Gene ID
            # 6: Mouse genome database ID
            ct += 1
            pbar.update(ct)
            sym = row[1]
            geneid = row[5]
            up = row[4]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                targets = dba.find_targets({'uniprot': up})
            if not targets:
                nf_ct += 1
                #logger.warn("No target found for {}|{}|{}".format(sym, geneid, up))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pid = p['id']
                tmark[pid] = True
                # HGNC xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'HGNC',
                    'dataset_id': dataset_id,
                    'value': row[0]
                })
                if rv:
                    hgnc_ct += 1
                else:
                    db_err_ct += 1
                # MGI xref
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'MGI ID',
                    'dataset_id': dataset_id,
                    'value': row[6]
                })
                if rv:
                    mgi_ct += 1
                else:
                    db_err_ct += 1
                # Add missing syms
                if p['sym'] == None:
                    rv = dba.upd_protein(pid, 'sym', sym)
                    if rv:
                        logger.info(
                            "Inserted new sym {} for protein {}, {}".format(
                                sym, pid, p['uniprot']))
                        sym_ct += 1
                    else:
                        db_err_ct += 1
                else:
                    # Check for symbol discrepancies
                    if p['sym'] != sym:
                        logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" %
                                    (p['sym'], sym))
                        symdiscr_ct += 1
                if geneid:
                    # Add missing geneids
                    if p['geneid'] == None:
                        rv = dba.upd_protein(pid, 'geneid', geneid)
                        if rv:
                            logger.info(
                                "Inserted new geneid {} for protein {}, {}".
                                format(geneid, pid, p['uniprot']))
                            geneid_ct += 1
                        else:
                            db_err_ct += 1
                    else:
                        # Check for geneid discrepancies
                        if p['geneid'] != int(geneid):
                            logger.warn(
                                "GeneID discrepancy: UniProt={}, HGNC={}".
                                format(p['geneid'], geneid))
                            geneiddiscr_ct += 1
    pbar.finish()
    print "Processed {} lines - {} targets annotated.".format(ct, len(tmark))
    print "No target found for {} lines.".format(nf_ct)
    print "  Inserted {} HGNC ID xrefs".format(hgnc_ct)
    print "  Inserted {} MGI ID xrefs".format(mgi_ct)
    if sym_ct > 0:
        print "  Added {} new HGNC symbols".format(sym_ct)
    if symdiscr_ct > 0:
        print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format(
            symdiscr_ct, logfile)
    if geneid_ct > 0:
        print "  Added {} new NCBI Gene IDs".format(geneid_ct)
    if geneiddiscr_ct > 0:
        print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format(
            geneiddiscr_ct, logfile)
    if db_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            db_err_ct, logfile)
Esempio n. 14
0
def run_and_load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'TMHMM Predictions',
        'source': 'Results of running TMHMM on protein sequences.',
        'app': PROGRAM,
        'app_version': __version__,
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'TMHMM Prediction'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    tct = dba.get_target_count(idg=False)
    print "\nProcessing {} TCRD targets".format(tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()

    regex = re.compile(r'PredHel=(\d+)')
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for t in dba.get_targets(idg=False, include_annotations=False):
        ct += 1
        p = t['components']['protein'][0]
        fasta = ">%s|%s %s\n%s\n" % (t['id'], p['name'], p['description'],
                                     p['seq'])
        #print "[DEBUG] Fasta:\n%s" % fasta
        fasta_filename = "/tmp/%s.fa" % t['id']
        f = open(fasta_filename, 'w')
        f.write(fasta)
        f.close()
        cmd = '%s --short --noplot %s' % (TMHMM_BIN, fasta_filename)
        #print "[DEBUG] Cmd: %s" % cmd
        output = ''
        for line in runProcess(cmd.split()):
            output += line
        os.remove(fasta_filename)
        #print "[DEBUG] Output: %s" % output
        pred = regex.findall(output)[0]
        #print "[DEBUG] PredHel: %s" % predhel
        if pred != '0':
            rv = dba.ins_tdl_info({
                'protein_id': p['id'],
                'itype': 'TMHMM Prediction',
                'string_value': output
            })
            if not rv:
                dba_err_ct += 1
                continue
            ti_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} targets processed.".format(ct)
    print "  Inserted {} new TMHMM Prediction tdl_info rows".format(ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 15
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'TDLs',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'TDLs are generated by the loading app from data in TCRD.'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'target',
        'column_name': 'tdl'
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile {} for details.".format(
            logfile)
        sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0}
    bump_ct = 0
    dba_err_ct = 0
    upd_ct = 0
    for target in dba.get_targets(idg=False, include_annotations=True):
        ct += 1
        pbar.update(ct)
        (tdl, bump_flag) = get_tdl(target)
        tdl_cts[tdl] += 1
        if bump_flag:
            bump_ct += 1
        rv = dba.upd_target(target['id'], 'tdl', tdl)
        if rv:
            upd_ct += 1
        else:
            dba_err_ct += 1
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "Set TDL values for {} targets:".format(upd_ct)
    print "  {} targets are Tclin".format(tdl_cts['Tclin'])
    print "  {} targets are Tchem".format(tdl_cts['Tchem'])
    print "  {} targets are Tbio - {} bumped from Tdark".format(
        tdl_cts['Tbio'], bump_ct)
    print "  {} targets are Tdark".format(tdl_cts['Tdark'])
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 16
0
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(LOGFILE)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    for ver, fn in INPUTFILES:
        fn = DATA_DIR + fn
        load(args, dba, logfile, logger, ver, fn)

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'IDG Eligible Lists',
Esempio n. 17
0
def load(ortho_df, args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Orthologs',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.genenames.org/cgi-bin/hcop'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'ortholog',
        'comment':
        "Orthologs are majority vote from the OMA, EggNOG and InParanoid resources as per HGNC."
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading ortholog data for {} TCRD targets".format(tct)
    logger.info("Loading ortholog data for {} TCRD targets".format(tct))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    ortho_ct = 0
    tskip_ct = 0
    skip_ct = 0
    notfnd = set()
    dba_err_ct = 0
    for target in dba.get_targets():
        ct += 1
        pbar.update(ct)
        logger.info("Processing target %d" % target['id'])
        p = target['components']['protein'][0]
        if p['sym']:  # try first by symbol
            to_df = ortho_df.loc[ortho_df['human_symbol'] == p['sym']]
        elif p['geneid']:  # then try by GeneID
            to_df = ortho_df.loc[ortho_df['human_entrez_gene'] == p['geneid']]
        else:
            tskip_ct += 1
            continue
        if len(to_df) == 0:
            continue
        for idx, row in to_df.iterrows():
            if row['ortholog_species_symbol'] == '-' and row[
                    'ortholog_species_name'] == '-':
                skip_ct += 1
                continue
            sp = TAXID2SP[row['ortholog_species']]
            init = {
                'protein_id': p['id'],
                'taxid': row['ortholog_species'],
                'species': sp,
                'sources': row['sources'],
                'symbol': row['ortholog_species_symbol'],
                'name': row['ortholog_species_name']
            }
            # Add MOD DB ID if it's there
            if row['ortholog_species_db_id'] != '-':
                init['db_id'] = row['ortholog_species_db_id']
            # Add NCBI Gene ID if it's there
            if row['ortholog_species_entrez_gene'] != '-':
                init['geneid'] = row['ortholog_species_entrez_gene']
            # Construct MOD URLs for mouse, rat, zebrafish, fly, worm and yeast
            if sp == 'Mouse':
                init[
                    'mod_url'] = 'http://www.informatics.jax.org/marker/' + row[
                        'ortholog_species_db_id']
            elif sp == 'Rat':
                rgdid = row['ortholog_species_db_id'].replace('RGD:', '')
                init[
                    'mod_url'] = 'http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=' + rgdid
            elif sp == 'Zebrafish':
                init['mod_url'] = 'http://zfin.org/' + row[
                    'ortholog_species_db_id']
            elif sp == 'Fruitfly':
                init['mod_url'] = "http://flybase.org/reports/%s.html" % row[
                    'ortholog_species_db_id']
            elif sp == 'C. elegans':
                init['mod_url'] = 'http://www.wormbase.org/search/gene/' + row[
                    'ortholog_species_symbol']
            elif sp == 'S.cerevisiae':
                init['mod_url'] = 'https://www.yeastgenome.org/locus/' + row[
                    'ortholog_species_db_id']
            rv = dba.ins_ortholog(init)
            if rv:
                ortho_ct += 1
            else:
                dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "Processed {} targets.".format(ct)
    print "Loaded {} new ortholog rows".format(ortho_ct)
    print "  Skipped {} empty ortholog entries".format(skip_ct)
    print "  Skipped {} targets with no sym/geneid".format(tskip_ct)
    if len(notfnd) > 0:
        print "  No orthologs found for {} targets.".format(len(notfnd))
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 18
0
def load(args, dod):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    # data-version field in the header of the OBO file has a relase version:
    # data-version: releases/2016-03-25
    f = os.popen("head %s" % DOWNLOAD_DIR + FILENAME)
    for line in f:
        if line.startswith("data-version:"):
            ver = line.replace('data-version: ', '')
            break
    f.close()
    dataset_id = dba.ins_dataset({
        'name':
        'Disease Ontology',
        'source':
        'File %s, version %s' % (BASE_URL + FILENAME, ver),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://disease-ontology.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'do'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'do_xref'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nLoading {} Disease Ontology terms".format(len(dod))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=len(dod)).start()
    ct = 0
    do_ct = 0
    dba_err_ct = 0
    for doid, d in dod.items():
        ct += 1
        d['doid'] = doid
        rv = dba.ins_do(d)
        if rv:
            do_ct += 1
        else:
            dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} terms processed.".format(ct)
    print "  Inserted {} new do rows".format(do_ct)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 19
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'eRAM Disease Associations',
        'source': 'Data scraped from eRAM web pages.',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.unimd.org/eram/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'eRAM'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    s = shelve.open(ERAM_SHELF_FILE)
    dis_ct = len(s['disease_names'])
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} disease names in shelf file {}".format(
            dis_ct, ERAM_SHELF_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=dis_ct).start()
    ct = 0
    pmark = {}
    skip_ct = 0
    dnerr1_ct = 0
    dnerr2_ct = 0
    notfnd = set()
    dis_ct = 0
    dba_err_ct = 0
    for dname in s['disease_names']:
        ct += 1
        try:
            dname = str(dname)
        except:
            dnerr2_ct += 1
            logger.warn("UnicodeEncodeError for disease name '{}'".format(
                dname.encode('ascii', 'ignore')))
            continue
        if dname not in s:
            dnerr_ct += 1
            logger.warn("Disease name '{}' not in shelf".format(dname))
            continue
        if 'currated_genes' not in s[dname]:
            skip_ct += 1
            continue
        for cg in s[dname]['currated_genes']:
            sym = cg['sym']
            geneid = cg['geneid']
            k = "%s|%s" % (sym, geneid)
            if k in notfnd:
                continue
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets({'geneid': geneid})
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            for t in targets:
                p = t['components']['protein'][0]
                pmark[t['id']] = True
                for doid in s[dname]['doids']:
                    rv = dba.ins_disease({
                        'protein_id': p['id'],
                        'dtype': 'eRAM',
                        'name': dname,
                        'did': doid,
                        'source': cg['sources']
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    dis_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} diseases with no currated genes. See logfile {} for details.".format(
            skip_ct, logfile)
    if dnerr1_ct > 0:
        print "{} disease names not found in shelf. See logfile {} for details.".format(
            dnerr1_ct, logfile)
    if dnerr2_ct > 0:
        print "{} disease names cannot be decoded to strs. See logfile {} for details.".format(
            dnerr2_ct, logfile)
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 20
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Drug Central',
        'source':
        "Drug Central files download files: %s" % ", ".join(SRC_FILES),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://drugcentral.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset. See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'drug_activity'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'DrugCentral Indication'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    # First get mapping of DrugCentral names to ids
    name2id = {}
    line_ct = slmf.wcl(NAME_ID_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, NAME_ID_FILE)
    with open(NAME_ID_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            name2id[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(name2id))

    # Next get drug info fields
    infos = {}
    line_ct = slmf.wcl(DRUGINFO_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, DRUGINFO_FILE)
    with open(DRUGINFO_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            infos[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(infos))

    #
    # MOA activities
    #
    drug2tids = defaultdict(list)
    line_ct = slmf.wcl(TCLIN_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from DrugDB MOA activities file {}".format(
            line_ct, TCLIN_FILE)
    with open(TCLIN_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 1,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['cmpd_chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tclin rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Non-MOA activities
    #
    line_ct = slmf.wcl(TCHEM_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from Non-MOA activities file {}".format(
            line_ct, TCHEM_FILE)
    with open(TCHEM_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 0,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tchem rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Indications (diseases)
    #
    line_ct = slmf.wcl(DRUGIND_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from indications file {}".format(
            line_ct, DRUGIND_FILE)
    with open(DRUGIND_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # DRUG_ID DRUG_NAME       INDICATION_FDB  UMLS_CUI        SNOMEDCT_CUI    DOID
        ct = 0
        t2d_ct = 0
        notfnd = {}
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            drug = row[1]
            if drug not in drug2tids:
                notfnd[drug] = True
                continue
            init = {
                'protein_id': tid,
                'dtype': 'DrugCentral Indication',
                'name': row[2],
                'drug_name': drug
            }
            if row[5] != '':
                init['did'] = row[5]
            for tid in drug2tids[drug]:
                # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets
                init['protein_id'] = tid
                rv = dba.ins_disease(init)
                if rv:
                    t2d_ct += 1
                else:
                    dba_err_ct += 1
    print "{} DrugCentral indication rows processed.".format(ct)
    print "  Inserted {} new disease rows".format(t2d_ct)
    if len(notfnd.keys()) > 0:
        print "WARNNING: {} drugs NOT FOUND in activity files:".format(
            len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 21
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'JensenLab PubMed Text-mining Scores',
        'source': 'File %s' % BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': BASE_URL
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'pmscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'JensenLab PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ensp2pids = {}
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    upd_ct = 0
    notfnd = {}
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            pbar.update(ct)
            if not row[0].startswith('ENSP'): continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'stringid': ensp})
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not targets:
                        notfnd[ensp] = True
                        logger.warn("No target found for {}".format(ensp))
                        continue
                pids = []
                for target in targets:
                    pids.append(target['components']['protein'][0]['id'])
                    ensp2pids[
                        ensp] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    pbar.finish()
    print "{} input lines processed.".format(ct)
    print "  Inserted {} new pmscore rows for {} targets".format(
        pms_ct, len(pmscores))
    if len(notfnd) > 0:
        print "No target found for {} STRING IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} JensenLab PubMed Score tdl_infos".format(
        len(pmscores.keys()))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'JensenLab PubMed Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "  Inserted {} new JensenLab PubMed Score tdl_info rows".format(
        ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Esempio n. 22
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Jensen Lab DISEASES',
        'source':
        'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://diseases.jensenlab.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype LIKE 'JensenLab %'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    # Knowledge channel
    fn = DOWNLOAD_DIR + FILE_K
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Knowledge ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                init = {
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                }

                rv = dba.ins_disease(init)
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Experiment channel
    fn = DOWNLOAD_DIR + FILE_E
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[6] == '0':
                # skip zero confidence rows
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Experiment ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} zero confidence rows".format(skip_ct)
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Text Mining channel
    fn = DOWNLOAD_DIR + FILE_T
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Text Mining'
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'zscore': row[4],
                    'conf': row[5]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 23
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Expression Atlas',
        'source':
        'IDG-KMC generated data at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gxa/',
        'comment':
        'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'Expression Atlas'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    with open(INPUT_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        k2pids = {}
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            # 0: "Gene ID"
            # 1: "DOID"
            # 2: "Gene Name"
            # 3: "log2foldchange"
            # 4: "p-value"
            # 5: "disease"
            # 6: "experiment_id"
            # 7: "contrast_id"
            ct += 1
            sym = row[2]
            ensg = row[0]
            k = "%s|%s" % (sym, ensg)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'sym': sym}, idg=False)
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'ENSG',
                        'value': ensg
                    })
                if not targets:
                    notfnd.add(k)
                    logger.warn("No target found for {}".format(k))
                    continue
                pids = []
                for t in targets:
                    p = t['components']['protein'][0]
                    pmark[p['id']] = True
                    pids.append(p['id'])
                k2pids[
                    k] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': 'Expression Atlas',
                    'name': row[5],
                    'did': row[1],
                    'log2foldchange': "%.3f" % float(row[3]),
                    'pvalue': row[4]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new disease rows for {} proteins.".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} symbols/ensgs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 24
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'GeneRIF Years',
        'source': 'PubMed records via NCBI E-Utils',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'https://www.ncbi.nlm.nih.gov/pubmed'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'generif',
        'column_name': 'years'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pubmed2date = pickle.load(open(PICKLE_FILE, 'rb'))
    if not args['--quiet']:
        print "\nGot %d PubMed date mappings from file %s" % (len(pubmed2date),
                                                              PICKLE_FILE)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    generifs = dba.get_generifs()
    if not args['--quiet']:
        print "\nProcessing {} GeneRIFs".format(len(generifs))
    logger.info("Processing {} GeneRIFs".format(len(generifs)))
    pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start()
    yrre = re.compile(r'^(\d{4})')
    ct = 0
    yr_ct = 0
    skip_ct = 0
    net_err_ct = 0
    dba_err_ct = 0
    for generif in generifs:
        ct += 1
        logger.debug("Processing GeneRIF: {}".format(generif))
        # GeneRIFs with multiple refs often have duplicates, so fix that
        if "|" in generif['pubmed_ids']:
            pmids = set(generif['pubmed_ids'].split("|"))
            pmids = list(pmids)
            rv = dba.do_update({
                'table': 'generif',
                'id': generif['id'],
                'col': 'pubmed_ids',
                'val': "|".join(pmids)
            })
            if not rv:
                dba_err_ct += 1
        else:
            pmids = [generif['pubmed_ids']]

        years = list()
        for pmid in pmids:
            if pmid in pubmed2date:
                m = yrre.match(pubmed2date[pmid])
                if m:
                    years.append(m.groups(1)[0])
                else:
                    years.append('')
            else:
                years.append('')
        # See if we got any years...
        if any(years):  # if so, so do the updates
            rv = dba.do_update({
                'table': 'generif',
                'id': generif['id'],
                'col': 'years',
                'val': "|".join(years)
            })
            if rv:
                yr_ct += 1
            else:
                dba_err_ct += 1
        else:  # if not, skip
            skip_ct += 1
        pbar.update(ct)
    pbar.finish()
    if not args['--quiet']:
        print "{} GeneRIFs processed.".format(ct)
    print "  Updated {} genefifs with years".format(yr_ct)
    print "  Skipped {} generifs with no years.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    if net_err_ct > 0:
        print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format(
            net_err_ct, logfile)
Esempio n. 25
0
def tinx(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # The results of parsing the input mentions files will be the following dictionaries:
    pid2pmids = {
    }  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
    # Including the UniProt accession in the key is just for convenience when
    # checking the output. It is not used for anything.
    doid2pmids = {}  # DOID => set of all PMIDs that mention the disease
    pmid_disease_ct = {
    }  # PMID => count of diseases mentioned in a given paper
    pmid_protein_ct = {
    }  # PMID => count of proteins mentioned in a given paper

    # First parse the Disease Ontology OBO file to get DO names and defs
    dofile = DO_DOWNLOAD_DIR + DO_OBO
    print "\nParsing Disease Ontology file {}".format(dofile)
    do_parser = obo.Parser(open(dofile))
    do = {}
    for stanza in do_parser:
        do[stanza.tags['id'][0].value] = stanza.tags
    print "  Got {} Disease Ontology terms".format(len(do))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = JL_DOWNLOAD_DIR + PROTEIN_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in protein file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('ENSP'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            ensp = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                # if we don't find a target by stringid, which is the more reliable and
                # prefered way, try by Ensembl xref
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensp
                })
            if not targets:
                notfnd.add(ensp)
                continue
            for t in targets:
                p = t['components']['protein'][0]
                k = "%s,%s" % (p['id'], p['uniprot'])
                if k in pid2pmids:
                    pid2pmids[k] = pid2pmids[k].union(pmids)
                else:
                    pid2pmids[k] = set(pmids)
                for pmid in pmids:
                    if pmid in pmid_protein_ct:
                        pmid_protein_ct[pmid] += 1.0
                    else:
                        pmid_protein_ct[pmid] = 1.0
    pbar.finish()
    for ensp in notfnd:
        logger.warn("No target found for {}".format(ensp))
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-ENSP lines".format(skip_ct)
    print "  Saved {} protein to PMIDs mappings".format(len(pid2pmids))
    print "  Saved {} PMID to protein count mappings".format(
        len(pmid_protein_ct))
    if notfnd:
        print "  No target found for {} ENSPs. See logfile {} for details.".format(
            len(notfnd), logfile)

    fn = JL_DOWNLOAD_DIR + DISEASE_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('DOID:'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            doid = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            if doid not in do:
                logger.warn("%s not found in DO" % doid)
                notfnd.add(doid)
                continue
            if doid in doid2pmids:
                doid2pmids[doid] = doid2pmids[doid].union(pmids)
            else:
                doid2pmids[doid] = set(pmids)
            for pmid in pmids:
                if pmid in pmid_disease_ct:
                    pmid_disease_ct[pmid] += 1.0
                else:
                    pmid_disease_ct[pmid] = 1.0
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-DOID lines".format(skip_ct)
    print "  Saved {} DOID to PMIDs mappings".format(len(doid2pmids))
    print "  Saved {} PMID to disease count mappings".format(
        len(pmid_disease_ct))
    if notfnd:
        print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(
            len(notfnd), logfile)

    if not args['--quiet']:
        print "\nComputing protein novely scores"
    # To calculate novelty scores, each paper (PMID) is assigned a
    # fractional target (FT) score of one divided by the number of targets
    # mentioned in it. The novelty score of a given protein is one divided
    # by the sum of the FT scores for all the papers mentioning that
    # protein.
    ct = 0
    with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf:
        pnovf.write("Protein ID,UniProt,Novelty\n")
        for k in pid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in pid2pmids[k]:
                ft_score_sum += 1.0 / pmid_protein_ct[pmid]
            novelty = 1.0 / ft_score_sum
            pnovf.write("%s,%.8f\n" % (k, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, PROTEIN_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing disease novely scores"
    # Exactly as for proteins, but using disease mentions
    ct = 0
    with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf:
        dnovf.write("DOID,Novelty\n")
        for doid in doid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in doid2pmids[doid]:
                ft_score_sum += 1.0 / pmid_disease_ct[pmid]
            novelty = 1.0 / ft_score_sum
            dnovf.write("%s,%.8f\n" % (doid, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, DISEASE_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing importance scores"
    # To calculate importance scores, each paper is assigned a fractional
    # disease-target (FDT) score of one divided by the product of the
    # number of targets mentioned and the number of diseases
    # mentioned. The importance score for a given disease-target pair is
    # the sum of the FDT scores for all papers mentioning that disease and
    # protein.
    ct = 0
    with open(IMPORTANCE_FILE, 'wb') as impf:
        impf.write("DOID,Protein ID,UniProt,Score\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                fdt_score_sum = 0.0
                for pmid in pd_pmids:
                    fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] *
                                            pmid_disease_ct[pmid])
                if fdt_score_sum > 0:
                    ct += 1
                    impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum))
    print "  Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE)

    if not args['--quiet']:
        print "\nComputing PubMed rankings"
    # PMIDs are ranked for a given disease-target pair based on a score
    # calculated by multiplying the number of targets mentioned and the
    # number of diseases mentioned in that paper. Lower scores have a lower
    # rank (higher priority). If the scores do not discriminate, PMIDs are
    # reverse sorted by value with the assumption that larger PMIDs are
    # newer and of higher priority.
    ct = 0
    with open(PMID_RANKING_FILE, 'wb') as pmrf:
        pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                scores = [
                ]  # scores are tuples of (PMID, protein_mentions*disease_mentions)
                for pmid in pd_pmids:
                    scores.append(
                        (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]))
                if len(scores) > 0:
                    scores.sort(cmp_pmids_scores)
                    for i, t in enumerate(scores):
                        ct += 1
                        pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i))
    print "  Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
Esempio n. 26
0
def calc_and_load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'KEGG Distances', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Directed graphs are produced from KEGG pathway KGML files and all shortest path lengths are then calculated and stored.'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'kegg_distance'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  kgmls = get_kgmls(KGML_DIR)

  if not args['--quiet']:
    print "\nProcessing {} KGML files in {}".format(len(kgmls), KGML_DIR)
    logger.info("Processing {} KGML files in {}".format(len(kgmls), KGML_DIR))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(kgmls)).start()
  # All pathways shortest path lengths
  # (node1, node2) => distance
  all_pws_spls = {}
  ct = 0
  err_ct = 0
  for kgml in kgmls:
    logger.info("  Working on {}".format(kgml))
    ct += 1
    try:
      dig = kg.kgml_file_to_digraph(kgml)
    except:
      err_ct += 1
      logger.error("Error parsing file: {}".format(kgml))
      continue
    aspls = nx.all_pairs_shortest_path_length(dig)
    dct = 0
    for source in aspls:
      for target in aspls[source]:
        if source == target: continue
        st = (source, target)
        if st in all_pws_spls:
          if aspls[source][target] < all_pws_spls[st]:
            all_pws_spls[st] = aspls[source][target]
            dct += 1
        else:
          all_pws_spls[st] = aspls[source][target]
          dct += 1
    logger.info("  {} has {} non-zero shortest path lengths".format(kgml, dct))
    pbar.update(ct)
  pbar.finish()
  logger.info("Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls)))
  if not args['--quiet']:
    print "  Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls))
  if err_ct > 0:
    print "WARNNING: {} parsing errors occurred. See logfile {} for details.".format(err_ct, logfile)

  logger.info("Processing {} KEGG Distances".format(len(all_pws_spls)))
  if not args['--quiet']:
    print "\nProcessing {} KEGG Distances".format(len(all_pws_spls))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=len(all_pws_spls)).start()
  gid2pids = defaultdict(list) # So we only find each target once,
                               # save protein.geneid => protein.id(s)
  notfnd = set()
  ct = 0
  skip_ct = 0
  kd_ct = 0
  dba_err_ct = 0
  for st,dist in all_pws_spls.items():
    ct += 1
    geneid1 = re.sub(r'^hsa:', '', st[0])
    geneid2 = re.sub(r'^hsa:', '', st[1])
    if geneid1 in gid2pids:
      pids1 = gid2pids[geneid1]
    elif geneid1 in notfnd:
      skip_ct += 1
      continue
    else:
      targets = dba.find_targets({'geneid': geneid1})
      if not targets:
        skip_ct += 1
        notfnd.add(geneid1) # add to notfnd so we don't try looking it up again
        logger.warn("No target found for KEGG Gene ID {}".format(geneid1))
        continue
      pids1 = []
      for t in targets:
        pid = t['components']['protein'][0]['id']
        pids1.append(pid)
        gid2pids[geneid1].append(pid)
    if geneid2 in gid2pids:
      pids2 = gid2pids[geneid2]
    elif geneid2 in notfnd:
      skip_ct += 1
      continue
    else:
      targets = dba.find_targets({'geneid': geneid2})
      if not targets:
        skip_ct += 1
        notfnd.add(geneid2) # add to notfnd so we don't try looking it up again
        logger.warn("No target found for KEGG Gene ID {}".format(geneid2))
        continue
      pids2 = []
      for t in targets:
        pid = t['components']['protein'][0]['id']
        pids2.append(pid)
        gid2pids[geneid2].append(pid)
    for pid1 in pids1:
      for pid2 in pids2:
        rv = dba.ins_kegg_distance({'pid1': pid1, 'pid2': pid2, 'distance': dist})
        if rv:
          kd_ct += 1
        else:
          dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "{} KEGG Distances processed.".format(ct)
  print "  Inserted {} new kegg_distance rows".format(kd_ct)
  if skip_ct > 0:
    print "  {} KEGG IDs not found in TCRD - Skipped {} rows. See logfile {} for details.".format(len(notfnd), skip_ct, logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Esempio n. 27
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'OMIM',
        'source':
        'Files %s downloaded from omim.org' %
        ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://omim.org/',
        'comments':
        'Confirmed OMIM phenotypes and OMIM Phenotype Series info'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'omim'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'omim_ps'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'OMIM'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    # OMIMs and Phenotypic Series
    fname = DOWNLOAD_DIR + TITLES_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        omim_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Prefix ???
            # 1: Mim Number
            # 2: Preferred Title; symbol Alternative Title(s); symbol(s)
            # 3: Included Title(s); symbols
            title = row[2].partition(';')[0]
            rv = dba.ins_omim({'mim': row[1], 'title': title})
            if not rv:
                dba_err_ct += 1
                continue
            omim_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim rows".format(omim_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fname = DOWNLOAD_DIR + PS_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        ps_ct = 0
        err_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Phenotypic Series Number
            # 1: Mim Number
            # 2: Phenotype
            if len(row) == 2:
                init = {'omim_ps_id': row[0], 'title': row[1]}
            elif len(row) == 3:
                init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]}
            else:
                err_ct += 1
                logger.warn("Parsing error for row {}".format(row))
                continue
            rv = dba.ins_omim_ps(init)
            if not rv:
                dba_err_ct += 1
                continue
            ps_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim_ps rows".format(ps_ct)
    if err_ct > 0:
        print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(
            er_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Phenotypes
    fname = DOWNLOAD_DIR + GENEMAP_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        tmark = {}
        skip_ct = 0
        notfnd_ct = 0
        prov_ct = 0
        dds_ct = 0
        pt_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0 - Sort ???
            # 1 - Month
            # 2 - Day
            # 3 - Year
            # 4 - Cytogenetic location
            # 5 - Gene Symbol(s)
            # 6 - Confidence
            # 7 - Gene Name
            # 8 - MIM Number
            # 9 - Mapping Method
            # 10 - Comments
            # 11 - Phenotypes
            # 12 - Mouse Gene Symbol
            pts = row[11]
            if pts.startswith('?'):
                prov_ct += 1
                continue
            if '(4)' in pts:
                dds_ct += 1
            trait = "MIM Number: %s" % row[8]
            if row[11]:
                trait += "; Phenotype: %s" % pts
            found = False
            syms = row[5].split(', ')
            logger.info("Checking for OMIM syms: {}".format(syms))
            for sym in syms:
                targets = dba.find_targets({'sym': sym})
                if targets:
                    found = True
                    for t in targets:
                        p = t['components']['protein'][0]
                        logger.info(
                            "  Symbol {} found target {}: {}, {}".format(
                                sym, t['id'], p['name'], p['description']))
                        rv = dba.ins_phenotype({
                            'protein_id': p['id'],
                            'ptype': 'OMIM',
                            'trait': trait
                        })
                        if not rv:
                            dba_err_ct += 1
                            continue
                        tmark[t['id']] = True
                        pt_ct += 1
            if not found:
                notfnd_ct += 1
                logger.warn("No target found for row {}".format(row))
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "  Skipped {} provisional phenotype rows.".format(prov_ct)
    print "  Skipped {} deletion/duplication syndrome rows.".format(dds_ct)
    print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark))
    if notfnd_ct > 0:
        print "No target found for {} good lines. See logfile {} for details.".format(
            notfnd_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 28
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Harmonogram CDFs',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'CDFs are calculated by the loader app based on gene_attribute data in TCRD.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': 1, 'table_name': 'hgram_cdf'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    # Create a dictionary of gene_attribute_type.name => [] pairs
    counts = {}
    # Create a dictionary of gene_attribute_type.name => {} pairs
    stats = {}
    gatypes = dba.get_gene_attribute_types()
    for ga in gatypes:
        counts[ga] = []
        stats[ga] = {}

    tct = dba.get_target_count(idg=False)
    if not args['--quiet']:
        print "\nCollecting counts for {} gene attribute types on {} TCRD targets".format(
            len(gatypes), tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    for t in dba.get_targets(idg=False,
                             include_annotations=True,
                             get_ga_counts=True):
        ct += 1
        pbar.update(ct)
        p = t['components']['protein'][0]
        pid = p['id']
        if not 'gene_attribute_counts' in p: continue
        for type, attr_count in p['gene_attribute_counts'].items():
            counts[type].append(attr_count)
    pbar.finish()

    print "\nCalculatig Gene Attribute stats. See logfile {}.".format(logfile)
    logger.info("Calculatig Gene Attribute stats:")
    for type, l in counts.items():
        if len(l) == 0:
            del (counts[type])
            continue
        npa = numpy.array(l)
        logger.info("  %s: %d counts; mean: %.2f; std: %.2f" %
                    (type, len(l), npa.mean(), npa.std()))
        stats[type]['mean'] = npa.mean()
        stats[type]['std'] = npa.std()

    print "\nLoading HGram CDFs for {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    nan_ct = 0
    cdf_ct = 0
    dba_err_ct = 0
    for t in dba.get_targets(idg=False,
                             include_annotations=True,
                             get_ga_counts=True):
        ct += 1
        p = t['components']['protein'][0]
        pid = p['id']
        if not 'gene_attribute_counts' in p: continue
        for type, attr_count in p['gene_attribute_counts'].items():
            attr_cdf = gaussian_cdf(attr_count, stats[type]['mean'],
                                    stats[type]['std'])
            if math.isnan(attr_cdf):
                attr_cdf = 1.0 / (1.0 + math.exp(-1.702 * (
                    (attr_count - stats[type]['mean']) / stats[type]['std'])))
            if math.isnan(attr_cdf):
                nan_ct += 1
                continue
            rv = dba.ins_hgram_cdf({
                'protein_id': p['id'],
                'type': type,
                'attr_count': attr_count,
                'attr_cdf': attr_cdf
            })
            if not rv:
                dba_err_ct += 1
                continue
            cdf_ct += 1
        pbar.update(ct)
    pbar.finish()
    print "Processed {} targets.".format(ct)
    print "  Loaded {} new hgram_cdf rows".format(cdf_ct)
    print "  Skipped {} NaN CDFs".format(nan_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 29
0
def calc_and_load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'KEGG Nearest Tclins',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'comments':
        'Nearest upstream and downstream Tclin targets are found and stored based on KEGG Distances.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'kegg_nearest_tclin'
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nProcessing {} TCRD targets".format(tct)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    uct = 0
    umark = set()
    dct = 0
    dmark = set()
    dba_err_ct = 0
    for target in dba.get_targets():
        #tids = [1983, 7166]
        #for tid in tids:
        #  target = dba.get_target(tid)
        ct += 1
        if target['tdl'] == 'Tclin':
            continue
        pid = target['components']['protein'][0]['id']
        ups = dba.get_nearest_kegg_tclins(pid, 'upstream')
        if ups:
            umark.add(pid)
            for d in ups:
                d['tclin_id'] = d['protein_id']
                d['protein_id'] = pid
                d['direction'] = 'upstream'
                rv = dba.ins_kegg_nearest_tclin(d)
                if rv:
                    uct += 1
                else:
                    dba_err_ct += 1
        dns = dba.get_nearest_kegg_tclins(pid, 'downstream')
        if dns:
            dmark.add(pid)
            for d in dns:
                d['tclin_id'] = d['protein_id']
                d['protein_id'] = pid
                d['direction'] = 'downstream'
                rv = dba.ins_kegg_nearest_tclin(d)
                if rv:
                    dct += 1
                else:
                    dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()

    if not args['--quiet']:
        print "\n{} targets processed.".format(ct)
        print "  {} non-Tclin targets have upstream Tclin target(s)".format(
            len(umark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(uct)
        print "  {} non-Tclin targets have downstream Tclin target(s)".format(
            len(dmark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(dct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Esempio n. 30
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)",
                args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    if not args['--quiet']:
        print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'NCBI Gene',
        'source': 'EUtils web API at %s' % EFETCH_GENE_URL,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.ncbi.nlm.nih.gov/gene'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene Summary'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'NCBI Gene PubMed Count'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'generif'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    }, {
        'dataset_id': dataset_id,
        'table_name': 'alias',
        'where_clause': "dataset_id = %d" % dataset_id
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    s = shelve.open(SHELF_FILE, writeback=True)
    s['loaded'] = []
    s['retries'] = {}
    s['counts'] = defaultdict(int)

    tct = dba.get_target_count()
    if not args['--quiet']:
        print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct
    logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
    ct = 0
    skip_ct = 0
    for t in dba.get_targets(include_annotations=False):
        tid = t['id']
        ct += 1
        p = t['components']['protein'][0]
        pid = p['id']
        if p['geneid'] == None:
            skip_ct += 1
            continue
        geneid = str(p['geneid'])
        logger.info("Processing target %d: geneid %s" % (tid, geneid))
        (status, headers, xml) = get_ncbigene(geneid)
        if not status:
            logger.warn("Failed getting Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        if status != 200:
            logger.warn("Bad API response for Gene ID %s: %s" %
                        (geneid, status))
            s['retries'][tid] = True
            continue
        gene_annotations = parse_genexml(xml)
        if not gene_annotations:
            s['counts']['xml_err'] += 1
            logger.error("XML Error for Gene ID %s" % geneid)
            s['retries'][tid] = True
            continue
        load_annotations(dba, t, dataset_id, gene_annotations, s)
        time.sleep(0.5)
        pbar.update(ct)
    pbar.finish()
    print "Processed %d targets." % ct
    if skip_ct > 0:
        print "Skipped %d targets with no geneid" % skip_ct
    print "Loaded NCBI annotations for %d targets" % len(s['loaded'])
    if len(s['retries']) > 0:
        print "Total targets remaining for retries: %d " % len(s['retries'])

    loop = 1
    while len(s['retries']) > 0:
        print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (
            loop, len(s['retries']))
        logger.info(
            "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets"
            % (loop, len(s['retries'])))
        pbar_widgets = [
            'Progress: ',
            Percentage(), ' ',
            Bar(marker='#', left='[', right=']'), ' ',
            ETA()
        ]
        pbar = ProgressBar(widgets=pbar_widgets,
                           maxval=len(s['retries'])).start()
        ct = 0
        act = 0
        for tid, _ in s['retries'].items():
            ct += 1
            t = dba.get_target(tid, include_annotations=False)
            geneid = str(t['components']['protein'][0]['geneid'])
            logger.info("Processing target %d: geneid %s" % (tid, geneid))
            (status, headers, xml) = get_ncbigene(geneid)
            if not status:
                logger.warn("Failed getting Gene ID %s" % geneid)
                continue
            if status != 200:
                logger.warn("Bad API response for Gene ID %s: %s" %
                            (geneid, status))
                continue
            gene_annotations = parse_genexml(xml)
            if not gene_annotations:
                s['counts']['xml_err'] += 1
                logger.error("XML Error for Gene ID %s" % geneid)
                continue
            load_annotations(dba, t, dataset_id, gene_annotations, s)
            act += 1
            del s['retries'][tid]
            time.sleep(0.5)
            pbar.update(ct)
        loop += 1
        if loop == 5:
            print("Completed 5 retry loops. Aborting.")
            break
        pbar.finish()
        print "Processed %d targets." % ct
        print "  Annotated %d additional targets" % act
        print "  Total annotated targets: %d" % len(s['loaded'])
        if len(s['retries']) > 0:
            print "Total targets remaining for retries: %d " % len(
                s['retries'])

    print "\nInserted %d aliases" % s['counts']['alias']
    print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary']
    print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc']
    print "Inserted %d GeneRIFs" % s['counts']['generif']
    print "Inserted %d PubMed xrefs" % s['counts']['pmxr']
    #print "Inserted %d other xrefs" % s['counts']['xref']
    if s['counts']['xml_err'] > 0:
        print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % (
            s['counts']['xml_err'], logfile)
    if s['counts']['dba_err'] > 0:
        print "WARNNING: %d DB errors occurred. See logfile %s for details." % (
            s['counts']['dba_err'], logfile)