Exemple #1
0
def load_tinx(curs):
  chunk_size = 50000
  delim = '\t'
  print('\nLoading tinx tables...')
  for table in ['tinx_novelty', 'tinx_disease', 'tinx_importance', 'tinx_articlerank']:
    print(f"  Loading {table}: ", end='')
    fn = INFILES[table]
    st = time.time()
    first_chunk = True
    row_ct = 0
    for values in slmf.file_chunker(fn, chunk_size, delim):
      if first_chunk:
        values.pop(0) # get rid of the header
        first_chunk = False
      row_ct += len(values)
      curs.executemany(INS_SQL[table], [tuple(vals) for vals in values])
    ets = slmf.secs2str(time.time() - st)
    print(f"OK - ({row_ct} rows).  Elapsed time: {ets}")
  print("Done.")
Exemple #2
0
def load_pubmed(curs, logger, logfile):
  st = time.time()
  fn = INFILES['pubmed']
  line_ct = slmf.wcl(fn)
  print(f'\nLoading TIN-X pubmeds from {fn}...')
  ct = 0
  pm_ct = 0
  dup_ct = 0
  err_ct = 0
  with open(fn, 'r') as ifh:
    tsvreader = csv.reader(ifh, delimiter='\t')
    for row in tsvreader:
      if ct == 0: # skip header
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      try:
        curs.execute(INS_SQL['pubmed'], tuple(row))
        pm_ct += 1
      except Error as e:
        if f"Duplicate entry '{row[0]}'" in e.msg:
          # this should not happen under "production" runs, but it's here for testing/debugging
          dup_ct += 1
          continue
        else:
          err_ct += 1
          logger.error(f"``{e}`` for line {ct}. Data: {row}")
          continue
  ets = slmf.secs2str(time.time() - st)
  print(f"\n  Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}")
  if err_ct:
    print(f"  WARNING: {err_ct} errors occurred. See logfile {logfile} for details.")
  if dup_ct:
    print(f"  Skipped {dup_ct} existing pubmeds.")
  print("Done.")
Exemple #3
0
def tinx_pubmed(args, dba, tinx_pmids, logger):
  st = time.time()
  tcrd_pmids = set(dba.get_pmids())
  new_pmids = tinx_pmids - tcrd_pmids
  new_pmids = [str(i) for i in list(new_pmids)]
  new_pmid_ct = len(new_pmids)
  if not args['--quiet']:
    print(f"Fetching pubmed data for {new_pmid_ct} new TIN-X PMIDs")
  logger.info(f"Fetching pubmed data for {new_pmid_ct} new TIN-X PMIDs")
  ct = 0
  net_err_ct = 0
  chunk_ct = 0
  fn = f"{TINX_OUTDIR}TINX_Pubmed.tsv"
  with open(fn, 'w') as ofh:
    ofh.write("PubMedID\tTitle\tJournal\tDate\tAutors\tAbstract\n")
    ct += 1
    for chunk in slmf.chunker(new_pmids, 200):
      chunk_ct += 1
      logger.info(f"Processing PMID chunk {chunk_ct}")
      pmas = tpm.fetch_pubmeds(chunk)
      if not pmas:
        logger.error("Bad E-Utils response for PMID chunk {}: {}".format(chunk_ct, ','.join(chunk)))
        net_err_ct += 1
        continue
      for pma in pmas:
        pmid, title, journal, date, authors, abstract = tpm.parse_pubmed_article(pma)
        if abstract:
          ofh.write(f"{pmid}\t{title}\t{journal}\t{date}\t{authors}\t{abstract}\n")
        else:
          ofh.write(f"{pmid}\t{title}\t{journal}\t{date}\t{authors}\t''\n")
        ct += 1
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"{ct} lines written to file {fn}. Elapsed time: {ets}")
  if net_err_ct > 0:
    print(f"WARNING: {net_err_ct} Network/E-Utils errors occurred.")
Exemple #4
0
def tinx(args, dba, do, logger, logfile):
  tinx = TINX({'TINX_PROTEIN_FILE': JL_DOWNLOAD_DIR+TINX_PROTEIN_FILE,
               'TINX_DISEASE_FILE': JL_DOWNLOAD_DIR+TINX_DISEASE_FILE,
               'logfile': logfile, 'OUTDIR': TINX_OUTDIR}, dba, do)
  st = time.time()
  (ct1, ct2) = tinx.parse_protein_mentions()
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Protein mappings: {ct1} protein to PMIDs ; {ct2} PMID to protein counts. Elapsed time: {ets}")
  st = time.time()
  (ct1, ct2) = tinx.parse_disease_mentions()
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Disease mappings: {ct1} disease to PMIDs ; {ct2} PMID to disease counts. Elapsed time: {ets}")
  st = time.time()
  (ct, fn) = tinx.compute_protein_novelty()
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}")
  st = time.time()
  (ct, fn) = tinx.compute_disease_novelty()
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}")
  st = time.time()
  (ct, fn) = tinx.compute_importances()
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Wrote {ct} lines to file {fn}. Elapsed time: {ets}")
  st = time.time()
  (ct, tinx_pmids, fn) = tinx.compute_pubmed_rankings()
  tinx_pmid_ct = len(tinx_pmids)
  ets = slmf.secs2str(time.time() - st)
  if not args['--quiet']:
    print(f"Wrote {ct} lines ({tinx_pmid_ct} total TIN-x PMIDs) to file {fn}. Elapsed time: {ets}")
  return tinx_pmids
Exemple #5
0
        'table_name': 'alias',
        'where_clause': f"dataset_id = {dataset_id}"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, f"Error inserting provenance. See logfile {logfile} for details."
    load_human(args, dba, dataset_id, eco_map, logger, logfile)

    # Mouse and Rat proteins
    # Dataset and Provenance
    # As for human, we need the dataset id for xrefs and aliases
    dataset_id = dba.ins_dataset({
        'name': 'UniProt Mouse and Rat Proteins',
        'source':
        f"Mouse and Rat  from UniProt XML file {UP_RODENT_FILE} from {UP_BASE_URL}",
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'https://www.uniprot.org'
    })
    assert dataset_id, f"Error inserting dataset See logfile {logfile} for details."
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'nhprotein'
    })
    assert rv, f"Error inserting provenance. See logfile {logfile} for details."
    load_mouse_rat(args, dba, dataset_id, logger, logfile)

    elapsed = time.time() - start_time
    print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM,
                                                  slmf.secs2str(elapsed)))
Exemple #6
0
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))

  for cfgd in CONFIG:
    name = cfgd['name']
    #download(name)
    parsed_ont = cfgd['parse_function'](cfgd['DOWNLOAD_DIR']+cfgd['FILENAME'])
    cfgd['load_function'](dba, logger, logfile, parsed_ont, cfgd)
    
  elapsed = time.time() - start_time
  print("\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)))
Exemple #7
0
  fh.setFormatter(fmtr)
  logger.addHandler(fh)
  
  st = time.time()
  cnx = None
  try:
    cnx = mysql.connector.connect(host = args['--dbhost'],
                                  database = args['--dbname'],
                                  user = '******',
                                  password = slmf.get_pw('/home/smathias/.dbirc'),
                                  autocommit = True)
    if cnx.is_connected():
      if not args['--quiet']:
        print("Connected to TCRD database {}".format(args['--dbname']))
      curs = cnx.cursor()
      del_dataset(curs)
      drop_tables(curs)
      create_tables(curs)
      load_tinx(curs)
      load_pubmed(curs, logger, logfile)
      load_dataset(curs)
      curs.close()
  except Error as e:
    print(f"ERROR: {e}")
  finally:
    if cnx and cnx.is_connected():
      cnx.commit()
      cnx.close()
  ets = slmf.secs2str(time.time() - st)
  print(f"\n{PROGRAM}: Done. Total time: {ets}\n")