Esempio n. 1
0
def parse_mappings(fn):
    line_ct = slmf.wcl(fn)
    print(f"\nProcessing {line_ct} input lines in mapping file {fn}")
    up2chembl = {}
    with open(fn, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if row[0].startswith('#'):
                continue
            if row[3] != 'SINGLE PROTEIN':
                continue
            if row[0] in up2chembl:
                up2chembl[row[0]].append(row[1])
            else:
                up2chembl[row[0]] = [row[1]]
    return up2chembl
Esempio n. 2
0
 def parse_protein_mentions(self):
     line_ct = slmf.wcl(self._protein_file)
     self._logger.info("Processing {} lines in protein file {}".format(
         line_ct, self._protein_file))
     with open(self._protein_file, 'r') as tsvf:
         ct = 0
         skip_ct = 0
         notfnd = set()
         for line in tsvf:
             ct += 1
             if not line.startswith('ENSP'):
                 skip_ct += 1
                 continue
             data = line.rstrip().split('\t')
             ensp = data[0]
             pmids = set([int(pmid) for pmid in data[1].split()])
             pids = self._dba.find_protein_ids({'stringid': ensp})
             if not pids:
                 # if we don't find a protein by stringid, which is the more reliable and
                 # prefered way, try by Ensembl xref
                 pids = self._dba.find_protein_ids_by_xref({
                     'xtype': 'Ensembl',
                     'value': ensp
                 })
                 if not pids:
                     notfnd.add(ensp)
                     continue
             for pid in pids:
                 self._pid2pmids[pid] = self._pid2pmids[pid].union(pmids)
                 for pmid in pmids:
                     self._pmid_protein_ct[pmid] += 1.0
     self._logger.info(f"{ct} lines processed")
     self._logger.info(f"  Skipped {skip_ct} non-ENSP lines")
     self._logger.info("  Saved {} protein to PMIDs mappings".format(
         len(self._pid2pmids)))
     self._logger.info("  Saved {} PMID to protein count mappings".format(
         len(self._pmid_protein_ct)))
     if notfnd:
         self._logger.info("  No protein found for {} ENSPs.".format(
             len(notfnd)))
         self._logger.debug("Here they are: {}".format(', '.join(notfnd)))
     return (len(self._pid2pmids), len(self._pmid_protein_ct))
Esempio n. 3
0
 def parse_disease_mentions(self):
     line_ct = slmf.wcl(self._disease_file)
     self._logger.info("Processing {} lines in disease file {}".format(
         line_ct, self._disease_file))
     with open(self._disease_file, 'r') as tsvf:
         ct = 0
         skip_ct = 0
         notfnd = set()
         for line in tsvf:
             ct += 1
             if not line.startswith('DOID:'):
                 skip_ct += 1
                 continue
             data = line.rstrip().split('\t')
             doid = data[0]
             pmids = set([int(pmid) for pmid in data[1].split()])
             if doid not in self._do:
                 self._logger.warn(f"{doid} not found in DO")
                 notfnd.add(doid)
                 continue
             if doid in self._doid2pmids:
                 self._doid2pmids[doid] = self._doid2pmids[doid].union(
                     pmids)
             else:
                 self._doid2pmids[doid] = set(pmids)
             for pmid in pmids:
                 if pmid in self._pmid_disease_ct:
                     self._pmid_disease_ct[pmid] += 1.0
                 else:
                     self._pmid_disease_ct[pmid] = 1.0
     self._logger.info(f"{ct} lines processed.")
     self._logger.info(f"  Skipped {skip_ct} non-DOID lines")
     self._logger.info("  Saved {} DOID to PMIDs mappings".format(
         len(self._doid2pmids)))
     self._logger.info("  Saved {} PMID to disease count mappings".format(
         len(self._pmid_disease_ct)))
     if notfnd:
         self._logger.warn(
             "No entry found in DO map for {} DOIDs: {}".format(
                 ', '.join(notfnd)))
     return (len(self._doid2pmids), len(self._pmid_disease_ct))
Esempio n. 4
0
def load_pubmed(curs, logger, logfile):
  st = time.time()
  fn = INFILES['pubmed']
  line_ct = slmf.wcl(fn)
  print(f'\nLoading TIN-X pubmeds from {fn}...')
  ct = 0
  pm_ct = 0
  dup_ct = 0
  err_ct = 0
  with open(fn, 'r') as ifh:
    tsvreader = csv.reader(ifh, delimiter='\t')
    for row in tsvreader:
      if ct == 0: # skip header
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      try:
        curs.execute(INS_SQL['pubmed'], tuple(row))
        pm_ct += 1
      except Error as e:
        if f"Duplicate entry '{row[0]}'" in e.msg:
          # this should not happen under "production" runs, but it's here for testing/debugging
          dup_ct += 1
          continue
        else:
          err_ct += 1
          logger.error(f"``{e}`` for line {ct}. Data: {row}")
          continue
  ets = slmf.secs2str(time.time() - st)
  print(f"\n  Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}")
  if err_ct:
    print(f"  WARNING: {err_ct} errors occurred. See logfile {logfile} for details.")
  if dup_ct:
    print(f"  Skipped {dup_ct} existing pubmeds.")
  print("Done.")
Esempio n. 5
0
def load(args, dba, logger, logfile):
    line_ct = slmf.wcl(IDG_LIST_FILE)
    print(f"\nProcessing {line_ct} lines in file {IDG_LIST_FILE}")
    logger.info(f"Processing {line_ct} lines in list file {IDG_LIST_FILE}")
    ct = 0
    idg_ct = 0
    fam_ct = 0
    notfnd = []
    multfnd = []
    dba_err_ct = 0
    with open(IDG_LIST_FILE, 'r') as ifh:
        csvreader = csv.reader(ifh)
        for row in csvreader:
            if ct == 0:
                header = row  # header line
                ct += 1
                continue
            ct += 1
            slmf.update_progress(ct / line_ct)
            sym = row[0]
            fam = row[1]
            if fam == 'IonChannel':
                fam = 'IC'
            tids = dba.find_target_ids({'sym': sym})
            if not tids:
                notfnd.append(sym)
                continue
            if len(tids) > 1:
                multfnd.append(sym)
                continue
            rv = dba.do_update({
                'table': 'target',
                'col': 'idg',
                'id': tids[0],
                'val': 1
            })
            if rv:
                idg_ct += 1
            else:
                db_err_ct += 1
            rv = dba.do_update({
                'table': 'target',
                'col': 'fam',
                'id': tids[0],
                'val': fam
            })
            if rv:
                fam_ct += 1
            else:
                dba_err_ct += 1
    print(f"{ct} lines processed")
    print(f"{idg_ct} target rows updated with IDG flags")
    print(f"{fam_ct} target rows updated with fams")
    if notfnd:
        print("WARNING: No target found for {} symbols: {}".format(
            len(notfnd), ", ".join(notfnd)))
    if multfnd:
        print("WARNING: Multiple targets found for {} symbols: {}".format(
            len(multfnd), ", ".join(multfnd)))
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} database errors occured. See logfile {logfile} for details."
        )
Esempio n. 6
0
def tinx(args, dba, logger, logfile):
  # The results of parsing the input mentions files will be the following dictionaries:
  pid2pmids = {}  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
                  # Including the UniProt accession in the key is just for convenience when
                  # checking the output. It is not used for anything.
  doid2pmids = {} # DOID => set of all PMIDs that mention the disease
  pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper 
  pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper 

  # First parse the Disease Ontology OBO file to get DO names and defs
  dofile = DO_DOWNLOAD_DIR + DO_OBO
  print(f"\nParsing Disease Ontology file {dofile}")
  do_parser = obo.Parser(dofile)
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  print("  Got {} Disease Ontology terms".format(len(do)))

  fn = JL_DOWNLOAD_DIR+PROTEIN_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in protein file {fn}")
  with open(fn, 'rU') as tsvf:
    #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('ENSP'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      ensp = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      tids = dba.find_target_ids({'stringid': ensp})
      if not tids:
        # if we don't find a target by stringid, which is the more reliable and
        # prefered way, try by Ensembl xref
        tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp})
      if not tids:
        notfnd.add(ensp)
        continue
      for tid in tids:
        t = dba.get_target(tid, annot=False)
        p = t['components']['protein'][0]
        k = "{},{}".format(p['id'], p['uniprot'])
        if k in pid2pmids:
          pid2pmids[k] = pid2pmids[k].union(pmids)
        else:
          pid2pmids[k] = set(pmids)
        for pmid in pmids:
          if pmid in pmid_protein_ct:
            pmid_protein_ct[pmid] += 1.0
          else:
            pmid_protein_ct[pmid] = 1.0
  for ensp in notfnd:
    logger.warn(f"No target found for {ensp}")
  print(f"\n{ct} lines processed")
  print(f"  Skipped {skip_ct} non-ENSP lines")
  print("  Saved {} protein to PMIDs mappings".format(len(pid2pmids)))
  print("  Saved {} PMID to protein count mappings".format(len(pmid_protein_ct)))
  if notfnd:
    print("  No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile))

  fn = JL_DOWNLOAD_DIR+DISEASE_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'rU') as tsvf:
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('DOID:'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      doid = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      if doid not in do:
        logger.warn(f"{doid} not found in DO")
        notfnd.add(doid)
        continue
      if doid in doid2pmids:
        doid2pmids[doid] = doid2pmids[doid].union(pmids)
      else:
        doid2pmids[doid] = set(pmids)
      for pmid in pmids:
        if pmid in pmid_disease_ct:
          pmid_disease_ct[pmid] += 1.0
        else:
          pmid_disease_ct[pmid] = 1.0
  print(f"\n{ct} lines processed.")
  print(f"  Skipped {skip_ct} non-DOID lines")
  print("  Saved {} DOID to PMIDs mappings".format(len(doid2pmids)))
  print("  Saved {} PMID to disease count mappings".format(len(pmid_disease_ct)))
  if notfnd:
    print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile))

  if not args['--quiet']:
    print("\nComputing protein novely scores")
  # To calculate novelty scores, each paper (PMID) is assigned a
  # fractional target (FT) score of one divided by the number of targets
  # mentioned in it. The novelty score of a given protein is one divided
  # by the sum of the FT scores for all the papers mentioning that
  # protein.
  ct = 0
  with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf:
    pnovf.write("Protein ID,UniProt,Novelty\n")
    for k in pid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in pid2pmids[k]:
        ft_score_sum += 1.0 / pmid_protein_ct[pmid]
      novelty = 1.0 / ft_score_sum
      pnovf.write( "%s,%.8f\n" % (k, novelty) )
  print(f"  Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing disease novely scores")
  # Exactly as for proteins, but using disease mentions
  ct = 0
  with open(DISEASE_NOVELTY_FILE, 'w') as dnovf:
    dnovf.write("DOID,Novelty\n")
    for doid in doid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in doid2pmids[doid]:
        ft_score_sum += 1.0 / pmid_disease_ct[pmid]
      novelty = 1.0 / ft_score_sum
      dnovf.write( "%s,%.8f\n" % (doid, novelty) )
  print(f"  Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing importance scores")
  # To calculate importance scores, each paper is assigned a fractional
  # disease-target (FDT) score of one divided by the product of the
  # number of targets mentioned and the number of diseases
  # mentioned. The importance score for a given disease-target pair is
  # the sum of the FDT scores for all papers mentioning that disease and
  # protein.
  ct = 0
  with open(IMPORTANCE_FILE, 'w') as impf:
    impf.write("DOID,Protein ID,UniProt,Score\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        fdt_score_sum = 0.0
        for pmid in pd_pmids:
          fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] )
        if fdt_score_sum > 0:
          ct += 1
          impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) )
  print(f"  Wrote {ct} importance scores to file {IMPORTANCE_FILE}")

  if not args['--quiet']:
    print("\nComputing PubMed rankings")
  # PMIDs are ranked for a given disease-target pair based on a score
  # calculated by multiplying the number of targets mentioned and the
  # number of diseases mentioned in that paper. Lower scores have a lower
  # rank (higher priority). If the scores do not discriminate, PMIDs are
  # reverse sorted by value with the assumption that larger PMIDs are
  # newer and of higher priority.
  ct = 0
  with open(PMID_RANKING_FILE, 'w') as pmrf:
    pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions)
        for pmid in pd_pmids:
          scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) )
        if len(scores) > 0:
          scores.sort(key = cmp_to_key(cmp_pmids_scores))
          for i,t in enumerate(scores):
            ct += 1
            pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) )
  print(f"  Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
Esempio n. 7
0
def load_pmscores(dba, logger, logfile):
    ensp2pids = {}  # ENSP => list of TCRD protein ids
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    skip_ct = 0
    notfnd = set()
    dba_err_ct = 0
    infile = JL_DOWNLOAD_DIR + PM_SCORES_FILE
    line_ct = slmf.wcl(infile)
    print(f"Processing {line_ct} lines in file {infile}")
    with open(infile, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not pids:
                        notfnd.add(ensp)
                        logger.warn("No protein found for {}".format(ensp))
                        continue
                ensp2pids[
                    ensp] = pids  # save this mapping so we only lookup each ENSP once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    print(f"{ct} input lines processed.")
    print("  Inserted {} new pmscore rows for {} proteins".format(
        pms_ct, len(pmscores)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP")
    if notfnd:
        print(
            "  No protein found for {} STRING IDs. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    print("Updating {} JensenLab PubMed Scores...".format(len(pmscores)))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.upd_pms_tdlinfo(pid, score)
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print(f"  Updated {ti_ct} 'JensenLab PubMed Score' tdl_info rows")
    if dba_err_ct:
        print(
            f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
Esempio n. 8
0
def load_DISEASES(dba, logger, logfile):
    # Knowledge channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_K
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Knowledge file {fn}")
    with open(fn, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        skip_ct = 0
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Knowledge ' + row[4]
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    # Experiment channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_E
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Experiment file {fn}")
    with open(fn, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            if row[2].startswith('ENSP'):
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Experiment ' + row[4]
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP or with ENSP did")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    # Text Mining channel
    fn = JL_DOWNLOAD_DIR + DISEASES_FILE_T
    line_ct = slmf.wcl(fn)
    print(f"Processing {line_ct} lines in DISEASES Textmining file {fn}")
    with open(fn, 'rU') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        ct = 0
        k2pids = {}  # ENSP|sym => list of TCRD protein ids
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            slmf.update_progress(ct / line_ct)
            if not row[0].startswith('ENSP'):
                skip_ct += 1
                continue
            if float(row[5]) < 3.0:
                # skip rows with confidence < 3.0
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                pids = dba.find_protein_ids({'stringid': ensp})
                if not pids:
                    pids = dba.find_protein_ids({'sym': sym})
                    if not pids:
                        notfnd.add(k)
                        logger.warn(f"No protein found for {k}")
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each ENSP|sym once
            dtype = 'JensenLab Text Mining'
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'zscore': row[4],
                    'conf': row[5]
                })
                if rv:
                    dis_ct += 1
                    pmark[pid] = True
                else:
                    dba_err_ct += 1
    print(f"{ct} lines processed.")
    print("  Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark)))
    if skip_ct:
        print(f"  Skipped {skip_ct} rows w/o ENSP or with confidence < 3")
    if notfnd:
        print(
            "  No target found for {} stringids/symbols. See logfile {} for details."
            .format(len(notfnd), logfile))
    if dba_err_ct:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
Esempio n. 9
0
def load(args, dba, dataset_id, logger, logfile):
  line_ct = slmf.wcl(HGNC_TSV_FILE)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in file {HGNC_TSV_FILE}")
  ct = 0
  hgnc_ct = 0
  mgi_ct = 0
  chr_ct = 0
  sym_ct = 0
  symdiscr_ct = 0
  geneid_ct = 0
  geneiddiscr_ct = 0
  notfnd = set()
  pmark = {}
  db_err_ct = 0
  with open(HGNC_TSV_FILE, 'r') as ifh:
    tsvreader = csv.reader(ifh, delimiter='\t')
    for row in tsvreader:
      # 0: HGNC ID
      # 1: Approved symbol
      # 2: Approved name
      # 3: Status
      # 4: Chromosome
      # 5: Mouse genome database ID
      # 6: NCBI Gene ID
      # 7: UniProt ID
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[1]
      if row[6] != '':
        geneid = int(row[6])
      else:
        geneid = None
      if row[7] != '':
        up = row[7]
      else:
        up = None
      pids = dba.find_protein_ids({'sym': sym})
      if not pids and geneid:
        pids = dba.find_protein_ids({'geneid': geneid})
      if not pids and up:
        pids = dba.find_protein_ids({'uniprot': up})
      if up and not pids:
        notfnd.add(f"{sym}|{geneid}|{up}")
        logger.warn(f"No protein found for {sym}|{geneid}|{up}")
        continue
      for pid in pids:
        # HGNC xref
        hgncid = row[0].replace('HGNC:', '')
        rv = dba.ins_xref({'protein_id': pid, 'xtype': 'HGNC ID',
                           'dataset_id': dataset_id, 'value': hgncid})
        if rv:
          hgnc_ct += 1
        else:
          db_err_ct += 1
        # MGI xref
        if row[5] != '':
          mgiid = row[5].replace('MGI:', '')
          rv = dba.ins_xref({'protein_id': pid, 'xtype': 'MGI ID',
                             'dataset_id': dataset_id, 'value': mgiid})
          if rv:
            mgi_ct += 1
          else:
            db_err_ct += 1
        # Add protein.chr values
        rv = dba.do_update({'table': 'protein', 'col': 'chr', 'id': pid, 'val': row[4]})
        if rv:
          chr_ct += 1
        else:
          db_err_ct += 1
        p = dba.get_protein(pid)
        # Add missing syms
        if p['sym'] == None:
          rv = dba.do_update({'table': 'protein', 'col': 'sym', 'id': pid, 'val': sym})
          if rv:
            logger.info("Inserted new sym {} for protein {}|{}".format(sym, pid, p['uniprot']))
            sym_ct += 1
          else:
            db_err_ct += 1
        else:
          # Check for symbol discrepancies
          if p['sym'] != sym:
            logger.warn("Symbol discrepancy: UniProt's=%s, HGNC's=%s" % (p['sym'], sym))
            symdiscr_ct += 1
        if geneid:
          # Add missing geneids
          if p['geneid'] == None:
            rv = dba.do_update({'table': 'protein', 'col': 'geneid', 'id': pid, 'val': geneid})
            if rv:
              logger.info("Inserted new geneid {} for protein {}, {}".format(geneid, pid, p['uniprot']))
              geneid_ct += 1
            else:
              db_err_ct += 1
          else:
            # Check for geneid discrepancies
            if p['geneid'] != geneid:
              logger.warn("GeneID discrepancy: UniProt's={}, HGNC's={}".format(p['geneid'], geneid))
              geneiddiscr_ct += 1
        pmark[pid] = True
  print("Processed {} lines - {} proteins annotated.".format(ct, len(pmark)))
  if notfnd:
    print("No protein found for {} lines (with UniProts).".format(len(notfnd)))
  print(f"  Updated {chr_ct} protein.chr values.")
  print(f"  Inserted {hgnc_ct} HGNC ID xrefs")
  print(f"  Inserted {mgi_ct} MGI ID xrefs")
  if sym_ct > 0:
    print(f"  Inserted {sym_ct} new HGNC symbols")
  if symdiscr_ct > 0:
    print(f"WARNING: Found {symdiscr_ct} discrepant HGNC symbols. See logfile {logfile} for details")
  if geneid_ct > 0:
    print(f"  Inserted {geneid_ct} new NCBI Gene IDs")
  if geneiddiscr_ct > 0:
    print(f"WARNING: Found {geneiddiscr_ct} discrepant NCBI Gene IDs. See logfile {logfile} for details")
  if db_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
Esempio n. 10
0
def load_tinx(args, dba, do, logger, logfile):
  fn = f"{TINX_OUTDIR}ProteinNovelty.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # Protein ID,UniProt,Novelty
    ct = 1
    tn_ct = 0
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      pid = row[0]
      rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} )
      if rv:
        tn_ct += 1
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {tnct} new tinx_novelty rows".)
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
  
  dmap = {}
  fn = f"{TINX_OUTDIR}DiseaseNovelty.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Novelty
    ct = 1
    dct = 0
    notfnd = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      doid = row[0]
      if doid in do:
        if 'name' in do[doid]:
          dname = do[doid]['name'][0].value
        else:
          continue
        if 'def' in do[doid]:
          ddef = do[doid]['def'][0].value
        else:
          ddef = None
      else:
        logger.warn("{row[0]} not in DO map")
        notfnd.append(row[0])
        continue
      rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 
                                  'summary': ddef, 'score': float(row[1])} )
      if rv:
        dct += 1
        dmap[doid] = rv # map DOID to tinx_disease.id
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {dct} new tinx_disease rows".)
  print("  Saved {} keys in dmap".format(len(dmap)))
  if notfnd:
    print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")

  imap = {}
  fn = f"{TINX_OUTDIR}Importance.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,Score
    ct = 1
    ti_ct = 0
    skips1 = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if row[0] not in dmap:
        logger.warn("{row[0]} not in dmap")
        skips1.add(row[0])
        continue
      did = dmap[row[0]]
      pid = row[1]
      rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did,
                                     'score': float(row[3])} )
      if rv:
        ti_ct += 1
        # map DOID|PID to tinx_importance.id
        k = f"{row[0]}|{row[1]}"
        imap[k] = rv 
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {ti_ct} new tinx_importance rows".)
  print("  Saved {} keys in imap".format(len(imap)))
  if len(skips1) > 0:
    print("WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")

  fn = f"{TINX_OUTDIR}PMIDRanking.csv"
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print("f\nProcessing {line_ct} lines in file {fn}")
  regex = re.compile(r"^DOID:0*")
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,PubMed ID,Rank
    ct = 1
    tar_ct = 0
    skips = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      slmf.update_progress(ct/line_ct)
      k = "%s|%s"%(row[0],row[1])
      if k not in imap:
        logger.warn("{k} not in imap")
        skips.add(k)
        continue
      iid = imap[k]
      rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} )
      if rv:
        tar_ct += 1
      else:
        dba_err_ct += 1
  print(f"{ct} input lines processed.")
  print("  Inserted {tar_ct} new tinx_articlerank rows".)
  if len(skips) > 0:
    print("WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile))
  if dba_err_ct > 0:
    print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
Esempio n. 11
0
def load(args, dba, logger, logfile):
  fn = DOWNLOAD_DIR + GENO_PHENO_FILE.replace('.gz', '')
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in input file {fn}")
  ct = 0
  pt_ct = 0
  pmark = {}
  sym2nhpids = {}
  notfnd = set()
  skip_ct = 0
  dba_err_ct = 0
  with open(fn, 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      # 0: marker_accession_id
      # 1: marker_symbol
      # 2: phenotyping_center
      # 3: colony_id
      # 4: sex
      # 5: zygosity
      # 6: allele_accession_id
      # 7: allele_symbol
      # 8: allele_name
      # 9: strain_accession_id
      # 10: strain_name
      # 11: project_name
      # 12: project_fullname
      # 13: pipeline_name
      # 14: pipeline_stable_id
      # 15: procedure_stable_id
      # 16: procedure_name
      # 17: parameter_stable_id
      # 18: parameter_name
      # 19: top_level_mp_term_id
      # 20: top_level_mp_term_name
      # 21: mp_term_id
      # 22: mp_term_name
      # 23: p_value
      # 24: percentage_change
      # 25: effect_size
      # 26: statistical_method
      # 27: resource_name
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[1]
      if not row[21] and not row[22]:
        # skip data with neither a term_id or term_name
        skip_ct += 1
        continue
      if sym in sym2nhpids:
        # we've already found it
        nhpids = sym2nhpids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus')
        if not nhpids:
          notfnd.add(sym)
          logger.warn("No nhprotein found for symbol {}".format(sym))
          continue
        sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once
      pval = None
      if row[23] and row[23] != '':
        try:
          pval = float(row[23])
        except:
          logger.warn("Problem converting p_value {} for row {}".format(row[23], ct))
      sex = None
      if row[4] and len(row[4]) <= 8:
        sex = row[4]
      for nhpid in nhpids:
        rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': sex, 'gp_assoc': 1})
        if rv:
          pmark[nhpid] = True
          pt_ct += 1
        else:
          dba_err_ct += 1
  print(f"{ct} lines processed.")
  print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark)))
  if notfnd:
    print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile))
  if skip_ct > 0:
    print(f"Skipped {skip_ct} lines with no term_id or term_name.")
  if dba_err_ct > 0:
    print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")

  fn = DOWNLOAD_DIR + STAT_RES_FILE.replace('.gz', '')
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines from input file {fn}")
  ct = 0
  pt_ct = 0
  pmark = {}
  sym2nhpids = {}
  notfnd = set()
  skip_ct = 0
  pv_ct = 0
  dba_err_ct = 0
  with open(fn, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    for row in csvreader:
      # 0: phenotyping_center
      # 1: intercept_estimate
      # 2: procedure_id
      # 3: mutant_biological_model_id
      # 4: rotated_residuals_test
      # 5: weight_effect_p_value
      # 6: male_mutant_count
      # 7: pipeline_stable_key
      # 8: female_ko_effect_p_value
      # 9: pipeline_stable_id
      # 10: parameter_stable_key
      # 11: data_type
      # 12: parameter_stable_id
      # 13: interaction_significant
      # 14: strain_accession_id
      # 15: control_selection_method
      # 16: parameter_name
      # 17: allele_name
      # 18: phenotyping_center_id
      # 19: weight_effect_stderr_estimate
      # 20: weight_effect_parameter_estimate
      # 21: procedure_stable_id
      # 22: status
      # 23: sex_effect_parameter_estimate
      # 24: female_ko_effect_stderr_estimate
      # 25: female_percentage_change
      # 26: group_2_residuals_normality_test
      # 27: marker_accession_id
      # 28: mp_term_name
      # 29: group_1_residuals_normality_test
      # 30: genotype_effect_p_value
      # 31: dependent_variable
      # 32: resource_name
      # 33: project_id
      # 34: procedure_name
      # 35: doc_id
      # 36: top_level_mp_term_id
      # 37: allele_accession_id
      # 38: blups_test
      # 39: null_test_p_value
      # 40: p_value
      # 41: marker_symbol
      # 42: control_biological_model_id
      # 43: pipeline_name
      # 44: sex
      # 45: interaction_effect_p_value
      # 46: colony_id
      # 47: project_name
      # 48: female_ko_parameter_estimate
      # 49: female_mutant_count
      # 50: organisation_id
      # 51: external_db_id
      # 52: female_control_count
      # 53: intermediate_mp_term_id
      # 54: db_id
      # 55: male_ko_effect_p_value
      # 56: top_level_mp_term_name
      # 57: metadata_group
      # 58: sex_effect_stderr_estimate
      # 59: zygosity
      # 60: male_percentage_change
      # 61: sex_effect_p_value
      # 62: mp_term_id
      # 63: male_ko_effect_stderr_estimate
      # 64: additional_information
      # 65: statistical_method
      # 66: _version_
      # 67: intercept_estimate_stderr_estimate
      # 68: male_control_count
      # 69: intermediate_mp_term_name
      # 70: strain_name
      # 71: classification_tag
      # 72: effect_size
      # 73: procedure_stable_key
      # 74: allele_symbol
      # 75: resource_id
      # 76: group_2_genotype
      # 77: variance_significant
      # 78: pipeline_id
      # 79: group_1_genotype
      # 80: male_ko_parameter_estimate
      # 81: genotype_effect_parameter_estimate
      # 82: categories
      # 83: parameter_id
      # 84: batch_significant
      # 85: genotype_effect_stderr_estimate
      # 86: resource_fullname
      if ct == 0:
        header = row # header line
        ct += 1
        continue
      ct += 1
      slmf.update_progress(ct/line_ct)
      sym = row[41]
      if not row[62] and not row[28]:
        # skip lines with neither a term_id or term_name
        skip_ct += 1
        continue
      if sym in sym2nhpids:
        # we've already found it
        nhpids = sym2nhpids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus')
        if not nhpids:
          notfnd.add(sym)
          logger.warn("No nhprotein found for symbol {}".format(sym))
          continue
        sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once
      pval = None
      if row[40] and row[40] != '':
        try:
          pval = float(row[40])
        except:
          logger.warn("Problem converting p_value {} for row {}".format(row[40], ct))
      sex = None
      if row[4] and len(row[4]) <= 8:
        sex = row[4]
      for nhpid in nhpids:
        rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': sex, 'gp_assoc': 0})
        if rv:
          pmark[nhpid] = True
          pt_ct += 1
        else:
          dba_err_ct += 1
  print(f"{ct} lines processed.")
  print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark)))
  if notfnd:
    print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile))
  if skip_ct > 0:
    print(f"Skipped {skip_ct} lines with no term_id or term_name.")
  if dba_err_ct > 0:
    print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
Esempio n. 12
0
def load(dba, logger, logfile):
    infile = DOWNLOAD_DIR + TIGA_FILE
    line_ct = slmf.wcl(infile)
    print(f"\nProcessing {line_ct} lines in TIGA file {infile}")
    ct = 0
    k2pids = defaultdict(list)  # maps sym|ENSG to TCRD protein_id(s)
    notfnd = set()
    pmark = {}
    tiga_ct = 0
    dba_err_ct = 0
    with open(infile, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        for row in tsvreader:
            if ct == 0:  # skip header
                header = row  # header line
                ct += 1
                continue
            # 0: ensemblId
            # 1: efoId
            # 2: trait
            # 3: n_study
            # 4: n_snp
            # 5: n_snpw
            # 6: geneNtrait
            # 7: geneNstudy
            # 8: traitNgene
            # 9: traitNstudy
            # 10: pvalue_mlog_median
            # 11: pvalue_mlog_max
            # 12: or_median
            # 13: n_beta
            # 14: study_N_mean
            # 15: rcras
            # 16: geneSymbol
            # 17: TDL
            # 18: geneFamily
            # 19: geneIdgList
            # 20: geneName
            # 21: meanRank
            # 22: meanRankScore
            ct += 1
            slmf.update_progress(ct / line_ct)
            sym = row[16]
            ensg = row[0]
            k = sym + '|' + ensg
            pids = []
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                pids = dba.find_protein_ids({'sym': sym})
                if not pids:
                    pids = dba.find_protein_ids_by_xref({
                        'xtype': 'Ensembl',
                        'value': ensg
                    })
                    if not pids:
                        notfnd.add(k)
                        continue
                k2pids[
                    k] = pids  # save this mapping so we only lookup each sym/ENSG once
            init = {
                'ensg': ensg,
                'efoid': row[1],
                'trait': row[2],
                'n_study': row[3],
                'n_snp': row[4],
                'n_snpw': row[5],
                'geneNtrait': row[6],
                'geneNstudy': row[7],
                'traitNgene': row[8],
                'traitNstudy': row[9],
                'pvalue_mlog_median': row[10],
                'pvalue_mlog_max': row[11],
                'n_beta': row[13],
                'study_N_mean': row[14],
                'rcras': row[15],
                'meanRank': row[21],
                'meanRankScore': row[22]
            }
            if row[12] != 'NA':
                init['or_median'] = row[12]
            #if row[] != 'NA':
            #  init[''] = row[]
            for pid in pids:
                init['protein_id'] = pid
                rv = dba.ins_tiga(init)
                if not rv:
                    dba_err_ct += 1
                    continue
                tiga_ct += 1
                pmark[pid] = True
    for k in notfnd:
        logger.warn(f"No protein found for {k}")
    print(f"Processed {ct} lines")
    print("  Inserted {} new tiga rows for {} proteins".format(
        tiga_ct, len(pmark)))
    if notfnd:
        print("No target found for {} sym/ENSGs. See logfile {} for details.".
              format(len(notfnd), logfile))
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )

    infile = DOWNLOAD_DIR + TIGA_PROV_FILE
    line_ct = slmf.wcl(infile)
    print(f"\nProcessing {line_ct} lines in TIGA provenance file {infile}")
    ct = 0
    tigaprov_ct = 0
    dba_err_ct = 0
    with open(infile, 'r') as ifh:
        tsvreader = csv.reader(ifh, delimiter='\t')
        for row in tsvreader:
            if ct == 0:  # skip header
                header = row  # header line
                ct += 1
                continue
            # 0: ensemblId
            # 1: TRAIT_URI
            # 2: STUDY_ACCESSION
            # 3: PUBMEDID
            # 4: efoId
            ct += 1
            slmf.update_progress(ct / line_ct)
            rv = dba.ins_tiga_provenance({
                'ensg': row[0],
                'efoid': row[4],
                'study_acc': row[2],
                'pubmedid': row[3]
            })
            if not rv:
                dba_err_ct += 1
                continue
            tigaprov_ct += 1
    print(f"Processed {ct} lines")
    print(f"  Inserted {tigaprov_ct} new tiga_provenance rows")
    if dba_err_ct > 0:
        print(
            f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details."
        )
Esempio n. 13
0
def load(args, dba, logger, logfile):
  # OMIMs and Phenotypic Series
  fname = DOWNLOAD_DIR + TITLES_FILE
  line_ct = slmf.wcl(fname)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines from input file {fname}")
  with open(fname, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    skip_ct = 0
    omim_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      if row[0].startswith('#'):
        # The file has commented lines
        skip_ct += 1
        continue
      # The fields are:
      # 0: Prefix ???
      # 1: Mim Number
      # 2: Preferred Title; symbol Alternative Title(s); symbol(s)
      # 3: Included Title(s); symbols
      title = row[2].partition(';')[0]
      rv = dba.ins_omim({'mim': row[1], 'title': title})
      if not rv:
        dba_err_ct += 1
        continue
      omim_ct += 1
      pbar.update(ct)
  pbar.finish()
  print "{} lines processed".format(ct)
  print "  Skipped {} commented lines.".format(skip_ct)
  print "Loaded {} new omim rows".format(omim_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  fname = DOWNLOAD_DIR + PS_FILE
  line_ct = slmf.wcl(fname)
  if not args['--quiet']:
    print '\nProcessing %d lines from input file %s' % (line_ct, fname)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  with open(fname, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    skip_ct = 0
    ps_ct = 0
    err_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      if row[0].startswith('#'):
        # The file has commented lines
        skip_ct += 1
        continue
      # The fields are:
      # 0: Phenotypic Series Number
      # 1: Mim Number
      # 2: Phenotype
      if len(row) ==2:
        init = {'omim_ps_id': row[0], 'title': row[1]}
      elif len(row) == 3:
        init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]}
      else:
        err_ct += 1
        logger.warn("Parsing error for row {}".format(row))
        continue
      rv = dba.ins_omim_ps(init)
      if not rv:
        dba_err_ct += 1
        continue
      ps_ct += 1
      pbar.update(ct)
  pbar.finish()
  print "{} lines processed".format(ct)
  print "  Skipped {} commented lines.".format(skip_ct)
  print "Loaded {} new omim_ps rows".format(ps_ct)
  if err_ct > 0:
    print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(er_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
    
  # Phenotypes
  fname = DOWNLOAD_DIR + GENEMAP_FILE
  line_ct = slmf.wcl(fname)
  if not args['--quiet']:
    print '\nProcessing %d lines from input file %s' % (line_ct, fname)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  with open(fname, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    tmark = {}
    skip_ct = 0
    notfnd_ct = 0
    prov_ct = 0
    dds_ct = 0
    pt_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      if row[0].startswith('#'):
        # The file has commented lines
        skip_ct += 1
        continue
      # The fields are:
      # 0 - Sort ???
      # 1 - Month
      # 2 - Day
      # 3 - Year
      # 4 - Cytogenetic location
      # 5 - Gene Symbol(s)
      # 6 - Confidence
      # 7 - Gene Name
      # 8 - MIM Number
      # 9 - Mapping Method
      # 10 - Comments
      # 11 - Phenotypes
      # 12 - Mouse Gene Symbol
      pts = row[11]
      if pts.startswith('?'):
        prov_ct += 1
        continue
      if '(4)' in pts:
        dds_ct += 1
      trait = "MIM Number: %s" % row[8]
      if row[11]:
        trait += "; Phenotype: %s" % pts
      found = False
      syms = row[5].split(', ')
      logger.info("Checking for OMIM syms: {}".format(syms))
      for sym in syms:
        targets = dba.find_targets({'sym': sym})
        if targets:
          found = True
          for t in targets:
            p = t['components']['protein'][0]
            logger.info("  Symbol {} found target {}: {}, {}".format(sym, t['id'], p['name'], p['description']))
            rv = dba.ins_phenotype({'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait})
            if not rv:
              dba_err_ct += 1
              continue
            tmark[t['id']] = True
            pt_ct += 1
      if not found:
        notfnd_ct += 1
        logger.warn("No target found for row {}".format(row))
      pbar.update(ct)
  pbar.finish()
  print "{} lines processed".format(ct)
  print "  Skipped {} commented lines.".format(skip_ct)
  print "  Skipped {} provisional phenotype rows.".format(prov_ct)
  print "  Skipped {} deletion/duplication syndrome rows.".format(dds_ct)
  print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark))
  if notfnd_ct > 0:
    print "No target found for {} good lines. See logfile {} for details.".format(notfnd_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)