Ejemplo n.º 1
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Expression Atlas',
        'source':
        'IDG-KMC generated data at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gxa/',
        'comment':
        'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'Expression Atlas'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    with open(INPUT_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        k2pids = {}
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            # 0: "Gene ID"
            # 1: "DOID"
            # 2: "Gene Name"
            # 3: "log2foldchange"
            # 4: "p-value"
            # 5: "disease"
            # 6: "experiment_id"
            # 7: "contrast_id"
            ct += 1
            sym = row[2]
            ensg = row[0]
            k = "%s|%s" % (sym, ensg)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'sym': sym}, idg=False)
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'ENSG',
                        'value': ensg
                    })
                if not targets:
                    notfnd.add(k)
                    logger.warn("No target found for {}".format(k))
                    continue
                pids = []
                for t in targets:
                    p = t['components']['protein'][0]
                    pmark[p['id']] = True
                    pids.append(p['id'])
                k2pids[
                    k] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': 'Expression Atlas',
                    'name': row[5],
                    'did': row[1],
                    'log2foldchange': "%.3f" % float(row[3]),
                    'pvalue': row[4]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new disease rows for {} proteins.".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} symbols/ensgs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Ejemplo n.º 2
0
def tinx(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # The results of parsing the input mentions files will be the following dictionaries:
    pid2pmids = {
    }  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
    # Including the UniProt accession in the key is just for convenience when
    # checking the output. It is not used for anything.
    doid2pmids = {}  # DOID => set of all PMIDs that mention the disease
    pmid_disease_ct = {
    }  # PMID => count of diseases mentioned in a given paper
    pmid_protein_ct = {
    }  # PMID => count of proteins mentioned in a given paper

    # First parse the Disease Ontology OBO file to get DO names and defs
    dofile = DO_DOWNLOAD_DIR + DO_OBO
    print "\nParsing Disease Ontology file {}".format(dofile)
    do_parser = obo.Parser(open(dofile))
    do = {}
    for stanza in do_parser:
        do[stanza.tags['id'][0].value] = stanza.tags
    print "  Got {} Disease Ontology terms".format(len(do))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = JL_DOWNLOAD_DIR + PROTEIN_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in protein file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('ENSP'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            ensp = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                # if we don't find a target by stringid, which is the more reliable and
                # prefered way, try by Ensembl xref
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensp
                })
            if not targets:
                notfnd.add(ensp)
                continue
            for t in targets:
                p = t['components']['protein'][0]
                k = "%s,%s" % (p['id'], p['uniprot'])
                if k in pid2pmids:
                    pid2pmids[k] = pid2pmids[k].union(pmids)
                else:
                    pid2pmids[k] = set(pmids)
                for pmid in pmids:
                    if pmid in pmid_protein_ct:
                        pmid_protein_ct[pmid] += 1.0
                    else:
                        pmid_protein_ct[pmid] = 1.0
    pbar.finish()
    for ensp in notfnd:
        logger.warn("No target found for {}".format(ensp))
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-ENSP lines".format(skip_ct)
    print "  Saved {} protein to PMIDs mappings".format(len(pid2pmids))
    print "  Saved {} PMID to protein count mappings".format(
        len(pmid_protein_ct))
    if notfnd:
        print "  No target found for {} ENSPs. See logfile {} for details.".format(
            len(notfnd), logfile)

    fn = JL_DOWNLOAD_DIR + DISEASE_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('DOID:'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            doid = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            if doid not in do:
                logger.warn("%s not found in DO" % doid)
                notfnd.add(doid)
                continue
            if doid in doid2pmids:
                doid2pmids[doid] = doid2pmids[doid].union(pmids)
            else:
                doid2pmids[doid] = set(pmids)
            for pmid in pmids:
                if pmid in pmid_disease_ct:
                    pmid_disease_ct[pmid] += 1.0
                else:
                    pmid_disease_ct[pmid] = 1.0
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-DOID lines".format(skip_ct)
    print "  Saved {} DOID to PMIDs mappings".format(len(doid2pmids))
    print "  Saved {} PMID to disease count mappings".format(
        len(pmid_disease_ct))
    if notfnd:
        print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(
            len(notfnd), logfile)

    if not args['--quiet']:
        print "\nComputing protein novely scores"
    # To calculate novelty scores, each paper (PMID) is assigned a
    # fractional target (FT) score of one divided by the number of targets
    # mentioned in it. The novelty score of a given protein is one divided
    # by the sum of the FT scores for all the papers mentioning that
    # protein.
    ct = 0
    with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf:
        pnovf.write("Protein ID,UniProt,Novelty\n")
        for k in pid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in pid2pmids[k]:
                ft_score_sum += 1.0 / pmid_protein_ct[pmid]
            novelty = 1.0 / ft_score_sum
            pnovf.write("%s,%.8f\n" % (k, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, PROTEIN_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing disease novely scores"
    # Exactly as for proteins, but using disease mentions
    ct = 0
    with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf:
        dnovf.write("DOID,Novelty\n")
        for doid in doid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in doid2pmids[doid]:
                ft_score_sum += 1.0 / pmid_disease_ct[pmid]
            novelty = 1.0 / ft_score_sum
            dnovf.write("%s,%.8f\n" % (doid, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, DISEASE_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing importance scores"
    # To calculate importance scores, each paper is assigned a fractional
    # disease-target (FDT) score of one divided by the product of the
    # number of targets mentioned and the number of diseases
    # mentioned. The importance score for a given disease-target pair is
    # the sum of the FDT scores for all papers mentioning that disease and
    # protein.
    ct = 0
    with open(IMPORTANCE_FILE, 'wb') as impf:
        impf.write("DOID,Protein ID,UniProt,Score\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                fdt_score_sum = 0.0
                for pmid in pd_pmids:
                    fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] *
                                            pmid_disease_ct[pmid])
                if fdt_score_sum > 0:
                    ct += 1
                    impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum))
    print "  Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE)

    if not args['--quiet']:
        print "\nComputing PubMed rankings"
    # PMIDs are ranked for a given disease-target pair based on a score
    # calculated by multiplying the number of targets mentioned and the
    # number of diseases mentioned in that paper. Lower scores have a lower
    # rank (higher priority). If the scores do not discriminate, PMIDs are
    # reverse sorted by value with the assumption that larger PMIDs are
    # newer and of higher priority.
    ct = 0
    with open(PMID_RANKING_FILE, 'wb') as pmrf:
        pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                scores = [
                ]  # scores are tuples of (PMID, protein_mentions*disease_mentions)
                for pmid in pd_pmids:
                    scores.append(
                        (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]))
                if len(scores) > 0:
                    scores.sort(cmp_pmids_scores)
                    for i, t in enumerate(scores):
                        ct += 1
                        pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i))
    print "  Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
Ejemplo n.º 3
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)
  
  #
  # Protein Level Expressions
  #
  line_ct = slmf.wcl(PROTEIN_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  rs2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(PROTEIN_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      if rs in rs2pids:
        # we've already found it
        pids = rs2pids[rs]
      elif rs in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False)
        if not targets:
          notfnd.add(rs)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        rs2pids[rs] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      if row[3] == 'NA':
        init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Protein','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids))
  if notfnd:
    print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(PROTEIN_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(PROTEIN_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      #rs = re.sub('\.\d+$', '', row[0]) # get rid of version
      rs = row[0]
      tau = row[1]
      if rs not in rs2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  #
  # Gene Level Expressions
  #
  line_ct = slmf.wcl(GENE_QUAL_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  sym2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(GENE_QUAL_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      if sym in sym2pids:
        pids = sym2pids[sym]
      elif sym in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          notfnd.add(sym)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        sym2pids[sym] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      
      if row[3] == 'NA':
        init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],}
      else:
        init = {'etype': 'HPM Gene','tissue': tissue, 
                'qual_value': row[4], 'number_value': row[3]}
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids))
  if notfnd:
    print "  No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(GENE_TAU_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  dba_err_ct = 0
  pmark = {}
  skip_ct = 0
  ti_ct = 0
  with open(GENE_TAU_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      sym = re.sub('\.\d+$', '', row[0]) # get rid of version
      tau = row[1]
      if sym not in sym2pids:
        skip_ct += 1
        continue
      for pid in rs2pids[rs]:
        rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index',
                               'number_value': tau})
        if not rv:
          dba_err_ct += 1
          continue
        ti_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark))
  if skip_ct > 0:
    print "  Skipped {} rows with symbols not in map from expression file".format(skip_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Ejemplo n.º 4
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'JensenLab PubMed Text-mining Scores',
        'source': 'File %s' % BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': BASE_URL
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'pmscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'JensenLab PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ensp2pids = {}
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    upd_ct = 0
    notfnd = {}
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            pbar.update(ct)
            if not row[0].startswith('ENSP'): continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'stringid': ensp})
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not targets:
                        notfnd[ensp] = True
                        logger.warn("No target found for {}".format(ensp))
                        continue
                pids = []
                for target in targets:
                    pids.append(target['components']['protein'][0]['id'])
                    ensp2pids[
                        ensp] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    pbar.finish()
    print "{} input lines processed.".format(ct)
    print "  Inserted {} new pmscore rows for {} targets".format(
        pms_ct, len(pmscores))
    if len(notfnd) > 0:
        print "No target found for {} STRING IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} JensenLab PubMed Score tdl_infos".format(
        len(pmscores.keys()))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'JensenLab PubMed Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "  Inserted {} new JensenLab PubMed Score tdl_info rows".format(
        ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Ejemplo n.º 5
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'PANTHER protein classes',
        'source':
        'File %s from ftp://ftp.pantherdb.org//sequence_classifications/current_release/PANTHER_Sequence_Classification_files/, and files %s and %s from http://data.pantherdb.org/PANTHER14.1/ontology/'
        % (os.path.basename(P2PC_FILE), os.path.basename(CLASS_FILE),
           os.path.basename(RELN_FILE)),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.pantherdb.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'panther_class'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'p2pc'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    relns = {}
    line_ct = slmf.wcl(RELN_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in relationships file {}".format(
            line_ct, RELN_FILE)
    with open(RELN_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            pcid = row[0]
            parentid = row[2]
            if pcid in relns:
                relns[pcid].append(parentid)
            else:
                relns[pcid] = [parentid]
    print "{} input lines processed.".format(ct)
    print "  Got {} PANTHER Class relationships".format(len(relns))

    pc2dbid = {}
    line_ct = slmf.wcl(CLASS_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in class file {}".format(
            line_ct, CLASS_FILE)
    with open(CLASS_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pc_ct = 0
        pcmark = {}
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            pc = row[0]
            init = {'pcid': pc, 'name': row[2]}
            if row[3]:
                init['desc'] = row[3]
            if pc in relns:
                init['parent_pcids'] = "|".join(relns[pc])
            # there are duplicates in this file too, so only insert if we haven't
            if pc not in pcmark:
                rv = dba.ins_panther_class(init)
                if rv:
                    pc_ct += 1
                else:
                    dba_err_ct += 1
                pc2dbid[pc] = rv
                pcmark[pc] = True
    print "{} lines processed.".format(ct)
    print "  Inserted {} new panther_class rows".format(pc_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(P2PC_FILE)
    regex = re.compile(r'#(PC\d{5})')
    if not args['--quiet']:
        print "\nProcessing {} lines in classification file {}".format(
            line_ct, P2PC_FILE)
    with open(P2PC_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 02
        pmark = {}
        p2pc_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            [sp, hgnc, up] = row[0].split('|')
            up = up.replace('UniProtKB=', '')
            hgnc = hgnc.replace('HGNC=', '')
            if not row[8]:
                skip_ct += 1
                continue
            #print "[DEBUG] searching by uniprot", up
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                #print "[DEBUG] searching by Ensembl xref", ensg
                targets = dba.find_targets_by_xref({
                    'xtype': 'HGNC',
                    'value': hgnc
                })
            if not targets:
                k = "%s|%s" % (up, hgnc)
                notfnd.add(k)
                continue
            t = targets[0]
            pid = t['components']['protein'][0]['id']
            pmark[pid] = True
            #print "[DEBUG] PCs:",  row[8]
            for pc in regex.findall(row[8]):
                #print "[DEBUG]    ", pc
                pcid = pc2dbid[pc]
                rv = dba.ins_p2pc({
                    'protein_id': pid,
                    'panther_class_id': pcid
                })
                if rv:
                    p2pc_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for {}".format(k))
    print "{} lines processed.".format(ct)
    print "  Inserted {} new p2pc rows for {} distinct proteins".format(
        p2pc_ct, len(pmark))
    print "  Skipped {} rows without PCs".format(skip_ct)
    if notfnd:
        print "No target found for {} UniProt/HGNCs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Ejemplo n.º 6
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)
  
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'GTEx', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM from GTEx files.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.gtexportal.org/home/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'GTEx'", 'comment': 'Pre-processing code can be found here: https://github.com/unmtransinfo/expression-profiles'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'GTEx Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from GTEx files. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)
  
  line_ct = slmf.wcl(GTEX_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in GTEx file {}".format(line_ct, GTEX_FILE)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  ensg2pids = defaultdict(list)
  notfnd = set()
  nouid = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(GTEX_FILE, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      # ENSG    SMTSD   SEX     TPM     TAU     TAU_BYSEX       TPM_RANK        TPM_RANK_BYSEX  TPM_LEVEL TPM_LEVEL_BYSEX TPM_F   TPM_M   log2foldchange
      ct += 1
      pbar.update(ct)
      ensg = re.sub('\.\d+$', '', row[0]) # get rid of version if present
      if ensg in ensg2pids:
        # we've already found it
        pids = ensg2pids[ensg]
      elif ensg in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False)
        if not targets:
          notfnd.add(ensg)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        ensg2pids[ensg] = pids # save this mapping so we only lookup each target once
      tissue = row[1]
      init = {'tissue': tissue, 'gender': row[2], 'tpm': row[3], 'tpm_rank': row[6],
              'tpm_rank_bysex': row[7], 'tpm_level': row[8], 'tpm_level_bysex': row[9],
              'tau': row[4], 'tau_bysex': row[5]}
      if row[10]:
        init['tpm_f'] = row[10]
      if row[11]:
        init['tpm_m'] = row[11]
      if row[12]:
        init['log2foldchange'] = row[12]
      # Add Uberon ID, if we can find one
      if tissue in tiss2uid:
        uberon_id = tiss2uid[tissue]
      else:
        uberon_id = dba.get_uberon_id({'name': tissue})
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(tissue)
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_gtex(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  for ensg in notfnd:
    logger.warn("No target found for {}".format(ensg))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "Processed {} lines".format(ct)
  print "  Inserted {} new expression rows for {} proteins ({} ENSGs)".format(exp_ct, len(pmark), len(ensg2pids))
  if notfnd:
    print "  No target found for {} ENSGs. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Ejemplo n.º 7
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
  
  # Dataset
  exp_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Expression', 'source': 'File Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full', 'comments': 'Qualitative expression values are generated by the loading app.'} )
  assert exp_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  cpt_dataset_id = dba.ins_dataset( {'name': 'Human Cell Atlas Compartments', 'source': 'File Table S6 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://science.sciencemag.org/content/356/6340/eaal3321.full'} )
  assert cpt_dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': exp_dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HCA RNA'", 'comment': 'TPM and qualitative expression values are derived from file Table S1 from http://science.sciencemag.org/content/suppl/2017/05/10/science.aal3321.DC1'},
            {'dataset_id': cpt_dataset_id, 'table_name': 'compartment', 'where_clause': "ctype = 'Human Cell Atlas'"} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  if not args['--quiet']:
    print "\nCalculating expression level percentiles"
  pctiles = calc_pctiles()
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  
  #
  # Expressions
  #
  line_ct = slmf.wcl(RNA_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines from HCA file {}".format(line_ct, RNA_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  k2pids = defaultdict(list)
  notfnd = set()
  dba_err_ct = 0
  pmark = {}
  exp_ct = 0
  with open(RNA_FILE, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next()
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      sym = row[1]
      ensg = row[0]
      k = "%s|%s"%(sym,ensg)
      if k in k2pids:
        # we've already found it
        pids = k2pids[k]
      elif k in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          targets = dba.find_targets_by_xref({'xtype': 'Ensembl', 'value': ensg}, False)
        if not targets:
          notfnd.add(k)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        k2pids[k] = pids
      for pid in pids:
        cell_lines = [c.replace(' (TPM)', '') for c in header[2:]]
        for (i,cl) in enumerate(cell_lines):
          tpm_idx = i + 2 # add two because row has ENSG and Gene at beginning
          tpm = float(row[tpm_idx])
          qv = calc_qual_value( tpm, pctiles[cl] )
          rv = dba.ins_expression( {'protein_id': pid, 'etype': 'HCA RNA',
                                    'tissue': 'Cell Line '+cl, 
                                    'qual_value': qv, 'number_value': tpm} )
          if not rv:
            dba_err_ct += 1
            continue
          exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new expression rows for {} proteins.".format(exp_ct, len(pmark))
  if notfnd:
    print "  No target found for {} Symbols/ENSGs. See logfile {} for details".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  #
  # Compartments
  #
  line_ct = slmf.wcl(LOC_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines from HCA file {}".format(line_ct, LOC_FILE)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  ct = 0
  k2pids = defaultdict(list)
  notfnd = set()
  dba_err_ct = 0
  pmark = {}
  cpt_ct = 0
  with open(LOC_FILE, 'rU') as csvfile:
    csvreader = csv.reader(csvfile)
    header = csvreader.next()
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      uniprot = row[2]
      sym = row[1]
      k = "%s|%s"%(uniprot,sym)
      if k in k2pids:
        # we've already found it
        pids = k2pids[k]
      elif k in notfnd:
        # we've already not found it
        continue
      else:
        # look it up
        targets = dba.find_targets({'uniprot': uniprot}, False)
        if not targets:
          targets = dba.find_targets({'sym': sym}, False)
        if not targets:
          notfnd.add(k)
          continue
        pids = []
        for t in targets:
          pids.append(t['components']['protein'][0]['id'])
        k2pids[k] = pids
      for pid in pids:
        compartments = [c for c in header[3:-5]]
        for (i,c) in enumerate(compartments):
          val_idx = i + 3 # add three because row has ENSG,Gene,Uniprot at beginning
          val = int(row[val_idx])
          if val == 0:
            continue
          rel = row[-5]
          if rel == 'Uncertain':
            continue
          rv = dba.ins_compartment( {'protein_id': pid, 'ctype': 'Human Cell Atlas',
                                     'go_id': COMPARTMENTS[c][1], 
                                     'go_term': COMPARTMENTS[c][0], 'reliability': rel} )
          if not rv:
            dba_err_ct += 1
            continue
          cpt_ct += 1
        pmark[pid] = True
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "  Inserted {} new compartment rows for {} protein.s".format(cpt_ct, len(pmark))
  if notfnd:
    print "  No target found for {} UniProts/Symbols. See logfile {} for details".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Ejemplo n.º 8
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IMPC Mouse Clones',
        'source':
        "File %s obtained directly from Terry Meehan/Alba Gomez at EBI." %
        os.path.basename(IMPC_FILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.mousephenotype.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'IMPC Clones'"
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'IMPC Status'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    line_ct = slmf.wcl(IMPC_FILE)
    if not args['--quiet']:
        print "\nProcessing {} rows from input file {}".format(
            line_ct, IMPC_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    skip_ct = 0
    notfnd = set()
    ti1_ct = 0
    ti2_ct = 0
    dba_err_ct = 0
    with open(IMPC_FILE, 'rU') as csvfile:
        csvreader = csv.DictReader(csvfile)
        for d in csvreader:
            # Gene,MGI Accession,Public IDG,Public CMG Tier1,Public CMG Tier 2,Number of notifications,Status,# Clones,Non-Assigned Plans,Assigned plans,Aborted MIs,MIs in Progress,GLT Mice,Private
            ct += 1
            sym = d['Gene'].upper()
            targets = dba.find_targets({'sym': sym})
            if not targets:
                targets = dba.find_targets_by_xref({
                    'xtype': 'MGI ID',
                    'value': d['MGI Accession']
                })
            if not targets:
                k = "%s,%s" % (d['Gene'], d['MGI Accession'])
                notfnd.add(k)
                continue
            if not d['Status'] and not d['# Clones']:
                skip_ct += 1
                continue
            tids = list()
            for t in targets:
                pid = t['components']['protein'][0]['id']
                if not d['Status']:
                    status = '?'
                else:
                    status = d['Status']
                rv = dba.ins_tdl_info({
                    'protein_id': pid,
                    'itype': 'IMPC Status',
                    'string_value': status
                })
                if rv:
                    ti1_ct += 1
                else:
                    dba_err_ct += 1
                if not d['# Clones']:
                    continue
                rv = dba.ins_tdl_info({
                    'protein_id': pid,
                    'itype': 'IMPC Clones',
                    'string_value': d['# Clones']
                })
                if rv:
                    ti2_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for: {}".format(k))
    if not args['--quiet']:
        print "{} rows processed.".format(ct)
    print "Inserted {} new 'IMPC Status' tdl_info rows".format(ti1_ct)
    print "Inserted {} new 'IMPC Clones' tdl_info rows".format(ti2_ct)
    print "Skipped {} rows with no relevant info".format(skip_ct)
    if notfnd:
        print "No target found for {} rows. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Ejemplo n.º 9
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Transcription Factor Flags',
        'source': BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://humantfs.ccbr.utoronto.ca/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'Is Transcription Factor'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0}

    ifn = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(ifn)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, ifn)
    with open(ifn, 'rU') as ifh:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(ifh)
        header = csvreader.next()  # skip header line
        ct = 0
        ti_ct = 0
        skip_ct = 0
        notfnd = set()
        dba_err_ct = 0
        for row in csvreader:
            # 0 Ensembl ID
            # 1 HGNC symbol
            # 2 DBD
            # 3 Is TF?
            # 4 TF assessment
            # 5 Binding mode,Motif status
            # 6 Final Notes
            # 7 Final Comments
            # 8 Interpro ID(s)
            # 9 EntrezGene ID
            # 10 EntrezGene Description
            # 11 PDB ID
            # 12 TF tested by HT-SELEX?
            # 13 TF tested by PBM?
            # 14 Conditional Binding Requirements
            # 15 Original Comments
            # 16 Vaquerizas 2009 classification
            # 17 CisBP considers it a TF?
            # 18 TFCat classification
            # 19 Is a GO TF?
            # 20 Initial assessment
            # 21 Curator 1
            # 22 Curator 2
            # 23 TFclass considers
            ct += 1
            if row[3] != 'Yes':
                skip_ct += 1
                continue
            sym = row[1]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                gid = row[9]
                if gid != 'None' and not gid.startswith('IPR'):
                    targets = dba.find_targets({'geneid': gid})
            if not targets:
                ensg = row[0]
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensg
                })
            if not targets:
                k = "%s|%s|%s" % (sym, gid, ensg)
                notfnd.add(k)
                continue
            t = targets[0]
            TDLs[t['tdl']] += 1
            pid = t['components']['protein'][0]['id']
            rv = dba.ins_tdl_info({
                'protein_id': pid,
                'itype': 'Is Transcription Factor',
                'boolean_value': 1
            })
            if rv:
                ti_ct += 1
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    for k in notfnd:
        logger.warn("No target found for {}".format(k))
    print "\n{} lines processed.".format(ct)
    print "  Inserted {} new 'Is Transcription Factor' tdl_infos".format(ti_ct)
    print "  Skipped {} non-TF lines".format(skip_ct)
    if notfnd:
        print "No target found for {} symbols/geneids/ENSGs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']:
        print "%s: %d" % (tdl, TDLs[tdl])