Beispiel #1
0
def cnv_gene_attribute_types(args, ifn, ofn):
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  line_ct = slmf.wcl(ifn)
  if not args['--quiet']:
    print "\nProcessing {} lines in file {}".format(line_ct, ifn)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  rct = 0
  wct = 0
  with open(ofn, 'w') as ofh:
    ofh.write("LOCK TABLES `gene_attribute_type` WRITE;\n")
    ofh.write("/*!40000 ALTER TABLE `gene_attribute_type` DISABLE KEYS */;\n")
    ofh.write("INSERT INTO `gene_attribute_type` VALUES ")
    with open(ifn, 'r') as ifh:
      csvreader = csv.reader(ifh)
      header = csvreader.next() # skip header line
      rct = 1
      for row in csvreader:
        # "id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url"
        rct += 1
        ofh.write('("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9]))
        if rct < line_ct:
          ofh.write(',')
        wct += 1
        pbar.update(rct)
      pbar.finish()
    ofh.write(";\n/*!40000 ALTER TABLE `gene_attribute_type` ENABLE KEYS */;\nUNLOCK TABLES;\n")
  print "Processed {} lines".format(rct)
  print "  Wrote inserts for {} new gene_attribute_type rows to file {}".format(wct, ofn)
  return
Beispiel #2
0
def cnv_gene_attribute_types(args, ifn, ofn):
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(ifn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, ifn)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    rct = 0
    wct = 0
    with open(ofn, 'w') as ofh:
        ofh.write(
            '"id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url"\n'
        )
        with open(ifn, 'r') as ifh:
            csvreader = csv.reader(ifh)
            header = csvreader.next()  # skip header line
            rct = 1
            for row in csvreader:
                # "id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url"
                rct += 1
                ofh.write(
                    '"{}","{}","{}","{}","{}","{}","{}","{}","{}","{}"\n'.
                    format(row[0], row[1], row[2], row[3], row[4], row[5],
                           row[6], row[7], row[8], row[9]))
                wct += 1
                pbar.update(rct)
            pbar.finish()
    print "Processed {} lines".format(rct)
    print "  Wrote {} new gene_attribute_type rows to file {}".format(wct, ofn)
    return
Beispiel #3
0
def parse_ens_files(args):
  for sp in UP2ENSG.keys():
    fn = CONFIG[sp]['ensfile']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
      print "Processing {} lines in file {}".format(line_ct, fn)
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    with open(fn, 'rU') as tsv:
      tsvreader = csv.reader(tsv, delimiter='\t')
      header = tsvreader.next() # skip header line
      for row in tsvreader:
        # 0: gene_stable_id
        # 1: transcript_stable_id
        # 2: protein_stable_id
        # 3: xref
        # 4: db_name
        # 5: info_type
        # 6: source_identity
        # 7: xref_identity
        # 8: linkage_type
        if row[7] != '100':
          continue
        UP2ENSG[sp][row[3]].add(row[0])
        pbar.update(ct)
    pbar.finish()
  if not args['--quiet']:
    mct = sum([len(UP2ENSG[sp]) for sp in UP2ENSG.keys()])
    print "Now have {} UniProt to ENSG mappings.\n".format(mct)
Beispiel #4
0
def load(args, dba, logger, logfile):
  infile = DOWNLOAD_DIR + FILENAME
  line_ct = slmf.wcl(infile)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, infile)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(infile, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    skip_ct = 0
    hom_ct = 0
    nf_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      # homologene_group_id    tax_id    ncbi_gene_id    symbol    protein_gi    ref_seq
      taxid = int(row[1])
      if taxid not in TAXIDS:
        skip_ct += 1
        continue
      if taxid == 9606:
        targets = dba.find_targets({'geneid': row[2]})
        if not targets:
          nf_ct += 1
          logger.warn("No target found for {}".format(row))
          continue
        for t in targets:
          p = t['components']['protein'][0]
          rv = dba.ins_homologene({'protein_id': p['id'], 'groupid': row[0], 'taxid': taxid})
          if rv:
            hom_ct += 1
          else:
            dba_err_ct += 1
      else:
        nhproteins = dba.find_nhproteins({'geneid': row[2]})
        if not nhproteins:
          nf_ct += 1
          logger.warn("No nhprotein found for {}".format(row))
          continue
        for nhp in nhproteins:
          rv = dba.ins_homologene({'nhprotein_id': nhp['id'], 'groupid': row[0], 'taxid': taxid})
          if rv:
            hom_ct += 1
          else:
            dba_err_ct += 1
  pbar.finish()
  print "Processed {} lines.".format(ct)
  print "Loaded {} new homologene rows".format(hom_ct)
  print "  Skipped {} non-Human/Mouse/Rat lines".format(skip_ct)
  if nf_ct > 0:
    print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format(nf_ct, logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #5
0
def load(args, dba, logfile, logger, ver, fn):
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    ct = 0
    ins_ct = 0
    dba_err_ct = 0
    with open(fn, 'rU') as ifh:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        csvreader = csv.reader(ifh)
        for row in csvreader:
            # 0: TCRD DB ID in version
            # 1: Name
            # 2: Description
            # 3: UniProt
            # 4: Symbol
            # 5: Gene ID
            # 6: TDL
            # 7: Family
            ct += 1
            geneid = None
            if row[5] != '\\N':
                geneid = row[5]
            rv = dba.ins_idg_evol({
                'tcrd_ver': ver,
                'tcrd_dbid': row[0],
                'name': row[1],
                'description': row[2],
                'uniprot': row[3],
                'sym': row[4],
                'geneid': geneid,
                'tdl': row[6],
                'fam': row[7]
            })
            if not rv:
                dba_err_ct += 1
                continue
            ins_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Inserted {} new idg_evol rows".format(ins_ct)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    return True
Beispiel #6
0
def parse_hcop16(args):
    gzfn = DOWNLOAD_DIR + FILENAME
    fn = gzfn.replace('.gz', '')
    orthos = list()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        tsvreader = csv.DictReader(tsv, delimiter='\t')
        for d in tsvreader:
            # ortholog_species
            # human_entrez_gene
            # human_ensembl_gene
            # hgnc_id
            # human_name
            # human_symbol
            # human_chr
            # human_assert_ids
            # ortholog_species_entrez_gene
            # ortholog_species_ensembl_gene
            # ortholog_species_db_id
            # ortholog_species_name
            # ortholog_species_symbol
            # ortholog_species_chr
            # ortholog_species_assert_ids
            # support
            src_ct = 0
            srcs = []
            if 'Inparanoid' in d['support']:
                src_ct += 1
                srcs.append('Inparanoid')
            if 'OMA' in d['support']:
                src_ct += 1
                srcs.append('OMA')
            if 'EggNOG' in d['support']:
                src_ct += 1
                srcs.append('EggNOG')
            if src_ct >= 2:  # Only take rows with at least 2 out of three
                d['sources'] = ', '.join(srcs)
                orthos.append(d)
    if not args['--quiet']:
        print "  Generated ortholog dataframe with {} entries".format(
            len(orthos))
    ortho_df = pd.DataFrame(orthos)
    return ortho_df
Beispiel #7
0
def cnv_gene_attributes(args, idmap, ifn, ofn):
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    line_ct = slmf.wcl(ifn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, ifn)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    rct = 0
    wct = 0
    skip_ct = 0
    gaid = 1
    with open(ofn, 'w') as ofh:
        ofh.write('"id","protein_id","gat_id","name","value"\n')
        with open(ifn, 'r') as ifh:
            csvreader = csv.reader(ifh)
            header = csvreader.next()  # skip header line
            rct = 1
            for row in csvreader:
                # "id","protein_id","gat_id","name","value"
                rct += 1
                v5pid = int(row[1])
                if v5pid in idmap:
                    # v5 protein maps to v6 protein
                    v6pid = idmap[v5pid]
                else:
                    skip_ct += 1
                    continue
                ofh.write('"{}","{}","{}","{}","{}"\n'.format(
                    gaid, v6pid, row[2], row[3], row[4]))
                gaid += 1
                wct += 1
                pbar.update(rct)
            pbar.finish()
    print "Processed {} lines.".format(rct)
    print "  Wrote {} new gene_attribute rows to file {}".format(wct, ofn)
    print "  Skipped {} rows that do not map from v5 to v6.".format(skip_ct)
    return
Beispiel #8
0
def main():
    for ver, fn in INFILES.items():
        line_ct = slmf.wcl(fn)
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
        ct = 0
        with open(fn, 'r') as ifh:
            csvreader = csv.reader(ifh)
            for row in csvreader:
                # name, uniprot, sym, geneid, tdl
                ct += 1
                up = row[1]
                sym = row[2]
                tdl = row[4]
                TDLEvol[up][ver] = tdl
                UP2Sym[up] = sym
        print "{} lines processed.".format(ct)
        print "{} entries now in TDLEvol.".format(len(TDLEvol))

        ct = 0
        header = [
            'UniProt', 'HGNC Symbol', 'v1 TDL', 'v2 TDL', 'v3 TDL', 'v4 TDL',
            'v5 TDL', 'v6 TDL'
        ]
        ct += 1
        with open(OUTFILE, 'w') as csvout:
            csvwriter = csv.writer(csvout,
                                   quotechar='"',
                                   quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for up, tdld in TDLEvol.items():
                outrow = [up, UP2Sym[up]]
                for ver in ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']:
                    if ver in tdld:
                        outrow.append(tdld[ver])
                    else:
                        outrow.append('')
                csvwriter.writerow(outrow)
                ct += 1
        print "\nWrote {} line to output file {}.".format(ct, OUTFILE)
    return True
Beispiel #9
0
def load_RGD(args, dba, logger, logfile):
    fn = CONFIG['RGD']['DOWNLOAD_DIR'] + CONFIG['RGD']['QTL_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "Processing {} lines in processed RGD file {}".format(
            line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    nhpmark = {}
    qtl_ct = 0
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            ct += 1
            # 0 "GENE_RGD_ID"
            # 1 "nhprotein_id"
            # 2 "QTL_RGD_ID"
            # 3 "QTL_SYMBOL"
            # 4 "QTL_NAME"
            # 5 "LOD"
            # 6 "P_VALUE"
            # 7 "TRAIT_NAME"
            # 8 "MEASUREMENT_TYPE"
            # 9 "ASSOCIATED_DISEASES"
            # 10 "PHENOTYPES"
            init = {
                'nhprotein_id': row[1],
                'rgdid': row[0],
                'qtl_rgdid': row[2],
                'qtl_symbol': row[3],
                'qtl_name': row[4]
            }
            if row[5] and row[5] != 'None':
                init['lod'] = row[5]
            if row[6] and row[6] != 'None':
                init['p_value'] = row[6]
            if row[7] and row[7] != 'None':
                init['trait_name'] = row[7]
            if row[8] and row[8] != 'None':
                init['measurement_type'] = row[8]
            if row[9] and row[9] != 'None':
                init['associated_disease'] = row[9]
            if row[10] and row[10] != 'None':
                init['phenotype'] = row[10]
            rv = dba.ins_rat_qtl(init)
            if not rv:
                dba_err_ct += 1
                continue
            qtl_ct += 1
            nhpmark[row[1]] = True
            pbar.update(ct)
    pbar.finish()
    print "Processed {} lines".format(ct)
    print "Inserted {} new rat_qtl rows for {} nhproteins.".format(
        qtl_ct, len(nhpmark))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fn = CONFIG['RGD']['DOWNLOAD_DIR'] + CONFIG['RGD']['TERMS_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "Processing {} lines in processed RGD file {}".format(
            line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    term_ct = 0
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            ct += 1
            # 0 "RGD_ID"
            # 1 "OBJECT_SYMBOL"
            # 2 "TERM_ACC_ID"
            # 3 "TERM_NAME"
            # 4 "QUALIFIER"
            # 5 "EVIDENCE"
            # 6 "ONTOLOGY"
            init = {
                'rgdid': row[0],
                'term_id': row[2],
                'qtl_symbol': row[3],
                'qtl_name': row[4]
            }
            if row[1] and row[1] != 'None':
                init['obj_symbol'] = row[1]
            if row[3] and row[3] != 'None':
                init['term_name'] = row[3]
            if row[4] and row[4] != 'None':
                init['qualifier'] = row[4]
            if row[5] and row[5] != 'None':
                init['evidence'] = row[5]
            if row[6] and row[6] != 'None':
                init['ontology'] = row[6]
            rv = dba.ins_rat_term(init)
            if not rv:
                dba_err_ct += 1
                continue
            term_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "Processed {} lines".format(ct)
    print "Inserted {} new rat_term rows.".format(term_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'RGD',
        'source':
        'Files %s and %s produced by UNM KMC group from files from ftp://ftp.rgd.mcw.edu/pub/data_release/'
        .format(CONFIG['RGD']['QTL_FILE'], CONFIG['RGD']['TERMS_FILE']),
        'app':
        PROGRAM,
        'app_version':
        __version__
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'rat_term'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'rat_qtl'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)
Beispiel #10
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GWAS Catalog',
        'source':
        'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' %
        os.path.basename(INFILE),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gwas/home'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INFILE)
    line_ct -= 1
    if not args['--quiet']:
        print '\nProcessing {} lines from input file {}'.format(
            line_ct, INFILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    outlist = []
    with open(INFILE, 'rU') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        notfnd = set()
        pmark = {}
        gwas_ct = 0
        dba_err_ct = 0
        # 0: DATE ADDED TO CATALOG
        # 1: PUBMEDID
        # 2: FIRST AUTHOR
        # 3: DATE
        # 4: JOURNAL
        # 5: LINK
        # 6: STUDY
        # 7: DISEASE/TRAIT
        # 8: INITIAL SAMPLE SIZE
        # 9: REPLICATION SAMPLE SIZE
        # 10: REGION
        # 11: CHR_ID
        # 12: CHR_POS
        # 13: REPORTED GENE(S)
        # 14: MAPPED_GENE
        # 15: UPSTREAM_GENE_ID
        # 16: DOWNSTREAM_GENE_ID
        # 17: SNP_GENE_IDS
        # 18: UPSTREAM_GENE_DISTANCE
        # 19: DOWNSTREAM_GENE_DISTANCE
        # 20: STRONGEST SNP-RISK ALLELE
        # 21: SNPS
        # 22: MERGED
        # 23: SNP_ID_CURRENT
        # 24: CONTEXT
        # 25: INTERGENIC
        # 26: RISK ALLELE FREQUENCY
        # 27: P-VALUE
        # 28: PVALUE_MLOG
        # 29: P-VALUE (TEXT)
        # 30: OR or BETA
        # 31: 95% CI (TEXT)
        # 32: PLATFORM [SNPS PASSING QC]
        # 33: CNV
        # 34: MAPPED_TRAIT
        # 35: MAPPED_TRAIT_URI
        # 36: STUDY ACCESSION
        # 37: GENOTYPING TECHNOLOGY
        symregex = re.compile(r' ?[-,;] ?')
        for row in tsvreader:
            ct += 1
            if len(row) < 14: continue
            symstr = row[14]
            if symstr == 'NR': continue
            symlist = symregex.split(symstr)
            for sym in symlist:
                if sym in notfnd:
                    continue
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.add(sym)
                    logger.warn("No target found for symbol {}".format(sym))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    try:
                        pval = float(row[27])
                    except:
                        pval = None
                    try:
                        orbeta = float(row[30])
                    except:
                        orbeta = None
                    if row[25]:
                        ig = int(row[25])
                    else:
                        ig = None
                    rv = dba.ins_gwas({
                        'protein_id': p['id'],
                        'disease_trait': row[7],
                        'snps': row[21],
                        'pmid': row[1],
                        'study': row[6],
                        'context': row[24],
                        'intergenic': ig,
                        'p_value': pval,
                        'or_beta': orbeta,
                        'cnv': row[33],
                        'mapped_trait': row[34],
                        'mapped_trait_uri': row[35]
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    pmark[p['id']] = True
                    gwas_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new gwas rows for {} proteins".format(
        gwas_ct, len(pmark.keys()))
    if notfnd:
        print "No target found for {} symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #11
0
def load_JAX(args, dba, logger, logfile):
    fn = CONFIG['MPO_OWL_FILE']
    if not args['--quiet']:
        print "Parsing Mammalian Phenotype Ontology file {}".format(fn)
    mpo = parse_mp_owl(fn)
    if not args['--quiet']:
        print "Got {} MP terms".format(len(mpo))

    fn = CONFIG['JAX']['DOWNLOAD_DIR'] + CONFIG['JAX']['FILENAME']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "Processing {} lines from JAX file {}".format(line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pt_ct = 0
        skip_ct = 0
        pmark = {}
        notfnd = set()
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if not row[6] or row[6] == '':
                skip_ct += 1
                continue
            sym = row[0]
            geneid = row[1]
            k = "%s|%s" % (sym, geneid)
            if k in notfnd:
                continue
            targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                targets = dba.find_targets({'geneid': geneid}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            for t in targets:
                pid = t['components']['protein'][0]['id']
                pmark[pid] = True
                for mpid in row[6].split():
                    rv = dba.ins_phenotype({
                        'protein_id': pid,
                        'ptype': 'JAX/MGI Human Ortholog Phenotype',
                        'term_id': mpid,
                        'term_name': mpo[mpid]['name']
                    })
                    if rv:
                        pt_ct += 1
                    else:
                        dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new phenotype rows for {} proteins".format(
        pt_ct, len(pmark))
    print "  Skipped {} lines with no MP terms".format(skip_ct)
    if notfnd:
        print "  No target found for {} gene symbols/ids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'JAX/MGI Mouse/Human Orthology Phenotypes',
        'source':
        'File %s from ftp.informatics.jax.org' % CONFIG['JAX']['FILENAME'],
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.informatics.jax.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotype'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)
Beispiel #12
0
def tinx(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # The results of parsing the input mentions files will be the following dictionaries:
    pid2pmids = {
    }  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
    # Including the UniProt accession in the key is just for convenience when
    # checking the output. It is not used for anything.
    doid2pmids = {}  # DOID => set of all PMIDs that mention the disease
    pmid_disease_ct = {
    }  # PMID => count of diseases mentioned in a given paper
    pmid_protein_ct = {
    }  # PMID => count of proteins mentioned in a given paper

    # First parse the Disease Ontology OBO file to get DO names and defs
    dofile = DO_DOWNLOAD_DIR + DO_OBO
    print "\nParsing Disease Ontology file {}".format(dofile)
    do_parser = obo.Parser(open(dofile))
    do = {}
    for stanza in do_parser:
        do[stanza.tags['id'][0].value] = stanza.tags
    print "  Got {} Disease Ontology terms".format(len(do))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = JL_DOWNLOAD_DIR + PROTEIN_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in protein file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('ENSP'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            ensp = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                # if we don't find a target by stringid, which is the more reliable and
                # prefered way, try by Ensembl xref
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensp
                })
            if not targets:
                notfnd.add(ensp)
                continue
            for t in targets:
                p = t['components']['protein'][0]
                k = "%s,%s" % (p['id'], p['uniprot'])
                if k in pid2pmids:
                    pid2pmids[k] = pid2pmids[k].union(pmids)
                else:
                    pid2pmids[k] = set(pmids)
                for pmid in pmids:
                    if pmid in pmid_protein_ct:
                        pmid_protein_ct[pmid] += 1.0
                    else:
                        pmid_protein_ct[pmid] = 1.0
    pbar.finish()
    for ensp in notfnd:
        logger.warn("No target found for {}".format(ensp))
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-ENSP lines".format(skip_ct)
    print "  Saved {} protein to PMIDs mappings".format(len(pid2pmids))
    print "  Saved {} PMID to protein count mappings".format(
        len(pmid_protein_ct))
    if notfnd:
        print "  No target found for {} ENSPs. See logfile {} for details.".format(
            len(notfnd), logfile)

    fn = JL_DOWNLOAD_DIR + DISEASE_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('DOID:'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            doid = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            if doid not in do:
                logger.warn("%s not found in DO" % doid)
                notfnd.add(doid)
                continue
            if doid in doid2pmids:
                doid2pmids[doid] = doid2pmids[doid].union(pmids)
            else:
                doid2pmids[doid] = set(pmids)
            for pmid in pmids:
                if pmid in pmid_disease_ct:
                    pmid_disease_ct[pmid] += 1.0
                else:
                    pmid_disease_ct[pmid] = 1.0
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-DOID lines".format(skip_ct)
    print "  Saved {} DOID to PMIDs mappings".format(len(doid2pmids))
    print "  Saved {} PMID to disease count mappings".format(
        len(pmid_disease_ct))
    if notfnd:
        print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(
            len(notfnd), logfile)

    if not args['--quiet']:
        print "\nComputing protein novely scores"
    # To calculate novelty scores, each paper (PMID) is assigned a
    # fractional target (FT) score of one divided by the number of targets
    # mentioned in it. The novelty score of a given protein is one divided
    # by the sum of the FT scores for all the papers mentioning that
    # protein.
    ct = 0
    with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf:
        pnovf.write("Protein ID,UniProt,Novelty\n")
        for k in pid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in pid2pmids[k]:
                ft_score_sum += 1.0 / pmid_protein_ct[pmid]
            novelty = 1.0 / ft_score_sum
            pnovf.write("%s,%.8f\n" % (k, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, PROTEIN_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing disease novely scores"
    # Exactly as for proteins, but using disease mentions
    ct = 0
    with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf:
        dnovf.write("DOID,Novelty\n")
        for doid in doid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in doid2pmids[doid]:
                ft_score_sum += 1.0 / pmid_disease_ct[pmid]
            novelty = 1.0 / ft_score_sum
            dnovf.write("%s,%.8f\n" % (doid, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, DISEASE_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing importance scores"
    # To calculate importance scores, each paper is assigned a fractional
    # disease-target (FDT) score of one divided by the product of the
    # number of targets mentioned and the number of diseases
    # mentioned. The importance score for a given disease-target pair is
    # the sum of the FDT scores for all papers mentioning that disease and
    # protein.
    ct = 0
    with open(IMPORTANCE_FILE, 'wb') as impf:
        impf.write("DOID,Protein ID,UniProt,Score\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                fdt_score_sum = 0.0
                for pmid in pd_pmids:
                    fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] *
                                            pmid_disease_ct[pmid])
                if fdt_score_sum > 0:
                    ct += 1
                    impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum))
    print "  Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE)

    if not args['--quiet']:
        print "\nComputing PubMed rankings"
    # PMIDs are ranked for a given disease-target pair based on a score
    # calculated by multiplying the number of targets mentioned and the
    # number of diseases mentioned in that paper. Lower scores have a lower
    # rank (higher priority). If the scores do not discriminate, PMIDs are
    # reverse sorted by value with the assumption that larger PMIDs are
    # newer and of higher priority.
    ct = 0
    with open(PMID_RANKING_FILE, 'wb') as pmrf:
        pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                scores = [
                ]  # scores are tuples of (PMID, protein_mentions*disease_mentions)
                for pmid in pd_pmids:
                    scores.append(
                        (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]))
                if len(scores) > 0:
                    scores.sort(cmp_pmids_scores)
                    for i, t in enumerate(scores):
                        ct += 1
                        pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i))
    print "  Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
Beispiel #13
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'OMIM',
        'source':
        'Files %s downloaded from omim.org' %
        ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://omim.org/',
        'comments':
        'Confirmed OMIM phenotypes and OMIM Phenotype Series info'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'omim'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'omim_ps'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'OMIM'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    # OMIMs and Phenotypic Series
    fname = DOWNLOAD_DIR + TITLES_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        omim_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Prefix ???
            # 1: Mim Number
            # 2: Preferred Title; symbol Alternative Title(s); symbol(s)
            # 3: Included Title(s); symbols
            title = row[2].partition(';')[0]
            rv = dba.ins_omim({'mim': row[1], 'title': title})
            if not rv:
                dba_err_ct += 1
                continue
            omim_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim rows".format(omim_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fname = DOWNLOAD_DIR + PS_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        ps_ct = 0
        err_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Phenotypic Series Number
            # 1: Mim Number
            # 2: Phenotype
            if len(row) == 2:
                init = {'omim_ps_id': row[0], 'title': row[1]}
            elif len(row) == 3:
                init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]}
            else:
                err_ct += 1
                logger.warn("Parsing error for row {}".format(row))
                continue
            rv = dba.ins_omim_ps(init)
            if not rv:
                dba_err_ct += 1
                continue
            ps_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "Loaded {} new omim_ps rows".format(ps_ct)
    if err_ct > 0:
        print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(
            er_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Phenotypes
    fname = DOWNLOAD_DIR + GENEMAP_FILE
    line_ct = slmf.wcl(fname)
    if not args['--quiet']:
        print '\nProcessing %d lines from input file %s' % (line_ct, fname)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fname, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        tmark = {}
        skip_ct = 0
        notfnd_ct = 0
        prov_ct = 0
        dds_ct = 0
        pt_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0 - Sort ???
            # 1 - Month
            # 2 - Day
            # 3 - Year
            # 4 - Cytogenetic location
            # 5 - Gene Symbol(s)
            # 6 - Confidence
            # 7 - Gene Name
            # 8 - MIM Number
            # 9 - Mapping Method
            # 10 - Comments
            # 11 - Phenotypes
            # 12 - Mouse Gene Symbol
            pts = row[11]
            if pts.startswith('?'):
                prov_ct += 1
                continue
            if '(4)' in pts:
                dds_ct += 1
            trait = "MIM Number: %s" % row[8]
            if row[11]:
                trait += "; Phenotype: %s" % pts
            found = False
            syms = row[5].split(', ')
            logger.info("Checking for OMIM syms: {}".format(syms))
            for sym in syms:
                targets = dba.find_targets({'sym': sym})
                if targets:
                    found = True
                    for t in targets:
                        p = t['components']['protein'][0]
                        logger.info(
                            "  Symbol {} found target {}: {}, {}".format(
                                sym, t['id'], p['name'], p['description']))
                        rv = dba.ins_phenotype({
                            'protein_id': p['id'],
                            'ptype': 'OMIM',
                            'trait': trait
                        })
                        if not rv:
                            dba_err_ct += 1
                            continue
                        tmark[t['id']] = True
                        pt_ct += 1
            if not found:
                notfnd_ct += 1
                logger.warn("No target found for row {}".format(row))
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    print "  Skipped {} provisional phenotype rows.".format(prov_ct)
    print "  Skipped {} deletion/duplication syndrome rows.".format(dds_ct)
    print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark))
    if notfnd_ct > 0:
        print "No target found for {} good lines. See logfile {} for details.".format(
            notfnd_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #14
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Expression Atlas',
        'source':
        'IDG-KMC generated data at UNM.',
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gxa/',
        'comment':
        'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'Expression Atlas'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    with open(INPUT_FILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 0
        k2pids = {}
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            # 0: "Gene ID"
            # 1: "DOID"
            # 2: "Gene Name"
            # 3: "log2foldchange"
            # 4: "p-value"
            # 5: "disease"
            # 6: "experiment_id"
            # 7: "contrast_id"
            ct += 1
            sym = row[2]
            ensg = row[0]
            k = "%s|%s" % (sym, ensg)
            if k in k2pids:
                # we've already found it
                pids = k2pids[k]
            elif k in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'sym': sym}, idg=False)
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'ENSG',
                        'value': ensg
                    })
                if not targets:
                    notfnd.add(k)
                    logger.warn("No target found for {}".format(k))
                    continue
                pids = []
                for t in targets:
                    p = t['components']['protein'][0]
                    pmark[p['id']] = True
                    pids.append(p['id'])
                k2pids[
                    k] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_disease({
                    'protein_id': pid,
                    'dtype': 'Expression Atlas',
                    'name': row[5],
                    'did': row[1],
                    'log2foldchange': "%.3f" % float(row[3]),
                    'pvalue': row[4]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} new disease rows for {} proteins.".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} symbols/ensgs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'JensenLab PubMed Text-mining Scores',
        'source': 'File %s' % BASE_URL + FILENAME,
        'app': PROGRAM,
        'app_version': __version__,
        'url': BASE_URL
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'pmscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'JensenLab PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
            sys.exit(1)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ensp2pids = {}
    pmscores = {}  # protein.id => sum(all scores)
    pms_ct = 0
    upd_ct = 0
    notfnd = {}
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            # sym  year  score
            ct += 1
            pbar.update(ct)
            if not row[0].startswith('ENSP'): continue
            ensp = row[0]
            if ensp in ensp2pids:
                # we've already found it
                pids = ensp2pids[ensp]
            elif ensp in notfnd:
                # we've already not found it
                continue
            else:
                targets = dba.find_targets({'stringid': ensp})
                if not targets:
                    targets = dba.find_targets_by_xref({
                        'xtype': 'STRING',
                        'value': '9606.' + ensp
                    })
                    if not targets:
                        notfnd[ensp] = True
                        logger.warn("No target found for {}".format(ensp))
                        continue
                pids = []
                for target in targets:
                    pids.append(target['components']['protein'][0]['id'])
                    ensp2pids[
                        ensp] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_pmscore({
                    'protein_id': pid,
                    'year': row[1],
                    'score': row[2]
                })
                if rv:
                    pms_ct += 1
                else:
                    dba_err_ct += 1
                if pid in pmscores:
                    pmscores[pid] += float(row[2])
                else:
                    pmscores[pid] = float(row[2])
    pbar.finish()
    print "{} input lines processed.".format(ct)
    print "  Inserted {} new pmscore rows for {} targets".format(
        pms_ct, len(pmscores))
    if len(notfnd) > 0:
        print "No target found for {} STRING IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} JensenLab PubMed Score tdl_infos".format(
        len(pmscores.keys()))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in pmscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'JensenLab PubMed Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "  Inserted {} new JensenLab PubMed Score tdl_info rows".format(
        ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            (dba_err_ct, logfile))
Beispiel #16
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'LINCS',
        'source':
        "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.",
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://lincsproject.org/LINCS/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    line_ct = slmf.wcl(INPUT_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    gid2pids = {}
    notfnd = set()
    dba_err_ct = 0
    pmark = {}
    lincs_ct = 0
    with open(INPUT_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        for row in tsvreader:
            # 0: level5_lm.pr_gene_id
            # 1: level5_lm.zscore
            # 2: perturbagen.dc_id
            # 3: perturbagen.canonical_smiles
            # 4: signature.cell_id
            ct += 1
            gid = row[0]
            if gid in gid2pids:
                # we've already found it
                pids = gid2pids[gid]
            elif gid in notfnd:
                # we've already not found it
                continue
            else:
                # look it up
                targets = dba.find_targets({'geneid': gid}, False)
                if not targets:
                    notfnd.add(gid)
                    continue
                pids = []
                for t in targets:
                    pid = t['components']['protein'][0]['id']
                    pids.append(pid)
                gid2pids[
                    gid] = pids  # save this mapping so we only lookup each target once
            for pid in pids:
                rv = dba.ins_lincs({
                    'protein_id': pid,
                    'cellid': row[4],
                    'zscore': row[1],
                    'pert_dcid': row[2],
                    'pert_smiles': row[3]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                pmark[pid] = True
                lincs_ct += 1
            pbar.update(ct)
    pbar.finish()
    for gid in notfnd:
        logger.warn("No target found for {}".format(gid))
    print "{} lines processed.".format(ct)
    print "Loaded {} new lincs rows for {} proteins.".format(
        lincs_ct, len(pmark))
    if notfnd:
        print "No target found for {} geneids. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #17
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  if os.path.isfile(T2AID_PICKLE):
    t2aid = pickle.load( open(T2AID_PICKLE, 'rb'))
    act = 0
    for tid in t2aid.keys():
      for aid in t2aid[tid]:
        act += 1
    if not args['--debug']:
      print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act)
  else:
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    line_ct = slmf.wcl(AIDGI_FILE)
    t2aid = {}
    if not args['--quiet']:
      print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE)
    with open(AIDGI_FILE, 'rU') as csvfile:
      pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
      csvreader = csv.reader(csvfile)
      ct = 0
      skip_ct = 0
      fndgi_ct = 0
      fndpl_ct = 0
      notfnd = set()
      assay_ct = 0
      dba_err_ct = 0
      for row in csvreader:
        # aid, tgt_gi, tgt_species, tgt_name
        #print "[DEBUG]", row
        ct += 1
        if row[2] != 'H**o sapiens':
          skip_ct += 1
          continue
        gi = row[1]
        targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi})
        if targets:
          fndgi_ct += 1
        else:
          url = EFETCH_PROTEIN_URL + gi
          r = requests.get(url)
          if r.status_code == 200:
            soup = BeautifulSoup(r.text, "xml")
            grl = soup.find('Gene-ref_locus')
            if grl:
              sym = grl.text
              targets = dba.find_targets({'sym': sym})
          if targets:
            fndpl_ct += 1
          else:
            notfnd.append(gi)
            logger.warn("No target found for GI {}".format(gi))
            continue
        t = targets[0]
        tid = t['id']
        if tid in t2aid:
          t2aid[tid].append(row[0])
          assay_ct += 1
        else:
          t2aid[tid] = [row[0]]
          assay_ct += 1
        pbar.update(ct)
    pbar.finish()
    pickle.dump(t2aid, open(T2AID_PICKLE, "wb"))
    print "\n{} rows processed.".format(ct)
    print "  {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid))
    print "  Skipped {} non-huamn assay rows".format(skip_ct)
    print "    {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct)
    print "  No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile)

  assay_info = {}
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  line_ct = slmf.wcl(ASSAYS_FILE)
  if not args['--quiet']:
    print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE)
  with open(ASSAYS_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    csvreader = csv.reader(csvfile)
    ct = 0
    for row in csvreader:
      # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList
      aid = row[0]
      assay_info[aid] = row[1:]
      pbar.update(ct)
  pbar.finish()
  elapsed = time.time() - start_time
  print "Got assay info for {} assays.".format(len(assay_info))

  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  tct = len(t2aid.keys())
  if not args['--quiet']:
    print "\nLoading MLP Assay Info for {} targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  ti_ct = 0
  mai_ct = 0
  dba_err_ct = 0
  for tid, aids in t2aid.items():
    ct += 1
    for aid in aids:
      ainfo = assay_info[aid]
      rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]})
      if rv:
        mai_ct += 1
      else:
        dba_err_ct += 1
    pbar.update(ct)
  pbar.finish()
  print "\n{} targets processed.".format(ct)
  print "  Inserted {} new mlp_assay_info rows".format(mai_ct)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #18
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'PubTator Text-mining Scores',
        'source':
        'File %s' % BASE_URL + FILENAME,
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/',
        'comments':
        'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'ptscore'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'PubTator PubMed Score'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    ptscores = {}  # protein.id => sum(all scores)
    pts_ct = 0
    dba_err_ct = 0
    infile = DOWNLOAD_DIR + FILENAME
    line_ct = slmf.wcl(infile)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        geneid2pid = {}
        notfnd = set()
        for row in tsvreader:
            # NCBI Gene ID  year  score
            ct += 1
            pbar.update(ct)
            gidstr = row[0].replace(',', ';')
            geneids = gidstr.split(';')
            for geneid in geneids:
                if not geneid or '(tax:' in geneid:
                    continue
                if geneid in geneid2pid:
                    # we've already found it
                    pids = geneid2pid[geneid]
                elif geneid in notfnd:
                    # we've already not found it
                    continue
                else:
                    targets = dba.find_targets({'geneid': geneid})
                    if not targets:
                        notfnd.add(geneid)
                        logger.warn("No target found for {}".format(geneid))
                        continue
                    pids = []
                    for target in targets:
                        pids.append(target['components']['protein'][0]['id'])
                        geneid2pid[
                            geneid] = pids  # save this mapping so we only lookup each target once
                for pid in pids:
                    rv = dba.ins_ptscore({
                        'protein_id': pid,
                        'year': row[1],
                        'score': row[2]
                    })
                    if rv:
                        pts_ct += 1
                    else:
                        dba_err_ct += 1
                    if pid in ptscores:
                        ptscores[pid] += float(row[2])
                    else:
                        ptscores[pid] = float(row[2])
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Inserted {} new ptscore rows for {} targets.".format(
        pts_ct, len(ptscores))
    if notfnd:
        print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores))
    ct = 0
    ti_ct = 0
    dba_err_ct = 0
    for pid, score in ptscores.items():
        ct += 1
        rv = dba.ins_tdl_info({
            'protein_id': pid,
            'itype': 'PubTator Score',
            'number_value': score
        })
        if rv:
            ti_ct += 1
        else:
            dba_err_ct += 1
    print "{} processed".format(ct)
    print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #19
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'NCBI GI Numbers',
        'source':
        'UniProt ID Mapping file %s' % (BASE_URL + FILENAME),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://www.uniprot.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset See logfile %s for details." % logfile
        sys.exit(1)
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'xref',
        'where_clause': "dataset_id = %d" % dataset_id
    })
    if not rv:
        print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
        sys.exit(1)

    start_time = time.time()
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '')
    line_ct = slmf.wcl(infile)
    # ID Mappiing fields
    # 1. UniProtKB-AC
    # 2. UniProtKB-ID
    # 3. GeneID (EntrezGene)
    # 4. RefSeq
    # 5. GI
    # 6. PDB
    # 7. GO
    # 8. UniRef100
    # 9. UniRef90
    # 10. UniRef50
    # 11. UniParc
    # 12. PIR
    # 13. NCBI-taxon
    # 14. MIM
    # 15. UniGene
    # 16. PubMed
    # 17. EMBL
    # 18. EMBL-CDS
    # 19. Ensembl
    # 20. Ensembl_TRS
    # 21. Ensembl_PRO
    # 22. Additional PubMed
    if not args['--quiet']:
        print "\nProcessing {} rows in file {}".format(line_ct, infile)
    with open(infile, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        tmark = {}
        xref_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for line in tsv:
            data = line.split('\t')
            ct += 1
            up = data[0]
            if not data[4]:  # no gi
                skip_ct += 1
                continue
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                skip_ct += 1
                continue
            target = targets[0]
            tmark[target['id']] = True
            pid = target['components']['protein'][0]['id']
            for gi in data[4].split('; '):
                rv = dba.ins_xref({
                    'protein_id': pid,
                    'xtype': 'NCBI GI',
                    'dataset_id': dataset_id,
                    'value': gi
                })
                if rv:
                    xref_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} rows processed".format(ct)
    print "  Inserted {} new GI xref rows for {} targets".format(
        xref_ct, len(tmark))
    print "  Skipped {} rows with no GI".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} database errors occured. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #20
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    # DBAdaptor uses same logger as main()
    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Drug Central',
        'source':
        "Drug Central files download files: %s" % ", ".join(SRC_FILES),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://drugcentral.org/'
    })
    if not dataset_id:
        print "WARNING: Error inserting dataset. See logfile {} for details.".format(
            logfile)
        sys.exit(1)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'drug_activity'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype = 'DrugCentral Indication'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        if not rv:
            print "WARNING: Error inserting provenance. See logfile {} for details.".format(
                logfile)
            sys.exit(1)

    # First get mapping of DrugCentral names to ids
    name2id = {}
    line_ct = slmf.wcl(NAME_ID_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, NAME_ID_FILE)
    with open(NAME_ID_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            name2id[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(name2id))

    # Next get drug info fields
    infos = {}
    line_ct = slmf.wcl(DRUGINFO_FILE)
    if not args['--quiet']:
        print "\nProcessing {} input lines in file {}".format(
            line_ct, DRUGINFO_FILE)
    with open(DRUGINFO_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'): continue
            infos[row[0]] = row[1].replace("\n", '')
    print "{} input lines processed.".format(ct)
    print "Saved {} keys in infos map".format(len(infos))

    #
    # MOA activities
    #
    drug2tids = defaultdict(list)
    line_ct = slmf.wcl(TCLIN_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from DrugDB MOA activities file {}".format(
            line_ct, TCLIN_FILE)
    with open(TCLIN_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 1,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['cmpd_chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tclin rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Non-MOA activities
    #
    line_ct = slmf.wcl(TCHEM_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from Non-MOA activities file {}".format(
            line_ct, TCHEM_FILE)
    with open(TCHEM_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # uniprot swissprot       drug_name       act_value       act_type        action_type     source_name     reference       smiles  ChEMBL_Id
        ct = 0
        da_ct = 0
        err_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            up = row[0]
            sp = row[1]
            drug = row[2]
            if drug not in name2id:
                err_ct += 1
                logger.warn("No DrugCentral id found for {}".format(drug))
                continue
            dcid = name2id[drug]
            targets = dba.find_targets({'uniprot': up})
            if not targets:
                targets = dba.find_targets({'name': sp})
                if not targets:
                    notfnd.append(up)
                    continue
            tid = targets[0]['id']
            drug2tids[drug].append(tid)
            init = {
                'target_id': tid,
                'drug': drug,
                'dcid': dcid,
                'has_moa': 0,
                'source': row[5]
            }
            if row[3]:
                init['act_value'] = row[3]
            if row[4]:
                init['act_type'] = row[4]
            if row[5]:
                init['action_type'] = row[5]
            if row[6]:
                init['source'] = row[6]
            if row[7]:
                init['reference'] = row[7]
            if row[8]:
                init['smiles'] = row[8]
            if row[9]:
                init['chemblid'] = row[9]
            if drug in infos:
                init['nlm_drug_info'] = infos[drug]
            rv = dba.ins_drug_activity(init)
            if rv:
                da_ct += 1
            else:
                dba_err_ct += 1
    print "{} DrugCentral Tchem rows processed.".format(ct)
    print "  Inserted {} new drug_activity rows".format(da_ct)
    if len(notfnd) > 0:
        print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format(
            len(notfnd))
        for up in notfnd:
            print up
    if err_ct > 0:
        print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format(
            err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    #
    # Indications (diseases)
    #
    line_ct = slmf.wcl(DRUGIND_FILE)
    line_ct -= 1
    if not args['--quiet']:
        print "\nProcessing {} lines from indications file {}".format(
            line_ct, DRUGIND_FILE)
    with open(DRUGIND_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        # DRUG_ID DRUG_NAME       INDICATION_FDB  UMLS_CUI        SNOMEDCT_CUI    DOID
        ct = 0
        t2d_ct = 0
        notfnd = {}
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            drug = row[1]
            if drug not in drug2tids:
                notfnd[drug] = True
                continue
            init = {
                'protein_id': tid,
                'dtype': 'DrugCentral Indication',
                'name': row[2],
                'drug_name': drug
            }
            if row[5] != '':
                init['did'] = row[5]
            for tid in drug2tids[drug]:
                # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets
                init['protein_id'] = tid
                rv = dba.ins_disease(init)
                if rv:
                    t2d_ct += 1
                else:
                    dba_err_ct += 1
    print "{} DrugCentral indication rows processed.".format(ct)
    print "  Inserted {} new disease rows".format(t2d_ct)
    if len(notfnd.keys()) > 0:
        print "WARNNING: {} drugs NOT FOUND in activity files:".format(
            len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #21
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not debug:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'AnimalTFDB',
        'source':
        'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.bioguo.org/AnimalTFDB/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'tdl_info',
        'where_clause': "itype = 'Is Transcription Factor'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0}

    line_ct = slmf.wcl(INFILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in input file {}\n".format(
            line_ct, INFILE)
    with open(INFILE, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        ti_ct = 0
        notfnd = []
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            sym = row[3]
            targets = dba.find_targets({'sym': sym})
            if not targets:
                gid = row[2]
                targets = dba.find_targets({'geneid': gid})
            if not targets:
                ensg = row[1]
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensg
                })
            if not targets:
                notfnd.append(row)
                continue
            t = targets[0]
            TDLs[t['tdl']] += 1
            pid = t['components']['protein'][0]['id']
            rv = dba.ins_tdl_info({
                'protein_id': pid,
                'itype': 'Is Transcription Factor',
                'boolean_value': 1
            })
            if rv:
                ti_ct += 1
            else:
                dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "\n{} lines processed.".format(ct)
    print "  Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct)
    if notfnd:
        print "No target found for {} rows:".format(len(notfnd))
    if dba_err_ct > 0:
        print "WARNING: %d DB errors occurred. See logfile %s for details." % (
            dba_err_ct, logfile)
    for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']:
        print "{}: {}".format(tdl, TDLs[tdl])
Beispiel #22
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'Jensen Lab DISEASES',
        'source':
        'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://diseases.jensenlab.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({
        'dataset_id': dataset_id,
        'table_name': 'disease',
        'where_clause': "dtype LIKE 'JensenLab %'"
    })
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)

    # Knowledge channel
    fn = DOWNLOAD_DIR + FILE_K
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Knowledge ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                init = {
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                }

                rv = dba.ins_disease(init)
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Experiment channel
    fn = DOWNLOAD_DIR + FILE_E
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        skip_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[6] == '0':
                # skip zero confidence rows
                skip_ct += 1
                continue
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Experiment ' + row[4]
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'evidence': row[5],
                    'conf': row[6]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if skip_ct > 0:
        print "Skipped {} zero confidence rows".format(skip_ct)
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Text Mining channel
    fn = DOWNLOAD_DIR + FILE_T
    line_ct = slmf.wcl(fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsv:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        notfnd = set()
        dis_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            ensp = row[0]
            sym = row[1]
            k = "%s|%s" % (ensp, sym)
            if k in notfnd:
                continue
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                targets = dba.find_targets({'sym': sym}, idg=False)
            if not targets:
                notfnd.add(k)
                logger.warn("No target found for {}".format(k))
                continue
            dtype = 'JensenLab Text Mining'
            for t in targets:
                p = t['components']['protein'][0]
                pmark[p['id']] = True
                rv = dba.ins_disease({
                    'protein_id': p['id'],
                    'dtype': dtype,
                    'name': row[3],
                    'did': row[2],
                    'zscore': row[4],
                    'conf': row[5]
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                dis_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new disease rows for {} proteins".format(
        dis_ct, len(pmark))
    if notfnd:
        print "No target found for {} stringids/symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #23
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Jensen Lab TISSUES', 'source': 'Files %s from %s'%(", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://tissues.jensenlab.org/'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    sys.exit(1)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "type LIKE 'JensenLab %'"})
  if not rv:
    print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
    sys.exit(1)

  with open(TISSUE2UBERON_FILE, 'r') as ifh:
    tiss2uid = ast.literal_eval(ifh.read())
  if not args['--quiet']:
    print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE)

  # this dict will map ENSP|sym from input files to TCRD protein_id(s)
  # so we only have to find target(s) once for each pair.
  # See find_pids() below
  pmap = {}

  # Knowledge channel
  fn = DOWNLOAD_DIR+FILE_K
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      k = "%s|%s" % (row[0], row[1]) # ENSP|sym
      if k in notfnd:
        continue
      pids = find_pids(dba, k, pmap)
      if not pids:
        notfnd.add(k)
        continue
      etype = 'JensenLab Knowledge ' + row[4]
      init = {'etype': etype, 'tissue': row[3],'boolean_value': 1, 
              'oid': row[2], 'evidence': row[5], 'conf': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
        pmark[pid] = True
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
  
  # Experiment channel
  fn = DOWNLOAD_DIR+FILE_E
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    skip_ct = 0
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      if row[6] == '0':
        # skip zero confidence rows
        skip_ct += 1
        continue
      sym = row[1]
      # some rows look like:
      # ['ENSP00000468389', 'PSENEN {ECO:0000313|Ensembl:ENSP00000468593}', 'BTO:0002860', 'Oral mucosa', 'HPA', 'High: 1 antibody', '1']
      if ' ' in sym:
        sym = sym.split()[0]
      k = "%s|%s" % (row[0], sym) # ENSP|sym
      if k in notfnd:
        continue
      try:
        pids = find_pids(dba, k, pmap)
      except ValueError:
        print "[ERROR] Row: %s; k: %s" % (str(row), k)
      if not pids:
        notfnd.add(k)
        continue
      etype = 'JensenLab Experiment ' + row[4]
      init = {'etype': etype, 'tissue': row[3],
              'string_value': row[5], 'oid': row[2], 'conf': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        pmark[pid] = True
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  print "  Skipped {} zero confidence rows".format(skip_ct)
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  # Text Mining channel
  fn = DOWNLOAD_DIR+FILE_T
  line_ct = slmf.wcl(fn)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  if not args['--quiet']:
    print "\nProcessing {} lines in input file {}".format(line_ct, fn)
  with open(fn, 'rU') as tsv:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    tsvreader = csv.reader(tsv, delimiter='\t')
    ct = 0
    pmark = {}
    exp_ct = 0
    notfnd = set()
    nouid = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      k = "%s|%s" % (row[0], row[1]) # ENSP|sym
      if k in notfnd:
        continue
      pids = find_pids(dba, k, pmap)
      if not pids:
        notfnd.add(k)
        logger.warn("No target found for {}".format(k))
        continue
      etype = 'JensenLab Text Mining'
      init = {'etype': etype, 'tissue': row[3], 'boolean_value': 1,
              'oid': row[2], 'zscore': row[4], 'conf': row[5], 'url': row[6]}
      # Add Uberon ID, if we can find one
      if row[2]:
        uberon_id = dba.get_uberon_id({'oid': row[2]})
      if not uberon_id:
        uberon_id = dba.get_uberon_id({'name': row[3]})
      if not uberon_id and row[3] in tiss2uid:
        uberon_id = tiss2uid[row[3]]
      if uberon_id:
        init['uberon_id'] = uberon_id
      else:
        nouid.add(row[3])
      for pid in pids:
        pmark[pid] = True
        init['protein_id'] = pid
        rv = dba.ins_expression(init)
        if not rv:
          dba_err_ct += 1
          continue
        exp_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for {}".format(k))
  for t in nouid:
    logger.warn("No Uberon ID found for {}".format(t))
  print "{} rows processed.".format(ct)
  print "  Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark))
  if notfnd:
    print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile)
  if nouid:
    print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #24
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])
    
  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Reactome Protein-Protein Interactions', 'source': "File %s"%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset See logfile %s for details." % logfile
    sys.exit(1)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'Reactome'"})
  if not rv:
    print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
    sys.exit(1)

  infile = DOWNLOAD_DIR + FILENAME
  line_ct = slmf.wcl(infile)
  if not args['--quiet']:
    print "\nProcessing {} lines from Reactome PPI file {}".format(line_ct, infile)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(infile, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct = 1
    skip_ct = 0
    same12_ct = 0
    dup_ct = 0
    ppis = {}
    ppi_ct = 0
    up2pid = {}
    notfnd = set()
    dba_err_ct = 0
    for row in tsvreader:
      # 0: Interactor 1 uniprot id
      # 1: Interactor 1 Ensembl gene id
      # 2: Interactor 1 Entrez Gene id
      # 3: Interactor 2 uniprot id
      # 4: Interactor 2 Ensembl gene id
      # 5: Interactor 2 Entrez Gene id
      # 6: Interaction type
      # 7: Interaction context Pubmed references
      ct += 1
      pbar.update(ct)
      if not row[0].startswith('uniprotkb:'):
        continue
      if not row[3].startswith('uniprotkb:'):
        continue
      up1 = row[0].replace('uniprotkb:', '')
      up2 = row[3].replace('uniprotkb:', '')      
      if not up1 or not up2:
        skip_ct += 1
        continue
      # protein1
      if up1 in up2pid:
        pid1 = up2pid[up1]
      elif up1 in notfnd:
        continue
      else:
        t1 = find_target(dba, up1)
        if not t1:
          notfnd.add(up1)
          continue
        pid1 = t1['components']['protein'][0]['id']
        up2pid[up1] = pid1
      # protein2
      if up2 in up2pid:
        pid2 = up2pid[up2]
      elif up2 in notfnd:
        continue
      else:
        t2 = find_target(dba, up2)
        if not t2:
          notfnd.add(up2)
          continue
        pid2 = t2['components']['protein'][0]['id']
        up2pid[up2] = pid2
      int_type = row[6]
      ppik = up1 + "|" + up2 + 'int_type'
      if ppik in ppis:
        dup_ct += 1
        continue
      if pid1 == pid2:
        same12_ct += 1
        continue
      # Insert PPI
      rv = dba.ins_ppi( {'ppitype': 'Reactome', 'interaction_type': int_type,
                         'protein1_id': pid1, 'protein1_str': up1,
                         'protein2_id': pid2, 'protein2_str': up2} )
      if rv:
        ppi_ct += 1
        ppis[ppik] = True
      else:
        dba_err_ct += 1
  pbar.finish()
  for up in notfnd:
    logger.warn("No target found for: {}".format(up))
  print "{} Reactome PPI rows processed.".format(ct)
  print "  Inserted {} ({}) new ppi rows".format(ppi_ct, len(ppis))
  if skip_ct:
    print "  Skipped {} rows without two UniProt interactors".format(skip_ct)
  if dup_ct:
    print "  Skipped {} duplicate PPIs".format(dup_ct)
  if same12_ct:
    print "  Skipped {} PPIs involving the same protein".format(same12_ct)
  if notfnd:
    print "  No target found for {} UniProt accessions. See logfile {} for details.".format(len(notfnd), logfile) 
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #25
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging when debug is 0
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  aliasmap = {}
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  ct = 0
  skip_ct = 0
  mult_ct = 0
  line_ct = slmf.wcl(INFILE1)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(INFILE1, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      # taxid   uniprot_ac|uniprot_id   string_id   identity   bit_score
      ct += 1
      pbar.update(ct)
      if float(row[3]) != 100:
        skip_ct += 1
        continue
      [uniprot, name] = row[1].split("|")
      ensp = row[2].replace('9606.', '')
      bitscore = float(row[4])
      if uniprot in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[uniprot][1]:
          aliasmap[uniprot] = (ensp, bitscore)
      else:
        aliasmap[uniprot] = (ensp, bitscore)
      if name in aliasmap:
        # Save mapping with highest bit score
        if bitscore > aliasmap[name][1]:
          aliasmap[name] = (ensp, bitscore)
      else:
        aliasmap[name] = (ensp, bitscore)
  pbar.finish()
  unmap_ct = len(aliasmap)
  print "{} input lines processed.".format(ct)
  print "  Skipped {} non-identity lines".format(skip_ct)
  print "  Got {} uniprot/name to STRING ID mappings".format(unmap_ct)

  line_ct = slmf.wcl(INFILE2)
  if not args['--quiet']:
    print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
  ct = 0
  warn_ct = 0
  with open(INFILE2, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    ct += 1
    for row in tsvreader:
      ## string_protein_id ## alias ## source ##
      ct += 1
      pbar.update(ct)
      alias = row[1]
      ensp = row[0].replace('9606.', '')
      if alias in aliasmap and aliasmap[alias][0] != ensp:
        # do not replace mappings from *human.uniprot_2_string.2018* with aliases
        logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp))
        warn_ct += 1
        continue
      aliasmap[alias] = (ensp, None)
  pbar.finish()
  amap_ct = len(aliasmap) - unmap_ct
  print "{} input lines processed.".format(ct)
  print "  Added {} alias to STRING ID mappings".format(amap_ct)
  if warn_ct > 0:
    print "  Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile)

  tct = dba.get_target_count(idg=False)
  if not args['--quiet']:
    print "\nLoading STRING IDs for {} TCRD targets".format(tct)
  pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start()
  ct = 0
  upd_ct = 0
  nf_ct = 0
  dba_err_ct = 0
  for target in dba.get_targets(include_annotations=True):
    ct += 1
    pbar.update(ct)
    p = target['components']['protein'][0]
    geneid = 'hsa:' + str(p['geneid'])
    hgncid = None
    if 'HGNC' in p['xrefs']:
      hgncid = p['xrefs']['HGNC'][0]['value']
    ensp = None
    if p['uniprot'] in aliasmap:
      ensp = aliasmap[p['uniprot']][0]
    elif p['name'] in aliasmap:
      ensp = aliasmap[p['name']][0]
    elif geneid in aliasmap:
      ensp = aliasmap[geneid][0]
    elif hgncid and hgncid in aliasmap:
      ensp = aliasmap[hgncid][0]
    if not ensp:
      nf_ct += 1
      logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot']))
      continue
    rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} )
    if rv:
      upd_ct += 1
    else:
      dba_err_ct += 1
  pbar.finish()
  print "Updated {} STRING ID values".format(upd_ct)
  if nf_ct > 0:
    print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #26
0
def load_OMIM(args, dba, logger, logfile):
    # OMIMs and Phenotypic Series
    fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['TITLES_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print 'Processing %d lines from input file %s' % (line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        omim_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Prefix ???
            # 1: Mim Number
            # 2: Preferred Title; symbol Alternative Title(s); symbol(s)
            # 3: Included Title(s); symbols
            title = row[2].partition(';')[0]
            rv = dba.ins_omim({'mim': row[1], 'title': title})
            if not rv:
                dba_err_ct += 1
                continue
            omim_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "Loaded {} new omim rows".format(omim_ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['PS_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print 'Processing %d lines from input file %s' % (line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        skip_ct = 0
        ps_ct = 0
        err_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Phenotypic Series Number
            # 1: Mim Number
            # 2: Phenotype
            if len(row) == 2:
                init = {'omim_ps_id': row[0], 'title': row[1]}
            elif len(row) == 3:
                init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]}
            else:
                err_ct += 1
                logger.warn("Parsing error for row {}".format(row))
                continue
            rv = dba.ins_omim_ps(init)
            if not rv:
                dba_err_ct += 1
                continue
            ps_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "Loaded {} new omim_ps rows".format(ps_ct)
    print "  Skipped {} commented lines.".format(skip_ct)
    if err_ct > 0:
        print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(
            er_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    # Phenotypes
    fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['GENEMAP2_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print 'Processing %d lines from input file %s' % (line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        ct = 0
        pmark = {}
        skip_ct = 0
        notfnd_ct = 0
        prov_ct = 0
        dds_ct = 0
        pt_ct = 0
        dba_err_ct = 0
        for row in tsvreader:
            ct += 1
            if row[0].startswith('#'):
                # The file has commented lines
                skip_ct += 1
                continue
            # The fields are:
            # 0: Chromosome
            # 1: Genomic Position Start
            # 2: Genomic Position End
            # 3: Cyto Location
            # 4: Computed Cyto Location
            # 5: MIM Number
            # 6: Gene Symbols
            # 7: Gene Name
            # 8: Approved Symbol
            # 9. Entrez Gene ID
            # 10: Ensembl Gene ID
            # 11: Comments
            # 12: Phenotypes
            # 13: Mouse Gene Symbol/ID
            pts = row[11]
            if pts.startswith('?'):
                prov_ct += 1
                continue
            if '(4)' in pts:
                dds_ct += 1
            trait = "MIM Number: %s" % row[5]
            if pts:
                trait += "; Phenotype: %s" % pts
            if row[8]:
                syms = [row[8]]
            else:
                syms = syms = row[5].split(', ')
            logger.info("Checking for OMIM syms: {}".format(syms))
            for sym in syms:
                targets = dba.find_targets({'sym': sym})
                if not targets and row[9]:
                    targets = dba.find_targets({'geneid': int(row[9])})
                if not targets:
                    notfnd_ct += 1
                    logger.warn("No target found for row {}".format(row))
                    continue
            for t in targets:
                p = t['components']['protein'][0]
                rv = dba.ins_phenotype({
                    'protein_id': p['id'],
                    'ptype': 'OMIM',
                    'trait': trait
                })
                if not rv:
                    dba_err_ct += 1
                    continue
                pmark[p['id']] = True
                pt_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed".format(ct)
    print "Loaded {} OMIM phenotypes for {} proteins".format(pt_ct, len(pmark))
    print "  Skipped {} commented lines.".format(skip_ct)
    print "  Skipped {} provisional phenotype rows.".format(prov_ct)
    if dds_ct > 0:
        print "  Skipped {} deletion/duplication syndrome rows.".format(dds_ct)
    if notfnd_ct > 0:
        print "  No target found for {} good lines. See logfile {} for details.".format(
            notfnd_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'OMIM',
        'source':
        'Files %s from http:data.omim.org' %
        (", ".join(CONFIG['OMIM']['SRC_FILES'])),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'http://omim.org/',
        'comments':
        'OMIM phenotype associations and Phenotype Series info. Neither provisional associations nor deletion/duplication syndromes are loaded.'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'omim'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'omim_ps'
    }, {
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'OMIM'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)
Beispiel #27
0
def load(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # Dataset
    dataset_id = dba.ins_dataset({
        'name': 'Human Protein Atlas',
        'source':
        'IDG-KMC generated data by Steve Mathias at UNM from HPA file http://www.proteinatlas.org/download/normal_tissue.tsv.zip.',
        'app': PROGRAM,
        'app_version': __version__,
        'url': 'http://www.proteinatlas.org/'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id':
        dataset_id,
        'table_name':
        'expression',
        'where_clause':
        "etype = 'HPA'",
        'comment':
        'Qualitative expression values are derived from files from http://www.proteinatlas.org/'
    }, {
        'dataset_id':
        dataset_id,
        'table_name':
        'tdl_info',
        'where_clause':
        "itype = 'HPA Tissue Specificity Index'",
        'comment':
        'Tissue Specificity scores are derived from files from http://www.proteinatlas.org/. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)

    with open(TISSUE2UBERON_FILE, 'r') as ifh:
        tiss2uid = ast.literal_eval(ifh.read())
    if not args['--quiet']:
        print "\nGot {} tissue to Uberon ID mappings from file {}".format(
            len(tiss2uid), TISSUE2UBERON_FILE)

    line_ct = slmf.wcl(HPA_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in HPA file {}".format(line_ct, HPA_FILE)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    pmark = {}
    exp_ct = 0
    nouid = set()
    with open(HPA_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # "protein_id"    "Tissue"        "Gene"  "Gene name"     "Level" "Reliability"
            ct += 1
            tissue = row[1]
            init = {
                'protein_id': row[0],
                'etype': 'HPA',
                'tissue': tissue,
                'qual_value': row[4],
                'evidence': row[5]
            }
            # Add Uberon ID, if we can find one
            if tissue in tiss2uid:
                uberon_id = tiss2uid[tissue]
            else:
                uberon_id = dba.get_uberon_id({'name': tissue})
            if uberon_id:
                init['uberon_id'] = uberon_id
            else:
                nouid.add(tissue)
            rv = dba.ins_expression(init)
            if not rv:
                dba_err_ct += 1
                continue
            exp_ct += 1
            pmark[row[1]] = True
            pbar.update(ct)
    pbar.finish()
    print "Processed {} HPA lines.".format(ct)
    print "  Inserted {} new expression rows for {} proteins.".format(
        exp_ct, len(pmark))
    if nouid:
        print "No Uberon ID found for {} tissues. See logfile {} for details.".format(
            len(nouid), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    line_ct = slmf.wcl(HPA_TAU_FILE)
    if not args['--quiet']:
        print "\nProcessing {} lines in HPA TAU file {}".format(
            line_ct, HPA_TAU_FILE)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    ct = 0
    dba_err_ct = 0
    pmark = {}
    skip_ct = 0
    ti_ct = 0
    with open(HPA_TAU_FILE, 'rU') as tsv:
        tsvreader = csv.reader(tsv, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct += 1
        for row in tsvreader:
            # "Gene"  "TAU"   "protein_id"
            ct += 1
            pbar.update(ct)
            if row[1] == 'None':
                skip_ct += 1
                continue
            rv = dba.ins_tdl_info({
                'protein_id': int(row[2]),
                'itype': 'HPA Tissue Specificity Index',
                'number_value': row[1]
            })
            if not rv:
                dba_err_ct += 1
                continue
            pmark[row[1]] = True
            ti_ct += 1
    pbar.finish()
    print "Processed {} lines.".format(ct)
    print "  Inserted {} new HPA Tissue Specificity Index tdl_info rows for {} proteins.".format(
        ti_ct, len(pmark))
    if skip_ct:
        print "  Skipped {} rows with no tau.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
Beispiel #28
0
def load_GWASCatalog(args, dba, logger, logfile):
    fn = CONFIG['GWAS Catalog']['DOWNLOAD_DIR'] + CONFIG['GWAS Catalog'][
        'FILENAME']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print 'Processing {} lines GWAS Catalog file {}'.format(line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    outlist = []
    with open(fn, 'rU') as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter='\t')
        header = tsvreader.next()  # skip header line
        ct = 1
        notfnd = set()
        pmark = {}
        gwas_ct = 0
        dba_err_ct = 0
        # 0: DATE ADDED TO CATALOG
        # 1: PUBMEDID
        # 2: FIRST AUTHOR
        # 3: DATE
        # 4: JOURNAL
        # 5: LINK
        # 6: STUDY
        # 7: DISEASE/TRAIT
        # 8: INITIAL SAMPLE SIZE
        # 9: REPLICATION SAMPLE SIZE
        # 10: REGION
        # 11: CHR_ID
        # 12: CHR_POS
        # 13: REPORTED GENE(S)
        # 14: MAPPED_GENE
        # 15: UPSTREAM_GENE_ID
        # 16: DOWNSTREAM_GENE_ID
        # 17: SNP_GENE_IDS
        # 18: UPSTREAM_GENE_DISTANCE
        # 19: DOWNSTREAM_GENE_DISTANCE
        # 20: STRONGEST SNP-RISK ALLELE
        # 21: SNPS
        # 22: MERGED
        # 23: SNP_ID_CURRENT
        # 24: CONTEXT
        # 25: INTERGENIC
        # 26: RISK ALLELE FREQUENCY
        # 27: P-VALUE
        # 28: PVALUE_MLOG
        # 29: P-VALUE (TEXT)
        # 30: OR or BETA
        # 31: 95% CI (TEXT)
        # 32: PLATFORM [SNPS PASSING QC]
        # 33: CNV
        # 34: MAPPED_TRAIT
        # 35: MAPPED_TRAIT_URI
        # 36: STUDY ACCESSION
        # 37: GENOTYPING TECHNOLOGY
        symregex = re.compile(r' ?[-,;] ?')
        for row in tsvreader:
            ct += 1
            if len(row) < 14: continue
            symstr = row[14]
            if symstr == 'NR': continue
            symlist = symregex.split(symstr)
            for sym in symlist:
                if sym in notfnd:
                    continue
                targets = dba.find_targets({'sym': sym})
                if not targets:
                    notfnd.add(sym)
                    logger.warn("No target found for symbol {}".format(sym))
                    continue
                for t in targets:
                    p = t['components']['protein'][0]
                    try:
                        pval = float(row[27])
                    except:
                        pval = None
                    try:
                        orbeta = float(row[30])
                    except:
                        orbeta = None
                    if row[25]:
                        ig = int(row[25])
                    else:
                        ig = None
                    rv = dba.ins_gwas({
                        'protein_id': p['id'],
                        'disease_trait': row[7],
                        'snps': row[21],
                        'pmid': row[1],
                        'study': row[6],
                        'context': row[24],
                        'intergenic': ig,
                        'p_value': pval,
                        'or_beta': orbeta,
                        'cnv': row[33],
                        'mapped_trait': row[34],
                        'mapped_trait_uri': row[35]
                    })
                    if not rv:
                        dba_err_ct += 1
                        continue
                    pmark[p['id']] = True
                    gwas_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Inserted {} new gwas rows for {} proteins".format(
        gwas_ct, len(pmark))
    if notfnd:
        print "  No target found for {} symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'GWAS Catalog',
        'source':
        'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' %
        os.path.basename(CONFIG['GWAS Catalog']['FILENAME']),
        'app':
        PROGRAM,
        'app_version':
        __version__,
        'url':
        'https://www.ebi.ac.uk/gwas/home'
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'})
    assert rv, "Error inserting provenance. See logfile {} for details.".format(
        logfile)
Beispiel #29
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'BioPlex Protein-Protein Interactions', 'source': "Files %s from http://wren.hms.harvard.edu/bioplex/downloadInteractions.php"%", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://wren.hms.harvard.edu/bioplex/index.php'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'BioPlex'"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
    
  f = BIOPLEX_FILE
  line_ct = slmf.wcl(f)
  line_ct -= 1
  if not args['--quiet']:
    print "\nProcessing {} lines from BioPlex PPI file {}".format(line_ct, f)
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
  with open(f, 'rU') as tsv:
    tsvreader = csv.reader(tsv, delimiter='\t')
    header = tsvreader.next() # skip header line
    # GeneA   GeneB   UniprotA        UniprotB        SymbolA SymbolB pW      pNI     pInt
    ct = 0
    ppi_ct = 0
    same12_ct = 0
    k2pid = {}
    notfnd = set()
    dba_err_ct = 0
    for row in tsvreader:
      ct += 1
      pbar.update(ct)
      geneid1 = row[0]
      geneid2 = row[1]
      up1 = row[2]
      up2 = row[3]
      sym1 = row[4]
      sym2 = row[5]
      pw = row[6]
      pni = row[7]
      pint = row[8]
      # protein1
      k1 = "%s|%s|%s" % (up1, sym1, geneid1)
      if k1 in k2pid:
        pid1 = k2pid[k1]
      elif k1 in notfnd:
        continue
      else:
        t1 = find_target(dba, k1)
        if not t1:
          notfnd.add(k1)
          continue
        pid1 = t1['components']['protein'][0]['id']
      k2pid[k1] = pid1
      # protein2
      k2 = "%s|%s|%s" % (up2, sym2, geneid2)
      if k2 in k2pid:
        pid2 = k2pid[k2]
      elif k2 in notfnd:
        continue
      else:
        t2 = find_target(dba, k2)
        if not t2:
          notfnd.add(k2)
          continue
        pid2 = t2['components']['protein'][0]['id']
      k2pid[k2] = pid2
      if pid1 == pid2:
        same12_ct += 1
        continue
      # Insert PPI
      rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw,
                         'protein1_id': pid1, 'protein1_str': k1,
                         'protein2_id': pid2, 'protein2_str': k2} )
      if rv:
        ppi_ct += 1
      else:
        dba_err_ct += 1
  pbar.finish()
  for k in notfnd:
    logger.warn("No target found for: {}".format(k))
  print "{} BioPlex PPI rows processed.".format(ct)
  print "  Inserted {} new ppi rows".format(ppi_ct)
  if same12_ct:
    print "  Skipped {} PPIs involving the same protein".format(same12_ct)
  if notfnd:
    print "  No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) 
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  for f in UPD_FILES[1:]:
    start_time = time.time()
    line_ct = slmf.wcl(f)
    line_ct -= 1
    if not args['--quiet']:
      print "\nProcessing {} lines from BioPlex PPI update file {}".format(line_ct, f)
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    with open(f, 'rU') as tsv:
      tsvreader = csv.reader(tsv, delimiter='\t')
      header = tsvreader.next() # skip header line
      # plate_num       well_num        db_protein_id   symbol  gene_id bait_symbol     bait_geneid     pWrongID        pNoInt  pInt
      ct = 0
      ppi_ct = 0
      same12_ct = 0
      k2pid = {}
      notfnd = set()
      dba_err_ct = 0
      for row in tsvreader:
        ct += 1
        pbar.update(ct)
        geneid1 = row[6]
        geneid2 = row[4]
        sym1 = row[5]
        sym2 = row[3]
        pw = row[7]
        pni = row[8]
        pint = row[9]
        # protein1
        k1 = "|%s|%s" % (sym1, geneid1)
        if k1 in k2pid:
          pid1 = k2pid[k1]
        elif k1 in notfnd:
          continue
        else:
          t1 = find_target(dba, k1)
          if not t1:
            notfnd.add(k1)
            continue
          pid1 = t1['components']['protein'][0]['id']
          k2pid[k1] = pid1
        # protein2
        k2 = "|%s|%s" % (sym2, geneid2)
        if k2 in k2pid:
          pid2 = k2pid[k2]
        elif k2 in notfnd:
          continue
        else:
          t2 = find_target(dba, k2)
          if not t2:
            notfnd.add(k2)
            continue
          pid2 = t2['components']['protein'][0]['id']
          k2pid[k2] = pid2
        if pid1 == pid2:
          same12_ct += 1
          continue
        # Insert PPI
        rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw,
                           'protein1_id': pid1, 'protein1_str': k1,
                           'protein2_id': pid2, 'protein2_str': k2} )
        if rv:
          ppi_ct += 1
        else:
          dba_err_ct += 1
    pbar.finish()
    for k in notfnd:
      logger.warn("No target found for: {}".format(k))
    print "{} BioPlex PPI rows processed.".format(ct)
    print "  Inserted {} new ppi rows".format(ppi_ct)
    if same12_ct:
      print "  Skipped {} PPIs involving the same protein".format(same12_ct)
    if notfnd:
      print "  No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) 
    if dba_err_ct > 0:
      print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Beispiel #30
0
def load_IMPC(args, dba, logger, logfile):
    fn = CONFIG['IMPC']['DOWNLOAD_DIR'] + CONFIG['IMPC']['GENO_PHENO_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "Processing {} lines from input file {}".format(line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 1
        pt_ct = 0
        pmark = {}
        sym2nhps = {}
        notfnd = set()
        skip_ct = 0
        dba_err_ct = 0
        # 0: marker_accession_id
        # 1: marker_symbol
        # 2: phenotyping_center
        # 3: colony_id
        # 4: sex
        # 5: zygosity
        # 6: allele_accession_id
        # 7: allele_symbol
        # 8: allele_name
        # 9: strain_accession_id
        # 10: strain_name
        # 11: project_name
        # 12: project_fullname
        # 13: pipeline_name
        # 14: pipeline_stable_id
        # 15: procedure_stable_id
        # 16: procedure_name
        # 17: parameter_stable_id
        # 18: parameter_name
        # 19: top_level_mp_term_id
        # 20: top_level_mp_term_name
        # 21: mp_term_id
        # 22: mp_term_name
        # 23: p_value
        # 24: percentage_change
        # 25: effect_size
        # 26: statistical_method
        # 27: resource_name
        for row in csvreader:
            ct += 1
            sym = row[1]
            if not row[21] and not row[22]:
                # skip data with neither a term_id or term_name (IMPC has some of these)
                skip_ct += 1
                continue
            if sym in sym2nhps:
                # we've already found it
                nhpids = sym2nhps[sym]
            elif sym in notfnd:
                # we've already not found it
                continue
            else:
                nhps = dba.find_nhproteins({'sym': sym},
                                           species='Mus musculus')
                if not nhps:
                    notfnd.add(sym)
                    logger.warn("No nhprotein found for symbol {}".format(sym))
                    continue
                nhpids = []
                for nhp in nhps:
                    nhpids.append(nhp['id'])
                sym2nhps[
                    sym] = nhpids  # save this mapping so we only lookup each nhprotein once
            pval = None
            if row[23] and row[23] != '':
                try:
                    pval = float(row[23])
                except:
                    logger.warn(
                        "Problem converting p_value {} for row {}".format(
                            row[23], ct))
            for nhpid in nhpids:
                rv = dba.ins_phenotype({
                    'nhprotein_id': nhpid,
                    'ptype': 'IMPC',
                    'top_level_term_id': row[19],
                    'top_level_term_name': row[20],
                    'term_id': row[21],
                    'term_name': row[22],
                    'p_value': pval,
                    'percentage_change': row[24],
                    'effect_size': row[25],
                    'procedure_name': row[16],
                    'parameter_name': row[18],
                    'statistical_method': row[26],
                    'sex': row[4],
                    'gp_assoc': 1
                })
                if rv:
                    pmark[nhpid] = True
                    pt_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} IMPC phenotypes for {} nhproteins".format(
        pt_ct, len(pmark.keys()))
    if notfnd:
        print "No nhprotein found for {} gene symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if skip_ct > 0:
        print "Skipped {} lines with no term_id or term_name.".format(skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)

    fn = CONFIG['IMPC']['DOWNLOAD_DIR'] + CONFIG['IMPC']['STAT_RES_FILE']
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "Processing {} lines from input file {}".format(line_ct, fn)
    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
    with open(fn, 'rU') as csvfile:
        csvreader = csv.reader(csvfile)
        header = csvreader.next()  # skip header line
        ct = 1
        pt_ct = 0
        pmark = {}
        sym2nhps = {}
        notfnd = set()
        skip_ct = 0
        pv_ct = 0
        dba_err_ct = 0
        # 0: phenotyping_center
        # 1: intercept_estimate
        # 2: procedure_id
        # 3: mutant_biological_model_id
        # 4: rotated_residuals_test
        # 5: weight_effect_p_value
        # 6: male_mutant_count
        # 7: pipeline_stable_key
        # 8: female_ko_effect_p_value
        # 9: pipeline_stable_id
        # 10: parameter_stable_key
        # 11: data_type
        # 12: parameter_stable_id
        # 13: interaction_significant
        # 14: strain_accession_id
        # 15: control_selection_method
        # 16: parameter_name
        # 17: allele_name
        # 18: phenotyping_center_id
        # 19: weight_effect_stderr_estimate
        # 20: weight_effect_parameter_estimate
        # 21: procedure_stable_id
        # 22: status
        # 23: sex_effect_parameter_estimate
        # 24: female_ko_effect_stderr_estimate
        # 25: female_percentage_change
        # 26: group_2_residuals_normality_test
        # 27: marker_accession_id
        # 28: mp_term_name
        # 29: group_1_residuals_normality_test
        # 30: genotype_effect_p_value
        # 31: dependent_variable
        # 32: resource_name
        # 33: project_id
        # 34: procedure_name
        # 35: doc_id
        # 36: top_level_mp_term_id
        # 37: allele_accession_id
        # 38: blups_test
        # 39: null_test_p_value
        # 40: p_value
        # 41: marker_symbol
        # 42: control_biological_model_id
        # 43: pipeline_name
        # 44: sex
        # 45: interaction_effect_p_value
        # 46: colony_id
        # 47: project_name
        # 48: female_ko_parameter_estimate
        # 49: female_mutant_count
        # 50: organisation_id
        # 51: external_db_id
        # 52: female_control_count
        # 53: intermediate_mp_term_id
        # 54: db_id
        # 55: male_ko_effect_p_value
        # 56: top_level_mp_term_name
        # 57: metadata_group
        # 58: sex_effect_stderr_estimate
        # 59: zygosity
        # 60: male_percentage_change
        # 61: sex_effect_p_value
        # 62: mp_term_id
        # 63: male_ko_effect_stderr_estimate
        # 64: additional_information
        # 65: statistical_method
        # 66: _version_
        # 67: intercept_estimate_stderr_estimate
        # 68: male_control_count
        # 69: intermediate_mp_term_name
        # 70: strain_name
        # 71: classification_tag
        # 72: effect_size
        # 73: procedure_stable_key
        # 74: allele_symbol
        # 75: resource_id
        # 76: group_2_genotype
        # 77: variance_significant
        # 78: pipeline_id
        # 79: group_1_genotype
        # 80: male_ko_parameter_estimate
        # 81: genotype_effect_parameter_estimate
        # 82: categories
        # 83: parameter_id
        # 84: batch_significant
        # 85: genotype_effect_stderr_estimate
        # 86: resource_fullname
        for row in csvreader:
            ct += 1
            sym = row[41]
            if not row[62] and not row[28]:
                # skip lines with neither a term_id or term_name
                skip_ct += 1
                continue
            if sym in sym2nhps:
                # we've already found it
                nhpids = sym2nhps[sym]
            elif sym in notfnd:
                # we've already not found it
                continue
            else:
                nhps = dba.find_nhproteins({'sym': sym},
                                           species='Mus musculus')
                if not nhps:
                    notfnd.add(sym)
                    logger.warn("No nhprotein found for symbol {}".format(sym))
                    continue
                nhpids = []
                for nhp in nhps:
                    nhpids.append(nhp['id'])
                sym2nhps[
                    sym] = nhpids  # save this mapping so we only lookup each nhprotein once
            pval = None
            if row[40] and row[40] != '':
                try:
                    pval = float(row[40])
                except:
                    logger.warn(
                        "Problem converting p_value {} for row {}".format(
                            row[40], ct))
            for nhpid in nhpids:
                rv = dba.ins_phenotype({
                    'nhprotein_id': nhpid,
                    'ptype': 'IMPC',
                    'top_level_term_id': row[36],
                    'top_level_term_name': row[56],
                    'term_id': row[62],
                    'term_name': row[28],
                    'p_value': pval,
                    'effect_size': row[72],
                    'procedure_name': row[34],
                    'parameter_name': row[16],
                    'statistical_method': row[65],
                    'sex': row[44],
                    'gp_assoc': 0
                })
                if rv:
                    pmark[nhpid] = True
                    pt_ct += 1
                else:
                    dba_err_ct += 1
            pbar.update(ct)
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "Loaded {} IMPC phenotypes for {} nhproteins".format(
        pt_ct, len(pmark))
    if notfnd:
        print "  No nhprotein found for {} gene symbols. See logfile {} for details.".format(
            len(notfnd), logfile)
    if skip_ct > 0:
        print "  Skipped {} lines with no term_id/term_name or no p-value.".format(
            skip_ct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)
    # Dataset
    dataset_id = dba.ins_dataset({
        'name':
        'IMPC Phenotypes',
        'source':
        "Files %s and %s from ftp://ftp.ebi.ac.uk/pub/databases/impc/release-11.0/csv/"
        % (CONFIG['IMPC']['GENO_PHENO_FILE'], CONFIG['IMPC']['STAT_RES_FILE']),
        'app':
        PROGRAM,
        'app_version':
        __version__
    })
    assert dataset_id, "Error inserting dataset See logfile {} for details.".format(
        logfile)
    # Provenance
    provs = [{
        'dataset_id': dataset_id,
        'table_name': 'phenotype',
        'where_clause': "ptype = 'IMPC'"
    }]
    for prov in provs:
        rv = dba.ins_provenance(prov)
        assert rv, "Error inserting provenance. See logfile {} for details.".format(
            logfile)