def cnv_gene_attribute_types(args, ifn, ofn): pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(ifn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, ifn) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() rct = 0 wct = 0 with open(ofn, 'w') as ofh: ofh.write("LOCK TABLES `gene_attribute_type` WRITE;\n") ofh.write("/*!40000 ALTER TABLE `gene_attribute_type` DISABLE KEYS */;\n") ofh.write("INSERT INTO `gene_attribute_type` VALUES ") with open(ifn, 'r') as ifh: csvreader = csv.reader(ifh) header = csvreader.next() # skip header line rct = 1 for row in csvreader: # "id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url" rct += 1 ofh.write('("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")'.format(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9])) if rct < line_ct: ofh.write(',') wct += 1 pbar.update(rct) pbar.finish() ofh.write(";\n/*!40000 ALTER TABLE `gene_attribute_type` ENABLE KEYS */;\nUNLOCK TABLES;\n") print "Processed {} lines".format(rct) print " Wrote inserts for {} new gene_attribute_type rows to file {}".format(wct, ofn) return
def cnv_gene_attribute_types(args, ifn, ofn): pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(ifn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, ifn) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() rct = 0 wct = 0 with open(ofn, 'w') as ofh: ofh.write( '"id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url"\n' ) with open(ifn, 'r') as ifh: csvreader = csv.reader(ifh) header = csvreader.next() # skip header line rct = 1 for row in csvreader: # "id","name","association","description","resource_group","measurement","attribute_group","attribute_type","pubmed_ids","url" rct += 1 ofh.write( '"{}","{}","{}","{}","{}","{}","{}","{}","{}","{}"\n'. format(row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9])) wct += 1 pbar.update(rct) pbar.finish() print "Processed {} lines".format(rct) print " Wrote {} new gene_attribute_type rows to file {}".format(wct, ofn) return
def parse_ens_files(args): for sp in UP2ENSG.keys(): fn = CONFIG[sp]['ensfile'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines in file {}".format(line_ct, fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line for row in tsvreader: # 0: gene_stable_id # 1: transcript_stable_id # 2: protein_stable_id # 3: xref # 4: db_name # 5: info_type # 6: source_identity # 7: xref_identity # 8: linkage_type if row[7] != '100': continue UP2ENSG[sp][row[3]].add(row[0]) pbar.update(ct) pbar.finish() if not args['--quiet']: mct = sum([len(UP2ENSG[sp]) for sp in UP2ENSG.keys()]) print "Now have {} UniProt to ENSG mappings.\n".format(mct)
def load(args, dba, logger, logfile): infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 hom_ct = 0 nf_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) # homologene_group_id tax_id ncbi_gene_id symbol protein_gi ref_seq taxid = int(row[1]) if taxid not in TAXIDS: skip_ct += 1 continue if taxid == 9606: targets = dba.find_targets({'geneid': row[2]}) if not targets: nf_ct += 1 logger.warn("No target found for {}".format(row)) continue for t in targets: p = t['components']['protein'][0] rv = dba.ins_homologene({'protein_id': p['id'], 'groupid': row[0], 'taxid': taxid}) if rv: hom_ct += 1 else: dba_err_ct += 1 else: nhproteins = dba.find_nhproteins({'geneid': row[2]}) if not nhproteins: nf_ct += 1 logger.warn("No nhprotein found for {}".format(row)) continue for nhp in nhproteins: rv = dba.ins_homologene({'nhprotein_id': nhp['id'], 'groupid': row[0], 'taxid': taxid}) if rv: hom_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Processed {} lines.".format(ct) print "Loaded {} new homologene rows".format(hom_ct) print " Skipped {} non-Human/Mouse/Rat lines".format(skip_ct) if nf_ct > 0: print "WARNNING: No target/nhprotein found for {} lines. See logfile {} for details.".format(nf_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args, dba, logfile, logger, ver, fn): line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) ct = 0 ins_ct = 0 dba_err_ct = 0 with open(fn, 'rU') as ifh: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(ifh) for row in csvreader: # 0: TCRD DB ID in version # 1: Name # 2: Description # 3: UniProt # 4: Symbol # 5: Gene ID # 6: TDL # 7: Family ct += 1 geneid = None if row[5] != '\\N': geneid = row[5] rv = dba.ins_idg_evol({ 'tcrd_ver': ver, 'tcrd_dbid': row[0], 'name': row[1], 'description': row[2], 'uniprot': row[3], 'sym': row[4], 'geneid': geneid, 'tdl': row[6], 'fam': row[7] }) if not rv: dba_err_ct += 1 continue ins_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new idg_evol rows".format(ins_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) return True
def parse_hcop16(args): gzfn = DOWNLOAD_DIR + FILENAME fn = gzfn.replace('.gz', '') orthos = list() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: tsvreader = csv.DictReader(tsv, delimiter='\t') for d in tsvreader: # ortholog_species # human_entrez_gene # human_ensembl_gene # hgnc_id # human_name # human_symbol # human_chr # human_assert_ids # ortholog_species_entrez_gene # ortholog_species_ensembl_gene # ortholog_species_db_id # ortholog_species_name # ortholog_species_symbol # ortholog_species_chr # ortholog_species_assert_ids # support src_ct = 0 srcs = [] if 'Inparanoid' in d['support']: src_ct += 1 srcs.append('Inparanoid') if 'OMA' in d['support']: src_ct += 1 srcs.append('OMA') if 'EggNOG' in d['support']: src_ct += 1 srcs.append('EggNOG') if src_ct >= 2: # Only take rows with at least 2 out of three d['sources'] = ', '.join(srcs) orthos.append(d) if not args['--quiet']: print " Generated ortholog dataframe with {} entries".format( len(orthos)) ortho_df = pd.DataFrame(orthos) return ortho_df
def cnv_gene_attributes(args, idmap, ifn, ofn): pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] line_ct = slmf.wcl(ifn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, ifn) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() rct = 0 wct = 0 skip_ct = 0 gaid = 1 with open(ofn, 'w') as ofh: ofh.write('"id","protein_id","gat_id","name","value"\n') with open(ifn, 'r') as ifh: csvreader = csv.reader(ifh) header = csvreader.next() # skip header line rct = 1 for row in csvreader: # "id","protein_id","gat_id","name","value" rct += 1 v5pid = int(row[1]) if v5pid in idmap: # v5 protein maps to v6 protein v6pid = idmap[v5pid] else: skip_ct += 1 continue ofh.write('"{}","{}","{}","{}","{}"\n'.format( gaid, v6pid, row[2], row[3], row[4])) gaid += 1 wct += 1 pbar.update(rct) pbar.finish() print "Processed {} lines.".format(rct) print " Wrote {} new gene_attribute rows to file {}".format(wct, ofn) print " Skipped {} rows that do not map from v5 to v6.".format(skip_ct) return
def main(): for ver, fn in INFILES.items(): line_ct = slmf.wcl(fn) print "\nProcessing {} lines in file {}".format(line_ct, fn) ct = 0 with open(fn, 'r') as ifh: csvreader = csv.reader(ifh) for row in csvreader: # name, uniprot, sym, geneid, tdl ct += 1 up = row[1] sym = row[2] tdl = row[4] TDLEvol[up][ver] = tdl UP2Sym[up] = sym print "{} lines processed.".format(ct) print "{} entries now in TDLEvol.".format(len(TDLEvol)) ct = 0 header = [ 'UniProt', 'HGNC Symbol', 'v1 TDL', 'v2 TDL', 'v3 TDL', 'v4 TDL', 'v5 TDL', 'v6 TDL' ] ct += 1 with open(OUTFILE, 'w') as csvout: csvwriter = csv.writer(csvout, quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for up, tdld in TDLEvol.items(): outrow = [up, UP2Sym[up]] for ver in ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']: if ver in tdld: outrow.append(tdld[ver]) else: outrow.append('') csvwriter.writerow(outrow) ct += 1 print "\nWrote {} line to output file {}.".format(ct, OUTFILE) return True
def load_RGD(args, dba, logger, logfile): fn = CONFIG['RGD']['DOWNLOAD_DIR'] + CONFIG['RGD']['QTL_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines in processed RGD file {}".format( line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 nhpmark = {} qtl_ct = 0 with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 # 0 "GENE_RGD_ID" # 1 "nhprotein_id" # 2 "QTL_RGD_ID" # 3 "QTL_SYMBOL" # 4 "QTL_NAME" # 5 "LOD" # 6 "P_VALUE" # 7 "TRAIT_NAME" # 8 "MEASUREMENT_TYPE" # 9 "ASSOCIATED_DISEASES" # 10 "PHENOTYPES" init = { 'nhprotein_id': row[1], 'rgdid': row[0], 'qtl_rgdid': row[2], 'qtl_symbol': row[3], 'qtl_name': row[4] } if row[5] and row[5] != 'None': init['lod'] = row[5] if row[6] and row[6] != 'None': init['p_value'] = row[6] if row[7] and row[7] != 'None': init['trait_name'] = row[7] if row[8] and row[8] != 'None': init['measurement_type'] = row[8] if row[9] and row[9] != 'None': init['associated_disease'] = row[9] if row[10] and row[10] != 'None': init['phenotype'] = row[10] rv = dba.ins_rat_qtl(init) if not rv: dba_err_ct += 1 continue qtl_ct += 1 nhpmark[row[1]] = True pbar.update(ct) pbar.finish() print "Processed {} lines".format(ct) print "Inserted {} new rat_qtl rows for {} nhproteins.".format( qtl_ct, len(nhpmark)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fn = CONFIG['RGD']['DOWNLOAD_DIR'] + CONFIG['RGD']['TERMS_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines in processed RGD file {}".format( line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 term_ct = 0 with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 # 0 "RGD_ID" # 1 "OBJECT_SYMBOL" # 2 "TERM_ACC_ID" # 3 "TERM_NAME" # 4 "QUALIFIER" # 5 "EVIDENCE" # 6 "ONTOLOGY" init = { 'rgdid': row[0], 'term_id': row[2], 'qtl_symbol': row[3], 'qtl_name': row[4] } if row[1] and row[1] != 'None': init['obj_symbol'] = row[1] if row[3] and row[3] != 'None': init['term_name'] = row[3] if row[4] and row[4] != 'None': init['qualifier'] = row[4] if row[5] and row[5] != 'None': init['evidence'] = row[5] if row[6] and row[6] != 'None': init['ontology'] = row[6] rv = dba.ins_rat_term(init) if not rv: dba_err_ct += 1 continue term_ct += 1 pbar.update(ct) pbar.finish() print "Processed {} lines".format(ct) print "Inserted {} new rat_term rows.".format(term_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'RGD', 'source': 'Files %s and %s produced by UNM KMC group from files from ftp://ftp.rgd.mcw.edu/pub/data_release/' .format(CONFIG['RGD']['QTL_FILE'], CONFIG['RGD']['TERMS_FILE']), 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'rat_term' }, { 'dataset_id': dataset_id, 'table_name': 'rat_qtl' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GWAS Catalog', 'source': 'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' % os.path.basename(INFILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gwas/home' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INFILE) line_ct -= 1 if not args['--quiet']: print '\nProcessing {} lines from input file {}'.format( line_ct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() outlist = [] with open(INFILE, 'rU') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 notfnd = set() pmark = {} gwas_ct = 0 dba_err_ct = 0 # 0: DATE ADDED TO CATALOG # 1: PUBMEDID # 2: FIRST AUTHOR # 3: DATE # 4: JOURNAL # 5: LINK # 6: STUDY # 7: DISEASE/TRAIT # 8: INITIAL SAMPLE SIZE # 9: REPLICATION SAMPLE SIZE # 10: REGION # 11: CHR_ID # 12: CHR_POS # 13: REPORTED GENE(S) # 14: MAPPED_GENE # 15: UPSTREAM_GENE_ID # 16: DOWNSTREAM_GENE_ID # 17: SNP_GENE_IDS # 18: UPSTREAM_GENE_DISTANCE # 19: DOWNSTREAM_GENE_DISTANCE # 20: STRONGEST SNP-RISK ALLELE # 21: SNPS # 22: MERGED # 23: SNP_ID_CURRENT # 24: CONTEXT # 25: INTERGENIC # 26: RISK ALLELE FREQUENCY # 27: P-VALUE # 28: PVALUE_MLOG # 29: P-VALUE (TEXT) # 30: OR or BETA # 31: 95% CI (TEXT) # 32: PLATFORM [SNPS PASSING QC] # 33: CNV # 34: MAPPED_TRAIT # 35: MAPPED_TRAIT_URI # 36: STUDY ACCESSION # 37: GENOTYPING TECHNOLOGY symregex = re.compile(r' ?[-,;] ?') for row in tsvreader: ct += 1 if len(row) < 14: continue symstr = row[14] if symstr == 'NR': continue symlist = symregex.split(symstr) for sym in symlist: if sym in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) logger.warn("No target found for symbol {}".format(sym)) continue for t in targets: p = t['components']['protein'][0] try: pval = float(row[27]) except: pval = None try: orbeta = float(row[30]) except: orbeta = None if row[25]: ig = int(row[25]) else: ig = None rv = dba.ins_gwas({ 'protein_id': p['id'], 'disease_trait': row[7], 'snps': row[21], 'pmid': row[1], 'study': row[6], 'context': row[24], 'intergenic': ig, 'p_value': pval, 'or_beta': orbeta, 'cnv': row[33], 'mapped_trait': row[34], 'mapped_trait_uri': row[35] }) if not rv: dba_err_ct += 1 continue pmark[p['id']] = True gwas_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new gwas rows for {} proteins".format( gwas_ct, len(pmark.keys())) if notfnd: print "No target found for {} symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load_JAX(args, dba, logger, logfile): fn = CONFIG['MPO_OWL_FILE'] if not args['--quiet']: print "Parsing Mammalian Phenotype Ontology file {}".format(fn) mpo = parse_mp_owl(fn) if not args['--quiet']: print "Got {} MP terms".format(len(mpo)) fn = CONFIG['JAX']['DOWNLOAD_DIR'] + CONFIG['JAX']['FILENAME'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines from JAX file {}".format(line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pt_ct = 0 skip_ct = 0 pmark = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 if not row[6] or row[6] == '': skip_ct += 1 continue sym = row[0] geneid = row[1] k = "%s|%s" % (sym, geneid) if k in notfnd: continue targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets({'geneid': geneid}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue for t in targets: pid = t['components']['protein'][0]['id'] pmark[pid] = True for mpid in row[6].split(): rv = dba.ins_phenotype({ 'protein_id': pid, 'ptype': 'JAX/MGI Human Ortholog Phenotype', 'term_id': mpid, 'term_name': mpo[mpid]['name'] }) if rv: pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new phenotype rows for {} proteins".format( pt_ct, len(pmark)) print " Skipped {} lines with no MP terms".format(skip_ct) if notfnd: print " No target found for {} gene symbols/ids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JAX/MGI Mouse/Human Orthology Phenotypes', 'source': 'File %s from ftp.informatics.jax.org' % CONFIG['JAX']['FILENAME'], 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.informatics.jax.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotype'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile)
def tinx(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = { } # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = { } # PMID => count of diseases mentioned in a given paper pmid_protein_ct = { } # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print "\nParsing Disease Ontology file {}".format(dofile) do_parser = obo.Parser(open(dofile)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = JL_DOWNLOAD_DIR + PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in protein file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) targets = dba.find_targets({'stringid': ensp}) if not targets: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not targets: notfnd.add(ensp) continue for t in targets: p = t['components']['protein'][0] k = "%s,%s" % (p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 pbar.finish() for ensp in notfnd: logger.warn("No target found for {}".format(ensp)) print "{} lines processed.".format(ct) print " Skipped {} non-ENSP lines".format(skip_ct) print " Saved {} protein to PMIDs mappings".format(len(pid2pmids)) print " Saved {} PMID to protein count mappings".format( len(pmid_protein_ct)) if notfnd: print " No target found for {} ENSPs. See logfile {} for details.".format( len(notfnd), logfile) fn = JL_DOWNLOAD_DIR + DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn("%s not found in DO" % doid) notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 pbar.finish() print "{} lines processed.".format(ct) print " Skipped {} non-DOID lines".format(skip_ct) print " Saved {} DOID to PMIDs mappings".format(len(doid2pmids)) print " Saved {} PMID to disease count mappings".format( len(pmid_disease_ct)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format( len(notfnd), logfile) if not args['--quiet']: print "\nComputing protein novely scores" # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write("%s,%.8f\n" % (k, novelty)) print " Wrote {} novelty scores to file {}".format( ct, PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nComputing disease novely scores" # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write("%s,%.8f\n" % (doid, novelty)) print " Wrote {} novelty scores to file {}".format( ct, DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nComputing importance scores" # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'wb') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) if fdt_score_sum > 0: ct += 1 impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum)) print " Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE) if not args['--quiet']: print "\nComputing PubMed rankings" # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'wb') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [ ] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid])) if len(scores) > 0: scores.sort(cmp_pmids_scores) for i, t in enumerate(scores): ct += 1 pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i)) print " Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'OMIM', 'source': 'Files %s downloaded from omim.org' % ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://omim.org/', 'comments': 'Confirmed OMIM phenotypes and OMIM Phenotype Series info' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'omim' }, { 'dataset_id': dataset_id, 'table_name': 'omim_ps' }, { 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'OMIM'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # OMIMs and Phenotypic Series fname = DOWNLOAD_DIR + TITLES_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 omim_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Prefix ??? # 1: Mim Number # 2: Preferred Title; symbol Alternative Title(s); symbol(s) # 3: Included Title(s); symbols title = row[2].partition(';')[0] rv = dba.ins_omim({'mim': row[1], 'title': title}) if not rv: dba_err_ct += 1 continue omim_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim rows".format(omim_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fname = DOWNLOAD_DIR + PS_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 ps_ct = 0 err_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Phenotypic Series Number # 1: Mim Number # 2: Phenotype if len(row) == 2: init = {'omim_ps_id': row[0], 'title': row[1]} elif len(row) == 3: init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]} else: err_ct += 1 logger.warn("Parsing error for row {}".format(row)) continue rv = dba.ins_omim_ps(init) if not rv: dba_err_ct += 1 continue ps_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim_ps rows".format(ps_ct) if err_ct > 0: print "WARNING: {} parsing errors occurred. See logfile {} for details.".format( er_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Phenotypes fname = DOWNLOAD_DIR + GENEMAP_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 tmark = {} skip_ct = 0 notfnd_ct = 0 prov_ct = 0 dds_ct = 0 pt_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0 - Sort ??? # 1 - Month # 2 - Day # 3 - Year # 4 - Cytogenetic location # 5 - Gene Symbol(s) # 6 - Confidence # 7 - Gene Name # 8 - MIM Number # 9 - Mapping Method # 10 - Comments # 11 - Phenotypes # 12 - Mouse Gene Symbol pts = row[11] if pts.startswith('?'): prov_ct += 1 continue if '(4)' in pts: dds_ct += 1 trait = "MIM Number: %s" % row[8] if row[11]: trait += "; Phenotype: %s" % pts found = False syms = row[5].split(', ') logger.info("Checking for OMIM syms: {}".format(syms)) for sym in syms: targets = dba.find_targets({'sym': sym}) if targets: found = True for t in targets: p = t['components']['protein'][0] logger.info( " Symbol {} found target {}: {}, {}".format( sym, t['id'], p['name'], p['description'])) rv = dba.ins_phenotype({ 'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait }) if not rv: dba_err_ct += 1 continue tmark[t['id']] = True pt_ct += 1 if not found: notfnd_ct += 1 logger.warn("No target found for row {}".format(row)) pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print " Skipped {} provisional phenotype rows.".format(prov_ct) print " Skipped {} deletion/duplication syndrome rows.".format(dds_ct) print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark)) if notfnd_ct > 0: print "No target found for {} good lines. See logfile {} for details.".format( notfnd_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Expression Atlas', 'source': 'IDG-KMC generated data at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gxa/', 'comment': 'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'Expression Atlas'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] with open(INPUT_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 k2pids = {} pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: # 0: "Gene ID" # 1: "DOID" # 2: "Gene Name" # 3: "log2foldchange" # 4: "p-value" # 5: "disease" # 6: "experiment_id" # 7: "contrast_id" ct += 1 sym = row[2] ensg = row[0] k = "%s|%s" % (sym, ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'ENSG', 'value': ensg }) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[ k] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': 'Expression Atlas', 'name': row[5], 'did': row[1], 'log2foldchange': "%.3f" % float(row[3]), 'pvalue': row[4] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} symbols/ensgs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JensenLab PubMed Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': BASE_URL }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'pmscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'JensenLab PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ensp2pids = {} pmscores = {} # protein.id => sum(all scores) pms_ct = 0 upd_ct = 0 notfnd = {} dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 pbar.update(ct) if not row[0].startswith('ENSP'): continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not targets: notfnd[ensp] = True logger.warn("No target found for {}".format(ensp)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) ensp2pids[ ensp] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) pbar.finish() print "{} input lines processed.".format(ct) print " Inserted {} new pmscore rows for {} targets".format( pms_ct, len(pmscores)) if len(notfnd) > 0: print "No target found for {} STRING IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} JensenLab PubMed Score tdl_infos".format( len(pmscores.keys())) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'JensenLab PubMed Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print " Inserted {} new JensenLab PubMed Score tdl_info rows".format( ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'LINCS', 'source': "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.", 'app': PROGRAM, 'app_version': __version__, 'url': 'http://lincsproject.org/LINCS/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 gid2pids = {} notfnd = set() dba_err_ct = 0 pmark = {} lincs_ct = 0 with open(INPUT_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') for row in tsvreader: # 0: level5_lm.pr_gene_id # 1: level5_lm.zscore # 2: perturbagen.dc_id # 3: perturbagen.canonical_smiles # 4: signature.cell_id ct += 1 gid = row[0] if gid in gid2pids: # we've already found it pids = gid2pids[gid] elif gid in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'geneid': gid}, False) if not targets: notfnd.add(gid) continue pids = [] for t in targets: pid = t['components']['protein'][0]['id'] pids.append(pid) gid2pids[ gid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_lincs({ 'protein_id': pid, 'cellid': row[4], 'zscore': row[1], 'pert_dcid': row[2], 'pert_smiles': row[3] }) if not rv: dba_err_ct += 1 continue pmark[pid] = True lincs_ct += 1 pbar.update(ct) pbar.finish() for gid in notfnd: logger.warn("No target found for {}".format(gid)) print "{} lines processed.".format(ct) print "Loaded {} new lincs rows for {} proteins.".format( lincs_ct, len(pmark)) if notfnd: print "No target found for {} geneids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) if os.path.isfile(T2AID_PICKLE): t2aid = pickle.load( open(T2AID_PICKLE, 'rb')) act = 0 for tid in t2aid.keys(): for aid in t2aid[tid]: act += 1 if not args['--debug']: print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act) else: pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(AIDGI_FILE) t2aid = {} if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE) with open(AIDGI_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 skip_ct = 0 fndgi_ct = 0 fndpl_ct = 0 notfnd = set() assay_ct = 0 dba_err_ct = 0 for row in csvreader: # aid, tgt_gi, tgt_species, tgt_name #print "[DEBUG]", row ct += 1 if row[2] != 'H**o sapiens': skip_ct += 1 continue gi = row[1] targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi}) if targets: fndgi_ct += 1 else: url = EFETCH_PROTEIN_URL + gi r = requests.get(url) if r.status_code == 200: soup = BeautifulSoup(r.text, "xml") grl = soup.find('Gene-ref_locus') if grl: sym = grl.text targets = dba.find_targets({'sym': sym}) if targets: fndpl_ct += 1 else: notfnd.append(gi) logger.warn("No target found for GI {}".format(gi)) continue t = targets[0] tid = t['id'] if tid in t2aid: t2aid[tid].append(row[0]) assay_ct += 1 else: t2aid[tid] = [row[0]] assay_ct += 1 pbar.update(ct) pbar.finish() pickle.dump(t2aid, open(T2AID_PICKLE, "wb")) print "\n{} rows processed.".format(ct) print " {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid)) print " Skipped {} non-huamn assay rows".format(skip_ct) print " {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct) print " No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile) assay_info = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(ASSAYS_FILE) if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE) with open(ASSAYS_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 for row in csvreader: # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList aid = row[0] assay_info[aid] = row[1:] pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "Got assay info for {} assays.".format(len(assay_info)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] tct = len(t2aid.keys()) if not args['--quiet']: print "\nLoading MLP Assay Info for {} targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 ti_ct = 0 mai_ct = 0 dba_err_ct = 0 for tid, aids in t2aid.items(): ct += 1 for aid in aids: ainfo = assay_info[aid] rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]}) if rv: mai_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} targets processed.".format(ct) print " Inserted {} new mlp_assay_info rows".format(mai_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubTator Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/', 'comments': 'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ptscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'PubTator PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ptscores = {} # protein.id => sum(all scores) pts_ct = 0 dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 geneid2pid = {} notfnd = set() for row in tsvreader: # NCBI Gene ID year score ct += 1 pbar.update(ct) gidstr = row[0].replace(',', ';') geneids = gidstr.split(';') for geneid in geneids: if not geneid or '(tax:' in geneid: continue if geneid in geneid2pid: # we've already found it pids = geneid2pid[geneid] elif geneid in notfnd: # we've already not found it continue else: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(geneid) logger.warn("No target found for {}".format(geneid)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) geneid2pid[ geneid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_ptscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pts_ct += 1 else: dba_err_ct += 1 if pid in ptscores: ptscores[pid] += float(row[2]) else: ptscores[pid] = float(row[2]) pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new ptscore rows for {} targets.".format( pts_ct, len(ptscores)) if notfnd: print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores)) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in ptscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'PubTator Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI GI Numbers', 'source': 'UniProt ID Mapping file %s' % (BASE_URL + FILENAME), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '') line_ct = slmf.wcl(infile) # ID Mappiing fields # 1. UniProtKB-AC # 2. UniProtKB-ID # 3. GeneID (EntrezGene) # 4. RefSeq # 5. GI # 6. PDB # 7. GO # 8. UniRef100 # 9. UniRef90 # 10. UniRef50 # 11. UniParc # 12. PIR # 13. NCBI-taxon # 14. MIM # 15. UniGene # 16. PubMed # 17. EMBL # 18. EMBL-CDS # 19. Ensembl # 20. Ensembl_TRS # 21. Ensembl_PRO # 22. Additional PubMed if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} xref_ct = 0 skip_ct = 0 dba_err_ct = 0 for line in tsv: data = line.split('\t') ct += 1 up = data[0] if not data[4]: # no gi skip_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: skip_ct += 1 continue target = targets[0] tmark[target['id']] = True pid = target['components']['protein'][0]['id'] for gi in data[4].split('; '): rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'NCBI GI', 'dataset_id': dataset_id, 'value': gi }) if rv: xref_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} rows processed".format(ct) print " Inserted {} new GI xref rows for {} targets".format( xref_ct, len(tmark)) print " Skipped {} rows with no GI".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Drug Central', 'source': "Drug Central files download files: %s" % ", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://drugcentral.org/' }) if not dataset_id: print "WARNING: Error inserting dataset. See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'drug_activity' }, { 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'DrugCentral Indication'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) # First get mapping of DrugCentral names to ids name2id = {} line_ct = slmf.wcl(NAME_ID_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, NAME_ID_FILE) with open(NAME_ID_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue name2id[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(name2id)) # Next get drug info fields infos = {} line_ct = slmf.wcl(DRUGINFO_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, DRUGINFO_FILE) with open(DRUGINFO_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue infos[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(infos)) # # MOA activities # drug2tids = defaultdict(list) line_ct = slmf.wcl(TCLIN_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from DrugDB MOA activities file {}".format( line_ct, TCLIN_FILE) with open(TCLIN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 1, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['cmpd_chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tclin rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Non-MOA activities # line_ct = slmf.wcl(TCHEM_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from Non-MOA activities file {}".format( line_ct, TCHEM_FILE) with open(TCHEM_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 0, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tchem rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Indications (diseases) # line_ct = slmf.wcl(DRUGIND_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from indications file {}".format( line_ct, DRUGIND_FILE) with open(DRUGIND_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # DRUG_ID DRUG_NAME INDICATION_FDB UMLS_CUI SNOMEDCT_CUI DOID ct = 0 t2d_ct = 0 notfnd = {} dba_err_ct = 0 for row in tsvreader: ct += 1 drug = row[1] if drug not in drug2tids: notfnd[drug] = True continue init = { 'protein_id': tid, 'dtype': 'DrugCentral Indication', 'name': row[2], 'drug_name': drug } if row[5] != '': init['did'] = row[5] for tid in drug2tids[drug]: # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets init['protein_id'] = tid rv = dba.ins_disease(init) if rv: t2d_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral indication rows processed.".format(ct) print " Inserted {} new disease rows".format(t2d_ct) if len(notfnd.keys()) > 0: print "WARNNING: {} drugs NOT FOUND in activity files:".format( len(notfnd)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'AnimalTFDB', 'source': 'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.bioguo.org/AnimalTFDB/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Is Transcription Factor'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0} line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines in input file {}\n".format( line_ct, INFILE) with open(INFILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 ti_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 sym = row[3] targets = dba.find_targets({'sym': sym}) if not targets: gid = row[2] targets = dba.find_targets({'geneid': gid}) if not targets: ensg = row[1] targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not targets: notfnd.append(row) continue t = targets[0] TDLs[t['tdl']] += 1 pid = t['components']['protein'][0]['id'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Is Transcription Factor', 'boolean_value': 1 }) if rv: ti_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} lines processed.".format(ct) print " Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct) if notfnd: print "No target found for {} rows:".format(len(notfnd)) if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile) for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']: print "{}: {}".format(tdl, TDLs[tdl])
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Jensen Lab DISEASES', 'source': 'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://diseases.jensenlab.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype LIKE 'JensenLab %'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # Knowledge channel fn = DOWNLOAD_DIR + FILE_K line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Knowledge ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True init = { 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] } rv = dba.ins_disease(init) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Experiment channel fn = DOWNLOAD_DIR + FILE_E line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[6] == '0': # skip zero confidence rows skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Experiment ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} zero confidence rows".format(skip_ct) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Text Mining channel fn = DOWNLOAD_DIR + FILE_T line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Text Mining' for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'zscore': row[4], 'conf': row[5] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Jensen Lab TISSUES', 'source': 'Files %s from %s'%(", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://tissues.jensenlab.org/'} ) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "type LIKE 'JensenLab %'"}) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) # this dict will map ENSP|sym from input files to TCRD protein_id(s) # so we only have to find target(s) once for each pair. # See find_pids() below pmap = {} # Knowledge channel fn = DOWNLOAD_DIR+FILE_K line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) k = "%s|%s" % (row[0], row[1]) # ENSP|sym if k in notfnd: continue pids = find_pids(dba, k, pmap) if not pids: notfnd.add(k) continue etype = 'JensenLab Knowledge ' + row[4] init = {'etype': etype, 'tissue': row[3],'boolean_value': 1, 'oid': row[2], 'evidence': row[5], 'conf': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Experiment channel fn = DOWNLOAD_DIR+FILE_E line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) if row[6] == '0': # skip zero confidence rows skip_ct += 1 continue sym = row[1] # some rows look like: # ['ENSP00000468389', 'PSENEN {ECO:0000313|Ensembl:ENSP00000468593}', 'BTO:0002860', 'Oral mucosa', 'HPA', 'High: 1 antibody', '1'] if ' ' in sym: sym = sym.split()[0] k = "%s|%s" % (row[0], sym) # ENSP|sym if k in notfnd: continue try: pids = find_pids(dba, k, pmap) except ValueError: print "[ERROR] Row: %s; k: %s" % (str(row), k) if not pids: notfnd.add(k) continue etype = 'JensenLab Experiment ' + row[4] init = {'etype': etype, 'tissue': row[3], 'string_value': row[5], 'oid': row[2], 'conf': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: pmark[pid] = True init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) print " Skipped {} zero confidence rows".format(skip_ct) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Text Mining channel fn = DOWNLOAD_DIR+FILE_T line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) k = "%s|%s" % (row[0], row[1]) # ENSP|sym if k in notfnd: continue pids = find_pids(dba, k, pmap) if not pids: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue etype = 'JensenLab Text Mining' init = {'etype': etype, 'tissue': row[3], 'boolean_value': 1, 'oid': row[2], 'zscore': row[4], 'conf': row[5], 'url': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: pmark[pid] = True init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Reactome Protein-Protein Interactions', 'source': "File %s"%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/'} ) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'Reactome'"}) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines from Reactome PPI file {}".format(line_ct, infile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 1 skip_ct = 0 same12_ct = 0 dup_ct = 0 ppis = {} ppi_ct = 0 up2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: # 0: Interactor 1 uniprot id # 1: Interactor 1 Ensembl gene id # 2: Interactor 1 Entrez Gene id # 3: Interactor 2 uniprot id # 4: Interactor 2 Ensembl gene id # 5: Interactor 2 Entrez Gene id # 6: Interaction type # 7: Interaction context Pubmed references ct += 1 pbar.update(ct) if not row[0].startswith('uniprotkb:'): continue if not row[3].startswith('uniprotkb:'): continue up1 = row[0].replace('uniprotkb:', '') up2 = row[3].replace('uniprotkb:', '') if not up1 or not up2: skip_ct += 1 continue # protein1 if up1 in up2pid: pid1 = up2pid[up1] elif up1 in notfnd: continue else: t1 = find_target(dba, up1) if not t1: notfnd.add(up1) continue pid1 = t1['components']['protein'][0]['id'] up2pid[up1] = pid1 # protein2 if up2 in up2pid: pid2 = up2pid[up2] elif up2 in notfnd: continue else: t2 = find_target(dba, up2) if not t2: notfnd.add(up2) continue pid2 = t2['components']['protein'][0]['id'] up2pid[up2] = pid2 int_type = row[6] ppik = up1 + "|" + up2 + 'int_type' if ppik in ppis: dup_ct += 1 continue if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'Reactome', 'interaction_type': int_type, 'protein1_id': pid1, 'protein1_str': up1, 'protein2_id': pid2, 'protein2_str': up2} ) if rv: ppi_ct += 1 ppis[ppik] = True else: dba_err_ct += 1 pbar.finish() for up in notfnd: logger.warn("No target found for: {}".format(up)) print "{} Reactome PPI rows processed.".format(ct) print " Inserted {} ({}) new ppi rows".format(ppi_ct, len(ppis)) if skip_ct: print " Skipped {} rows without two UniProt interactors".format(skip_ct) if dup_ct: print " Skipped {} duplicate PPIs".format(dup_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProt accessions. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) aliasmap = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] ct = 0 skip_ct = 0 mult_ct = 0 line_ct = slmf.wcl(INFILE1) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(INFILE1, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # taxid uniprot_ac|uniprot_id string_id identity bit_score ct += 1 pbar.update(ct) if float(row[3]) != 100: skip_ct += 1 continue [uniprot, name] = row[1].split("|") ensp = row[2].replace('9606.', '') bitscore = float(row[4]) if uniprot in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[uniprot][1]: aliasmap[uniprot] = (ensp, bitscore) else: aliasmap[uniprot] = (ensp, bitscore) if name in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[name][1]: aliasmap[name] = (ensp, bitscore) else: aliasmap[name] = (ensp, bitscore) pbar.finish() unmap_ct = len(aliasmap) print "{} input lines processed.".format(ct) print " Skipped {} non-identity lines".format(skip_ct) print " Got {} uniprot/name to STRING ID mappings".format(unmap_ct) line_ct = slmf.wcl(INFILE2) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 warn_ct = 0 with open(INFILE2, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ## string_protein_id ## alias ## source ## ct += 1 pbar.update(ct) alias = row[1] ensp = row[0].replace('9606.', '') if alias in aliasmap and aliasmap[alias][0] != ensp: # do not replace mappings from *human.uniprot_2_string.2018* with aliases logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp)) warn_ct += 1 continue aliasmap[alias] = (ensp, None) pbar.finish() amap_ct = len(aliasmap) - unmap_ct print "{} input lines processed.".format(ct) print " Added {} alias to STRING ID mappings".format(amap_ct) if warn_ct > 0: print " Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile) tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nLoading STRING IDs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 upd_ct = 0 nf_ct = 0 dba_err_ct = 0 for target in dba.get_targets(include_annotations=True): ct += 1 pbar.update(ct) p = target['components']['protein'][0] geneid = 'hsa:' + str(p['geneid']) hgncid = None if 'HGNC' in p['xrefs']: hgncid = p['xrefs']['HGNC'][0]['value'] ensp = None if p['uniprot'] in aliasmap: ensp = aliasmap[p['uniprot']][0] elif p['name'] in aliasmap: ensp = aliasmap[p['name']][0] elif geneid in aliasmap: ensp = aliasmap[geneid][0] elif hgncid and hgncid in aliasmap: ensp = aliasmap[hgncid][0] if not ensp: nf_ct += 1 logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot'])) continue rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} ) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Updated {} STRING ID values".format(upd_ct) if nf_ct > 0: print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load_OMIM(args, dba, logger, logfile): # OMIMs and Phenotypic Series fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['TITLES_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print 'Processing %d lines from input file %s' % (line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 omim_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Prefix ??? # 1: Mim Number # 2: Preferred Title; symbol Alternative Title(s); symbol(s) # 3: Included Title(s); symbols title = row[2].partition(';')[0] rv = dba.ins_omim({'mim': row[1], 'title': title}) if not rv: dba_err_ct += 1 continue omim_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "Loaded {} new omim rows".format(omim_ct) print " Skipped {} commented lines.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['PS_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print 'Processing %d lines from input file %s' % (line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 ps_ct = 0 err_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Phenotypic Series Number # 1: Mim Number # 2: Phenotype if len(row) == 2: init = {'omim_ps_id': row[0], 'title': row[1]} elif len(row) == 3: init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]} else: err_ct += 1 logger.warn("Parsing error for row {}".format(row)) continue rv = dba.ins_omim_ps(init) if not rv: dba_err_ct += 1 continue ps_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "Loaded {} new omim_ps rows".format(ps_ct) print " Skipped {} commented lines.".format(skip_ct) if err_ct > 0: print "WARNING: {} parsing errors occurred. See logfile {} for details.".format( er_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Phenotypes fn = CONFIG['OMIM']['DOWNLOAD_DIR'] + CONFIG['OMIM']['GENEMAP2_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print 'Processing %d lines from input file %s' % (line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} skip_ct = 0 notfnd_ct = 0 prov_ct = 0 dds_ct = 0 pt_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Chromosome # 1: Genomic Position Start # 2: Genomic Position End # 3: Cyto Location # 4: Computed Cyto Location # 5: MIM Number # 6: Gene Symbols # 7: Gene Name # 8: Approved Symbol # 9. Entrez Gene ID # 10: Ensembl Gene ID # 11: Comments # 12: Phenotypes # 13: Mouse Gene Symbol/ID pts = row[11] if pts.startswith('?'): prov_ct += 1 continue if '(4)' in pts: dds_ct += 1 trait = "MIM Number: %s" % row[5] if pts: trait += "; Phenotype: %s" % pts if row[8]: syms = [row[8]] else: syms = syms = row[5].split(', ') logger.info("Checking for OMIM syms: {}".format(syms)) for sym in syms: targets = dba.find_targets({'sym': sym}) if not targets and row[9]: targets = dba.find_targets({'geneid': int(row[9])}) if not targets: notfnd_ct += 1 logger.warn("No target found for row {}".format(row)) continue for t in targets: p = t['components']['protein'][0] rv = dba.ins_phenotype({ 'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait }) if not rv: dba_err_ct += 1 continue pmark[p['id']] = True pt_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print "Loaded {} OMIM phenotypes for {} proteins".format(pt_ct, len(pmark)) print " Skipped {} commented lines.".format(skip_ct) print " Skipped {} provisional phenotype rows.".format(prov_ct) if dds_ct > 0: print " Skipped {} deletion/duplication syndrome rows.".format(dds_ct) if notfnd_ct > 0: print " No target found for {} good lines. See logfile {} for details.".format( notfnd_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'OMIM', 'source': 'Files %s from http:data.omim.org' % (", ".join(CONFIG['OMIM']['SRC_FILES'])), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://omim.org/', 'comments': 'OMIM phenotype associations and Phenotype Series info. Neither provisional associations nor deletion/duplication syndromes are loaded.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'omim' }, { 'dataset_id': dataset_id, 'table_name': 'omim_ps' }, { 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'OMIM'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Human Protein Atlas', 'source': 'IDG-KMC generated data by Steve Mathias at UNM from HPA file http://www.proteinatlas.org/download/normal_tissue.tsv.zip.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.proteinatlas.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPA'", 'comment': 'Qualitative expression values are derived from files from http://www.proteinatlas.org/' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPA Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.proteinatlas.org/. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format( len(tiss2uid), TISSUE2UBERON_FILE) line_ct = slmf.wcl(HPA_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPA file {}".format(line_ct, HPA_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} exp_ct = 0 nouid = set() with open(HPA_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # "protein_id" "Tissue" "Gene" "Gene name" "Level" "Reliability" ct += 1 tissue = row[1] init = { 'protein_id': row[0], 'etype': 'HPA', 'tissue': tissue, 'qual_value': row[4], 'evidence': row[5] } # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[row[1]] = True pbar.update(ct) pbar.finish() print "Processed {} HPA lines.".format(ct) print " Inserted {} new expression rows for {} proteins.".format( exp_ct, len(pmark)) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format( len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) line_ct = slmf.wcl(HPA_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPA TAU file {}".format( line_ct, HPA_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(HPA_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # "Gene" "TAU" "protein_id" ct += 1 pbar.update(ct) if row[1] == 'None': skip_ct += 1 continue rv = dba.ins_tdl_info({ 'protein_id': int(row[2]), 'itype': 'HPA Tissue Specificity Index', 'number_value': row[1] }) if not rv: dba_err_ct += 1 continue pmark[row[1]] = True ti_ct += 1 pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPA Tissue Specificity Index tdl_info rows for {} proteins.".format( ti_ct, len(pmark)) if skip_ct: print " Skipped {} rows with no tau.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load_GWASCatalog(args, dba, logger, logfile): fn = CONFIG['GWAS Catalog']['DOWNLOAD_DIR'] + CONFIG['GWAS Catalog'][ 'FILENAME'] line_ct = slmf.wcl(fn) if not args['--quiet']: print 'Processing {} lines GWAS Catalog file {}'.format(line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() outlist = [] with open(fn, 'rU') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter='\t') header = tsvreader.next() # skip header line ct = 1 notfnd = set() pmark = {} gwas_ct = 0 dba_err_ct = 0 # 0: DATE ADDED TO CATALOG # 1: PUBMEDID # 2: FIRST AUTHOR # 3: DATE # 4: JOURNAL # 5: LINK # 6: STUDY # 7: DISEASE/TRAIT # 8: INITIAL SAMPLE SIZE # 9: REPLICATION SAMPLE SIZE # 10: REGION # 11: CHR_ID # 12: CHR_POS # 13: REPORTED GENE(S) # 14: MAPPED_GENE # 15: UPSTREAM_GENE_ID # 16: DOWNSTREAM_GENE_ID # 17: SNP_GENE_IDS # 18: UPSTREAM_GENE_DISTANCE # 19: DOWNSTREAM_GENE_DISTANCE # 20: STRONGEST SNP-RISK ALLELE # 21: SNPS # 22: MERGED # 23: SNP_ID_CURRENT # 24: CONTEXT # 25: INTERGENIC # 26: RISK ALLELE FREQUENCY # 27: P-VALUE # 28: PVALUE_MLOG # 29: P-VALUE (TEXT) # 30: OR or BETA # 31: 95% CI (TEXT) # 32: PLATFORM [SNPS PASSING QC] # 33: CNV # 34: MAPPED_TRAIT # 35: MAPPED_TRAIT_URI # 36: STUDY ACCESSION # 37: GENOTYPING TECHNOLOGY symregex = re.compile(r' ?[-,;] ?') for row in tsvreader: ct += 1 if len(row) < 14: continue symstr = row[14] if symstr == 'NR': continue symlist = symregex.split(symstr) for sym in symlist: if sym in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) logger.warn("No target found for symbol {}".format(sym)) continue for t in targets: p = t['components']['protein'][0] try: pval = float(row[27]) except: pval = None try: orbeta = float(row[30]) except: orbeta = None if row[25]: ig = int(row[25]) else: ig = None rv = dba.ins_gwas({ 'protein_id': p['id'], 'disease_trait': row[7], 'snps': row[21], 'pmid': row[1], 'study': row[6], 'context': row[24], 'intergenic': ig, 'p_value': pval, 'or_beta': orbeta, 'cnv': row[33], 'mapped_trait': row[34], 'mapped_trait_uri': row[35] }) if not rv: dba_err_ct += 1 continue pmark[p['id']] = True gwas_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new gwas rows for {} proteins".format( gwas_ct, len(pmark)) if notfnd: print " No target found for {} symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GWAS Catalog', 'source': 'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' % os.path.basename(CONFIG['GWAS Catalog']['FILENAME']), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gwas/home' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'BioPlex Protein-Protein Interactions', 'source': "Files %s from http://wren.hms.harvard.edu/bioplex/downloadInteractions.php"%", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://wren.hms.harvard.edu/bioplex/index.php'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'BioPlex'"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) f = BIOPLEX_FILE line_ct = slmf.wcl(f) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from BioPlex PPI file {}".format(line_ct, f) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(f, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # GeneA GeneB UniprotA UniprotB SymbolA SymbolB pW pNI pInt ct = 0 ppi_ct = 0 same12_ct = 0 k2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) geneid1 = row[0] geneid2 = row[1] up1 = row[2] up2 = row[3] sym1 = row[4] sym2 = row[5] pw = row[6] pni = row[7] pint = row[8] # protein1 k1 = "%s|%s|%s" % (up1, sym1, geneid1) if k1 in k2pid: pid1 = k2pid[k1] elif k1 in notfnd: continue else: t1 = find_target(dba, k1) if not t1: notfnd.add(k1) continue pid1 = t1['components']['protein'][0]['id'] k2pid[k1] = pid1 # protein2 k2 = "%s|%s|%s" % (up2, sym2, geneid2) if k2 in k2pid: pid2 = k2pid[k2] elif k2 in notfnd: continue else: t2 = find_target(dba, k2) if not t2: notfnd.add(k2) continue pid2 = t2['components']['protein'][0]['id'] k2pid[k2] = pid2 if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw, 'protein1_id': pid1, 'protein1_str': k1, 'protein2_id': pid2, 'protein2_str': k2} ) if rv: ppi_ct += 1 else: dba_err_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) print "{} BioPlex PPI rows processed.".format(ct) print " Inserted {} new ppi rows".format(ppi_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) for f in UPD_FILES[1:]: start_time = time.time() line_ct = slmf.wcl(f) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from BioPlex PPI update file {}".format(line_ct, f) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(f, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # plate_num well_num db_protein_id symbol gene_id bait_symbol bait_geneid pWrongID pNoInt pInt ct = 0 ppi_ct = 0 same12_ct = 0 k2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) geneid1 = row[6] geneid2 = row[4] sym1 = row[5] sym2 = row[3] pw = row[7] pni = row[8] pint = row[9] # protein1 k1 = "|%s|%s" % (sym1, geneid1) if k1 in k2pid: pid1 = k2pid[k1] elif k1 in notfnd: continue else: t1 = find_target(dba, k1) if not t1: notfnd.add(k1) continue pid1 = t1['components']['protein'][0]['id'] k2pid[k1] = pid1 # protein2 k2 = "|%s|%s" % (sym2, geneid2) if k2 in k2pid: pid2 = k2pid[k2] elif k2 in notfnd: continue else: t2 = find_target(dba, k2) if not t2: notfnd.add(k2) continue pid2 = t2['components']['protein'][0]['id'] k2pid[k2] = pid2 if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw, 'protein1_id': pid1, 'protein1_str': k1, 'protein2_id': pid2, 'protein2_str': k2} ) if rv: ppi_ct += 1 else: dba_err_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) print "{} BioPlex PPI rows processed.".format(ct) print " Inserted {} new ppi rows".format(ppi_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load_IMPC(args, dba, logger, logfile): fn = CONFIG['IMPC']['DOWNLOAD_DIR'] + CONFIG['IMPC']['GENO_PHENO_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines from input file {}".format(line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 1 pt_ct = 0 pmark = {} sym2nhps = {} notfnd = set() skip_ct = 0 dba_err_ct = 0 # 0: marker_accession_id # 1: marker_symbol # 2: phenotyping_center # 3: colony_id # 4: sex # 5: zygosity # 6: allele_accession_id # 7: allele_symbol # 8: allele_name # 9: strain_accession_id # 10: strain_name # 11: project_name # 12: project_fullname # 13: pipeline_name # 14: pipeline_stable_id # 15: procedure_stable_id # 16: procedure_name # 17: parameter_stable_id # 18: parameter_name # 19: top_level_mp_term_id # 20: top_level_mp_term_name # 21: mp_term_id # 22: mp_term_name # 23: p_value # 24: percentage_change # 25: effect_size # 26: statistical_method # 27: resource_name for row in csvreader: ct += 1 sym = row[1] if not row[21] and not row[22]: # skip data with neither a term_id or term_name (IMPC has some of these) skip_ct += 1 continue if sym in sym2nhps: # we've already found it nhpids = sym2nhps[sym] elif sym in notfnd: # we've already not found it continue else: nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus') if not nhps: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue nhpids = [] for nhp in nhps: nhpids.append(nhp['id']) sym2nhps[ sym] = nhpids # save this mapping so we only lookup each nhprotein once pval = None if row[23] and row[23] != '': try: pval = float(row[23]) except: logger.warn( "Problem converting p_value {} for row {}".format( row[23], ct)) for nhpid in nhpids: rv = dba.ins_phenotype({ 'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': row[4], 'gp_assoc': 1 }) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} IMPC phenotypes for {} nhproteins".format( pt_ct, len(pmark.keys())) if notfnd: print "No nhprotein found for {} gene symbols. See logfile {} for details.".format( len(notfnd), logfile) if skip_ct > 0: print "Skipped {} lines with no term_id or term_name.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fn = CONFIG['IMPC']['DOWNLOAD_DIR'] + CONFIG['IMPC']['STAT_RES_FILE'] line_ct = slmf.wcl(fn) if not args['--quiet']: print "Processing {} lines from input file {}".format(line_ct, fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line ct = 1 pt_ct = 0 pmark = {} sym2nhps = {} notfnd = set() skip_ct = 0 pv_ct = 0 dba_err_ct = 0 # 0: phenotyping_center # 1: intercept_estimate # 2: procedure_id # 3: mutant_biological_model_id # 4: rotated_residuals_test # 5: weight_effect_p_value # 6: male_mutant_count # 7: pipeline_stable_key # 8: female_ko_effect_p_value # 9: pipeline_stable_id # 10: parameter_stable_key # 11: data_type # 12: parameter_stable_id # 13: interaction_significant # 14: strain_accession_id # 15: control_selection_method # 16: parameter_name # 17: allele_name # 18: phenotyping_center_id # 19: weight_effect_stderr_estimate # 20: weight_effect_parameter_estimate # 21: procedure_stable_id # 22: status # 23: sex_effect_parameter_estimate # 24: female_ko_effect_stderr_estimate # 25: female_percentage_change # 26: group_2_residuals_normality_test # 27: marker_accession_id # 28: mp_term_name # 29: group_1_residuals_normality_test # 30: genotype_effect_p_value # 31: dependent_variable # 32: resource_name # 33: project_id # 34: procedure_name # 35: doc_id # 36: top_level_mp_term_id # 37: allele_accession_id # 38: blups_test # 39: null_test_p_value # 40: p_value # 41: marker_symbol # 42: control_biological_model_id # 43: pipeline_name # 44: sex # 45: interaction_effect_p_value # 46: colony_id # 47: project_name # 48: female_ko_parameter_estimate # 49: female_mutant_count # 50: organisation_id # 51: external_db_id # 52: female_control_count # 53: intermediate_mp_term_id # 54: db_id # 55: male_ko_effect_p_value # 56: top_level_mp_term_name # 57: metadata_group # 58: sex_effect_stderr_estimate # 59: zygosity # 60: male_percentage_change # 61: sex_effect_p_value # 62: mp_term_id # 63: male_ko_effect_stderr_estimate # 64: additional_information # 65: statistical_method # 66: _version_ # 67: intercept_estimate_stderr_estimate # 68: male_control_count # 69: intermediate_mp_term_name # 70: strain_name # 71: classification_tag # 72: effect_size # 73: procedure_stable_key # 74: allele_symbol # 75: resource_id # 76: group_2_genotype # 77: variance_significant # 78: pipeline_id # 79: group_1_genotype # 80: male_ko_parameter_estimate # 81: genotype_effect_parameter_estimate # 82: categories # 83: parameter_id # 84: batch_significant # 85: genotype_effect_stderr_estimate # 86: resource_fullname for row in csvreader: ct += 1 sym = row[41] if not row[62] and not row[28]: # skip lines with neither a term_id or term_name skip_ct += 1 continue if sym in sym2nhps: # we've already found it nhpids = sym2nhps[sym] elif sym in notfnd: # we've already not found it continue else: nhps = dba.find_nhproteins({'sym': sym}, species='Mus musculus') if not nhps: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue nhpids = [] for nhp in nhps: nhpids.append(nhp['id']) sym2nhps[ sym] = nhpids # save this mapping so we only lookup each nhprotein once pval = None if row[40] and row[40] != '': try: pval = float(row[40]) except: logger.warn( "Problem converting p_value {} for row {}".format( row[40], ct)) for nhpid in nhpids: rv = dba.ins_phenotype({ 'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': row[44], 'gp_assoc': 0 }) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} IMPC phenotypes for {} nhproteins".format( pt_ct, len(pmark)) if notfnd: print " No nhprotein found for {} gene symbols. See logfile {} for details.".format( len(notfnd), logfile) if skip_ct > 0: print " Skipped {} lines with no term_id/term_name or no p-value.".format( skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IMPC Phenotypes', 'source': "Files %s and %s from ftp://ftp.ebi.ac.uk/pub/databases/impc/release-11.0/csv/" % (CONFIG['IMPC']['GENO_PHENO_FILE'], CONFIG['IMPC']['STAT_RES_FILE']), 'app': PROGRAM, 'app_version': __version__ }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'IMPC'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile)