def load_data(): nex_session = get_session() bud_id_to_id = dict([(x.bud_id, x.updatelog_id) for x in nex_session.query(Updatelog).all()]) for bud_id in bud_id_to_id: print bud_id return i = 0 j = 0 f = open(file) for line in f: pieces = line.strip().split("\t") if int(pieces[0]) in bud_id_to_id: continue insert_update_log(nex_session, pieces) i = i + 1 j = j + 1 if i == 500: nex_session.commit() i = 0 if j == 200000: nex_session.close() nex_session = get_session() j = 0 f.close() # nex_session.rollback() nex_session.commit()
def update_reference_table(log_file): nex_session = get_session() fw = open(log_file, "w") fw.write(str(datetime.now()) + "\n") fw.write("Getting PMID list...\n") pmid_to_reference = dict([ (x.pmid, x) for x in nex_session.query(Referencedbentity).all() ]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) journal_id_to_abbrev = dict([(x.journal_id, x.med_abbr) for x in nex_session.query(Journal).all()]) ################################################################# source_id = source_to_id[SRC] fw.write(str(datetime.now()) + "\n") fw.write("Getting Pubmed records...\n") print(datetime.now()) print("Getting Pubmed records...") pmids = [] j = 0 for pmid in pmid_to_reference: if pmid is None or pmid in [26842620, 27823544, 11483584]: continue j = j + 1 if j > MAX_4_CONNECTION: nex_session.close() nex_session = get_session() j = 0 if len(pmids) >= MAX: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, journal_id_to_abbrev, source_id) pmids = [] time.sleep(SLEEP_TIME) pmids.append(str(pmid)) if len(pmids) > 0: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, journal_id_to_abbrev, source_id) print("Done") fw.close() nex_session.commit()
def update_data(): nex_session = get_session() all_conds = nex_session.query(PhenotypeannotationCond).filter_by( condition_class='chemical').all() nex_session.close() nex_session = get_session() i = 0 for x in all_conds: if x.condition_value is None or x.condition_value == "": continue if x.condition_unit is not None and x.condition_unit != "": continue else: condition_value = x.condition_value if " " not in condition_value: condition_value = condition_value.replace("uM", " uM") condition_value = condition_value.replace("mM", " mM") condition_value = condition_value.replace("g/L", " g/L") condition_value = condition_value.replace( "%", " %") # should we do this?? values = condition_value.split(' ') if len(values) >= 3 or "," in condition_value or len(values) == 1: # print "TO_FIX:", str(x.condition_id) + "\t" + str(x.annotation_id) + "\t" + x.condition_name + "\t" + x.condition_value + "\t" + str(x.condition_unit) continue elif values[1] in [ 'analog', 'derivative', 'B', 'C', 'X', 'CaCl2', 'Brix', 'B1', '2.7' ]: # print "TO_FIX:", str(x.condition_id) + "\t" + str(x.annotation_id) + "\t" + x.condition_name + "\t" + x.condition_value + "\t" + str(x.condition_unit) continue else: print values[0], ":", values[1] nex_session.query(PhenotypeannotationCond).filter_by( condition_id=x.condition_id).update({ "condition_value": values[0], "condition_unit": values[1] }) i = i + 1 if i > 500: nex_session.commit() i = 0 nex_session.commit() nex_session.close()
def load_go_refs(mapping_file): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) sgdid_to_dbentity_id = dict([(x.sgdid, x.dbentity_id) for x in nex_session.query(Dbentity).all()]) source_id = source_to_id[src] f = open(mapping_file) for line in f: pieces = line.strip().split("\t") print pieces[0] print pieces[1] x = ReferenceAlias(display_name=pieces[0], source_id=source_id, reference_id=sgdid_to_dbentity_id[pieces[1]], alias_type=type, created_by=CREATED_BY) nex_session.add(x) nex_session.commit() nex_session.close()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) ecid_to_ec = dict([(x.ecid, x) for x in nex_session.query(Ec).all()]) ec_id_to_alias = {} for x in nex_session.query(EcAlia).all(): aliases = [] if x.ec_id in ec_id_to_alias: aliases = ec_id_to_alias[x.ec_id] aliases.append(x.display_name) ec_id_to_alias[x.ec_id] = aliases #################################### fw = open(log_file, "w") data = read_data_file(enzyme_file) [update_log, to_delete_list] = load_new_data(nex_session, data, source_to_id, ecid_to_ec, ec_id_to_alias, fw) write_summary_and_send_email(fw, update_log, to_delete_list) nex_session.close() fw.close()
def load_eco_urls(): nex_session = get_session() f = open("data/eco_go_urls.txt") for line in f: if line.startswith('eco_id'): continue pieces = line.strip().split('\t') eco_id = int(pieces[0]) display_name = pieces[3] obj_url = pieces[4] url_type = pieces[5] source_id = int(pieces[6]) x = EcoUrl(eco_id=eco_id, display_name=display_name, obj_url=obj_url, url_type=url_type, source_id=source_id, created_by=CREATED_BY) nex_session.add(x) nex_session.commit() print(eco_id, display_name, obj_url, url_type, source_id) nex_session.commit() nex_session.close()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) chebiid_to_chebi = dict([(x.chebiid, x) for x in nex_session.query(Chebi).all()]) chebi_id_to_alias = {} for x in nex_session.query(ChebiAlia).all(): aliases = [] if x.chebi_id in chebi_id_to_alias: aliases = chebi_id_to_alias[x.chebi_id] aliases.append((x.display_name, x.alias_type)) chebi_id_to_alias[x.chebi_id] = aliases #################################### fw = open(log_file, "w") is_3_star_term = {} data = read_owl(ontology_file, ontology, is_3_star_term) [update_log, to_delete_list, term_name_changed] = load_new_data(nex_session, data, source_to_id, chebiid_to_chebi, chebi_id_to_alias, is_3_star_term, fw) write_summary_and_send_email(fw, update_log, to_delete_list, term_name_changed) nex_session.close() fw.close()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) obiid_to_obi = dict([(x.obiid, x) for x in nex_session.query(Obi).all()]) term_to_ro_id = dict([(x.display_name, x.ro_id) for x in nex_session.query(Ro).all()]) obi_id_to_parent = {} for x in nex_session.query(ObiRelation).all(): parents = [] if x.child_id in obi_id_to_parent: parents = obi_id_to_parent[x.child_id] parents.append(x.parent_id) obi_id_to_parent[x.child_id] = parents #################################### fw = open(log_file, "w") is_sgd_term = {} data = read_owl(ontology_file, ontology) [update_log, to_delete_list] = load_new_data(nex_session, data, source_to_id, obiid_to_obi, term_to_ro_id['is a'], obi_id_to_parent, fw) write_summary_and_send_email(fw, update_log, to_delete_list) nex_session.close() fw.close()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) psimodid_to_psimod = dict([(x.psimodid, x) for x in nex_session.query(Psimod).all()]) term_to_ro_id = dict([(x.display_name, x.ro_id) for x in nex_session.query(Ro).all()]) psimod_id_to_parent = {} for x in nex_session.query(PsimodRelation).all(): parents = [] if x.child_id in psimod_id_to_parent: parents = psimod_id_to_parent[x.child_id] parents.append(x.parent_id) psimod_id_to_parent[x.child_id] = parents #################################### fw = open(log_file, "w") data = read_obo(ontology_file) [update_log, to_delete_list] = load_new_data(nex_session, data, source_to_id, psimodid_to_psimod, term_to_ro_id['is a'], psimod_id_to_parent, fw) write_summary_and_send_email(fw, update_log, to_delete_list) nex_session.close() fw.close()
def update_database(): nex_session = get_session() dataset_id_to_dataset = dict([(x.dataset_id, x) for x in nex_session.query(Dataset).filter_by(channel_count=1).all()]) datasetsample_id_to_dataset_id = dict([(x.datasetsample_id, x.dataset_id) for x in nex_session.query(Datasetsample).all()]) all_expressions = nex_session.query(Expressionannotation).all() for x in all_expressions: if x.log_ratio_value is not None: continue dataset_id = datasetsample_id_to_dataset_id.get(x.datasetsample_id) if dataset_id is None: # print "BAD: The datasetsample_id: ", x.datasetsample_id, " is not in the database." continue if dataset_id not in dataset_id_to_dataset: # print "BAD: The datasetsample_id: ", x.datasetsample_id, " is mapped to a dataset_id=", dataset_id, " that doesn't have channel_count=1" continue ## update data from here value = x.normalized_expression_value new_value = 0 if value > 0: new_value = math.log(value, 2) ## round down to 2 decimal point new_value = float("%.2f" % new_value) print x.annotation_id, value, new_value nex_session.close()
def load_summaries(summary_file): nex_session = get_session() biocyc_id_to_dbentity_id = dict([ (x.biocyc_id, x.dbentity_id) for x in nex_session.query(Pathwaydbentity).all() ]) pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id f = open(summary_file) fw = open(log_file, "w") for line in f: pieces = line.strip().split("\t") pathway_name = pieces[0].strip() summary_text = pieces[1].strip() pmids = pieces[2].strip().replace(" ", "").split("|") dbentity_id = biocyc_id_to_dbentity_id.get(pathway_name) if dbentity_id is None: print "TO CHECK: The biocyc_id:", pathway_name, " is not in the database." print line continue # summary_id = insert_pathwaysummary(nex_session, fw, dbentity_id, summary_text, source_id) # if summary_id is None: # print "TO CHECK: Can't insert summary for biocyc_id: ", pathway_name # print line # continue reference_id_list = [] bad = 0 for pmid in pmids: if int(pmid) in pmid_to_reference_id: reference_id_list.append(pmid_to_reference_id[int(pmid)]) else: print "TO CHECK: The pmid: ", pmid, " is not in the database." bad = 1 if bad == 1: print line continue # insert_summary_references(nex_session, fw, reference_id_list, summary_id, source_id) # nex_session.commit() f.close() fw.close()
def update_all_urls(log_file): nex_session = get_session() fw = open(log_file, "w") fw.write(str(datetime.now()) + "\n") fw.write("Getting PMID list...\n") pmid_to_reference = dict([ (x.pmid, x) for x in nex_session.query(Referencedbentity).all() ]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) reference_id_to_urls = {} for x in nex_session.query(ReferenceUrl).all(): urls = [] if x.reference_id in reference_id_to_urls: urls = reference_id_to_urls[x.reference_id] urls.append((x.url_type, x.obj_url)) reference_id_to_urls[x.reference_id] = urls ################################################################# fw.write(str(datetime.now()) + "\n") fw.write("Getting Pubmed records...\n") print(datetime.now()) print("Getting Pubmed records...") source_id = source_to_id[SRC] pmids = [] for pmid in pmid_to_reference: fw.write("Getting data for PMID:" + str(pmid) + "\n") if pmid is None or pmid in [26842620, 27823544]: continue if len(pmids) >= MAX: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_urls, source_id) pmids = [] # time.sleep(SLEEP_TIME) pmids.append(str(pmid)) if len(pmids) > 0: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_urls, source_id) print("Done") fw.close() nex_session.commit()
def load_domains(): nex_session = get_session() fw = open(log_file, "w") read_data_and_update_database(nex_session, fw) nex_session.close() fw.close()
def get_data(): nex_session = get_session() format_name_to_datasetsample_id = dict([ (x.format_name, x.datasetsample_id) for x in nex_session.query(Datasetsample).all() ]) systematic_name_to_dbentity_id = dict([ (x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all() ]) pmid_to_reference_id = dict([ (x.pmid, x.dbentity_id) for x in nex_session.query(Referencedbentity).all() ]) tax = nex_session.query(Taxonomy).filter_by(taxid="TAX:4932").one_or_none() taxonomy_id = tax.taxonomy_id sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id fw = open(outfile, "w") fw.write( "dbentity_id\tsource_id\ttaxonomy_id\treference_id\tdatasetsample_id\tnormalized_expression_value\tlog_ratio_value\n" ) f = open(infile) seen = {} for line in f: pieces = line.strip().split("\t") reference_id = pmid_to_reference_id.get(int(pieces[0])) if reference_id is None: print "The pmid: ", pieces[0], " is not in the database." continue dbentity_id = systematic_name_to_dbentity_id.get(pieces[6]) if dbentity_id is None: print "The feature_name: ", pieces[6], " is not in the database." continue datasetsample_id = format_name_to_datasetsample_id.get(pieces[1]) if datasetsample_id is None: print "The datasetsample format_name: ", pieces[ 1], " is not in the database." continue key = (dbentity_id, datasetsample_id) if key in seen: continue seen[key] = 1 fw.write( str(dbentity_id) + "\t" + str(source_id) + "\t" + str(taxonomy_id) + "\t" + str(reference_id) + "\t" + str(datasetsample_id) + "\t" + pieces[4] + "\t" + pieces[5] + "\n") f.close() fw.close()
def load_geo_urls(): nex_session = get_session() all = nex_session.query(Datasetsample).all() nex_session.close() nex_session = get_session() i = 0 for x in all: if x.dbxref_id and x.dbxref_url is None: print(x.dbxref_id) dbxref_url = geo_root_url + x.dbxref_id nex_session.query(Datasetsample).filter_by(datasetsample_id=x.datasetsample_id).update({"dbxref_url": dbxref_url}) i = i + 1 if i == 200: nex_session.commit() i = 0 nex_session.commit() nex_session.close()
def load_data(): nex_session = get_session() key_to_id = dict([((x.dbentity_id, x.datasetsample_id), x.annotation_id) for x in nex_session.query(Expressionannotation).all()]) nex_session.close() nex_session = get_session() fw = open(log_file, "w") i = 0 j = 0 for file in files_to_load: f = open(file) for line in f: if line.startswith('dbentity_id'): continue pieces = line.strip().split("\t") if (int(pieces[0]), int(pieces[4])) in key_to_id: continue insert_expressionannotation(nex_session, fw, pieces) i = i + 1 j = j + 1 if i == 500: nex_session.commit() i = 0 if j == 200000: nex_session.close() nex_session = get_session() j = 0 f.close() fw.close() # nex_session.rollback() nex_session.commit()
def load_data(): nex_session = get_session() sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none() source_id = sgd.source_id key_to_id = {} for x in nex_session.query(ArchLocuschange).all(): old_value = "" if x.old_value is not None: old_value = x.old_value key = (x.dbentity_id, x.change_type, old_value, x.new_value, x.date_added_to_database) key_to_id[key] = x.archive_id fw = open(log_file, "w") for file in files_to_load: f = open(file) for line in f: if line.startswith('dbentity_id'): continue pieces = line.strip().split("\t") if len(pieces) < 9: print("Unknown line: ", line) continue date_added_to_database = reformat_date(pieces[5].strip()) date_standardized = reformat_date(pieces[7].strip()) date_archived = reformat_date(pieces[8].strip()) # key = (int(pieces[0].strip()), change_type, pieces[3].strip(), pieces[4].strip(), date_added_to_database) # if key in key_to_id: # print "In database: ", key # continue insert_into_database(nex_session, fw, int(pieces[0].strip()), source_id, int(pieces[1].strip()), pieces[3].strip(), pieces[4].strip(), date_added_to_database, pieces[6].strip(), date_archived, date_standardized), f.close() fw.close()
def update_data(): nex_session = get_session() fw = open(log_file, "w") source_id_to_source = dict([(x.source_id, x.display_name) for x in nex_session.query(Source).all()]) all_aliases = nex_session.query(LocusAlias).all() nex_session.close() nex_session = get_session() i = 0 for x in all_aliases: if x.obj_url: # print "OLD:", x.obj_url continue else: obj_url = get_url(x.alias_type, x.display_name, source_id_to_source[x.source_id]) if obj_url != "": print "OLD:", x.obj_url, "NEW:", obj_url nex_session.query(LocusAlias).filter_by( locus_id=x.locus_id, alias_type=x.alias_type, display_name=x.display_name, source_id=x.source_id).update({"obj_url": obj_url}) i = i + 1 if i > 500: nex_session.commit() i = 0 nex_session.commit() nex_session.close()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) doid_to_disease = dict([(x.doid, x) for x in nex_session.query(Disease).all()]) term_to_ro_id = dict([(x.display_name, x.ro_id) for x in nex_session.query(Ro).all()]) disease_id_to_alias = {} for x in nex_session.query(DiseaseAlia).all(): aliases = [] if x.disease_id in disease_id_to_alias: aliases = disease_id_to_alias[x.disease_id] aliases.append((x.display_name, x.alias_type)) disease_id_to_alias[x.disease_id] = aliases disease_id_to_parent = {} for x in nex_session.query(DiseaseRelation).all(): parents = [] if x.child_id in disease_id_to_parent: parents = disease_id_to_parent[x.child_id] parents.append(x.parent_id) disease_id_to_parent[x.child_id] = parents #################################### fw = open(log_file, "w") is_sgd_term = {} data = read_owl(ontology_file, ontology) [update_log, to_delete_list] = load_new_data(nex_session, data, source_to_id, doid_to_disease, term_to_ro_id['is a'], disease_id_to_alias, disease_id_to_parent, fw) write_summary_and_send_email(fw, update_log, to_delete_list) nex_session.close() fw.close()
def update_all_relations(log_file): nex_session = get_session() fw = open(log_file,"w") fw.write(str(datetime.now()) + "\n") fw.write("Getting PMID list...\n") print(datetime.now()) print("Getting PMID list...") pmid_to_reference = dict([(x.pmid, x) for x in nex_session.query(Referencedbentity).all()]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) key_to_type = dict([((x.parent_id, x.child_id), x.relation_type) for x in nex_session.query(ReferenceRelation).all()]) fw.write(str(datetime.now()) + "\n") fw.write("Getting Pubmed records...\n") print(datetime.now()) print("Getting Pubmed records...") pmids = [] for pmid in pmid_to_reference: fw.write("Getting data for PMID=" + str(pmid) + "\n") if pmid is None or pmid in [26842620, 27823544]: continue if len(pmids) >= MAX: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, key_to_type, source_to_id) pmids = [] time.sleep(SLEEP_TIME) pmids.append(str(pmid)) if len(pmids) > 0: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, key_to_type, source_to_id) print("Done") fw.close() nex_session.commit()
def add_display_name(): f = open(ocelot_file) biocyc_id = None display_name = None biocyc_to_display_name = {} prev_line = None for line in f: line = line.strip() if len(line) == 0: if biocyc_id and display_name: biocyc_to_display_name[biocyc_id] = display_name biocyc_id = None display_name = None continue if line.startswith('(') and line.endswith(' NIL ('): # print line biocyc_id = line.replace("(", "").split(" ")[0] continue if line.startswith("(COMMON-NAME ") and line != "(COMMON-NAME NIL)": # print line display_name = line.replace('(COMMON-NAME "', '').replace('")', '') continue f.close() nex_session = get_session() all_pathways = nex_session.query(Pathwaydbentity).all() for x in all_pathways: if x.biocyc_id in biocyc_to_display_name: nex_session.query(Pathwaydbentity).filter_by( biocyc_id=x.biocyc_id).update( {'display_name': biocyc_to_display_name[x.biocyc_id]}) print x.biocyc_id + "\t" + biocyc_to_display_name[x.biocyc_id] else: print "NOT FOUND:", x.biocyc_id nex_session.rollback()
def load_data(): nex_session = get_session() i = 0 for file in files_to_load: f = open(file) for line in f: if line.startswith('dbentity_id'): continue pieces = line.strip().split("\t") insert_expressionannotation(nex_session, pieces) i = i + 1 if i == 500: nex_session.commit() i = 0 f.close() # nex_session.rollback() nex_session.commit()
def load_ontology(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) roid_to_ro = dict([(x.roid, x) for x in nex_session.query(Ro).all()]) fw = open(log_file, "w") data = read_owl(ontology_file, ontology) [update_log, to_delete_list] = load_new_data(nex_session, data, source_to_id[src], roid_to_ro, fw) write_summary_and_send_email(fw, update_log, to_delete_list) nex_session.close() fw.close()
def load_geo_urls(): nex_session = get_session() source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) dataset_id_to_url = dict([(x.dataset_id, x) for x in nex_session.query(DatasetUrl).filter_by(url_type='GEO').all()]) source_id = source_to_id[src] for x in nex_session.query(Dataset).all(): if x.dbxref_id and x.dbxref_id.startswith('GSE') and x.dataset_id not in dataset_id_to_url: print(x.dbxref_id) y = DatasetUrl(display_name = type, dataset_id = x.dataset_id, source_id = source_id, obj_url = geo_root_url + x.dbxref_id, url_type = type, created_by = CREATED_BY) nex_session.add(y) nex_session.commit() nex_session.close()
def get_data(): nex_session = get_session() format_name_to_datasetsample_id = dict([ (x.format_name, x.datasetsample_id) for x in nex_session.query(Datasetsample).all() ]) systematic_name_to_dbentity_id = dict([ (x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all() ]) fw = open(outfile, "w") fw.write("dbentity_id\tdatasetsample_id\tlog_ratio_value\n") f = open(infile) seen = {} for line in f: pieces = line.strip().split("\t") dbentity_id = systematic_name_to_dbentity_id.get(pieces[6]) if dbentity_id is None: print("The feature_name: ", pieces[6], " is not in the database.") continue datasetsample_id = format_name_to_datasetsample_id.get(pieces[1]) if datasetsample_id is None: print("The datasetsample format_name: ", pieces[1], " is not in the database.") continue key = (dbentity_id, datasetsample_id) if key in seen: continue seen[key] = 1 fw.write( str(dbentity_id) + "\t" + str(datasetsample_id) + "\t" + pieces[5] + "\n") f.close() fw.close()
def update_all_authors(log_file): nex_session = get_session() fw = open(log_file,"w") fw.write(str(datetime.now()) + "\n") fw.write("Getting PMID list...\n") print(datetime.now()) print("Getting PMID list...") pmid_to_reference = dict([(x.pmid, x) for x in nex_session.query(Referencedbentity).all()]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) reference_id_to_authors = {} for x in nex_session.query(Referenceauthor).order_by(Referenceauthor.reference_id, Referenceauthor.author_order).all(): authors = [] if x.reference_id in reference_id_to_authors: authors = reference_id_to_authors[x.reference_id] authors.append(x.display_name) reference_id_to_authors[x.reference_id] = authors ################################################################# f = open('./authors_updated_pmid.lst') updated_pmid = {} for pmid in f: pmid = int(pmid.strip()) updated_pmid[pmid] = 1 f.close() fw.write(str(datetime.now()) + "\n") fw.write("Getting Pubmed records...\n") print(datetime.now()) print("Getting Pubmed records...") source_id = source_to_id[SRC] j = 0 pmids = [] for pmid in pmid_to_reference: fw.write("Getting data for PMID:" + str(pmid) + "\n") if pmid is None or pmid in [26842620, 27823544, 11483584]: continue if pmid in updated_pmid: continue j = j + 1 if j > MAX_4_CONNECTION: nex_session.close() nex_session = get_session() j = 0 if len(pmids) >= MAX: records = get_pubmed_record_from_xml(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_authors, source_id) pmids = [] time.sleep(SLEEP_TIME) pmids.append(str(pmid)) if len(pmids) > 0: records = get_pubmed_record_from_xml(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_authors, source_id) print("Done") fw.close() nex_session.commit()
def load_summaries(summary_type, summary_file, log_file): nex_session = get_session() fw = open(log_file,"w") fw.write(str(datetime.now()) + "\n") fw.write("reading data from summary_file...\n") data = read_summary_file(nex_session, fw, summary_type, summary_file, log_file) fw.write(str(datetime.now()) + "\n") fw.write("retriveing data from database and store the data in dictionary...\n") key_to_summary = dict([((x.locus_id, x.summary_type, x.summary_order), x) for x in nex_session.query(Locussummary).all()]) key_to_summaryref = dict([((x.summary_id, x.reference_id, x.reference_order), x) for x in nex_session.query(LocussummaryReference).all()]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) source_id = source_to_id.get('SGD') summary_id_to_references = {} for x in nex_session.query(LocussummaryReference).all(): references = [] if x.summary_id in summary_id_to_references: references = summary_id_to_references[x.summary_id] references.append(x) summary_id_to_references[x.summary_id] = references load_summary_holder = { "summary_added": 0, "summary_updated": 0, "summary_reference_added": 0 } fw.write(str(datetime.now()) + "\n") fw.write("updating the database...\n") for x in data: key = (x['locus_id'], x['summary_type'], x['summary_order']) summary_id = None if key in key_to_summary: if x['text'] != key_to_summary[key].text.strip(): fw.write("OLD:" + key_to_summary[key].text + ":\n") fw.write("NEW:" + x['text'] + ":\n") nex_session.query(Locussummary).filter_by(summary_id=key_to_summary[key].summary_id).update({'text': x['text'], 'html': x['html']}) nex_session.commit() load_summary_holder['summary_updated'] = load_summary_holder['summary_updated'] + 1 else: fw.write("SUMMARY is in DB\n") summary_id = key_to_summary[key].summary_id update_references(nex_session, fw, load_summary_holder, source_id, summary_id, summary_id_to_references.get(summary_id), x.get('references')) else: summary_id = insert_summary(nex_session, fw, load_summary_holder, source_id, x) if x.get('references'): for y in x['references']: insert_summary_reference(nex_session, fw, load_summary_holder, source_id, summary_id, y) nex_session.commit() nex_session.close() fw.write(str(datetime.now()) + "\n") fw.write("writing summary and sending an email to curators...\n") write_summary_and_send_email(load_summary_holder, fw, summary_type) fw.close()
def update_all_abstracts(log_file): nex_session = get_session() fw = open(log_file, "w") fw.write(str(datetime.now()) + "\n") print(datetime.now()) pmid_to_reference = dict([ (x.pmid, x) for x in nex_session.query(Referencedbentity).all() ]) source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()]) reference_id_to_abstract = dict([ (x.reference_id, x.text) for x in nex_session.query(Referencedocument).filter_by( document_type="Abstract").all() ]) ################################################################# fw.write("Getting Pubmed records...\n") print("Getting Pubmed records...") source_id = source_to_id[SRC] pmids = [] j = 0 for pmid in pmid_to_reference: fw.write("Getting data for PMID: " + str(pmid) + "\n") if pmid is None or pmid in [26842620, 27823544]: continue j = j + 1 if j > MAX_4_CONNECTION: nex_session.close() nex_session = get_session() j = 0 if len(pmids) >= MAX: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_abstract, source_id) pmids = [] time.sleep(SLEEP_TIME) pmids.append(str(pmid)) if len(pmids) > 0: records = get_pubmed_record(','.join(pmids)) update_database_batch(nex_session, fw, records, pmid_to_reference, reference_id_to_abstract, source_id) print("Done") fw.close() nex_session.commit()
def load_data(): nex_session = get_session() format_name_to_dataset_id_src = dict([(x.format_name, (x.dataset_id, x.source_id)) for x in nex_session.query(Dataset).all()]) taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()]) format_name_to_datasetsample_id = dict([(x.format_name, x.datasetsample_id) for x in nex_session.query(Datasetsample).all()]) fw = open(log_file, "w") GSE_GSM_to_assay = {} f = open(ds_multiassays_to_sample_mapping_file) for line in f: pieces = line.strip().split("\t") GSE = pieces[0].strip() assay_name = pieces[1] GSM_list = pieces[2].strip().split('|') for GSM in GSM_list: GSE_GSM_to_assay[(GSE, GSM)] = assay_name f.close() GSE_assay_to_dataset_format_name = {} f = open(ds_with_multiassays_file) for line in f: pieces = line.strip().split("\t") GSE = pieces[0].strip() dataset_format_name = pieces[1].strip() assay_name = pieces[2].strip() GSE_assay_to_dataset_format_name[(GSE, assay_name)] = dataset_format_name f.close() format_name2display_name = {} dataset2index = {} for file in files_to_load: print "Loading data from ", file f = open(file) for line in f: if line.startswith('dataset'): continue line = line.strip() if line: pieces = line.replace('"', '').split("\t") GSE = pieces[0].strip() GSM = pieces[3].strip() dataset_format_name = GSE if (GSE, GSM) in GSE_GSM_to_assay: assay = GSE_GSM_to_assay[(GSE, GSM)] # print "FOUND assay:", (GSE, GSM), assay if (GSE, assay) in GSE_assay_to_dataset_format_name: dataset_format_name = GSE_assay_to_dataset_format_name[(GSE, assay)] # print "FOUND dataset format_name:", (GSE, assay), dataset_format_name # else: # print "NOT FOUND dataset format_name:", (GSE, assay) if dataset_format_name not in format_name_to_dataset_id_src: print "The dataset: ", dataset_format_name, " is not in DATASET table." continue (dataset_id, source_id) = format_name_to_dataset_id_src[dataset_format_name] if len(pieces) < 9 or pieces[8] == '': print "SHORT LINE:", len(pieces), line continue display_name = pieces[1] description = "" if pieces[2] != '': description = pieces[2] if len(pieces[2]) > 500: description = display_name data = { "source_id": source_id, "dataset_id": dataset_id, "display_name": display_name, "sample_order": int(pieces[8]) } if pieces[2] != '': data['description'] = pieces[2] if len(pieces[2]) > 500: data['description'] = display_name if pieces[5] != '': data['biosample'] = pieces[5] if pieces[7] != '': data['strain_name'] = pieces[7] if len(pieces) > 9 and pieces[9]: taxonomy_id = taxid_to_taxonomy_id.get("TAX:"+pieces[9]) if taxonomy_id is None: print "The taxid = ", pieces[9], " for: ", dataset_format_name, GSM, " is not in TAXONOMY table." else: data['taxonomy_id'] = taxonomy_id if GSM == '': index = dataset2index.get(dataset_format_name, 0) + 1 data['format_name'] = dataset_format_name + "_sample_" + str(index) if data['format_name'] in format_name_to_datasetsample_id: print "format_name for Non GSM row: ", data['format_name'], " is used." continue dataset2index[dataset_format_name] = index data['obj_url'] = "/datasetsample/" + data['format_name'] insert_datasetsample(nex_session, fw, data) else: data['dbxref_type'] = pieces[4] if format_name2display_name.get(GSM): print "The format_name: ", GSM, " has been used for other sample", format_name2display_name.get(GSM) continue format_name2display_name[GSM] = display_name data['format_name'] = dataset_format_name + "_" + GSM if data['format_name'] in format_name_to_datasetsample_id: print "format_name for GSM row: ", data['format_name'], " is used." continue data['obj_url'] = "/datasetsample/" + data['format_name'] data['dbxref_id'] = GSM insert_datasetsample(nex_session, fw, data) f.close() fw.close() # nex_session.rollback() nex_session.commit()
import sys reload(sys) sys.setdefaultencoding('UTF8') sys.path.insert(0, '../../../src/') from models import Dataset, Datasetsample, Referencedbentity, DatasetReference sys.path.insert(0, '../') from database_session import get_nex_session as get_session __author__ = 'sweng66' nex_session = get_session() dataset_id_to_sample_count = dict([(x.dataset_id, x.sample_count) for x in nex_session.query(Dataset).filter_by(is_in_spell='true').all()]) sample_count = {} for x in nex_session.query(Datasetsample).all(): if x.dataset_id in sample_count: sample_count[x.dataset_id] = sample_count[x.dataset_id] + 1 else: sample_count[x.dataset_id] = 1 for dataset_id in sample_count: if dataset_id not in dataset_id_to_sample_count: continue if sample_count[dataset_id] != dataset_id_to_sample_count[dataset_id]: print "MISMATCH: ", dataset_id, sample_count[dataset_id], dataset_id_to_sample_count[dataset_id] else: print "MATCH: ", dataset_id,sample_count[dataset_id], dataset_id_to_sample_count[dataset_id]