def main(): efo_cell_og, x, y = load_ontology.load("11") #efo_disease_og, x,y = load_ontology.load("3") #efo_anatomy_og, x,y, = load_ontology.load("14") #efo_og, x,y = load_ontology.load("13") cl_uberon_doid_og, x, y = load_ontology.load("0") efo_cellline_og, x, y = load_ontology.load("10") cvcl_og, x, y = load_ontology.load("4") term_to_linkedsup_terms = term_to_linked_superterms(cl_uberon_doid_og) #term_to_linkedsup_terms = term_to_linked_superterms(efo_og) term_to_linkedsup_terms.update(term_to_linked_superterms(efo_cell_og)) term_to_linkedsup_terms.update(term_to_linked_superterms(efo_cellline_og)) term_to_linkedsup_terms.update(term_to_linked_superterms(cvcl_og)) #term_to_linkedsup_terms.update(term_to_linked_superterms(efo_disease_og)) #term_to_linkedsup_terms.update(term_to_linked_superterms(efo_anatomy_og)) # Remove known incorrect mappings if "CVCL:1240" in term_to_linkedsup_terms["EFO:0003045"]: term_to_linkedsup_terms["EFO:0003045"].remove("CVCL:1240") with open("term_to_superterm_linked_terms.json", "w") as f: f.write( json.dumps(term_to_linkedsup_terms, indent=4, sort_keys=True, separators=(',', ': ')))
def main(): doid_disease_og, x,y = load_ontology.load("2") efo_disease_og, x,y = load_ontology.load("3") efo_cellline_og, x,y = load_ontology.load("10") print "Generating cell line to disease implications..." term_to_implications = generate_implications(efo_disease_og, efo_cellline_og) temp = generate_implications(doid_disease_og, efo_cellline_og) for term, implied_terms in temp.iteritems(): term_to_implications[term] += implied_terms with open("cellline_to_disease_implied_terms.json", "w") as f: f.write(json.dumps(term_to_implications, indent=4, separators=(',', ': '), sort_keys=True))
def efo_cvcl_syns(): """ Use the Cellosaurus to generate extra cell-line synonyms for the EFO. """ og, x, y = load_ontology.load("10") syn_sets = None with open("../map_sra_to_ontology/synonym_sets/cvcl_syn_sets.json", "r") as f: syn_sets = [Set(x) for x in json.load(f)] # Add synonyms total_terms = len(og.get_mappable_terms()) c = 1 term_id_to_syns = defaultdict(lambda: []) for term in og.get_mappable_terms(): print "Adding synonyms to term %d/%d with id %s" % (c, total_terms, term.id) c += 1 for syn_set in syn_sets: current_term_strs = [x.syn_str for x in term.synonyms] current_term_strs.append(term.name) current_term_strs = Set(current_term_strs) for c_str in current_term_strs: if c_str in syn_set: for syn in syn_set: if syn not in current_term_strs and syn not in Set( term_id_to_syns[term.id]): print "Added synonym %s to term with name %s" % ( syn, term.name) term_id_to_syns[term.id].append(syn) return term_id_to_syns
def main(): og, x, y = load_ontology.load("13") problematic_terms = deque() for t_id, term in og.id_to_term.items(): if "carcinoma" in term.name or "adenocarcinoma" in term.name: problematic_terms.append(term) term_to_syns = {} for term in problematic_terms: term_to_syns[term.id] = {"name": term.name, "synonyms":[x.syn_str for x in term.synonyms]} with open("candidate_term_to_remove_synonyms.json", "w") as f: f.write(json.dumps(term_to_syns, indent=4, sort_keys=True, separators=(',', ': ')))
def uncaps_EFO_syns(): """ For all synonyms in the EFO, check if the first character is upper case (and only the first character). If so, convert to lower case. """ def uncap_str(r_str): tokens = r_str.split() new_tokens = [] for tok in tokens: if len(tok) == 1: new_tokens.append(tok) else: first_upper = tok[0].isupper() rest_lower = True for t in tok[1:]: if t.isupper(): rest_lower = False if first_upper and rest_lower: new_tokens.append(tok[0].lower() + tok[1:]) else: new_tokens.append(tok) return " ".join(new_tokens) og, x, y = load_ontology.load("9") term_id_to_syns = defaultdict(lambda: []) for term in og.id_to_term.values(): print "Looking at term %s" % term.id # Names of term ref_strs = [term.name] ref_strs += [x.syn_str for x in term.synonyms] ref_strs = Set(ref_strs) for r_str in ref_strs: new_str = uncap_str(r_str) if new_str not in ref_strs: print "Derived '%s' from '%s'" % (new_str, r_str) term_id_to_syns[term.id].append(new_str) return term_id_to_syns
def main(): parser = OptionParser() #parser.add_option("-f", "--key_value_file", help="JSON file storing key-value pairs describing sample") (options, args) = parser.parse_args() input_f = args[0] # Map key-value pairs to ontologies with open(input_f, "r") as f: tag_to_vals = json.load(f) # Load ontologies ont_name_to_ont_id = { "UBERON": "12", "CL": "1", "DOID": "2", "EFO": "16", "CVCL": "4" } ont_id_to_og = { x: load_ontology.load(x)[0] for x in ont_name_to_ont_id.values() } pipeline = p_53() all_mappings = [] for tag_to_val in tag_to_vals: sample_acc_to_matches = {} mapped_terms, real_props = pipeline.run(tag_to_val) mappings = { "mapped_terms": [x.to_dict() for x in mapped_terms], "real_value_properties": [x.to_dict() for x in real_props] } all_mappings.append(mappings) outputs = [] for tag_to_val, mappings in zip(tag_to_vals, all_mappings): outputs.append( run_pipeline_on_key_vals(tag_to_val, ont_id_to_og, mappings)) print json.dumps(outputs, indent=4, separators=(',', ': '))
def main(): efo_celltype_og, x, y = load_ontology.load("11") cl_celltype_og, x, y = load_ontology.load("1") doid_disease_og, x, y = load_ontology.load("2") efo_disease_og, x, y = load_ontology.load("3") efo_anatomy_og, x, y = load_ontology.load("14") uberon_anatomy_og, x, y = load_ontology.load("5") efo_cellline_og, x, y = load_ontology.load("10") cvcl_og, x, y = load_ontology.load("4") term_to_linked_terms = {} term_to_linked_terms.update(linked_terms(efo_celltype_og, cl_celltype_og)) term_to_linked_terms.update(linked_terms(efo_disease_og, doid_disease_og)) term_to_linked_terms.update(linked_terms(efo_anatomy_og, uberon_anatomy_og)) term_to_linked_terms.update(linked_terms(cl_celltype_og, efo_celltype_og)) term_to_linked_terms.update(linked_terms(doid_disease_og, efo_disease_og)) term_to_linked_terms.update(linked_terms(uberon_anatomy_og, efo_anatomy_og)) term_to_linked_terms.update( linked_terms(efo_cellline_og, cvcl_og, link_syn_types=["EXACT", "RELATED"])) term_to_linked_terms.update( linked_terms(cvcl_og, efo_cellline_og, link_syn_types=["EXACT", "RELATED"])) with open("term_to_linked_terms.json", "w") as f: f.write( json.dumps(term_to_linked_terms, indent=4, sort_keys=True, separators=(',', ': ')))
def main(): # add main args to parse parser = argparse.ArgumentParser( description='Arguments for run_pipeline.py') parser.add_argument("--fnvread", type=str, required=True, default=None, help='Paths of files to read, separated with ";".') parser.add_argument("--fnvwrite", type=str, required=True, default=None, help='Paths of files to write, separated with ";".') args = parser.parse_args() # start timer t1 = default_timer() parser = OptionParser() # parse args print str(args) input_fl = args.fnvread.split(";") print "detected " + str(len(input_fl)) + " files to read" # summarize detected files input_fl = args.fnvread.split(";") print "detected " + str(len(input_fl)) + " files to read" write_fl = args.fnvwrite.split(";") print "detected " + str(len(write_fl)) + " files to write" tagsl = [] for infile in input_fl: with open(infile, "r") as opf: tagsl.append(json.load(opf)) # ** do main ontology loads outside of main loop ont_name_to_ont_id = { "UBERON": "12", "CL": "1", "DOID": "2", "EFO": "16", "CVCL": "4" } ont_id_to_og = { x: load_ontology.load(x)[0] for x in ont_name_to_ont_id.values() } pipeline = p_48() # do main loop on samples (works for up to 500, then string too long) for ii, tag_to_vals in enumerate(tagsl): write_f = write_fl[ii] # Designate file to write output to # Map key-value pairs to ontologies pipeline = p_48() all_mappings = [] for tag_to_val in tag_to_vals: sample_acc_to_matches = {} mapped_terms, real_props = pipeline.run(tag_to_val) mappings = { "mapped_terms": [x.to_dict() for x in mapped_terms], "real_value_properties": [x.to_dict() for x in real_props] } all_mappings.append(mappings) outputs = [] for tag_to_val, mappings in zip(tag_to_vals, all_mappings): outputs.append( run_pipeline_on_key_vals(tag_to_val, ont_id_to_og, mappings)) # use json.dump to stream json to outfile, where fn is second arg with open(write_f, 'w') as outfile: json.dump(obj=outputs, fp=outfile, indent=4, separators=(',', ': ')) # use json.dumps to return json output as a string, to console print json.dumps(outputs, indent=4, separators=(',', ': ')) print "time elapsed = " + str(default_timer() - t1)
import json from collections import defaultdict import sqlite3 import map_sra_to_ontology from map_sra_to_ontology import load_ontology ONT_NAME_TO_ONT_ID = { "UBERON": "12", "CL": "1", "DOID": "2", "EFO": "16", "CVCL": "4" } ONT_ID_TO_OG = { x: load_ontology.load(x)[0] for x in ONT_NAME_TO_ONT_ID.values() } def main(): parser = OptionParser() #parser.add_option("-a", "--a_descrip", action="store_true", help="This is a flat") #parser.add_option("-b", "--b_descrip", help="This is an argument") (options, args) = parser.parse_args() mappings_f = args[0] sample_type_predictions_f = args[1] out_json = args[2] out_sql = args[3]
"CL": "1", "DOID": "2", "EFO": "16", "CVCL": "4", "ORDO": "19", "UBERON_all": "5", "UO": "7", "EFO_all": "9" } # ont_id_to_og = {x: load_ontology.load(x)[0] # for x in list(ont_name_to_ont_id.values())} ont_id_to_og = {} for ont_name in list(ont_name_to_ont_id.keys()): sys.stderr.write('[{}] {}\n'.format(datetime.datetime.now(), ont_name)) id = ont_name_to_ont_id[ont_name] ont_id_to_og[id] = load_ontology.load(id)[0] pipeline = p_48(ont_id_to_og) del ont_name_to_ont_id["UBERON_all"], ont_name_to_ont_id[ "UO"], ont_name_to_ont_id["EFO_all"] del ont_id_to_og["5"], ont_id_to_og["7"], ont_id_to_og["9"] sys.stderr.write('[{}] dill dump\n'.format(datetime.datetime.now())) #dill.dump_session('pipeline_init.dill') # with open("pipeline_init.dill", "wb") as f: # dill.dump((pipeline, ont_id_to_og), f) with open("pipeline_init.pickle", "wb") as f: pickle.dump((pipeline, ont_id_to_og), f) sys.stderr.write('[{}] Done.\n'.format(datetime.datetime.now()))