def script(path, out): omim_dict = get_hp_ic.parse_anno( open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab')) diseases = list(omim_dict.keys()) ic_dict = get_hp_ic.load_ic(open('ic/ic_parent.txt')) contents = os.listdir(path) patients = [f for f in contents if f.endswith('_hpo.txt')] t1 = time.time() for i, p in enumerate(patients): print(i, time.time() - t1) try: phenotypes = open(os.path.join(path, p)).readlines()[0].split(',') except IndexError: continue omim_scores = [] for o, phe in omim_dict.items(): dist = get_hp_ic.calc_dis_sim_no_norm( omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict) omim_scores.append((o, dist)) omim_scores.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(out, p + '.results'), 'w') as f: for line in omim_scores[:20]: f.write('\t'.join(['1.000', str(line[1]), 'OMIM:' + line[0]]) + '\n')
def script(path,ic, out): omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab')) #diseases = list(omim_dict.keys()) ic_dict = get_hp_ic.load_ic(open(ic)) contents = os.listdir(path) patients = [f for f in contents if f.endswith('.txt')] fail_counter = 0 failed_keys = set() for i, p in enumerate(patients): if i % 10 == 0: print(i) try: phenotypes = open(os.path.join(path, p)).readlines()[0].split(',') except IndexError: continue omim_scores = [] for o, phe in omim_dict.items(): try: sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict) except KeyError as e: fail_counter += 1 failed_keys.add(e.args[0]) omim_scores.append((o, sim)) omim_scores.sort(key=lambda x: x[1],reverse=True) with open(os.path.join(out, p + '.results'), 'w') as f: for line in omim_scores[:20]: f.write('\t'.join(['1.000', str(line[1]), line[0]]) + '\n') print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n') print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n') print(failed_keys)
def script(path,ic, out, pvals, normalized): omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab')) #diseases = list(omim_dict.keys()) if normalized == 'half': normalized = 'half' elif normalized == 'True': normalized = True elif normalized == 'False': normalized = False else: print('Bad option {}'.format(normalized)) sys.exit() #ic_dict = collections.defaultdict(lambda: 1) ic_dict = get_hp_ic.load_ic(open(ic)) contents = os.listdir(path) patients = [f for f in contents if f.endswith('.txt')] (pval_table, pval_dict) = load_pvals(pvals) fail_counter = 0 failed_keys = set() t1 = time.time() for i, p in enumerate(patients): if i % 10 == 0: print(i, time.time()) try: phenotypes = open(os.path.join(path, p)).readlines()[0].split(',') except IndexError: continue omim_scores = [] for o, phe in omim_dict.items(): if not o in pval_dict: continue if len(phenotypes) == 0: (pval, sim, final_score) = (1.0, 0, 0) else: try: pval_list = pval_table[pval_dict[o]][min(len(phenotypes) - 1, 9)] #norm True will give intersection/union, norm False just gives straight count sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phenotypes), omim.Disease(None, None, None, phe), ic_dict, ancestors=True, norm=normalized) pval = get_hp_ic.get_p_value(sim, pval_list) * 0.5 final_score = (1 - pval) * 1000000000000 + sim except KeyError as e: fail_counter += 1 failed_keys.add(e.args[0]) omim_scores.append((o, final_score, pval, sim)) omim_scores.sort(key=lambda x: x[1],reverse=True) with open(os.path.join(out, p + '.results'), 'w') as f: for line in omim_scores[:20]: f.write('\t'.join([str(line[2]), str(line[3]), line[0]]) + '\n') print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n') print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n') print(failed_keys)
def script(path,ic, out, pvals): omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab')) #diseases = list(omim_dict.keys()) ic_dict = get_hp_ic.load_ic(open(ic)) contents = os.listdir(path) patients = [f for f in contents if f.endswith('.txt')] (pval_table, pval_dict) = load_pvals(pvals) fail_counter = 0 failed_keys = set() t1 = time.time() for i, p in enumerate(patients): if i % 10 == 0: print(i, time.time() - t1) try: phenotypes = open(os.path.join(path, p)).readlines()[0].split(',') except IndexError: continue omim_scores = [] for o, phe in omim_dict.items(): if not o in pval_dict: continue if len(phenotypes) == 0: (pval, sim, final_score) = (1.0, 0, 0) else: pval_list = pval_table[pval_dict[o]][min(len(phenotypes) - 1, 9)] try: sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict) pval = get_hp_ic.get_p_value(sim, pval_list) * 0.1 final_score = (1 - pval) * 1000000000000 + sim except KeyError as e: fail_counter += 1 failed_keys.add(e.args[0]) omim_scores.append((o, final_score, pval, sim)) omim_scores.sort(key=lambda x: x[1], reverse=True) with open(os.path.join(out, p + '.results'), 'w') as f: for line in omim_scores[:20]: f.write('\t'.join([str(line[2]), str(line[3]), line[0]]) + '\n') print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n') print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n') print(failed_keys)
def script(path, out): omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab')) diseases = list(omim_dict.keys()) ic_dict = get_hp_ic.load_ic(open('ic/ic_parent.txt')) contents = os.listdir(path) patients = [f for f in contents if f.endswith('_hpo.txt')] t1 = time.time() for i, p in enumerate(patients): print(i, time.time() - t1) try: phenotypes = open(os.path.join(path, p)).readlines()[0].split(',') except IndexError: continue omim_scores = [] for o, phe in omim_dict.items(): dist = get_hp_ic.calc_dis_sim_no_norm(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict) omim_scores.append((o, dist)) omim_scores.sort(key=lambda x: x[1],reverse=True) with open(os.path.join(out, p + '.results'), 'w') as f: for line in omim_scores[:20]: f.write('\t'.join(['1.000', str(line[1]), 'OMIM:' + line[0]]) + '\n')
import sys import get_hp_ic import random import omim import time import hpo_lib if __name__ == '__main__': dis_code = sys.argv[1] outdir = sys.argv[2] ic_file = sys.argv[3] dis_dict = get_hp_ic.parse_anno( open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab', encoding='utf-8')) pheno_dist = [] for d in dis_dict: valid = filter(lambda hp: hpo_lib.search_code(hp[3:]) != None, dis_dict[d]) pheno_dist += valid dis_phenos = dis_dict[dis_code] dis = omim.Disease(None, None, None, dis_phenos) score_fn = get_hp_ic.calc_simgic random_fn = random.choice omim_cons = omim.Disease ic_dict = get_hp_ic.load_ic(open(ic_file)) t1 = time.time() for query_size in range(1, 11): scores = [] for trial in range(100000): if trial % 1000 == 0: print(trial, time.time() - t1) query = [random_fn(pheno_dist) for i in range(query_size)]
import sys import get_hp_ic import random import omim import time if __name__ == '__main__': dis_code = sys.argv[1] outdir = sys.argv[2] ic_file = sys.argv[3] dis_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab', encoding='utf-8')) pheno_dist = [] for d in dis_dict: pheno_dist += dis_dict[d] dis_phenos = dis_dict[dis_code] dis = omim.Disease(None, None, None, dis_phenos) score_fn = get_hp_ic.calc_simgic random_fn = random.choice omim_cons = omim.Disease ic_dict = get_hp_ic.load_ic(open(ic_file)) t1 = time.time() for query_size in range(1, 11): scores = [] for trial in range(100000): if trial % 1000 == 0: print(trial, time.time() - t1) query = [random_fn(pheno_dist) for i in range(query_size)] score = score_fn(omim_cons(None, None, None, query), dis, ic_dict) scores.append(score) scores.sort() fout = open('{}/{}.txt'.format(outdir, str(query_size)), 'w') for score in scores: