def script(path, out):
    omim_dict = get_hp_ic.parse_anno(
        open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab'))
    diseases = list(omim_dict.keys())

    ic_dict = get_hp_ic.load_ic(open('ic/ic_parent.txt'))
    contents = os.listdir(path)
    patients = [f for f in contents if f.endswith('_hpo.txt')]
    t1 = time.time()
    for i, p in enumerate(patients):
        print(i, time.time() - t1)
        try:
            phenotypes = open(os.path.join(path, p)).readlines()[0].split(',')
        except IndexError:
            continue
        omim_scores = []
        for o, phe in omim_dict.items():
            dist = get_hp_ic.calc_dis_sim_no_norm(
                omim.Disease(None, None, None, phe),
                omim.Disease(None, None, None, phenotypes), ic_dict)
            omim_scores.append((o, dist))

        omim_scores.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(out, p + '.results'), 'w') as f:
            for line in omim_scores[:20]:
                f.write('\t'.join(['1.000',
                                   str(line[1]), 'OMIM:' + line[0]]) + '\n')
Exemple #2
0
def script(path,ic, out):
    omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab'))
    #diseases = list(omim_dict.keys())
   
    ic_dict = get_hp_ic.load_ic(open(ic))
    contents = os.listdir(path)
    patients = [f for f in contents if f.endswith('.txt')]

    fail_counter = 0
    failed_keys = set()
    for i, p in enumerate(patients):
        if i % 10 == 0: print(i)
        try:
            phenotypes = open(os.path.join(path, p)).readlines()[0].split(',')
        except IndexError:
            continue
        omim_scores = []
        for o, phe in omim_dict.items():
            try:
                sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict)
            except KeyError as e:
                fail_counter += 1
                failed_keys.add(e.args[0])

            omim_scores.append((o, sim))

        omim_scores.sort(key=lambda x: x[1],reverse=True)
        with open(os.path.join(out, p + '.results'), 'w') as f:
            for line in omim_scores[:20]:
                f.write('\t'.join(['1.000', str(line[1]), line[0]]) + '\n')

    print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n')
    print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n')
    print(failed_keys)
def script(path,ic, out, pvals, normalized):
    omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab'))
    #diseases = list(omim_dict.keys())
    if normalized == 'half':
        normalized = 'half'
    elif normalized == 'True':
        normalized = True
    elif normalized == 'False':
        normalized = False
    else:
        print('Bad option {}'.format(normalized))
        sys.exit() 
    #ic_dict = collections.defaultdict(lambda: 1)
    ic_dict = get_hp_ic.load_ic(open(ic))
    contents = os.listdir(path)
    patients = [f for f in contents if f.endswith('.txt')]

    (pval_table, pval_dict) = load_pvals(pvals)
    
    fail_counter = 0
    failed_keys = set()
    t1 = time.time()
    for i, p in enumerate(patients):
        if i % 10 == 0: print(i, time.time())
        try:
            phenotypes = open(os.path.join(path, p)).readlines()[0].split(',')
        except IndexError:
            continue
        omim_scores = []
        for o, phe in omim_dict.items():
            if not o in pval_dict: continue
            if len(phenotypes) == 0: (pval, sim, final_score) = (1.0, 0, 0)
            else:
                try:
                    pval_list = pval_table[pval_dict[o]][min(len(phenotypes) - 1, 9)]
                    #norm True will give intersection/union, norm False just gives straight count
                    sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phenotypes), omim.Disease(None, None, None, phe), ic_dict, ancestors=True, norm=normalized)
                    pval = get_hp_ic.get_p_value(sim, pval_list) * 0.5
                    final_score = (1 - pval) * 1000000000000 + sim
                except KeyError as e:
                    fail_counter += 1
                    failed_keys.add(e.args[0])

            omim_scores.append((o, final_score, pval, sim))

        omim_scores.sort(key=lambda x: x[1],reverse=True)
        with open(os.path.join(out, p + '.results'), 'w') as f:
            for line in omim_scores[:20]:
                f.write('\t'.join([str(line[2]), str(line[3]), line[0]]) + '\n')

    print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n')
    print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n')
    print(failed_keys)
Exemple #4
0
def script(path,ic, out, pvals):
    omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab'))
    #diseases = list(omim_dict.keys())
   
    ic_dict = get_hp_ic.load_ic(open(ic))
    contents = os.listdir(path)
    patients = [f for f in contents if f.endswith('.txt')]

    (pval_table, pval_dict) = load_pvals(pvals)

    fail_counter = 0
    failed_keys = set()
    t1 = time.time()
    for i, p in enumerate(patients):
        if i % 10 == 0: print(i, time.time() - t1)
        try:
            phenotypes = open(os.path.join(path, p)).readlines()[0].split(',')
        except IndexError:
            continue
        omim_scores = []
        for o, phe in omim_dict.items():
            if not o in pval_dict: continue
            if len(phenotypes) == 0:
                (pval, sim, final_score) = (1.0, 0, 0)
            else:
                pval_list = pval_table[pval_dict[o]][min(len(phenotypes) - 1, 9)]
                try:
                    sim = get_hp_ic.calc_simgic(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict)
                    pval = get_hp_ic.get_p_value(sim, pval_list) * 0.1
                    final_score = (1 - pval) * 1000000000000 + sim
                except KeyError as e:
                    fail_counter += 1
                    failed_keys.add(e.args[0])

            omim_scores.append((o, final_score, pval, sim))

        omim_scores.sort(key=lambda x: x[1], reverse=True)
        with open(os.path.join(out, p + '.results'), 'w') as f:
            for line in omim_scores[:20]:
                f.write('\t'.join([str(line[2]), str(line[3]), line[0]]) + '\n')

    print("Number of occurences of missing phenotypes: " + str(fail_counter) + '\n')
    print("Number of unique phenos failing: " + str(len(failed_keys)) + '\n')
    print(failed_keys)
def script(path, out):
    omim_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab'))
    diseases = list(omim_dict.keys())
   
    ic_dict = get_hp_ic.load_ic(open('ic/ic_parent.txt'))
    contents = os.listdir(path)
    patients = [f for f in contents if f.endswith('_hpo.txt')]
    t1 = time.time()
    for i, p in enumerate(patients):
        print(i, time.time() - t1)
        try:
            phenotypes = open(os.path.join(path, p)).readlines()[0].split(',')
        except IndexError:
            continue
        omim_scores = []
        for o, phe in omim_dict.items():
            dist = get_hp_ic.calc_dis_sim_no_norm(omim.Disease(None, None, None, phe), omim.Disease(None, None, None, phenotypes), ic_dict)
            omim_scores.append((o, dist))

        omim_scores.sort(key=lambda x: x[1],reverse=True)
        with open(os.path.join(out, p + '.results'), 'w') as f:
            for line in omim_scores[:20]:
                f.write('\t'.join(['1.000', str(line[1]), 'OMIM:' + line[0]]) + '\n')
Exemple #6
0
import sys
import get_hp_ic
import random
import omim
import time
import hpo_lib

if __name__ == '__main__':
    dis_code = sys.argv[1]
    outdir = sys.argv[2]
    ic_file = sys.argv[3]
    dis_dict = get_hp_ic.parse_anno(
        open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab',
             encoding='utf-8'))
    pheno_dist = []
    for d in dis_dict:
        valid = filter(lambda hp: hpo_lib.search_code(hp[3:]) != None,
                       dis_dict[d])
        pheno_dist += valid
    dis_phenos = dis_dict[dis_code]
    dis = omim.Disease(None, None, None, dis_phenos)
    score_fn = get_hp_ic.calc_simgic
    random_fn = random.choice
    omim_cons = omim.Disease
    ic_dict = get_hp_ic.load_ic(open(ic_file))
    t1 = time.time()
    for query_size in range(1, 11):
        scores = []
        for trial in range(100000):
            if trial % 1000 == 0: print(trial, time.time() - t1)
            query = [random_fn(pheno_dist) for i in range(query_size)]
import sys
import get_hp_ic
import random
import omim
import time

if __name__ == '__main__':
    dis_code = sys.argv[1]
    outdir = sys.argv[2]
    ic_file = sys.argv[3]
    dis_dict = get_hp_ic.parse_anno(open('/dupa-filer/talf/diff-diagnosis/phenotype_annotation.tab', encoding='utf-8'))
    pheno_dist = []
    for d in dis_dict:
        pheno_dist += dis_dict[d]
    dis_phenos = dis_dict[dis_code]
    dis = omim.Disease(None, None, None, dis_phenos)
    score_fn = get_hp_ic.calc_simgic
    random_fn = random.choice
    omim_cons = omim.Disease
    ic_dict = get_hp_ic.load_ic(open(ic_file))
    t1 = time.time()
    for query_size in range(1, 11):
        scores = []
        for trial in range(100000):
            if trial % 1000 == 0: print(trial, time.time() - t1)
            query = [random_fn(pheno_dist) for i in range(query_size)]
            score = score_fn(omim_cons(None, None, None, query), dis, ic_dict)
            scores.append(score)
        scores.sort()
        fout = open('{}/{}.txt'.format(outdir, str(query_size)), 'w')
        for score in scores: