Esempio n. 1
0
def get_mica_ic(pheno_a: str, pheno_b: str, graph: Graph,
                ic_map: Dict[str, float], root) -> float:
    predicate = RDFS['subClassOf']
    p1_closure = owl_utils.get_closure(graph, pheno_a, predicate, root)
    p2_closure = owl_utils.get_closure(graph, pheno_b, predicate, root)
    return max(
        [ic_map[parent] for parent in p1_closure.intersection(p2_closure)])
def load_map_from_file(file_path: str) -> Dict[str, Set[str]]:
    profile_map: Dict[str, Set[str]] = {}
    with open(file_path, 'r') as annotations:
        for line in annotations:
            if line.startswith('#') or not line.startswith('MONDO'):
                continue
            disease, phenotype = line.rstrip("\n").split("\t")[0:2]
            try:
                profile_map[disease].add(phenotype)
            except KeyError:
                profile_map[disease] = {phenotype}
            for pheno in get_closure(hpo,
                                     phenotype,
                                     root='HP:0000118',
                                     reflexive=False):
                profile_map[disease].add(pheno)
    return profile_map
Esempio n. 3
0
def simulate_from_derived(
        pheno_profile: Set[str],
        pheno_subset: Set[str],
        graph: Graph,
        root: str,
        ic_values: Dict[str, float],
        filter_out: Set[str],
        ref_disease: Optional[str]=None) -> FrozenSet[str]:
    """
    Add imprecision and noise to profile
    20% omit phenotype - omissions
    10% use closest parent - imprecision
    30% add random phenotype - noise, min 1
    :return: FrozenSet[str] - set of phenotype curies
    """
    omission_rate = .2  # .4 for gold, .2 for derived
    imprecision_rate = .1  # .3 for gold, .1 for derived
    noise_rate = .3

    phenotypes = list(pheno_profile)
    profile_size = len(phenotypes)

    # Remove x percent of phenotypes
    count_to_remove = round(profile_size * omission_rate)
    phenotypes = random.sample(phenotypes, profile_size - count_to_remove)

    # mutate x percent to closest parent
    count_to_mutate = round(profile_size * imprecision_rate)
    random.shuffle(phenotypes)
    counter = 0
    for idx, pheno in enumerate(phenotypes):
        if counter == count_to_mutate:
            break
        parents = get_closure(graph, pheno, RDFS['subClassOf'], root, False)
        lay_overlap = parents.intersection(pheno_subset).difference(pheno_profile, phenotypes)
        if len(list(lay_overlap)) == 0:
            continue
        max_ic = max([ic_values[parent] for parent in lay_overlap])
        mica = ''
        for phen in lay_overlap:
            if ic_values[phen] == max_ic:
                mica = phen

        phenotypes[idx] = mica
        counter += 1

    if counter != count_to_mutate:
        logging.info("Could not mutate profile derived from {}".format(ref_disease))

    # add random phenotype(s)
    # Filter out phenotypes from filter_out set
    phenos_to_select = pheno_subset.difference(filter_out, phenotypes, pheno_profile)

    if len(list(phenos_to_select)) == 0:
        logging.warning("No phenotypes to select for "
                        "profile derived from {}".format(ref_disease))
    comissions = round(profile_size * noise_rate)
    noise_count = 1 if comissions == 0 else comissions

    for i in range(noise_count):
        random_pheno = random.choice(list(phenos_to_select))
        phenotypes.append(random_pheno)
        phenos_to_select.remove(random_pheno)

    return frozenset(phenotypes)
        pheno_profile, mondo_label = monarch.get_direct_phenotypes(mondo)
        gold_standard[mondo] = pheno_profile

for mondo in diseases:
    # If list is not mondo
    # clique_leader = monarch.get_clique_leader(disease)
    # mondo = clique_leader['id']
    # mondo_label = clique_leader['label']

    # Get phenotypes
    gold_profile = set(gold_standard[mondo])
    derived_profile = gold_profile.intersection(lay_terms)
    non_lay_terms = gold_profile - derived_profile

    for phenotype in non_lay_terms:
        parents = owl_utils.get_closure(hpo, phenotype, RDFS['subClassOf'],
                                        root)
        lay_overlap = parents.intersection(lay_terms)
        if len(lay_overlap) == 0:
            continue
        max_ic = max([ic_map[parent] for parent in lay_overlap])
        mica = ''
        for pheno in lay_overlap:
            if ic_map[pheno] == max_ic:
                mica = pheno

        derived_profile.add(mica)

    for phenotype in derived_profile:
        output.write("{}\t{}\n".format(mondo, phenotype))
Esempio n. 5
0
def main():

    parser = argparse.ArgumentParser(
        description='Generate information content for each HPO class using the '
        'HPO phenotype annotation file ')
    parser.add_argument('--mondo_cache',
                        '-m',
                        type=str,
                        required=True,
                        help='Cached 2 column disease phenotype tsv')
    parser.add_argument('--output',
                        '-o',
                        type=str,
                        required=False,
                        help='Location of output file',
                        default="./mondo-ic-cache.tsv")
    args = parser.parse_args()

    # i/o
    output_file = open(args.output, 'w')

    explicit_annotations = 1
    disease_annotations: Dict[str, int] = {}

    mondo_graph = Graph()

    # Previous cache made with 2018-08-03 version of mondo
    logger.info("Loading MONDO")
    mondo_graph.parse(
        "/path/to/git/mondo-2018-08-03/src/ontology/reasoned.owl",
        format='xml')
    root = "MONDO:0000001"

    logger.info("Getting classes")
    all_diseases = owl_utils.get_descendants(mondo_graph, root)
    disease_annotations = {disease: 0 for disease in all_diseases}

    logger.info("Seeding leaf nodes")
    # Seed leaf nodes with 1 annotation
    for leaf in owl_utils.get_leaf_nodes(mondo_graph, root):
        explicit_annotations += 1
        for disease in owl_utils.get_closure(mondo_graph, leaf, root=root):
            try:
                disease_annotations[disease] += 1
            except KeyError:
                print(disease)
                disease_annotations[disease] = 1

    logger.info("Fetching annotations")
    with open(args.mondo_cache, 'r') as cache_file:
        reader = csv.reader(cache_file, delimiter='\t', quotechar='\"')
        for row in reader:
            if row[0].startswith('#'): continue
            if not row[0].startswith('MONDO'): continue
            (mondo_id, phenotype_id) = row[0:2]
            explicit_annotations += 1
            for disease in owl_utils.get_closure(mondo_graph,
                                                 mondo_id,
                                                 root=root):
                try:
                    disease_annotations[disease] += 1
                except KeyError:
                    print(disease)
                    disease_annotations[disease] = 1

    logger.info("Computing IC")
    for disease, annot_count in disease_annotations.items():
        output_file.write("{}\t{}\n".format(
            disease,
            math_utils.information_content(annot_count /
                                           explicit_annotations)))
Esempio n. 6
0
def pairwise_jaccard(pheno_a: str, pheno_b: str, graph: Graph,
                     root: str) -> float:
    predicate = RDFS['subClassOf']
    return jaccard(owl_utils.get_closure(graph, pheno_a, predicate, root),
                   owl_utils.get_closure(graph, pheno_b, predicate, root))
    mondo = Graph()
    mondo.parse(gzip.open("../data/owl/mondo.owl.gz", 'rb'), format='xml')

    with open(args.mondo_assoc, 'r') as mondo_labels:
        for line in mondo_labels:
            if line.startswith('#'): continue
            if not line.startswith('MONDO'): continue
            if counter % 10000 == 0:
                print("Processed {} associations".format(counter))
            disease, phenotype = line.rstrip("\n").split("\t")[0:2]
            try:
                mondo_diseases[disease] = mondo_diseases_tmp[disease]
            except KeyError:
                mondo_diseases[disease] = "obsoleted class"

            disease_closure = get_closure(mondo, disease, root='MONDO:0000001')

            for dis in disease_closure:
                try:
                    mondo_diseases[dis] = mondo_diseases_tmp[dis]
                except KeyError:
                    mondo_diseases[dis] = "obsoleted class"

            if include_inferred:
                phenotype_closure = get_closure(hpo,
                                                phenotype,
                                                root='HP:0000118')
                associations.append((disease_closure, phenotype_closure))
            else:
                associations.append((disease_closure, {phenotype}))
            counter += 1
def main():
    """
    Cluster and iterate over each cluster to find the best
    disease group that subsumes the cluster
    """
    parser = argparse.ArgumentParser(description='description')
    parser.add_argument('--input',
                        '-i',
                        type=str,
                        required=True,
                        help='Location of input file'
                        ' that contains the sim matrix as json')
    parser.add_argument('--label',
                        '-l',
                        type=str,
                        required=True,
                        help='Location of id-label mapping file')
    parser.add_argument('--ic_cache', '-ic', type=str, required=True)
    parser.add_argument('--output', '-o', required=False, help='output file')
    args = parser.parse_args()

    logger.info("loading matrix")
    matrix = np.loadtxt(args.input, delimiter=",")
    labels = [
        line.rstrip('\n').split('\t')[0] for line in open(args.label, 'r')
    ]

    ic_fh = open(args.ic_cache, 'r')
    output = open(args.output, 'w')
    output.write("#distance\tlinkage\tmean_mica\tmedian_mica\t"
                 "num_clusters\tmean_mem\tmedian_mem\tsingletons\n")

    ic_map: Dict[str, float] = {}

    for line in ic_fh.readlines():
        hpo_id, ic = line.rstrip("\n").split("\t")
        ic_map[hpo_id] = float(ic)

    ic_fh.close()

    mondo_graph = Graph()

    logger.info("loading mondo")
    # Previous cache made with 2018-08-03 version of mondo
    mondo_graph.parse(
        "/path/to/git/mondo-2018-08-03/src/ontology/reasoned.owl",
        format='xml')
    root = "MONDO:0000001"

    mondo_skip = {
        'MONDO:0023807',
        'MONDO:0000559',
        'MONDO:0009117',
        'MONDO:0016961',
        'MONDO:0011750',
        'MONDO:0017180',
    }

    cluster_map = {}

    logger.info("clustering")
    Z = linkage(squareform(matrix), 'ward')
    # cosine weighted = 2631
    # resnik = 2453
    # euclidean = 525
    clusters = fcluster(Z, 525, 'maxclust')

    for disease_id, cluster_id in zip(labels, clusters):
        try:
            cluster_map[cluster_id].append(disease_id)
        except KeyError:
            cluster_map[cluster_id] = [disease_id]

    for cluster_id, diseases in cluster_map.items():
        if len(set(diseases).intersection(mondo_skip)) > 0:
            if len(diseases) == len(set(diseases).intersection(mondo_skip)):
                logger.warning("Cannot evaluate cluster")
                diseases = set(diseases)
            else:
                diseases = set(diseases) - mondo_skip
        common_ancestors = set()
        is_first = True
        for disease in diseases:
            if is_first:
                common_ancestors = owl_utils.get_closure(mondo_graph,
                                                         disease,
                                                         root=root)
                is_first = False
            else:
                common_ancestors = common_ancestors.intersection(
                    owl_utils.get_closure(mondo_graph, disease, root=root))
        mica = max([ic_map[d] for d in common_ancestors])
        for dis in common_ancestors:
            if ic_map[dis] == mica:
                mica_id = dis
        label = monarch.get_label(mica_id)
        # Number of subclasses for dis
        subclass_count = len(
            owl_utils.get_closure(mondo_graph,
                                  mica_id,
                                  negative=True,
                                  reflexive=False))
        output.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
            cluster_id, len(cluster_map[cluster_id]), mica_id, label, mica,
            subclass_count, "|".join(cluster_map[cluster_id])))