Esempio n. 1
0
 def test_encode_phenotypes_file(self):
     input_file = os.path.join(self.parent_dir, "data/test.score-short.txt")
     records = parse_input(input_file, self.hpo_network, self.alt2prim)
     encoded_phenotypes = encode_phenotypes(
         [record["terms"] for record in records], self.phenotype_groups,
         self.hpo_network, self.alt2prim)
     self.assertEqual(sum(encoded_phenotypes[0]), 4)
Esempio n. 2
0
    def test_score_pairs_age(self):
        # Test reading in records files and calculating pairwise scores
        # read in records
        self.hpo_network = annotate(
            self.hpo_network,
            self.phenotype_to_diseases,
            self.num_diseases_annotated,
            self.alt2prim,
            ages_distribution_file=self.ages_distribution_file)

        records = parse_input(
            os.path.join(self.parent_dir, 'data/test.score-short.txt'),
            self.hpo_network, self.alt2prim)

        # create instance the scorer class
        scorer = Scorer(self.hpo_network,
                        summarization_method='BMWA',
                        min_score_mask=None)

        # select which patients to test in pairwise best_match_weighted_average
        input_records = [
            x for x in records if x['record_id'] in ['118200', '118210']
        ]

        results = scorer.score_records(
            input_records,
            input_records,
            [
                (0, 1),
            ],
        )
        self.assertEqual(len(results), 1)

        # the right answer =
        answer = np.average([0.166, 1.0, 1.0, 0.125, 0.25, 1.0, 1.0],
                            weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0])

        self.assertAlmostEqual(float(results[0][2]), answer, 2)

        # Test identical records for which one age exist and one doesn't
        input_records = [
            x for x in records if x['record_id'] in ['118210', '118211']
        ]

        results = scorer.score_records(
            input_records,
            input_records,
            [
                (0, 1),
            ],
        )
        self.assertEqual(len(results), 1)

        self.assertAlmostEqual(float(results[0][2]), 1.0, 1)
Esempio n. 3
0
def likelihood_moldx(input_file, output_file=None, k_phenotype_groups=1000):
    """
    :param input_file: The file path to a file containing three columns. [ID\tkey=value\thpodid,hpoid,hpoid]
    :param output_file: The file path to an output file containing the predicted probabilities
    :param k_phenotype_groups: The number of phenotype groups to use for encoding phenotypes. The CLI version of phenopy allows for one of [1000, 1500] 
    """
    try:
        obo_file = config.get('hpo', 'obo_file')
    except (NoSectionError, NoOptionError):
        logger.critical(
            'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.'
        )
        sys.exit(1)
    try:
        disease_to_phenotype_file = config.get('hpo',
                                               'disease_to_phenotype_file')
    except (NoSectionError, NoOptionError):
        logger.critical(
            'No HPO annotated dataset file found in the configuration file.'
            ' See "hpo:disease_to_phenotype_file" parameter.')
        sys.exit(1)

    logger.info(f'Loading HPO OBO file: {obo_file}')
    hpo_network, alt2prim, _ = \
        generate_annotated_hpo_network(obo_file,
                                       disease_to_phenotype_file,
                                       )

    # parse input records
    input_records = parse_input(input_file, hpo_network, alt2prim)
    record_ids = [record["record_id"] for record in input_records]
    phenotypes = [record["terms"] for record in input_records]

    # predict likelihood of molecular diagnosis
    positive_probabilities = predict_likelihood_moldx(
        phenotypes,
        phenotype_groups=None,
        hpo_network=hpo_network,
        alt2prim=alt2prim,
        k_phenotype_groups=k_phenotype_groups,
    )

    if output_file is None:
        output_file = "phenopy.likelihood_moldx.txt"
    try:
        with open(output_file, "w") as f:
            for sample_id, probability in zip(record_ids,
                                              positive_probabilities):
                f.write(f"{sample_id}\t{probability}\n")
    except IOError:
        sys.exit("Something went wrong writing the probabilities to file")
Esempio n. 4
0
    def test_score_self(self):
        # read in records
        records = parse_input(
            os.path.join(self.parent_dir, 'data/test.score-long.txt'),
            self.hpo_network, self.alt2prim)

        # limit to records with HPO terms since many test cases don't have the sub-graph terms from tests/data/hp.obo
        input_records = [
            x for x in records if x['record_id'] in ['213200', '302801']
        ]

        results = self.scorer.score_records(
            input_records, input_records,
            half_product(len(input_records), len(input_records)))
        self.assertEqual(len(results), 3)

        # test the score of '213200' - '302801'
        self.assertAlmostEqual(float(results[1][2]), 0.3758, 2)
Esempio n. 5
0
def score(input_file,
          output_file='-',
          records_file=None,
          annotations_file=None,
          custom_disease_file=None,
          ages_distribution_file=None,
          self=False,
          summarization_method='BMWA',
          scoring_method='HRSS',
          threads=1):
    """
    Scores similarity of provided HPO annotated entries (see format below) against a set of HPO annotated dataset. By
    default scoring happens against diseases annotated by the HPO group. See https://hpo.jax.org/app/download/annotation.

    Phenopy also supports scoring the product of provided entries (see "--product") or scoring against a custom records
    dataset (see "--records-file).

    :param input_file: File with HPO annotated entries, one per line (see format below).
    :param output_file: File path where to store the results. [default: - (stdout)]
    :param records_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if
     provided, is used to score entries in the "input_file" against entries here. [default: None]
    :param annotations_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if
     provided, is used to add information content to the network. [default: None]
    :param custom_disease_file: entity Annotation for ranking diseases/genes
    :param ages_distribution_file: Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std.
     [default: None]
    :param self: Score entries in the "input_file" against itself.
    :param summarization_method: The method used to summarize the HRSS matrix. Supported Values are best match average
    (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA]
    :param scoring_method: Either HRSS or Resnik
    :param threads: Number of parallel processes to use. [default: 1]
    """

    try:
        obo_file = config.get('hpo', 'obo_file')
    except (NoSectionError, NoOptionError):
        logger.critical(
            'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.'
        )
        sys.exit(1)
    if custom_disease_file is None:
        try:
            disease_to_phenotype_file = config.get(
                'hpo', 'disease_to_phenotype_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO annotated dataset file found in the configuration file.'
                ' See "hpo:disease_to_phenotype_file" parameter.')
            sys.exit(1)
    else:
        logger.info(
            f"using custom disease annotation file: {custom_disease_file}")
        disease_to_phenotype_file = custom_disease_file

    logger.info(f'Loading HPO OBO file: {obo_file}')
    hpo_network, alt2prim, disease_records = \
        generate_annotated_hpo_network(obo_file,
                                       disease_to_phenotype_file,
                                       annotations_file=annotations_file,
                                       ages_distribution_file=ages_distribution_file
                                       )

    # parse input records
    input_records = parse_input(input_file, hpo_network, alt2prim)

    # create instance the scorer class
    try:
        scorer = Scorer(hpo_network,
                        summarization_method=summarization_method,
                        scoring_method=scoring_method)
    except ValueError as e:
        logger.critical(f'Failed to initialize scoring class: {e}')
        sys.exit(1)

    if self:
        score_records = input_records

        scoring_pairs = half_product(len(score_records), len(score_records))
    else:
        if records_file:
            score_records = parse_input(records_file, hpo_network, alt2prim)
        else:
            score_records = disease_records

        scoring_pairs = itertools.product(
            range(len(input_records)),
            range(len(score_records)),
        )

    results = scorer.score_records(input_records, score_records, scoring_pairs,
                                   threads)

    with open_or_stdout(output_file) as output_fh:
        output_fh.write('\t'.join(['#query', 'entity_id', 'score']))
        output_fh.write('\n')
        for result in results:
            output_fh.write('\t'.join(str(column) for column in result))
            output_fh.write('\n')
Esempio n. 6
0
def annotate(hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim, annotations_file=None, ages_distribution_file=None,
            phenotype_disease_frequencies=None):
    """
    Cleans the HPO network.

    Removes non-phenotype branches of the network, and merges all synonyms into one tag.

    :param hpo_network: `networkx.MultiDiGraph` to clean.
    :param phenotype_to_diseases: Dictionary mapping HPO terms to diseases.
    :param num_diseases_annotated: Number of diseases with HPO annotations.
    :param alt2prim: The dict of alternate terms to canonical terms.
    :param annotations_file: A list of custom annotation files, in the same format as tests/data/test.score-long.txt
    :param ages: age distributions object
    :param phenotype_disease_frequencies: dictionary of phenotype to disease frequencies
    :param logger: Python `logging` logger instance.
    :param ages_distribution_file: Path to phenotypes ages distribution file.
    :return: `networkx.MultiDiGraph`
    """

    # Before calculating information content, check for custom_annotations_file and load
    custom_annos = None
    if annotations_file is not None:
        custom_annos = {}
        for record in parse_input(annotations_file, hpo_network, alt2prim):
            for term_id in record['terms']:
                if term_id not in custom_annos:
                    custom_annos[term_id] = []
                custom_annos[term_id].append(record['record_id'])

    # make ages distributions
    ages = None
    if ages_distribution_file is not None:
        try:
            ages = make_age_distributions(ages_distribution_file)
            logger.info(
                f'Adding custom phenotype age distributions to HPO nodes from file: {ages_distribution_file}'
            )
        except (FileNotFoundError, PermissionError) as e:
            logger.critical(e)
            logger.critical(
                f'Specified phenotype ages file could not be loaded or does not exist: {e}'
            )
            exit(1)

    for node_id, data in hpo_network.nodes(data=True):
        # annotate with information content value
        hpo_network.nodes[node_id]['ic'] = calculate_information_content(
            node_id,
            hpo_network,
            phenotype_to_diseases,
            num_diseases_annotated,
            custom_annos,
        )
        # annotate with phenotype age distribution
        hpo_network.nodes[node_id]['disease_weights'] = {}

        if ages is not None and node_id in ages.index:
            hpo_network.nodes[node_id]['age_dist'] = ages.loc[node_id]['age_dist']

        # add the disease_frequency weights as attributes to the node
        if phenotype_disease_frequencies is not None:
            if node_id in phenotype_disease_frequencies:
                for disease_id, frequency in phenotype_disease_frequencies[node_id].items():
                    hpo_network.nodes[node_id]['weights']['disease_frequency'][disease_id] = frequency

        # annotate with depth value
        # hard-coding origin node for now
        origin = 'HP:0000001'
        hpo_network.nodes[node_id]['depth'] = nx.shortest_path_length(
            hpo_network,
            node_id,
            origin
        )

        # clean synonyms
        synonyms = []
        try:
            for synonym in data['synonym']:
                synonyms.append(synonym)
            hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms))
        except KeyError:
            # pass if no synonym tags in the node
            pass

    return hpo_network