def test_encode_phenotypes_file(self): input_file = os.path.join(self.parent_dir, "data/test.score-short.txt") records = parse_input(input_file, self.hpo_network, self.alt2prim) encoded_phenotypes = encode_phenotypes( [record["terms"] for record in records], self.phenotype_groups, self.hpo_network, self.alt2prim) self.assertEqual(sum(encoded_phenotypes[0]), 4)
def test_score_pairs_age(self): # Test reading in records files and calculating pairwise scores # read in records self.hpo_network = annotate( self.hpo_network, self.phenotype_to_diseases, self.num_diseases_annotated, self.alt2prim, ages_distribution_file=self.ages_distribution_file) records = parse_input( os.path.join(self.parent_dir, 'data/test.score-short.txt'), self.hpo_network, self.alt2prim) # create instance the scorer class scorer = Scorer(self.hpo_network, summarization_method='BMWA', min_score_mask=None) # select which patients to test in pairwise best_match_weighted_average input_records = [ x for x in records if x['record_id'] in ['118200', '118210'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) # the right answer = answer = np.average([0.166, 1.0, 1.0, 0.125, 0.25, 1.0, 1.0], weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0]) self.assertAlmostEqual(float(results[0][2]), answer, 2) # Test identical records for which one age exist and one doesn't input_records = [ x for x in records if x['record_id'] in ['118210', '118211'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) self.assertAlmostEqual(float(results[0][2]), 1.0, 1)
def likelihood_moldx(input_file, output_file=None, k_phenotype_groups=1000): """ :param input_file: The file path to a file containing three columns. [ID\tkey=value\thpodid,hpoid,hpoid] :param output_file: The file path to an output file containing the predicted probabilities :param k_phenotype_groups: The number of phenotype groups to use for encoding phenotypes. The CLI version of phenopy allows for one of [1000, 1500] """ try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) sys.exit(1) try: disease_to_phenotype_file = config.get('hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') sys.exit(1) logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, _ = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, ) # parse input records input_records = parse_input(input_file, hpo_network, alt2prim) record_ids = [record["record_id"] for record in input_records] phenotypes = [record["terms"] for record in input_records] # predict likelihood of molecular diagnosis positive_probabilities = predict_likelihood_moldx( phenotypes, phenotype_groups=None, hpo_network=hpo_network, alt2prim=alt2prim, k_phenotype_groups=k_phenotype_groups, ) if output_file is None: output_file = "phenopy.likelihood_moldx.txt" try: with open(output_file, "w") as f: for sample_id, probability in zip(record_ids, positive_probabilities): f.write(f"{sample_id}\t{probability}\n") except IOError: sys.exit("Something went wrong writing the probabilities to file")
def test_score_self(self): # read in records records = parse_input( os.path.join(self.parent_dir, 'data/test.score-long.txt'), self.hpo_network, self.alt2prim) # limit to records with HPO terms since many test cases don't have the sub-graph terms from tests/data/hp.obo input_records = [ x for x in records if x['record_id'] in ['213200', '302801'] ] results = self.scorer.score_records( input_records, input_records, half_product(len(input_records), len(input_records))) self.assertEqual(len(results), 3) # test the score of '213200' - '302801' self.assertAlmostEqual(float(results[1][2]), 0.3758, 2)
def score(input_file, output_file='-', records_file=None, annotations_file=None, custom_disease_file=None, ages_distribution_file=None, self=False, summarization_method='BMWA', scoring_method='HRSS', threads=1): """ Scores similarity of provided HPO annotated entries (see format below) against a set of HPO annotated dataset. By default scoring happens against diseases annotated by the HPO group. See https://hpo.jax.org/app/download/annotation. Phenopy also supports scoring the product of provided entries (see "--product") or scoring against a custom records dataset (see "--records-file). :param input_file: File with HPO annotated entries, one per line (see format below). :param output_file: File path where to store the results. [default: - (stdout)] :param records_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to score entries in the "input_file" against entries here. [default: None] :param annotations_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to add information content to the network. [default: None] :param custom_disease_file: entity Annotation for ranking diseases/genes :param ages_distribution_file: Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std. [default: None] :param self: Score entries in the "input_file" against itself. :param summarization_method: The method used to summarize the HRSS matrix. Supported Values are best match average (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA] :param scoring_method: Either HRSS or Resnik :param threads: Number of parallel processes to use. [default: 1] """ try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) sys.exit(1) if custom_disease_file is None: try: disease_to_phenotype_file = config.get( 'hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') sys.exit(1) else: logger.info( f"using custom disease annotation file: {custom_disease_file}") disease_to_phenotype_file = custom_disease_file logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, disease_records = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, annotations_file=annotations_file, ages_distribution_file=ages_distribution_file ) # parse input records input_records = parse_input(input_file, hpo_network, alt2prim) # create instance the scorer class try: scorer = Scorer(hpo_network, summarization_method=summarization_method, scoring_method=scoring_method) except ValueError as e: logger.critical(f'Failed to initialize scoring class: {e}') sys.exit(1) if self: score_records = input_records scoring_pairs = half_product(len(score_records), len(score_records)) else: if records_file: score_records = parse_input(records_file, hpo_network, alt2prim) else: score_records = disease_records scoring_pairs = itertools.product( range(len(input_records)), range(len(score_records)), ) results = scorer.score_records(input_records, score_records, scoring_pairs, threads) with open_or_stdout(output_file) as output_fh: output_fh.write('\t'.join(['#query', 'entity_id', 'score'])) output_fh.write('\n') for result in results: output_fh.write('\t'.join(str(column) for column in result)) output_fh.write('\n')
def annotate(hpo_network, phenotype_to_diseases, num_diseases_annotated, alt2prim, annotations_file=None, ages_distribution_file=None, phenotype_disease_frequencies=None): """ Cleans the HPO network. Removes non-phenotype branches of the network, and merges all synonyms into one tag. :param hpo_network: `networkx.MultiDiGraph` to clean. :param phenotype_to_diseases: Dictionary mapping HPO terms to diseases. :param num_diseases_annotated: Number of diseases with HPO annotations. :param alt2prim: The dict of alternate terms to canonical terms. :param annotations_file: A list of custom annotation files, in the same format as tests/data/test.score-long.txt :param ages: age distributions object :param phenotype_disease_frequencies: dictionary of phenotype to disease frequencies :param logger: Python `logging` logger instance. :param ages_distribution_file: Path to phenotypes ages distribution file. :return: `networkx.MultiDiGraph` """ # Before calculating information content, check for custom_annotations_file and load custom_annos = None if annotations_file is not None: custom_annos = {} for record in parse_input(annotations_file, hpo_network, alt2prim): for term_id in record['terms']: if term_id not in custom_annos: custom_annos[term_id] = [] custom_annos[term_id].append(record['record_id']) # make ages distributions ages = None if ages_distribution_file is not None: try: ages = make_age_distributions(ages_distribution_file) logger.info( f'Adding custom phenotype age distributions to HPO nodes from file: {ages_distribution_file}' ) except (FileNotFoundError, PermissionError) as e: logger.critical(e) logger.critical( f'Specified phenotype ages file could not be loaded or does not exist: {e}' ) exit(1) for node_id, data in hpo_network.nodes(data=True): # annotate with information content value hpo_network.nodes[node_id]['ic'] = calculate_information_content( node_id, hpo_network, phenotype_to_diseases, num_diseases_annotated, custom_annos, ) # annotate with phenotype age distribution hpo_network.nodes[node_id]['disease_weights'] = {} if ages is not None and node_id in ages.index: hpo_network.nodes[node_id]['age_dist'] = ages.loc[node_id]['age_dist'] # add the disease_frequency weights as attributes to the node if phenotype_disease_frequencies is not None: if node_id in phenotype_disease_frequencies: for disease_id, frequency in phenotype_disease_frequencies[node_id].items(): hpo_network.nodes[node_id]['weights']['disease_frequency'][disease_id] = frequency # annotate with depth value # hard-coding origin node for now origin = 'HP:0000001' hpo_network.nodes[node_id]['depth'] = nx.shortest_path_length( hpo_network, node_id, origin ) # clean synonyms synonyms = [] try: for synonym in data['synonym']: synonyms.append(synonym) hpo_network.nodes[node_id]['synonyms'] = re.findall(r'"(.*?)"', ','.join(synonyms)) except KeyError: # pass if no synonym tags in the node pass return hpo_network