def test_score_word2vec_empty(self): """Test the scoring functionality""" scorer = Scorer(self.hpo_network, scoring_method='word2vec') terms_a = [] terms_b = ['HP:0001290', 'HP:0011351'] self.assertEqual(0.0, scorer.score_term_sets_basic(terms_a, terms_b), 2)
def test_score_word2vec_out_of_vocab(self): """Test the scoring functionality""" scorer = Scorer(self.hpo_network, scoring_method='word2vec') terms_a = ['HP:NOT_A_TERM', 'HP:0000118'] terms_b = ['HP:0001290', 'NOT_A_TERM'] self.assertAlmostEqual(0.06, scorer.score_term_sets_basic(terms_a, terms_b), 2)
def test_score_pairs_age(self): # Test reading in records files and calculating pairwise scores # read in records self.hpo_network = annotate( self.hpo_network, self.phenotype_to_diseases, self.num_diseases_annotated, self.alt2prim, ages_distribution_file=self.ages_distribution_file) records = parse_input( os.path.join(self.parent_dir, 'data/test.score-short.txt'), self.hpo_network, self.alt2prim) # create instance the scorer class scorer = Scorer(self.hpo_network, summarization_method='BMWA', min_score_mask=None) # select which patients to test in pairwise best_match_weighted_average input_records = [ x for x in records if x['record_id'] in ['118200', '118210'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) # the right answer = answer = np.average([0.166, 1.0, 1.0, 0.125, 0.25, 1.0, 1.0], weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0]) self.assertAlmostEqual(float(results[0][2]), answer, 2) # Test identical records for which one age exist and one doesn't input_records = [ x for x in records if x['record_id'] in ['118210', '118211'] ] results = scorer.score_records( input_records, input_records, [ (0, 1), ], ) self.assertEqual(len(results), 1) self.assertAlmostEqual(float(results[0][2]), 1.0, 1)
def initialize_hrss(): """ Initialize the Hybrid Relative Semantic Similarity Scorer. """ # data directory phenopy_data_directory = os.path.join(os.getenv('HOMEPATH'), '.phenopy\\data') # files used in building the annotated HPO network obo_file = os.path.join(phenopy_data_directory, 'hp.obo') disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa') # if you have a custom ages_distribution_file, you can set it here. #ages_distribution_file = os.path.join(phenopy_data_directory, 'xa_age_stats_oct052019.tsv') hpo_network, alt2prim, disease_records = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, ) scorer = Scorer(hpo_network) return scorer
def setUp(cls): # parent dir cls.parent_dir = os.path.dirname(os.path.realpath(__file__)) # load and process the network cls.obo_file = os.path.join(cls.parent_dir, 'data/hp.obo') cls.hpo_network = load_network(cls.obo_file) cls.alt2prim = generate_alternate_ids(cls.hpo_network) cls.ages_distribution_file = os.path.join(cls.parent_dir, 'data/phenotype_age.tsv') # load phenotypes to genes associations cls.disease_to_phenotype_file = os.path.join(cls.parent_dir, 'data/phenotype.hpoa') cls.disease_records, cls.phenotype_to_diseases = load_d2p( cls.disease_to_phenotype_file, cls.hpo_network, cls.alt2prim) cls.num_diseases_annotated = len(cls.disease_records) cls.hpo_network = annotate(cls.hpo_network, cls.phenotype_to_diseases, cls.num_diseases_annotated, cls.alt2prim) # create instance the scorer class cls.scorer = Scorer(cls.hpo_network, min_score_mask=None)
def score(input_file, output_file='-', records_file=None, annotations_file=None, custom_disease_file=None, ages_distribution_file=None, self=False, summarization_method='BMWA', scoring_method='HRSS', threads=1): """ Scores similarity of provided HPO annotated entries (see format below) against a set of HPO annotated dataset. By default scoring happens against diseases annotated by the HPO group. See https://hpo.jax.org/app/download/annotation. Phenopy also supports scoring the product of provided entries (see "--product") or scoring against a custom records dataset (see "--records-file). :param input_file: File with HPO annotated entries, one per line (see format below). :param output_file: File path where to store the results. [default: - (stdout)] :param records_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to score entries in the "input_file" against entries here. [default: None] :param annotations_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if provided, is used to add information content to the network. [default: None] :param custom_disease_file: entity Annotation for ranking diseases/genes :param ages_distribution_file: Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std. [default: None] :param self: Score entries in the "input_file" against itself. :param summarization_method: The method used to summarize the HRSS matrix. Supported Values are best match average (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA] :param scoring_method: Either HRSS or Resnik :param threads: Number of parallel processes to use. [default: 1] """ try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.' ) sys.exit(1) if custom_disease_file is None: try: disease_to_phenotype_file = config.get( 'hpo', 'disease_to_phenotype_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO annotated dataset file found in the configuration file.' ' See "hpo:disease_to_phenotype_file" parameter.') sys.exit(1) else: logger.info( f"using custom disease annotation file: {custom_disease_file}") disease_to_phenotype_file = custom_disease_file logger.info(f'Loading HPO OBO file: {obo_file}') hpo_network, alt2prim, disease_records = \ generate_annotated_hpo_network(obo_file, disease_to_phenotype_file, annotations_file=annotations_file, ages_distribution_file=ages_distribution_file ) # parse input records input_records = parse_input(input_file, hpo_network, alt2prim) # create instance the scorer class try: scorer = Scorer(hpo_network, summarization_method=summarization_method, scoring_method=scoring_method) except ValueError as e: logger.critical(f'Failed to initialize scoring class: {e}') sys.exit(1) if self: score_records = input_records scoring_pairs = half_product(len(score_records), len(score_records)) else: if records_file: score_records = parse_input(records_file, hpo_network, alt2prim) else: score_records = disease_records scoring_pairs = itertools.product( range(len(input_records)), range(len(score_records)), ) results = scorer.score_records(input_records, score_records, scoring_pairs, threads) with open_or_stdout(output_file) as output_fh: output_fh.write('\t'.join(['#query', 'entity_id', 'score'])) output_fh.write('\n') for result in results: output_fh.write('\t'.join(str(column) for column in result)) output_fh.write('\n')
def run_phenoseries_experiment(outdir=None, phenotypic_series_filepath=None, min_hpos=2, min_entities=4, phenoseries_fraction=1.0, scoring_method="HRSS", threads=1, omim_phenotypes_file=None, pairwise_mim_scores_file=None): if outdir is None: outdir = os.getcwd # load HPO network # data directory phenopy_data_directory = os.path.join(os.getenv("HOME"), ".phenopy/data") # files used in building the annotated HPO network obo_file = os.path.join(phenopy_data_directory, "hp.obo") disease_to_phenotype_file = os.path.join(phenopy_data_directory, "phenotype.hpoa") hpo_network, alt2prim, _ = generate_annotated_hpo_network( obo_file, disease_to_phenotype_file, ages_distribution_file=None) # read the phenotypic series file as a DataFrame psdf = pd.read_csv( phenotypic_series_filepath, sep="\t", comment="#", names=["PS", "MIM", "Phenotype"], ) # null phenotypes are actually null MIM id fields, so just drop these psdf = psdf.dropna().sample(frac=phenoseries_fraction, random_state=42) psdf.reset_index(inplace=True, drop=True) # create a dictionary for phenotypic series to list of omim ids mapping ps2mimids = {} for ps, mim_ids in psdf.groupby(["PS"])["MIM"]: # more than two mims in a ps if len(mim_ids) >= 2: ps2mimids[ps] = list(set([int(mid) for mid in mim_ids.tolist()])) # invert the ps2mimid dictionary for easy lookup of which ps a mim belongs to mim2psids = {} for mim_id, ps in psdf.groupby(["MIM"])["PS"]: mim2psids[int(mim_id)] = ps.tolist() fields_to_use = [ "text", "description", "otherFeatures", "biochemicalFeatures", "diagnosis", "clinicalFeatures", ] if omim_phenotypes_file == "": logger.info("Scraping OMIM Diseases text") mim_texts = {} for mim_id in mim2psids: mim_response = request_mimid_info(mim_id) try: mim_info = mim_response.json() except AttributeError: break mim_text = mim_info["omim"]["entryList"][0]["entry"][ "textSectionList"] all_mim_text = "" for text_section in mim_text: section_name = text_section["textSection"]["textSectionName"] if section_name in fields_to_use: # unique_section_names.add(section_name) all_mim_text += " " + text_section["textSection"][ "textSectionContent"] mim_texts[mim_id] = all_mim_text # instantiate txt2hpo's Exctractor class to perform named entity recognition extractor = Extractor(remove_negated=True, max_neighbors=3, correct_spelling=False) # loop over the MIM ids and extract hpo ids from each MIM's text fields mim_hpos = {} for mim_id in mim2psids: mim_hpos[mim_id] = extractor.hpo(mim_texts[mim_id]).hpids mimdf = pd.DataFrame() mimdf["omim_id"] = list(mim2psids.keys()) mimdf["hpo_terms"] = mimdf["omim_id"].apply( lambda mim_id: mim_hpos[mim_id]) mimdf.to_csv(os.path.join(outdir, "omim_phenotypes.txt"), index=False, sep='\t') else: logger.info("You passed an OMIM disease to phenotype file") try: mimdf = pd.read_csv(omim_phenotypes_file, sep="\t") mimdf["omim_id"] = mimdf["omim_id"].astype(int) mimdf["hpo_terms"] = mimdf["hpo_terms"].apply(literal_eval) mim_hpos = dict(zip(mimdf["omim_id"], mimdf["hpo_terms"])) except FileNotFoundError: sys.exit("Please provide a valid file path") # do we need this? # mim_hpos = {mim_id: hpos for mim_id, hpos in mim_hpos.items()} # clean up HPO ids in lists for mim_id, hpo_ids in mim_hpos.items(): mim_hpos[mim_id] = convert_and_filter_hpoids(hpo_ids, hpo_network, alt2prim) # remove entities (mims) that have less than min_hpos mims_to_remove = [] for mim_id, hpo_ids in mim_hpos.copy().items(): if len(hpo_ids) <= min_hpos: mims_to_remove.append(mim_id) # Now remove the entities (mim ids) with less than min_hpos experiment_ps2mimids = {} # remove these mims from ps for ps, mimids in ps2mimids.copy().items(): experiment_ps2mimids[ps] = [] for ps_mim_id in mimids: if ps_mim_id not in mims_to_remove: experiment_ps2mimids[ps].append(ps_mim_id) # After removing entities, make sure the series has min number of entities # get lists of mims and their PS remove_these_ps = [] for ps, mimids in experiment_ps2mimids.items(): if len(mimids) < min_entities: remove_these_ps.append(ps) for psid in remove_these_ps: del experiment_ps2mimids[psid] # Create a unique list of entity ids, for scoring later experiment_omims = set() for psid, mim_ids in experiment_ps2mimids.items(): for mim in mim_ids: experiment_omims.add(mim) experiment_omims = list(experiment_omims) # make a DataFrame for entity ids mimdf = pd.DataFrame() mimdf["omim_id"] = experiment_omims mimdf["hpo_terms"] = mimdf["omim_id"].apply( lambda mim_id: mim_hpos[mim_id]) if pairwise_mim_scores_file == "": scorer = Scorer(hpo_network, scoring_method=scoring_method) records = [{ "record_id": mim_id, "terms": convert_and_filter_hpoids(hpo_terms, hpo_network, alt2prim), "weights": {}, } for mim_id, hpo_terms in dict( zip(mimdf["omim_id"], mimdf["hpo_terms"])).items()] results = scorer.score_records(records, records, half_product(len(records), len(records)), threads=threads) pairwise_scores = pd.DataFrame( results, columns=["mimid1", "mimid2", "phenopy-score"]) # convert to square form pairwise_scores = pairwise_scores.set_index(["mimid1", "mimid2"]).unstack() # This pandas method chain fills in the missing scores of the square matrix with the values from the transpose of df. pairwise_scores = (pairwise_scores["phenopy-score"].reset_index( drop=True).fillna( pairwise_scores.T.droplevel(0).reset_index( drop=True)).set_index(pairwise_scores.index, drop=True)) # reindex with the mimdf index pairwise_scores = pairwise_scores.reindex(mimdf["omim_id"].tolist()) pairwise_scores = pairwise_scores[mimdf["omim_id"].tolist()] pd.DataFrame(pairwise_scores).to_csv(os.path.join( outdir, 'phenoseries.psim_matrix.txt'), sep='\t') else: pairwise_scores = pd.read_csv(pairwise_mim_scores_file, sep='\t') ranksdf = make_rank_dataframe( pairwise_scores.astype(float).values, mimdf, experiment_ps2mimids) ranksdf.to_csv(os.path.join(outdir, "phenoseries.rankdf.txt"), sep="\t")
def score(query_hpo_file, records_file=None, query_name='SAMPLE', obo_file=None, pheno2genes_file=None, threads=1, agg_score='BMA', no_parents=False, custom_annotations_file=None, output_file=None): """ Scores a case HPO terms against all genes associated HPO. :param query_hpo_file: File with case HPO terms, one per line. :param records_file: One record per line, tab delimited. First column record unique identifier, second column pipe separated list of HPO identifier (HP:0000001). :param query_name: Unique identifier for the query file. :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology. :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation. :param threads: Number of parallel process to use. :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets Must be one of {'BMA', 'maximum'} :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed. :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt :param output_file: filepath where to store the results. """ if agg_score not in {'BMA', 'maximum', }: logger.critical( 'agg_score must be one of {BMA, maximum}.') exit(1) if obo_file is None: try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.') exit(1) if pheno2genes_file is None: try: pheno2genes_file = config.get('hpo', 'pheno2genes_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.' ) exit(1) try: with open(query_hpo_file, 'r') as case_fh: case_hpo = case_fh.read().splitlines() except (FileNotFoundError, PermissionError) as e: logger.critical(e) exit(1) # load phenotypes to genes associations terms_to_genes, genes_to_terms, annotations_count = load_p2g( pheno2genes_file, logger=logger) # load hpo network hpo_network = _load_hpo_network( obo_file, terms_to_genes, annotations_count, custom_annotations_file) # create instance the scorer class scorer = Scorer(hpo_network) # multiprocessing objects manager = Manager() lock = manager.Lock() if no_parents is True: case_hpo = remove_parents(case_hpo, hpo_network) if records_file: # score and output case hpo terms against all genes associated set of hpo terms logger.info( f'Scoring HPO terms from file: {query_hpo_file} against entities in: {records_file}') records = read_records_file(records_file, no_parents, hpo_network, logger=logger) # include the case-to-iteslf records[query_name] = case_hpo if not output_file: sys.stdout.write('\t'.join(['#query', 'entity_id', 'score'])) sys.stdout.write('\n') with Pool(threads) as p: p.starmap(scorer.score_pairs, [(records, [ (query_name, record) for record in records], lock, agg_score, i, threads) for i in range(threads)]) else: with Pool(threads) as p: scored_results = p.starmap(scorer.score_pairs, [(records, [(query_name, record) for record in records], lock, agg_score, i, threads, False) for i in range(threads)]) scored_results = [item for sublist in scored_results for item in sublist] scored_results_df = pd.DataFrame(data=scored_results, columns='#query,entity_id,score'.split(',')) scored_results_df = scored_results_df.sort_values(by='score', ascending=False) scored_results_df.to_csv(output_file, sep='\t', index=False) logger.info(f'Scoring completed') logger.info(f'Writing results to file: {output_file}') else: # score and output case hpo terms against all genes associated set of hpo terms logger.info(f'Scoring case HPO terms from file: {query_hpo_file}') # add the case terms to the genes_to_terms dict genes_to_terms[query_name] = case_hpo if not output_file: sys.stdout.write('\t'.join(['#query', 'gene', 'score'])) sys.stdout.write('\n') # iterate over each cross-product and score the pair of records with Pool(threads) as p: p.starmap(scorer.score_pairs, [(genes_to_terms, [ (query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads) for i in range(threads)]) else: with Pool(threads) as p: scored_results = p.starmap(scorer.score_pairs, [(genes_to_terms, [(query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads, False) for i in range(threads)]) scored_results = [item for sublist in scored_results for item in sublist] scored_results_df = pd.DataFrame(data=scored_results, columns='#query,gene,score'.split(',')) scored_results_df = scored_results_df.sort_values(by='score', ascending=False) scored_results_df.to_csv(output_file, sep='\t', index=False) logger.info(f'Scoring completed') logger.info(f'Writing results to file: {output_file}')
def score_product(records_file, obo_file=None, pheno2genes_file=None, threads=1, agg_score='BMA', no_parents=False, custom_annotations_file=None): """ Scores the cartesian product of HPO terms from a list of unique records (cases, genes, diseases, etc). :param records_file: One record per line, tab delimited. First column record unique identifier, second column pipe separated list of HPO identifier (HP:0000001). :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology. :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation. :param threads: Multiprocessing threads to use [default: 1]. :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets Must be one of {'BMA', 'maximum'} :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed. :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt """ if agg_score not in {'BMA', 'maximum', }: logger.critical( 'agg_score must be one of {BMA, maximum}.') exit(1) if obo_file is None: try: obo_file = config.get('hpo', 'obo_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.') exit(1) if pheno2genes_file is None: try: pheno2genes_file = config.get('hpo', 'pheno2genes_file') except (NoSectionError, NoOptionError): logger.critical( 'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.' ) exit(1) # load phenotypes to genes associations terms_to_genes, _, annotations_count = load_p2g( pheno2genes_file, logger=logger) # load hpo network hpo_network = _load_hpo_network( obo_file, terms_to_genes, annotations_count, custom_annotations_file) # try except records = read_records_file(records_file, no_parents, hpo_network, logger=logger) logger.info(f'Scoring product of records from file: {records_file}') # create instance the scorer class scorer = Scorer(hpo_network) # create records product generator records_product = itertools.product(records.keys(), repeat=2) # iterate over each cross-product and score the pair of records manager = Manager() lock = manager.Lock() with Pool(threads) as p: p.starmap(scorer.score_pairs, [(records, records_product, lock, agg_score, i, threads) for i in range(threads)])