Ejemplo n.º 1
0
    def test_score_word2vec_empty(self):
        """Test the scoring functionality"""
        scorer = Scorer(self.hpo_network, scoring_method='word2vec')
        terms_a = []
        terms_b = ['HP:0001290', 'HP:0011351']

        self.assertEqual(0.0, scorer.score_term_sets_basic(terms_a, terms_b),
                         2)
Ejemplo n.º 2
0
    def test_score_word2vec_out_of_vocab(self):
        """Test the scoring functionality"""
        scorer = Scorer(self.hpo_network, scoring_method='word2vec')
        terms_a = ['HP:NOT_A_TERM', 'HP:0000118']
        terms_b = ['HP:0001290', 'NOT_A_TERM']

        self.assertAlmostEqual(0.06,
                               scorer.score_term_sets_basic(terms_a, terms_b),
                               2)
Ejemplo n.º 3
0
    def test_score_pairs_age(self):
        # Test reading in records files and calculating pairwise scores
        # read in records
        self.hpo_network = annotate(
            self.hpo_network,
            self.phenotype_to_diseases,
            self.num_diseases_annotated,
            self.alt2prim,
            ages_distribution_file=self.ages_distribution_file)

        records = parse_input(
            os.path.join(self.parent_dir, 'data/test.score-short.txt'),
            self.hpo_network, self.alt2prim)

        # create instance the scorer class
        scorer = Scorer(self.hpo_network,
                        summarization_method='BMWA',
                        min_score_mask=None)

        # select which patients to test in pairwise best_match_weighted_average
        input_records = [
            x for x in records if x['record_id'] in ['118200', '118210']
        ]

        results = scorer.score_records(
            input_records,
            input_records,
            [
                (0, 1),
            ],
        )
        self.assertEqual(len(results), 1)

        # the right answer =
        answer = np.average([0.166, 1.0, 1.0, 0.125, 0.25, 1.0, 1.0],
                            weights=[0.481, 1.0, 1.0, 0.0446, 1.0, 1.0, 1.0])

        self.assertAlmostEqual(float(results[0][2]), answer, 2)

        # Test identical records for which one age exist and one doesn't
        input_records = [
            x for x in records if x['record_id'] in ['118210', '118211']
        ]

        results = scorer.score_records(
            input_records,
            input_records,
            [
                (0, 1),
            ],
        )
        self.assertEqual(len(results), 1)

        self.assertAlmostEqual(float(results[0][2]), 1.0, 1)
Ejemplo n.º 4
0
def initialize_hrss():
    """
    Initialize the Hybrid Relative Semantic Similarity Scorer.
    """
    # data directory
    phenopy_data_directory = os.path.join(os.getenv('HOMEPATH'), '.phenopy\\data')

    # files used in building the annotated HPO network
    obo_file = os.path.join(phenopy_data_directory, 'hp.obo')
    disease_to_phenotype_file = os.path.join(phenopy_data_directory, 'phenotype.hpoa')

    # if you have a custom ages_distribution_file, you can set it here.
    #ages_distribution_file = os.path.join(phenopy_data_directory, 'xa_age_stats_oct052019.tsv')

    hpo_network, alt2prim, disease_records = \
        generate_annotated_hpo_network(obo_file,
                                       disease_to_phenotype_file,
                                       )

    scorer = Scorer(hpo_network)
    return scorer
Ejemplo n.º 5
0
    def setUp(cls):
        # parent dir
        cls.parent_dir = os.path.dirname(os.path.realpath(__file__))

        # load and process the network
        cls.obo_file = os.path.join(cls.parent_dir, 'data/hp.obo')
        cls.hpo_network = load_network(cls.obo_file)
        cls.alt2prim = generate_alternate_ids(cls.hpo_network)
        cls.ages_distribution_file = os.path.join(cls.parent_dir,
                                                  'data/phenotype_age.tsv')

        # load phenotypes to genes associations
        cls.disease_to_phenotype_file = os.path.join(cls.parent_dir,
                                                     'data/phenotype.hpoa')
        cls.disease_records, cls.phenotype_to_diseases = load_d2p(
            cls.disease_to_phenotype_file, cls.hpo_network, cls.alt2prim)

        cls.num_diseases_annotated = len(cls.disease_records)
        cls.hpo_network = annotate(cls.hpo_network, cls.phenotype_to_diseases,
                                   cls.num_diseases_annotated, cls.alt2prim)

        # create instance the scorer class
        cls.scorer = Scorer(cls.hpo_network, min_score_mask=None)
Ejemplo n.º 6
0
def score(input_file,
          output_file='-',
          records_file=None,
          annotations_file=None,
          custom_disease_file=None,
          ages_distribution_file=None,
          self=False,
          summarization_method='BMWA',
          scoring_method='HRSS',
          threads=1):
    """
    Scores similarity of provided HPO annotated entries (see format below) against a set of HPO annotated dataset. By
    default scoring happens against diseases annotated by the HPO group. See https://hpo.jax.org/app/download/annotation.

    Phenopy also supports scoring the product of provided entries (see "--product") or scoring against a custom records
    dataset (see "--records-file).

    :param input_file: File with HPO annotated entries, one per line (see format below).
    :param output_file: File path where to store the results. [default: - (stdout)]
    :param records_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if
     provided, is used to score entries in the "input_file" against entries here. [default: None]
    :param annotations_file: An entity-to-phenotype annotation file in the same format as "input_file". This file, if
     provided, is used to add information content to the network. [default: None]
    :param custom_disease_file: entity Annotation for ranking diseases/genes
    :param ages_distribution_file: Phenotypes age summary stats file containing phenotype HPO id, mean_age, and std.
     [default: None]
    :param self: Score entries in the "input_file" against itself.
    :param summarization_method: The method used to summarize the HRSS matrix. Supported Values are best match average
    (BMA), best match weighted average (BMWA), and maximum (maximum). [default: BMWA]
    :param scoring_method: Either HRSS or Resnik
    :param threads: Number of parallel processes to use. [default: 1]
    """

    try:
        obo_file = config.get('hpo', 'obo_file')
    except (NoSectionError, NoOptionError):
        logger.critical(
            'No HPO OBO file found in the configuration file. See "hpo:obo_file" parameter.'
        )
        sys.exit(1)
    if custom_disease_file is None:
        try:
            disease_to_phenotype_file = config.get(
                'hpo', 'disease_to_phenotype_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO annotated dataset file found in the configuration file.'
                ' See "hpo:disease_to_phenotype_file" parameter.')
            sys.exit(1)
    else:
        logger.info(
            f"using custom disease annotation file: {custom_disease_file}")
        disease_to_phenotype_file = custom_disease_file

    logger.info(f'Loading HPO OBO file: {obo_file}')
    hpo_network, alt2prim, disease_records = \
        generate_annotated_hpo_network(obo_file,
                                       disease_to_phenotype_file,
                                       annotations_file=annotations_file,
                                       ages_distribution_file=ages_distribution_file
                                       )

    # parse input records
    input_records = parse_input(input_file, hpo_network, alt2prim)

    # create instance the scorer class
    try:
        scorer = Scorer(hpo_network,
                        summarization_method=summarization_method,
                        scoring_method=scoring_method)
    except ValueError as e:
        logger.critical(f'Failed to initialize scoring class: {e}')
        sys.exit(1)

    if self:
        score_records = input_records

        scoring_pairs = half_product(len(score_records), len(score_records))
    else:
        if records_file:
            score_records = parse_input(records_file, hpo_network, alt2prim)
        else:
            score_records = disease_records

        scoring_pairs = itertools.product(
            range(len(input_records)),
            range(len(score_records)),
        )

    results = scorer.score_records(input_records, score_records, scoring_pairs,
                                   threads)

    with open_or_stdout(output_file) as output_fh:
        output_fh.write('\t'.join(['#query', 'entity_id', 'score']))
        output_fh.write('\n')
        for result in results:
            output_fh.write('\t'.join(str(column) for column in result))
            output_fh.write('\n')
Ejemplo n.º 7
0
def run_phenoseries_experiment(outdir=None,
                               phenotypic_series_filepath=None,
                               min_hpos=2,
                               min_entities=4,
                               phenoseries_fraction=1.0,
                               scoring_method="HRSS",
                               threads=1,
                               omim_phenotypes_file=None,
                               pairwise_mim_scores_file=None):

    if outdir is None:
        outdir = os.getcwd

    # load HPO network
    # data directory
    phenopy_data_directory = os.path.join(os.getenv("HOME"), ".phenopy/data")

    # files used in building the annotated HPO network
    obo_file = os.path.join(phenopy_data_directory, "hp.obo")
    disease_to_phenotype_file = os.path.join(phenopy_data_directory,
                                             "phenotype.hpoa")

    hpo_network, alt2prim, _ = generate_annotated_hpo_network(
        obo_file, disease_to_phenotype_file, ages_distribution_file=None)

    # read the phenotypic series file as a DataFrame
    psdf = pd.read_csv(
        phenotypic_series_filepath,
        sep="\t",
        comment="#",
        names=["PS", "MIM", "Phenotype"],
    )
    # null phenotypes are actually null MIM id fields, so just drop these
    psdf = psdf.dropna().sample(frac=phenoseries_fraction, random_state=42)
    psdf.reset_index(inplace=True, drop=True)

    # create a dictionary for phenotypic series to list of omim ids mapping
    ps2mimids = {}
    for ps, mim_ids in psdf.groupby(["PS"])["MIM"]:
        # more than two mims in a ps
        if len(mim_ids) >= 2:
            ps2mimids[ps] = list(set([int(mid) for mid in mim_ids.tolist()]))

    # invert the ps2mimid dictionary for easy lookup of which ps a mim belongs to
    mim2psids = {}
    for mim_id, ps in psdf.groupby(["MIM"])["PS"]:
        mim2psids[int(mim_id)] = ps.tolist()

    fields_to_use = [
        "text",
        "description",
        "otherFeatures",
        "biochemicalFeatures",
        "diagnosis",
        "clinicalFeatures",
    ]

    if omim_phenotypes_file == "":
        logger.info("Scraping OMIM Diseases text")
        mim_texts = {}
        for mim_id in mim2psids:
            mim_response = request_mimid_info(mim_id)
            try:
                mim_info = mim_response.json()
            except AttributeError:
                break
            mim_text = mim_info["omim"]["entryList"][0]["entry"][
                "textSectionList"]

            all_mim_text = ""
            for text_section in mim_text:
                section_name = text_section["textSection"]["textSectionName"]
                if section_name in fields_to_use:
                    # unique_section_names.add(section_name)
                    all_mim_text += " " + text_section["textSection"][
                        "textSectionContent"]

            mim_texts[mim_id] = all_mim_text
        # instantiate txt2hpo's Exctractor class to perform named entity recognition
        extractor = Extractor(remove_negated=True,
                              max_neighbors=3,
                              correct_spelling=False)

        # loop over the MIM ids and extract hpo ids from each MIM's text fields
        mim_hpos = {}
        for mim_id in mim2psids:
            mim_hpos[mim_id] = extractor.hpo(mim_texts[mim_id]).hpids

        mimdf = pd.DataFrame()
        mimdf["omim_id"] = list(mim2psids.keys())
        mimdf["hpo_terms"] = mimdf["omim_id"].apply(
            lambda mim_id: mim_hpos[mim_id])
        mimdf.to_csv(os.path.join(outdir, "omim_phenotypes.txt"),
                     index=False,
                     sep='\t')

    else:
        logger.info("You passed an OMIM disease to phenotype file")
        try:
            mimdf = pd.read_csv(omim_phenotypes_file, sep="\t")
            mimdf["omim_id"] = mimdf["omim_id"].astype(int)
            mimdf["hpo_terms"] = mimdf["hpo_terms"].apply(literal_eval)
            mim_hpos = dict(zip(mimdf["omim_id"], mimdf["hpo_terms"]))
        except FileNotFoundError:
            sys.exit("Please provide a valid file path")

    # do we need this?
    # mim_hpos = {mim_id: hpos for mim_id, hpos in mim_hpos.items()}

    # clean up HPO ids in lists
    for mim_id, hpo_ids in mim_hpos.items():
        mim_hpos[mim_id] = convert_and_filter_hpoids(hpo_ids, hpo_network,
                                                     alt2prim)

    # remove entities (mims) that have less than min_hpos
    mims_to_remove = []
    for mim_id, hpo_ids in mim_hpos.copy().items():
        if len(hpo_ids) <= min_hpos:
            mims_to_remove.append(mim_id)

    # Now remove the entities (mim ids) with less than min_hpos
    experiment_ps2mimids = {}
    # remove these mims from ps
    for ps, mimids in ps2mimids.copy().items():
        experiment_ps2mimids[ps] = []
        for ps_mim_id in mimids:
            if ps_mim_id not in mims_to_remove:
                experiment_ps2mimids[ps].append(ps_mim_id)

    # After removing entities, make sure the series has min number of entities
    # get lists of mims and their PS
    remove_these_ps = []
    for ps, mimids in experiment_ps2mimids.items():
        if len(mimids) < min_entities:
            remove_these_ps.append(ps)

    for psid in remove_these_ps:
        del experiment_ps2mimids[psid]

    # Create a unique list of entity ids, for scoring later
    experiment_omims = set()
    for psid, mim_ids in experiment_ps2mimids.items():
        for mim in mim_ids:
            experiment_omims.add(mim)
    experiment_omims = list(experiment_omims)

    # make a DataFrame for entity ids
    mimdf = pd.DataFrame()
    mimdf["omim_id"] = experiment_omims
    mimdf["hpo_terms"] = mimdf["omim_id"].apply(
        lambda mim_id: mim_hpos[mim_id])

    if pairwise_mim_scores_file == "":
        scorer = Scorer(hpo_network, scoring_method=scoring_method)
        records = [{
            "record_id":
            mim_id,
            "terms":
            convert_and_filter_hpoids(hpo_terms, hpo_network, alt2prim),
            "weights": {},
        } for mim_id, hpo_terms in dict(
            zip(mimdf["omim_id"], mimdf["hpo_terms"])).items()]

        results = scorer.score_records(records,
                                       records,
                                       half_product(len(records),
                                                    len(records)),
                                       threads=threads)

        pairwise_scores = pd.DataFrame(
            results, columns=["mimid1", "mimid2", "phenopy-score"])
        # convert to square form
        pairwise_scores = pairwise_scores.set_index(["mimid1",
                                                     "mimid2"]).unstack()
        # This pandas method chain fills in the missing scores of the square matrix with the values from the transpose of df.
        pairwise_scores = (pairwise_scores["phenopy-score"].reset_index(
            drop=True).fillna(
                pairwise_scores.T.droplevel(0).reset_index(
                    drop=True)).set_index(pairwise_scores.index, drop=True))
        # reindex with the mimdf index
        pairwise_scores = pairwise_scores.reindex(mimdf["omim_id"].tolist())
        pairwise_scores = pairwise_scores[mimdf["omim_id"].tolist()]
        pd.DataFrame(pairwise_scores).to_csv(os.path.join(
            outdir, 'phenoseries.psim_matrix.txt'),
                                             sep='\t')
    else:
        pairwise_scores = pd.read_csv(pairwise_mim_scores_file, sep='\t')

    ranksdf = make_rank_dataframe(
        pairwise_scores.astype(float).values, mimdf, experiment_ps2mimids)
    ranksdf.to_csv(os.path.join(outdir, "phenoseries.rankdf.txt"), sep="\t")
Ejemplo n.º 8
0
def score(query_hpo_file, records_file=None, query_name='SAMPLE', obo_file=None, pheno2genes_file=None, threads=1,
          agg_score='BMA', no_parents=False, custom_annotations_file=None, output_file=None):
    """
    Scores a case HPO terms against all genes associated HPO.

    :param query_hpo_file: File with case HPO terms, one per line.
    :param records_file: One record per line, tab delimited. First column record unique identifier, second column
        pipe separated list of HPO identifier (HP:0000001).
    :param query_name: Unique identifier for the query file.
    :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology.
    :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation.
    :param threads: Number of parallel process to use.
    :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets
        Must be one of {'BMA', 'maximum'}
    :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed.
    :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt
    :param output_file: filepath where to store the results.
    """

    if agg_score not in {'BMA', 'maximum', }:
        logger.critical(
            'agg_score must be one of {BMA, maximum}.')
        exit(1)

    if obo_file is None:
        try:
            obo_file = config.get('hpo', 'obo_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.')
            exit(1)

    if pheno2genes_file is None:
        try:
            pheno2genes_file = config.get('hpo', 'pheno2genes_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.'
            )
            exit(1)

    try:
        with open(query_hpo_file, 'r') as case_fh:
            case_hpo = case_fh.read().splitlines()
    except (FileNotFoundError, PermissionError) as e:
        logger.critical(e)
        exit(1)

    # load phenotypes to genes associations
    terms_to_genes, genes_to_terms, annotations_count = load_p2g(
        pheno2genes_file, logger=logger)

    # load hpo network
    hpo_network = _load_hpo_network(
        obo_file, terms_to_genes, annotations_count, custom_annotations_file)

    # create instance the scorer class
    scorer = Scorer(hpo_network)

    # multiprocessing objects
    manager = Manager()
    lock = manager.Lock()

    if no_parents is True:
        case_hpo = remove_parents(case_hpo, hpo_network)

    if records_file:
        # score and output case hpo terms against all genes associated set of hpo terms
        logger.info(
            f'Scoring HPO terms from file: {query_hpo_file} against entities in: {records_file}')

        records = read_records_file(records_file, no_parents, hpo_network, logger=logger)

        # include the case-to-iteslf
        records[query_name] = case_hpo
        if not output_file:
            sys.stdout.write('\t'.join(['#query', 'entity_id', 'score']))
            sys.stdout.write('\n')
            with Pool(threads) as p:
                p.starmap(scorer.score_pairs, [(records, [
                          (query_name, record) for record in records], lock, agg_score, i, threads) for i in range(threads)])
        else:
            with Pool(threads) as p:
                scored_results = p.starmap(scorer.score_pairs, [(records, [(query_name, record) for record in records],
                                                                 lock, agg_score, i, threads, False) for i in range(threads)])
            scored_results = [item for sublist in scored_results for item in sublist]
            scored_results_df = pd.DataFrame(data=scored_results, columns='#query,entity_id,score'.split(','))
            scored_results_df = scored_results_df.sort_values(by='score', ascending=False)
            scored_results_df.to_csv(output_file, sep='\t', index=False)
            logger.info(f'Scoring completed')
            logger.info(f'Writing results to file: {output_file}')

    else:
        # score and output case hpo terms against all genes associated set of hpo terms
        logger.info(f'Scoring case HPO terms from file: {query_hpo_file}')

        # add the case terms to the genes_to_terms dict
        genes_to_terms[query_name] = case_hpo
        if not output_file:
            sys.stdout.write('\t'.join(['#query', 'gene', 'score']))
            sys.stdout.write('\n')
            # iterate over each cross-product and score the pair of records
            with Pool(threads) as p:
                p.starmap(scorer.score_pairs, [(genes_to_terms, [
                          (query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads) for i in range(threads)])
        else:

            with Pool(threads) as p:
                scored_results = p.starmap(scorer.score_pairs, [(genes_to_terms,
                                     [(query_name, gene) for gene in genes_to_terms], lock, agg_score, i, threads, False)
                                                                for i in range(threads)])
            scored_results = [item for sublist in scored_results for item in sublist]
            scored_results_df = pd.DataFrame(data=scored_results, columns='#query,gene,score'.split(','))
            scored_results_df = scored_results_df.sort_values(by='score', ascending=False)
            scored_results_df.to_csv(output_file, sep='\t', index=False)
            logger.info(f'Scoring completed')
            logger.info(f'Writing results to file: {output_file}')
Ejemplo n.º 9
0
def score_product(records_file, obo_file=None, pheno2genes_file=None, threads=1, agg_score='BMA', no_parents=False,
                  custom_annotations_file=None):
    """
    Scores the cartesian product of HPO terms from a list of unique records (cases, genes, diseases, etc).

    :param records_file: One record per line, tab delimited. First column record unique identifier, second column
        pipe separated list of HPO identifier (HP:0000001).
    :param obo_file: OBO file from https://hpo.jax.org/app/download/ontology.
    :param pheno2genes_file: Phenotypes to genes from https://hpo.jax.org/app/download/annotation.
    :param threads: Multiprocessing threads to use [default: 1].
    :param agg_score: The aggregation method to use for summarizing the similarity matrix between two term sets
        Must be one of {'BMA', 'maximum'}
    :param no_parents: If provided, scoring is done by only using the most informative nodes. All parent nodes are removed.
    :param custom_annotations_file: A custom entity-to-phenotype annotation file in the same format as tests/data/test.score-product.txt
    """
    if agg_score not in {'BMA', 'maximum', }:
        logger.critical(
            'agg_score must be one of {BMA, maximum}.')
        exit(1)

    if obo_file is None:
        try:
            obo_file = config.get('hpo', 'obo_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO OBO file provided and no "hpo:obo_file" found in the configuration file.')
            exit(1)

    if pheno2genes_file is None:
        try:
            pheno2genes_file = config.get('hpo', 'pheno2genes_file')
        except (NoSectionError, NoOptionError):
            logger.critical(
                'No HPO pheno2genes_file file provided and no "hpo:pheno2genes_file" found in the configuration file.'
            )
            exit(1)

    # load phenotypes to genes associations
    terms_to_genes, _, annotations_count = load_p2g(
        pheno2genes_file, logger=logger)

    # load hpo network
    hpo_network = _load_hpo_network(
        obo_file, terms_to_genes, annotations_count, custom_annotations_file)

    # try except
    records = read_records_file(records_file, no_parents, hpo_network, logger=logger)

    logger.info(f'Scoring product of records from file: {records_file}')

    # create instance the scorer class
    scorer = Scorer(hpo_network)

    # create records product generator
    records_product = itertools.product(records.keys(), repeat=2)

    # iterate over each cross-product and score the pair of records
    manager = Manager()
    lock = manager.Lock()
    with Pool(threads) as p:
        p.starmap(scorer.score_pairs, [(records, records_product,
                                        lock, agg_score, i, threads) for i in range(threads)])