Beispiel #1
0
    def test_age_weight(self):
        # Test age based weight distribution and best_match_weighted_average calculation

        terms_a = ['HP:0001251', 'HP:0001263', 'HP:0001290',
                   'HP:0004322']  # ATAX, DD, HYP, SS
        terms_b = ['HP:0001263', 'HP:0001249', 'HP:0001290']  # DD, ID, HYP

        self.hpo_network = annotate(
            self.hpo_network,
            self.phenotype_to_diseases,
            self.num_diseases_annotated,
            self.alt2prim,
            ages_distribution_file=self.ages_distribution_file)

        age_a = 9.0
        age_b = 4.0

        # calculate weights based on patients age
        weights_a = {
            'age': calculate_age_weights(terms_a, age_b, self.hpo_network)
        }
        weights_b = {
            'age': calculate_age_weights(terms_b, age_a, self.hpo_network)
        }

        # make pairwise scores matrix
        df = pd.DataFrame([[4.22595743e-02, 3.92122308e-02, 3.04851573e-04],
                           [1.07473687e-01, 5.05101655e-01, 3.78305515e-04],
                           [3.69780479e-04, 3.78305515e-04, 4.64651944e-01],
                           [4.17139800e-04, 4.12232546e-04, 3.67984322e-04]],
                          index=pd.Index(terms_a, name='a'),
                          columns=pd.MultiIndex.from_arrays(
                              [['score'] * len(terms_b), terms_b],
                              names=[None, 'b']))
        # compute pairwise best match weighted average
        score_bmwa = self.scorer.best_match_weighted_average(
            df, weights_a, weights_b)

        self.assertAlmostEqual(score_bmwa, 0.3741, 4)

        # set all weights to 1.0, result should be the same as BMA without weights
        weights_a = {'disease_frequency': [1.] * len(terms_a)}
        weights_b = {'disease_frequency': [1.] * len(terms_b)}
        score_bmwa = self.scorer.best_match_weighted_average(
            df, weights_a, weights_b)

        self.assertAlmostEqual(score_bmwa, 0.2985, 4)

        # test term not in network
        terms_a = ['HP:Not_a_term']
        weights_a = calculate_age_weights(terms_a, age_b, self.hpo_network)
        self.assertEqual(weights_a, [1.0])

        # term in network no age
        terms_a = ['HP:0000001']
        weights_a = calculate_age_weights(terms_a, age_b, self.hpo_network)
        self.assertEqual(weights_a, [1.0])
Beispiel #2
0
    def score(self, record_a, record_b):
        """
        Scores the comparison of terms in list A to terms in list B.

        :param record_a: record A.
        :param record_b: record B.
        :return: `float` (comparison score)
        """
        if self.summarization_method not in ['BMA', 'BMWA', 'maximum']:
            raise ValueError(
                'Unsupported summarization method, please choose from BMA, BMWA, or maximum.'
            )

        # if either set is empty return 0.0
        terms_a = record_a['terms']
        terms_b = record_b['terms']
        if not terms_a or not terms_b:
            return 0.0

        # calculate weights for record_a and record_b
        weights_a = record_a['weights'].copy(
        ) if record_a['weights'] is not None else []
        weights_b = record_b['weights'].copy(
        ) if record_b['weights'] is not None else []

        # set weights
        # if we have age of record_a use it to set age weights for record_b
        if 'age' in record_a:
            weights_b['age'] = calculate_age_weights(record_b['terms'],
                                                     record_a['age'],
                                                     self.hpo_network)

        # if we have age of record_b use it to set age weights for record_a
        if 'age' in record_b:
            weights_a['age'] = calculate_age_weights(record_a['terms'],
                                                     record_b['age'],
                                                     self.hpo_network)

        term_pairs = itertools.product(terms_a, terms_b)
        df = pd.DataFrame(
            [(pair[0], pair[1], self.score_hpo_pair_hrss(pair[0], pair[1]))
             for pair in term_pairs],
            columns=['a', 'b', 'score']).set_index(['a', 'b']).unstack()

        if self.summarization_method == 'maximum':
            return self.maximum(df)
        elif self.summarization_method == 'BMWA' and any(
            [weights_a, weights_b]):
            return self.best_match_weighted_average(df,
                                                    weights_a=weights_a,
                                                    weights_b=weights_b)
        else:
            return self.best_match_average(df)
Beispiel #3
0
    def score(self, record_a, record_b):
        """
        Scores the comparison of terms listed in record A to terms listed in record B.

        :param record_a: record A.
        :param record_b: record B.
        :return: record_a record id, record_b record id, `float` (comparison score)
        :rtype: tuple
        """
        if self.summarization_method not in ['BMA', 'BMWA', 'maximum']:
            raise ValueError('Unsupported summarization method, please choose from BMA, BMWA, or maximum.')

        # if either set is empty return 0.0
        terms_a = record_a['terms']
        terms_b = record_b['terms']
        if not terms_a or not terms_b:
            return record_a['record_id'], record_b['record_id'], 0.0

        if self.scoring_method == 'Jaccard':
            intersection = len(list(set(terms_a).intersection(terms_b)))
            union = (len(terms_a) + len(terms_b)) - intersection
            return record_a['record_id'], record_b['record_id'], float(intersection) / union

        elif self.scoring_method == 'word2vec':
            in_vocab_terms_a = [x for x in terms_a if x in self.word_vectors.vocab]
            in_vocab_terms_b = [x for x in terms_b if x in self.word_vectors.vocab]

            if in_vocab_terms_a and in_vocab_terms_b:
                return self.word_vectors.n_similarity(in_vocab_terms_a, in_vocab_terms_b)
            else:
                return record_a['record_id'], record_b['record_id'], 0.0


        # calculate weights for record_a and record_b
        weights_a = record_a['weights'].copy() if record_a['weights'] is not None else []
        weights_b = record_b['weights'].copy() if record_b['weights'] is not None else []

        # set weights
        # if we have age of record_a use it to set age weights for record_b
        if 'age' in record_a:
            weights_b['age'] = calculate_age_weights(record_b['terms'], record_a['age'], self.hpo_network)

        # if we have age of record_b use it to set age weights for record_a
        if 'age' in record_b:
            weights_a['age'] = calculate_age_weights(record_a['terms'], record_b['age'], self.hpo_network)

        term_pairs = itertools.product(terms_a, terms_b)
        df = pd.DataFrame(
            [(pair[0], pair[1], self.score_hpo_pair_hrss(pair[0], pair[1]))
             for pair in term_pairs],
            columns=['a', 'b', 'score']
        ).set_index(
            ['a', 'b']
        ).unstack()

        if self.summarization_method == 'maximum':
            return record_a['record_id'], record_b['record_id'], self.maximum(df)
        elif self.summarization_method == 'BMWA' and any([weights_a, weights_b]):
            return record_a['record_id'], record_b['record_id'], self.best_match_weighted_average(df, weights_a=weights_a, weights_b=weights_b)
        else:
            return record_a['record_id'], record_b['record_id'], self.best_match_average(df)