def test_kl_divergence():
    import pattern_matcher
    a = {'a': 10, 'b': 20}
    b = {'a': 20, 'b': 40}
    kl = pattern_matcher.kl_divergence(a, b, smooth=False)
    assert(kl == 0.0)
    b = {'c': 990}
    kl = pattern_matcher.kl_divergence(a, b, smooth=True)
    import data
    rel = 'user.jefft0.default_domain.virus_classification_rank.virus_classifications_at_this_rank'
    word_entity_types = data.read_word_type_distributions('data/word-entity-type-counts_filtered')
    wtypes = word_entity_types['genre']
    target_types = data.read_relation_target_type_distributions('data/relation-target-type-distributions')
    rtypes = target_types[rel]
    wtypes = pattern_matcher.filter_type_distribution(wtypes, n_max=10, min_count=1)
    rtypes = pattern_matcher.filter_type_distribution(rtypes, n_max=10, min_count=1)
    print pattern_matcher.kl_divergence(wtypes, rtypes, alpha=0.1)
    print rtypes
    print wtypes
 def init_from_config():
     """
     Return an instance with options parsed by a config parser.
     :param config_options:
     :return:
     """
     config_options = globals.config
     sparql_backend = globals.get_sparql_backend(config_options)
     relation_counts_file = config_options.get('QueryCandidateExtender',
                                               'relation-counts')
     mediator_names_file = config_options.get('QueryCandidateExtender',
                                              'mediator-names')
     reverse_relations_file = config_options.get('QueryCandidateExtender',
                                                 'reverse-relations')
     expected_types_file = config_options.get('QueryCandidateExtender',
                                              'relation-expected-types')
     tt_distributions_file = config_options.get('QueryCandidateExtender',
                                                'relation-target-type-distributions')
     mediator_relations_file = config_options.get('QueryCandidateExtender',
                                                  'mediator-relations')
     rel_lemmas_file = config_options.get('QueryCandidateExtender',
                                          'relation-lemmas')
     relation_words_file = config_options.get('QueryCandidateExtender',
                                              'relation-words')
     mediated_relation_words_file = config_options.get(
         'QueryCandidateExtender',
         'mediated-relation-words')
     word_type_counts_file = config_options.get(
         'QueryCandidateExtender',
         'word-type-counts')
     word_type_counts = data.read_word_type_distributions(
         word_type_counts_file)
     embeddings_model = config_options.get('Alignment',
                                           'word-embeddings')
     word_deriv_file = config_options.get('Alignment',
                                          'word-derivations')
     we_synonyms = WordembeddingSynonyms(embeddings_model)
     word_derivations = WordDerivations(word_deriv_file)
     mediator_relations = data.read_mediator_relations(
         mediator_relations_file)
     relation_counts = data.read_relation_counts(relation_counts_file)
     mediator_names = data.read_mediator_names(mediator_names_file)
     mediator_index = MediatorIndexFast.init_from_config()
     reverse_relations = data.read_reverse_relations(reverse_relations_file)
     relation_expected_types = data.read_relation_expected_types(
         expected_types_file)
     relation_words = data.read_relation_words(relation_words_file,
                                               n_top_words=1000)
     mediated_relation_words = data.read_mediated_relation_words(
         mediated_relation_words_file, n_top_words=1000)
     rel_tt_distributions = data.read_relation_target_type_distributions(
         tt_distributions_file)
     rel_lemmas = data.read_relation_lemmas(rel_lemmas_file)
     return QueryCandidateExtender(mediator_index, relation_counts,
                                   mediator_names,
                                   mediator_relations,
                                   reverse_relations,
                                   relation_expected_types,
                                   sparql_backend, relation_words,
                                   mediated_relation_words,
                                   rel_tt_distributions, we_synonyms,
                                   word_derivations, word_type_counts,
                                   rel_lemmas)