def unit_test_uppercase_ratio():
    model = SELLightFeatureExtractor()

    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_uppercase_ratio(body, entity_list, entity_id_set)
    ur = uppercase_ratio_by_ent_id[entity_list[0].entity_id]
    logger.info(ur)
    assert (are_same(ur, 0.14))

    body = "the IRANIAN. bla bla bla. bla.  "
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_uppercase_ratio(body, entity_list, entity_id_set)

    entity_id = get_iranian_entity_id(name_by_entity_id)

    logger.info('entity : %s' , name_by_entity_id[entity_id])
    ur = uppercase_ratio_by_ent_id[entity_id]
    logger.info(ur)
    assert (are_same(ur, 1.0))
def unit_test_field_frequency():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = "Iranian. Iranian."
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    field_frequency_by_ent_id = model.calc_field_frequency(body, entity_list, title_entity_list)
    v = field_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(v)
    assert (are_same(v[0], 3))
    assert (are_same(v[1], 0))
    assert (are_same(v[2], 0))
    assert (are_same(v[3], 2))

    body = "bla. bla. bla. Iranian. Iranian. Iranian"
    title = ""
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    field_frequency_by_ent_id = model.calc_field_frequency(body, entity_list, title_entity_list)

    entity_id = get_iranian_entity_id(name_by_entity_id)
    v = field_frequency_by_ent_id[entity_id]
    logger.info(v)
    assert (are_same(v[0], 0))
    assert (are_same(v[1], 0))
    assert (are_same(v[2], 3))
    assert (are_same(v[3], 0))
def unit_test_document_length():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "Free soap for all!"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    entity_frequency_by_ent_id = model.calc_document_length(body, entity_id_set)
    ub = entity_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(ub)
    assert (ub == 72)
def unit_test_average_term_length_in_words():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_average_term_length_in_words(body, entity_list, entity_id_set)
    ur = uppercase_ratio_by_ent_id[entity_list[0].entity_id]
    logger.info(ur)
    assert (are_same(ur, 1))
def unit_test_degree():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "World cabbage Day"
    spotter = WikipediaSpotter()
    # TODO we need a real spotter
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)
    degrees_by_ent_id = model.calc_degrees(entity_id_set)
    list_of_degrees = degrees_by_ent_id[entity_list[0].entity_id]
    logger.info(list_of_degrees)
    assert (list_of_degrees[0] == 0)
def unit_test_frequency():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "Frequency is important"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    entity_frequency_by_ent_id = model.calc_entity_frequency(body, entity_id_set, name_by_entity_id)
    ub = entity_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(ub)
    logger.info(entity_list)
    assert (ub == 3)
def unit_test_is_in_title():
    model = SELLightFeatureExtractor()
    title = "Iranian. Iranian. Iranian"
    body = "Cat Dog Australia. Bla. France, United States"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    is_in_title_by_ent_id = model.calc_is_in_title(entity_list, title_entity_list)
    ut = is_in_title_by_ent_id[title_entity_list[0].entity_id]
    ub = is_in_title_by_ent_id[entity_list[0].entity_id]
    logger.info(ut)
    assert (ut == True)
    assert (ub != True)
def unit_test_capitalization_internal(body, title, expected):
    model = SELLightFeatureExtractor()
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)
    capitalization_by_ent_id = model.calc_capitalization(body, entity_list, entity_id_set)
    vb = capitalization_by_ent_id[entity_list[0].entity_id]

    capitalization_by_ent_id = model.calc_capitalization(title, title_entity_list, title_entity_id_set)
    vt = capitalization_by_ent_id[entity_list[0].entity_id]
    v = vt or vb
    logger.info(v)
    assert (v == expected)
def unit_test_first_field_position_internal(body, title, first_pos, middle_pos, last_pos, title_pos):
    model = SELLightFeatureExtractor()
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)

    first_field_positions_by_ent_id = model.calc_first_field_positions(body, title, entity_list, entity_id_set,
                                                                       title_entity_list)

    entity_id = get_iranian_entity_id(name_by_entity_id)
    logger.info('name_by_entity_id: %s ', name_by_entity_id)
    logger.info('entity_id: %s ',entity_id)
    logger.info('name: %s ',name_by_entity_id[entity_id])
    logger.info('first_field_positions_by_ent_id: %s ', first_field_positions_by_ent_id[entity_id])

    if len(entity_list) > 0:
        assert (are_same(first_field_positions_by_ent_id[entity_id][0], first_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][1], middle_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][2], last_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][3], title_pos, epsilon=0.04))
def unit_test_sentence_positions():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    sentence_positions_by_ent_id = model.calc_sentence_positions(body, entity_list, entity_id_set)
    v = sentence_positions_by_ent_id[entity_list[0].entity_id]
    logger.info(v)
    assert (are_same(v, 0.0625))

    body = "bla bla bla bla bla bla bla bla Iranian. bla bla bla bla bla bla bla bla bla Iranian. " + \
           "bla bla bla bla bla bla bla bla Iranian"
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    sentence_positions_by_ent_id = model.calc_sentence_positions(body, entity_list, entity_id_set)

    entity_id = get_iranian_entity_id(name_by_entity_id)

    v = sentence_positions_by_ent_id[entity_id]
    logger.info(v)
    assert (are_same(v, 0.83))
Ejemplo n.º 11
0
    # ___________________________________________________________


    def get_entity_saliency_list(self, body, title, spotter, very_light=False, spotter_confidence = 0.5):
        entity_list = spotter.get_entity_candidates(body, spotter_confidence)
        entity_id_set, name_by_entity_id = self.get_entity_set(entity_list)
        title_entity_list = spotter.get_entity_candidates(title, spotter_confidence)
        features_by_ent_id = self.calc_light_features(body, title, entity_list, entity_id_set, name_by_entity_id,
                                                      title_entity_list, very_light)
        title_entity_id_set, title_name_by_entity_id = self.get_entity_set(title_entity_list)
        return entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set

    def get_feature_list_by_ent(self, body, title, spotter, very_light=False, spotter_confidence = 0.5):
        entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
            self.get_entity_saliency_list(body, title, spotter, very_light, spotter_confidence = spotter_confidence)
        return features_by_ent_id, name_by_entity_id


if __name__ == "__main__":
    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""
    model = SELLightFeatureExtractor()

    nltk_spotter = NLTKSpotter()
    wiki_spotter = WikipediaSpotter()

    entities, name_by_entity_id = model.get_saliency_by_ent(body, title, wiki_spotter)
    logger = logging.getLogger(__name__)
    logger.info(entities)