def unit_test_average_term_length_in_words():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_average_term_length_in_words(body, entity_list, entity_id_set)
    ur = uppercase_ratio_by_ent_id[entity_list[0].entity_id]
    logger.info(ur)
    assert (are_same(ur, 1))
def unit_test_document_length():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "Free soap for all!"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    entity_frequency_by_ent_id = model.calc_document_length(body, entity_id_set)
    ub = entity_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(ub)
    assert (ub == 72)
def unit_test_degree():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "World cabbage Day"
    spotter = WikipediaSpotter()
    # TODO we need a real spotter
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)
    degrees_by_ent_id = model.calc_degrees(entity_id_set)
    list_of_degrees = degrees_by_ent_id[entity_list[0].entity_id]
    logger.info(list_of_degrees)
    assert (list_of_degrees[0] == 0)
def unit_test_frequency():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian, Cat Dog Australia. Bla. France, United States"
    title = "Frequency is important"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    entity_frequency_by_ent_id = model.calc_entity_frequency(body, entity_id_set, name_by_entity_id)
    ub = entity_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(ub)
    logger.info(entity_list)
    assert (ub == 3)
def unit_test_is_in_title():
    model = SELLightFeatureExtractor()
    title = "Iranian. Iranian. Iranian"
    body = "Cat Dog Australia. Bla. France, United States"
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    is_in_title_by_ent_id = model.calc_is_in_title(entity_list, title_entity_list)
    ut = is_in_title_by_ent_id[title_entity_list[0].entity_id]
    ub = is_in_title_by_ent_id[entity_list[0].entity_id]
    logger.info(ut)
    assert (ut == True)
    assert (ub != True)
def unit_test_capitalization_internal(body, title, expected):
    model = SELLightFeatureExtractor()
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)
    capitalization_by_ent_id = model.calc_capitalization(body, entity_list, entity_id_set)
    vb = capitalization_by_ent_id[entity_list[0].entity_id]

    capitalization_by_ent_id = model.calc_capitalization(title, title_entity_list, title_entity_id_set)
    vt = capitalization_by_ent_id[entity_list[0].entity_id]
    v = vt or vb
    logger.info(v)
    assert (v == expected)
def unit_test_first_field_position_internal(body, title, first_pos, middle_pos, last_pos, title_pos):
    model = SELLightFeatureExtractor()
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
        model.get_entity_saliency_list(body, title, spotter)

    first_field_positions_by_ent_id = model.calc_first_field_positions(body, title, entity_list, entity_id_set,
                                                                       title_entity_list)

    entity_id = get_iranian_entity_id(name_by_entity_id)
    logger.info('name_by_entity_id: %s ', name_by_entity_id)
    logger.info('entity_id: %s ',entity_id)
    logger.info('name: %s ',name_by_entity_id[entity_id])
    logger.info('first_field_positions_by_ent_id: %s ', first_field_positions_by_ent_id[entity_id])

    if len(entity_list) > 0:
        assert (are_same(first_field_positions_by_ent_id[entity_id][0], first_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][1], middle_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][2], last_pos, epsilon=0.04))
        assert (are_same(first_field_positions_by_ent_id[entity_id][3], title_pos, epsilon=0.04))
def unit_test_uppercase_ratio():
    model = SELLightFeatureExtractor()

    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_uppercase_ratio(body, entity_list, entity_id_set)
    ur = uppercase_ratio_by_ent_id[entity_list[0].entity_id]
    logger.info(ur)
    assert (are_same(ur, 0.14))

    body = "the IRANIAN. bla bla bla. bla.  "
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    uppercase_ratio_by_ent_id = model.calc_uppercase_ratio(body, entity_list, entity_id_set)

    entity_id = get_iranian_entity_id(name_by_entity_id)

    logger.info('entity : %s' , name_by_entity_id[entity_id])
    ur = uppercase_ratio_by_ent_id[entity_id]
    logger.info(ur)
    assert (are_same(ur, 1.0))
def unit_test_field_frequency():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = "Iranian. Iranian."
    spotter = WikipediaSpotter()
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    field_frequency_by_ent_id = model.calc_field_frequency(body, entity_list, title_entity_list)
    v = field_frequency_by_ent_id[entity_list[0].entity_id]
    logger.info(v)
    assert (are_same(v[0], 3))
    assert (are_same(v[1], 0))
    assert (are_same(v[2], 0))
    assert (are_same(v[3], 2))

    body = "bla. bla. bla. Iranian. Iranian. Iranian"
    title = ""
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)

    field_frequency_by_ent_id = model.calc_field_frequency(body, entity_list, title_entity_list)

    entity_id = get_iranian_entity_id(name_by_entity_id)
    v = field_frequency_by_ent_id[entity_id]
    logger.info(v)
    assert (are_same(v[0], 0))
    assert (are_same(v[1], 0))
    assert (are_same(v[2], 3))
    assert (are_same(v[3], 0))
def unit_test_sentence_positions():
    model = SELLightFeatureExtractor()
    body = "Iranian. Iranian. Iranian"
    title = ""
    spotter = WikipediaSpotter()

    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    sentence_positions_by_ent_id = model.calc_sentence_positions(body, entity_list, entity_id_set)
    v = sentence_positions_by_ent_id[entity_list[0].entity_id]
    logger.info(v)
    assert (are_same(v, 0.0625))

    body = "bla bla bla bla bla bla bla bla Iranian. bla bla bla bla bla bla bla bla bla Iranian. " + \
           "bla bla bla bla bla bla bla bla Iranian"
    entity_list, entity_id_set, features_by_ent_id, name_by_entity_id,  title_entity_list, title_entity_id_set =\
        model.get_entity_saliency_list(body, title, spotter)
    sentence_positions_by_ent_id = model.calc_sentence_positions(body, entity_list, entity_id_set)

    entity_id = get_iranian_entity_id(name_by_entity_id)

    v = sentence_positions_by_ent_id[entity_id]
    logger.info(v)
    assert (are_same(v, 0.83))
Exemple #11
0
    handler.setFormatter(logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    INTERMEDIATE_FILE_PATH = FileLocations.get_dropbox_intermediate_path()
    heavy_model_filename = INTERMEDIATE_FILE_PATH + 'heavy_GradientBoostingRegressor.pickle'

    if os.path.isfile(heavy_model_filename):
        logger.info('loading model from %s', heavy_model_filename)
        with open(heavy_model_filename, 'rb') as handle:
            gbr_model = pickle.load(handle)
        logger.info('loaded')

        pipeline = Pipeline001(SpotlightCachingSpotter(),
                               SELLightFeatureExtractor(),
                               BinaryClassifierTrainer(),
                               HeavyFeatureExtractor(heavy_features_to_zero = []),
                               gbr_model
                               )

        body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages.Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision.Iran and the European Union's big three powers &mdash; Britain, Germany, and France &mdash; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions.U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs.Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States.\", 'body_sl_1': '\n<div>\nIranian representatives say negotiations with <a about="
        title = "Iran close to decision on nuclear program"

        ideal_salience_by_entity_id = {0: 0}

        calculated_saliency_by_entity_id = pipeline.process_document(-1, body, title, 'non-corpus-doc', break_early=False,
                                                                     golden_salience_by_entity_id=ideal_salience_by_entity_id)

Exemple #12
0
    def main(self, from_, to_, measurement, pipeline_portion):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset()

        # process the data
        count = 0

        slcs = SpotlightCachingSpotter()
        light_features_to_zero = []
        lfe = SELLightFeatureExtractor(light_features_to_zero)
        gbrt = None  # GBRT('fred')
        ndcg = NDCG()

        min_candidates_to_pass_through = 3
        binary_classifier_threshold = 0.5
        spotter_confidence = 0.5
        corpus_name = 'dexter_fset_02_'
        break_early = False

        file_prefix = (corpus_name + '_' + str(from_) + '_to_' + str(to_) +
                       '_')
        salience_by_entity_by_doc_id = {}
        time_by_docid = {}

        light_feature_filename = FileLocations.get_temp_path(
        ) + file_prefix + 'light_output_partial.txt'

        file = open(light_feature_filename, "a")
        file.write(
            '\ndocId, entity_id, golden_salience, estimated_salience, [light_features]'
        )
        file.close()

        for document in document_list:
            data = json.loads(document)
            docid = data['docId']

            if (count in range(from_, (to_ + 1)) and measurement == 'LINE') or \
                    (docid in range(from_, (to_ + 1)) and measurement == 'DOCID'):
                self.logger.info('_______________________________________')
                self.logger.info('Starting processing of docid = %d  line=%d ',
                                 docid, count)
                start_time = time.time()
                saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                    data)
                body = self.extract_body(data)
                title = data['title']

                pipeline = Pipeline002(slcs, lfe, gbrt, ndcg,
                                       light_feature_filename)

                calculated_saliency_by_entity_id, golden_salience_by_entity_id, discount_sum, model_dcgs = \
                    pipeline.process_document(
                        docid,
                        body, title,
                        file_prefix, break_early=break_early,
                        golden_salience_by_entity_id=saliency_by_ent_id_golden,
                        min_candidates_to_pass_through=min_candidates_to_pass_through,
                        binary_classifier_threshold=binary_classifier_threshold,
                        spotter_confidence=spotter_confidence)

                salience_by_entity_by_doc_id[
                    docid] = calculated_saliency_by_entity_id
                self.logger.info('count = %d, docId = %d ', count, docid)
                self.logger.info('calculated_saliency_by_entity_id = %s ',
                                 str(calculated_saliency_by_entity_id))
                self.logger.info('discount_sum = %s ', str(discount_sum))
                self.logger.info('model_dcgs = %s ', str(model_dcgs))

                diff = time.time() - start_time

                time_by_docid[docid] = diff
                self.logger.info('Times taken %s', time_by_docid)
                self.logger.info('Time taken for docid=%d, time=%f', docid,
                                 diff)

            count += 1
        self.logger.info('Times taken by docid: %s', time_by_docid)

        trc = TrecReferenceCreator()
        trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp')
        report, ndcg, p_at = trc.get_report(
            FileLocations.get_dropbox_intermediate_path() +
            'trec_ground_truth.txt', 'x_temp')
        self.logger.info(' Trec Eval Results:\n %s', report)
    def main(self, line_number, graph_disjoint):

        # load the data
        dd = DatasetDexter()

        document_list = dd.get_dexter_dataset()
        graph_utils = GraphUtils()

        # process the data
        count = 0

        slcs = SpotlightCachingSpotter()
        lfe = SELLightFeatureExtractor()
        # bc = BinaryClassifierTrainer()
        # hfe = HeavyFeatureExtractor()
        # rt = RegressionTree()
        # ndcg = NDCG()

        document = document_list[line_number]
        data = json.loads(document)
        body = self.extract_body(data)
        title = data['title']
        docid = data['docId']

        self.logger.info('count %d', count)
        self.logger.info('docId %d', docid)
        self.logger.info('%s', title)
        self.logger.info('%s', body)

        saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
            data)

        light_features_by_ent_id, name_by_entity_id = lfe.get_feature_list_by_ent(
            body, title, slcs, False, spotter_confidence=0.5)
        v1_matrix = graph_utils.calc_v1_matrix(name_by_entity_id.keys())

        # calc various sets of entities
        golden_entity_ids = saliency_by_ent_id_golden.keys()
        spotter_entity_ids = light_features_by_ent_id.keys()
        orphan = []
        key_entity_ids = []
        key_entity_ids.extend(golden_entity_ids)
        key_entity_ids.extend(spotter_entity_ids)

        disjoint = []
        # add the linked entities the heavy stage finds
        if graph_disjoint:
            for i in range(len(v1_matrix.row)):
                from_entity_id = v1_matrix.row[i]
                to_entity_id = v1_matrix.col[i]

                if from_entity_id not in key_entity_ids:
                    disjoint.append(from_entity_id)

                if to_entity_id not in key_entity_ids:
                    disjoint.append(to_entity_id)

        all_entities = []
        all_entities.extend(disjoint)
        all_entities.extend(golden_entity_ids)
        all_entities.extend(spotter_entity_ids)

        from_list, to_list, value_list = graph_utils.get_links_totally_within(
            all_entities)
        # add self referencing links to ensure they are displayed on graph
        for id in golden_entity_ids:
            if id not in from_list and id not in to_list:
                from_list.append(id)
                to_list.append(id)
                orphan.append(id)

        for id in spotter_entity_ids:
            if id not in from_list and id not in to_list:
                from_list.append(id)
                to_list.append(id)
                orphan.append(id)

        name_by_entity_id[31743] = 'Uranium'
        name_by_entity_id[21785] = 'Nuclear weapon'
        name_by_entity_id[9282173] = 'Israel'
        name_by_entity_id[31717] = 'United Kingdom'
        name_by_entity_id[5843419] = 'France'
        name_by_entity_id[6984] = 'Colin Powell'
        name_by_entity_id[57654] = 'Tehran'
        name_by_entity_id[31956] = 'United Nations Security Council'
        name_by_entity_id[1166971] = 'Ministry of Foreign Affairs (Iran)'
        name_by_entity_id[9239] = 'Europe'
        name_by_entity_id[11867] = 'Germany'
        name_by_entity_id[32293] = 'United States Secretary of State'
        name_by_entity_id[3434750] = 'United States'

        node_color_val_map = {}
        pos_map = {}

        # plt.get_cmap('Set1') colors:
        set1_red = 0 / 9.0
        set1_blue = 1.0 / 9.0
        set1_green = 2 / 9.0
        set1_purple = 3 / 9.0
        set1_orange = 4 / 9.0
        set1_golden = 5 / 9.0
        set1_brown = 6 / 9.0
        set1_pink = 7 / 9.0
        set1_grey = 8 / 9.0

        y = 0
        for entity_id in golden_entity_ids:
            if entity_id in spotter_entity_ids:
                node_color_val_map[entity_id] = set1_green
            else:
                node_color_val_map[entity_id] = set1_golden
            pos_map[entity_id] = (1, y)
            if entity_id in name_by_entity_id:
                pos_map[name_by_entity_id[entity_id]] = (1, y)
            y += 1

        y = 0
        for entity_id in spotter_entity_ids:
            if entity_id in golden_entity_ids:
                node_color_val_map[entity_id] = set1_green
            else:
                node_color_val_map[entity_id] = set1_blue
            if entity_id not in golden_entity_ids:
                pos_map[entity_id] = (2, y)
                if entity_id in name_by_entity_id:
                    pos_map[name_by_entity_id[entity_id]] = (2, y)
                y += 1

        c = 0
        row_count = 20
        for entity_id in disjoint:
            if entity_id not in golden_entity_ids and entity_id not in spotter_entity_ids:
                x = 3 + int(c / row_count)
                y = c % row_count
                node_color_val_map[entity_id] = set1_grey
                pos_map[entity_id] = (x, y)
                if entity_id in name_by_entity_id:
                    pos_map[name_by_entity_id[entity_id]] = (x, y)
                c += 1

        # copy the color to the names as well if present
        l = []
        l.extend(node_color_val_map.keys())
        for entity_id in l:
            if entity_id in name_by_entity_id:
                v = node_color_val_map[entity_id]
                node_color_val_map[name_by_entity_id[entity_id]] = v

        self.logger.info('converting to names if known')
        from_list = self.convert_using_map(from_list, name_by_entity_id)
        to_list = self.convert_using_map(to_list, name_by_entity_id)

        self.logger.info('from_list %s', from_list)
        self.logger.info('to_list %s', to_list)

        df = pd.DataFrame({'from': from_list, 'to': to_list})

        # Build your graph
        G = nx.from_pandas_dataframe(df, 'from', 'to')

        self.make_plot_from_graph(G,
                                  node_color_val_map,
                                  pos_map,
                                  'c:/temp/out_disjoint_' +
                                  str(graph_disjoint) + '.png',
                                  graph_disjoint,
                                  cmap=plt.get_cmap('Set1'))
        count += 1