Esempio n. 1
0
    def create_reference_file(self, zero_less_than_2):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset(
            path=FileLocations.get_dropbox_dexter_path())

        results = ''
        # process the data
        result_count = 0
        doc_count = 0

        for document in document_list:
            data = json.loads(document)
            saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                data)
            docid = data['docId']

            sorted_list = self.get_ordered_list_from_dictionary(
                saliency_by_ent_id_golden)

            for item in sorted_list:
                entity_id = item[0]
                salience = item[1]
                if zero_less_than_2:
                    if salience < 2.0:
                        salience = 0.0
                results = results + str(docid) + ' 0 ' + str(
                    entity_id) + ' ' + str(salience) + '\n'
                result_count += 1

            self.logger.info('Documents Processed %d Entities Processed %d ',
                             doc_count, result_count)
            doc_count += 1

        fn = FileLocations.get_dropbox_intermediate_path(
        ) + "trec_ground_truth.txt"
        self.logger.info('writing to %s ', fn)
        file = open(fn, "w")
        file.write(results)
        file.close()
Esempio n. 2
0
def train_model():
    X, y, docid_array, entity_id_array = load_feature_matrix(
        feature_filename=INTERMEDIATE_PATH +
        'dexter_all_heavy_catted_8_7_2018.txt',
        feature_names=feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=40,
        tmp_filename='/tmp/temp_conversion_file.txt')

    # train only on records we have a golden salience for
    fg = FilterGolden()
    logger.info('X Shape = %s', X.shape)
    logger.info('y Shape = %s', y.shape)

    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()

    X2, y2, docid2, entityid2 = fg.get_only_golden_rows(
        X, y, docid_array, entity_id_array, dexter_dataset, wikipedia_dataset)

    logger.info('X2 Shape = %s', X2.shape)
    logger.info('y2 Shape = %s', y2.shape)

    wrapper = GBRTWrapper()
    gbrt = wrapper.train_model_no_split(X2, y2, n_estimators=40)
    logger.info('trained')
    # gbrt.save_model()

    # from https://shankarmsy.github.io/stories/gbrt-sklearn.html
    # One of the benefits of growing trees is that we can understand how important each of the features are
    print("Feature Importances")
    print(gbrt.feature_importances_)
    print()
    # Let's print the R-squared value for train/test. This explains how much of the variance in the data our model is
    # able to decipher.
    print("R-squared for Train: %.2f" % gbrt.score(X2, y2))
    # print ("R-squared for Test: %.2f" %gbrt.score(X_test, y_test) )
    # - See more at: https://shankarmsy.github.io/stories/gbrt-sklearn.html#sthash.JNZQbnph.dpuf
    return gbrt, X2, y2, docid2, entityid2
Esempio n. 3
0
    def go(self, filename, feature_names, filter_only_golden):
        X, y, docid_array, entity_id_array = load_feature_matrix(feature_filename=filename,
                                                                 feature_names=feature_names,
                                                                 entity_id_index=1,
                                                                 y_feature_index=2,
                                                                 first_feature_index=4,
                                                                 number_features_per_line=len(feature_names) + 4,
                                                                 tmp_filename='/tmp/temp_conversion_file.txt'
                                                                 )

        # train only on records we have a golden salience for
        self.logger.info('__________________________',)
        self.logger.info('File %s', filename)
        self.logger.info('X Shape = %s', X.shape)
        self.logger.info('y Shape = %s', y.shape)

        if filter_only_golden:
            dexterDataset = DatasetDexter()
            wikipediaDataset = WikipediaDataset()
            fg = sellibrary.filter_only_golden.FilterGolden()
            X, y, docid_array, entity_id_array = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset)
            self.logger.info('After filtering only golden rows:')
            self.logger.info('X Shape = %s', X.shape)
            self.logger.info('y Shape = %s', y.shape)

        self.logger.info('y [1] %s', y[1:10])
        self.logger.info('y [1] %s', y[y > 0.0])

        y[y < 2.0] = 0
        y[y >= 2.0] = 1

        ig = self.information_gain_v2(X, y)
        self.logger.info('ig %s', ig)
        self.logger.info('ig shape %s', ig.shape)

        d = {}
        for i in range(len(feature_names)):
            d[feature_names[i]] = ig[i]

        self.sort_and_print(d)
        return d
Esempio n. 4
0
    filename_B = dropbox_intermediate_path + 'wp_sentiment_simple.txt'  #'base_tf_simple_v2.txt'

    output_filename = dropbox_intermediate_path + 'wp_joined.txt'  #'joined_sel_sent_and_tf.txt'

    # Load File A
    X1, y1, docid_array1, entity_id_array1 = load_feature_matrix(
        feature_filename=filename_A,
        feature_names=file_A_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_A_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')

    print(y1.shape)
    dexter_dataset = DatasetDexter()
    wikipedia_dataset = WikipediaDataset()
    # fg = FilterGolden()
    # X1, y1, docid_array1, entity_id_array1 = fg.get_only_golden_rows(X1, y1, docid_array1, entity_id_array1, dexter_dataset,
    #                                                     wikipedia_dataset)

    document_list = dexter_dataset.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    golden_saliency_by_entid_by_docid = dexter_dataset.get_golden_saliency_by_entid_by_docid(
        document_list, wikipedia_dataset)

    print(y1.shape)

    # Load File B
    X2, y2, docid_array2, entity_id_array2 = load_feature_matrix(
        feature_filename=filename_B,
class DexterFeeder():
    # set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    def __init__(self):
        self.dd = DatasetDexter()

    @staticmethod
    def extract_body(data):
        body = ''
        for d in data['document']:
            if d['name'].startswith('body_par_'):
                body = body + d['value']
        return body

    def dexter_dataset_sentiment(self, sentiment_processor, spotter,
                                 output_filename):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building list of n-grams')
        ngram_list = []

        sent_by_entity_id_by_docid = {}

        file_contents = ''
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            body = self.extract_body(data)
            title = data['title']
            docid = data['docId']

            sent_by_entity_id_by_docid[docid] = {}
            for n_gram_length in range(2, 10):
                title_entities = spotter.get_entity_candidates(title, 0.5)
                for e in title_entities:
                    n_gram = sentiment_processor.get_ngram(
                        title, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent

                ngram_list.append(n_gram)
                body_entities = spotter.get_entity_candidates(body, 0.5)
                for e in body_entities:
                    n_gram = sentiment_processor.get_ngram(
                        body, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent
            #log progress
            for entity_id in sent_by_entity_id_by_docid[docid].keys():
                sent = sent_by_entity_id_by_docid[docid][entity_id]

                s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent)
                self.logger.info(s)
                file_contents = file_contents + s + '\n'

        file = open(output_filename, "w")
        file.write(file_contents)
        file.close()

        self.logger.info('processing complete')
 def __init__(self):
     self.dd = DatasetDexter()
Esempio n. 7
0
            self.logger.debug('Appending heavy parameters to %s',
                              self.heavy_feature_filename)
            for entity_id in all_heavy_features_by_entity_id.keys():
                output = '{0},{1},{2},{3},{4}\n'.format(
                    str(optional_docid), str(entity_id), str('?'), str('?'),
                    str(all_heavy_features_by_entity_id[entity_id]))
                file.write(output)
            file.close()

        return features_by_entity_id


if __name__ == "__main__":

    #build a the golden spotter
    dd = DatasetDexter()
    document_list = dd.get_dexter_dataset(
        FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
    wikipedia_dataset = WikipediaDataset()
    spotter = GoldenSpotter(document_list, wikipedia_dataset)

    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""

    sfe = SelFeatureExtractor(spotter,
                              binary_classifier_threshold=0.5,
                              min_candidates_to_pass_through=5,
                              binary_classifier=None,
                              light_feature_filename=None,
                              heavy_feature_filename=None,
                              num_light_features=23,
Esempio n. 8
0
class SelModelBuilder:
    # set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    def __init__(self):
        self.dd = DatasetDexter()

    @staticmethod
    def extract_body(data):
        body = ''
        for d in data['document']:
            if d['name'].startswith('body_par_'):
                body = body + d['value']
        return body

    def get_dexter_datset(self):
        return self.dd

    # noinspection PyShadowingNames
    def train_model_using_dexter_dataset(self, sentiment_processor, spotter, afinn_filename):
        dexter_json_doc_list = self.dd.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(),
                                                          'saliency-dataset.json')
        self.logger.info('building list of n-grams')
        ngram_list = []
        for n_gram_length in range(2, 10):
            for json_doc in dexter_json_doc_list:
                data = json.loads(json_doc)
                # pprint.pprint(data)
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, 0.5)
                for e in title_entities:
                    n_gram = sentiment_processor.get_ngram(title, n_gram_length, e.start_char, e.end_char)
                    ngram_list.append(n_gram)
                body_entities = spotter.get_entity_candidates(body, 0.5)
                for e in body_entities:
                    n_gram = sentiment_processor.get_ngram(body, n_gram_length, e.start_char, e.end_char)
                    ngram_list.append(n_gram)
        self.logger.info('processing list of n-grams')
        sentiment_processor.cal_term_weight_on_full_corpus(afinn_filename, ngram_list, debug_mode=1)
        self.logger.info('processing complete')

    def train_and_save_model(self, filename, spotter):
        afinn_filename = '../sellibrary/resources/AFINN-111.txt'
        sentiment_processor = SentimentProcessor()
        self.train_model_using_dexter_dataset(sentiment_processor, spotter, afinn_filename)
        sentiment_processor.save_model(filename)
        return sentiment_processor

    def get_feature_list(self, sentiment_processor, phrase):
        sent = sentiment_processor.get_doc_simple_sentiment(phrase)
        feture_list = [sent]
        feture_list.extend(sentiment_processor.get_doc_prop_pos_prob_neg(phrase))
        return feture_list

    def build_output_using_dexter_dataset(self, spotter, golden_saliency_by_entid_by_docid,
                                          output_filename, document_to_feature_converter, tosent_converter, dexter_json_doc_list, min_docid = 1, max_docid = 700):

        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        salience_by_entity_by_doc_id = {}
        doc_number = -1
        for json_doc in dexter_json_doc_list:
            doc_number += 1
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            s = str(docid)
            # process by docid, unless it is not numeric, where we process by doc number
            if  (not s.isnumeric() and doc_number >= min_docid and doc_number <=  max_docid ) \
                or (s.isnumeric() and docid >= min_docid and docid <=  max_docid):

                t1 = time.time()

                salience_by_entity_by_doc_id[docid] = {}
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, docid)
                body_entities = spotter.get_entity_candidates(body, docid)

                features_by_entity_id = document_to_feature_converter.get_features(body, body_entities,
                                       title, title_entities, docid )

                for entity_id in features_by_entity_id.keys():
                    golden = 0
                    if docid in golden_saliency_by_entid_by_docid:
                        if entity_id in golden_saliency_by_entid_by_docid[docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][entity_id]

                    line = str(docid) + ',' + str(entity_id) + ',' + str(golden) + ',0,' + str(features_by_entity_id[entity_id])

                    if file is not None:
                        file.write(line)
                        file.write('\n')
                        self.logger.info('writing to %s',output_filename)
                        self.logger.info(line)

                    if tosent_converter is not None:
                        sentiment = tosent_converter.get_salient(features_by_entity_id[entity_id])
                    else:
                        sentiment = 0.0
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment

                    self.logger.debug('sent %f', sentiment)

                t2 = time.time()
                self.logger.debug('Time taken to process docid %d = %f sec', docid, (t2-t1))


        if file is not None:
            file.close()
            self.logger.info('written to %s',output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id
Esempio n. 9
0
from sellibrary.sel.dexter_dataset import DatasetDexter
from sellibrary.wiki.wikipedia_datasets import WikipediaDataset
from sellibrary.util.first_model_value import FirstValueModel

if __name__ == "__main__":

    const = Const()

    x_sel_feature_names = const.get_sel_feature_names()
    print(len(x_sel_feature_names))

    INTERMEDIATE_PATH = FileLocations.get_dropbox_intermediate_path()
    per_document_ndcg = True
    docid_set = set(Const.TESTSET_DOCID_LIST)
    dd = DatasetDexter()
    wikipediaDataset = WikipediaDataset()

    # SEL GBRT
    # feature_filename = INTERMEDIATE_PATH + 'aws/all.txt'
    # feature_names = const.get_sel_feature_names()
    # model_filename = INTERMEDIATE_PATH + 'sel_golden_spotter_GradientBoostingRegressor.pickle'

    # SEL RFR
    # feature_filename = INTERMEDIATE_PATH + 'aws/all.txt'
    # feature_names = const.get_sel_feature_names() # this was different
    # model_filename = INTERMEDIATE_PATH + 'sel_golden_spotter_RF.pickle'
    # # per_document_ndcg = True

    # Sent RFR
    # feature_filename = INTERMEDIATE_PATH + 'sentiment_simple.txt' # OK
Esempio n. 10
0
        self.logger.info('processing complete')

    # def train_and_save_model(self, filename):
    #     spotter = SpotlightCachingSpotter(False)
    #     afinn_filename = '../sellibrary/resources/AFINN-111.txt'
    #     sentiment_processor = SentimentProcessor()
    #     self.train_model_using_dexter_dataset(sentiment_processor, spotter, afinn_filename)
    #     sentiment_processor.save_model(filename)
    #     return sentiment_processor


if __name__ == "__main__":
    fg = FilterGolden()

    dd = DatasetDexter()
    wd = WikipediaDataset()

    dexter_json_doc_list = dd.get_dexter_dataset(
        FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
    golden_saliency_by_entid_by_docid = dd.get_golden_saliency_by_entid_by_docid(
        dexter_json_doc_list, wd)

    #check which are still valid

    wikititle_by_id = wd.get_wikititle_by_id()
    not_found_count = 0
    count = 0
    multiple_wid_count = 0

    for docid in golden_saliency_by_entid_by_docid.keys():
class PWModelBuilder:
    # set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    def __init__(self):
        self.dd = DatasetDexter()

    @staticmethod
    def extract_body(data):
        body = ''
        for d in data['document']:
            if d['name'].startswith('body_par_'):
                body = body + d['value']
        return body

    def get_dexter_datset(self):
        return self.dd

    def get_feature_list(self, sentiment_processor, phrase):
        sent = sentiment_processor.get_doc_simple_sentiment(phrase)
        feture_list = [sent]
        feture_list.extend(
            sentiment_processor.get_doc_prop_pos_prob_neg(phrase))
        return feture_list

    def build_output_using_dexter_dataset(self, spotter,
                                          golden_saliency_by_entid_by_docid,
                                          output_filename, docid_set,
                                          use_rand_values):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        salience_by_entity_by_doc_id = {}
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            if docid_set is None or docid in docid_set:

                salience_by_entity_by_doc_id[docid] = {}
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, docid)
                body_entities = spotter.get_entity_candidates(body, docid)

                features_by_entity_id = {}

                for e in title_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]
                for e in body_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]

                for entity_id in features_by_entity_id.keys():
                    golden = 0
                    if docid in golden_saliency_by_entid_by_docid:
                        if entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                entity_id]

                    line = str(docid) + ',' + str(entity_id) + ',' + str(
                        golden) + ',0,' + str(features_by_entity_id[entity_id])

                    if file is not None:
                        file.write(line)
                        file.write('\n')

                    sentiment = features_by_entity_id[entity_id][0]
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
                    self.logger.debug('sent %f', sentiment)

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id
Esempio n. 12
0
 def __init__(self):
     self.dd = DatasetDexter()
     self._model_runner = ModelRunner()
Esempio n. 13
0
class SentimentModelBuilder:
    # set up logging
    handler = logging.StreamHandler()
    handler.setFormatter(
        logging.Formatter(
            '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
    logger = logging.getLogger(__name__)
    logger.addHandler(handler)
    logger.propagate = False
    logger.setLevel(logging.INFO)

    def __init__(self):
        self.dd = DatasetDexter()

    @staticmethod
    def extract_body(data):
        body = ''
        for d in data['document']:
            if d['name'].startswith('body_par_'):
                body = body + d['value']
        return body

    def get_dexter_datset(self):
        return self.dd

    # noinspection PyShadowingNames
    def train_model_using_dexter_dataset(self, sentiment_processor, spotter,
                                         afinn_filename):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building list of n-grams')
        ngram_list = []
        for n_gram_length in range(2, 10):
            for json_doc in dexter_json_doc_list:
                data = json.loads(json_doc)
                # pprint.pprint(data)
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, 0.5)
                for e in title_entities:
                    n_gram = sentiment_processor.get_ngram(
                        title, n_gram_length, e.start_char, e.end_char)
                    ngram_list.append(n_gram)
                body_entities = spotter.get_entity_candidates(body, 0.5)
                for e in body_entities:
                    n_gram = sentiment_processor.get_ngram(
                        body, n_gram_length, e.start_char, e.end_char)
                    ngram_list.append(n_gram)
        self.logger.info('processing list of n-grams')
        sentiment_processor.cal_term_weight_on_full_corpus(afinn_filename,
                                                           ngram_list,
                                                           debug_mode=1)
        self.logger.info('processing complete')

    def train_and_save_model(self, filename, spotter):
        afinn_filename = '../sellibrary/resources/AFINN-111.txt'
        sentiment_processor = SentimentProcessor()
        self.train_model_using_dexter_dataset(sentiment_processor, spotter,
                                              afinn_filename)
        sentiment_processor.save_model(filename)
        return sentiment_processor

    def get_feature_list(self, sentiment_processor, phrase):
        sent = sentiment_processor.get_doc_simple_sentiment(phrase)
        feture_list = [sent]
        feture_list.extend(
            sentiment_processor.get_doc_prop_pos_prob_neg(phrase))
        return feture_list

    def build_output_using_dexter_dataset(self, spotter,
                                          golden_saliency_by_entid_by_docid,
                                          output_filename,
                                          document_to_feature_converter,
                                          tosent_converter, test_docid_set,
                                          train_docid_set):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        line_num = 0
        salience_by_entity_by_doc_id = {}
        for json_doc in dexter_json_doc_list:
            line_num += 1
            if line_num % 100 == 0:
                self.logger.info('Processed %d lines.', line_num)
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            # if docid in test_docid_set or docid in train_docid_set:

            salience_by_entity_by_doc_id[docid] = {}
            body = self.extract_body(data)
            title = data['title']
            title_entities = spotter.get_entity_candidates(title, docid)
            body_entities = spotter.get_entity_candidates(body, docid)
            # self.logger.info('Location:A')
            features_by_entity_id = document_to_feature_converter.get_features(
                body, body_entities, title, title_entities)
            # self.logger.info('Location:B.1')
            data_matrix = None
            for entity_id in features_by_entity_id.keys():
                if data_matrix is None:
                    data_matrix = np.array(
                        features_by_entity_id[entity_id]).reshape(1, -1)
                else:
                    row = np.array(features_by_entity_id[entity_id]).reshape(
                        1, -1)
                    data_matrix = np.concatenate((data_matrix, row), axis=0)
            # self.logger.info('Location:B.2')
            sentiment_array = tosent_converter.get_salient_from_numpy_matrix(
                data_matrix)
            # self.logger.info('Location:B.3')
            i = 0
            for entity_id in features_by_entity_id.keys():
                sentiment = sentiment_array[i]
                i += 1
                golden = 0
                if docid in golden_saliency_by_entid_by_docid:
                    if entity_id in golden_saliency_by_entid_by_docid[docid]:
                        golden = golden_saliency_by_entid_by_docid[docid][
                            entity_id]
                line = str(docid) + ',' + str(entity_id) + ',' + str(
                    golden) + ',0,' + str(features_by_entity_id[entity_id])
                if file is not None:
                    file.write(line)
                    file.write('\n')

                if docid in test_docid_set:
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
            # self.logger.info('Location:C')

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id
Esempio n. 14
0
    return worst_feature_number, reference_value, _contribution_by_feature_number_ordered, oob_test_score_by_feature_number


if __name__ == "__main__":
    trec_eval_feature_name = 'P_5'
    const = Const()
    base_set_to_supress = set()
    base_set_to_supress = set()
    # 30,64,63,37,28,39,62,10,29,32,33,25,24,60,36,21,27,34,61,23,19,26,38,11,44,6, 59,45,46,35,54,42,53,55,48,41,50,49,47,43,51,56,52,57,58,40,13,31,17,14}

    list_of_feature_deltas = []
    list_of_feature_oob_scores = []
    list_of_everything_a = []
    list_of_everything_oob = []
    _wikipedia_dataset = WikipediaDataset()
    _dexter_dataset = DatasetDexter()

    for i in range(len(const.get_joined_feature_names()) - 1):
        worst_feature_num, ref_value, contribution_by_feature_number_ordered, oob_test_score_by_feature_number = find_worst_feature(
            base_set_to_supress, _wikipedia_dataset, _dexter_dataset, trec_eval_feature_name)
        if worst_feature_num != -1:
            list_of_feature_deltas.append([worst_feature_num, ref_value])
            list_of_feature_oob_scores.append([worst_feature_num, oob_test_score_by_feature_number[worst_feature_num]])
            list_of_everything_a.append(contribution_by_feature_number_ordered)
            list_of_everything_oob.append(oob_test_score_by_feature_number)
            logger.info('__________________________________________________________________________________________')
            logger.info('Results after round %d', i)
            logger.info('__________________________________________________________________________________________')
            logger.info('base_set_to_supress: %s', base_set_to_supress)
            logger.info('contribution_by_feature_number_ordered: %s', contribution_by_feature_number_ordered)
            logger.info('list of features removed and ' + trec_eval_feature_name + ' %s', list_of_feature_deltas)