Example #1
0
    def build_file_train_model_produce_output(
            self, feature_names, n_gram_length, sentiment_processor, spotter,
            golden_saliency_by_entid_by_docid, dexter_dataset,
            wikipedia_dataset):
        feature_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'sentiment_simple_ngram_' + str(n_gram_length) + '.txt'
        document_to_feature_converter = SimpleSentiment(
            sentiment_processor, n_gram_length=n_gram_length)

        model_filename = FileLocations.get_dropbox_intermediate_path(
        ) + 'simple_sentiment_model_ngram_' + str(n_gram_length) + '.pickle'

        tosent_converter = SimpleGBRT(model_filename)
        test_docid_set = set(Const.TESTSET_DOCID_LIST)
        train_docid_set = set(Const.TRAINSET_DOCID_LIST)
        salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset(
            spotter, golden_saliency_by_entid_by_docid, feature_filename,
            document_to_feature_converter, tosent_converter, test_docid_set,
            train_docid_set)
        # if not os.path.isfile(model_filename):
        # build model
        self.train_model(feature_filename, feature_names, dexter_dataset,
                         wikipedia_dataset, model_filename)

        trc = TrecReferenceCreator()
        prefix = str(n_gram_length) + '_n_gram_x_temp'
        trc.create_results_file(salience_by_entity_by_doc_id, prefix)
        report, ndcg, trec_by_id = trc.get_report(
            FileLocations.get_dropbox_intermediate_path() +
            'trec_ground_truth.txt', prefix)
        trc.logger.info('\nTrec Eval Results:\n%s', report)

        return salience_by_entity_by_doc_id, ndcg, trec_by_id
Example #2
0
    def get_report(self, golden_source_filename, prefix):
        cmd = FileLocations.get_trec_eval_executable_location()
        param1 = golden_source_filename
        param2 = FileLocations.get_temp_path() + prefix + ".trec_results.txt"
        self.logger.info('%s %s %s  ', cmd, param1, param2)
        output = check_output([cmd, param1, param2])
        s = output.decode("utf-8").split('\n')

        trec_val_by_name = {}
        for i in range(len(s)):
            s_list = s[i].split('\t')
            if len(s_list) >= 2:
                n = s_list[0].strip()
                v = s_list[2]
                if self.isfloat(v):
                    trec_val_by_name[n] = float(v)

        overall_ndcg, result1 = self.extract_single_measure(
            cmd, param1, param2, trec_val_by_name, "ndcg")
        p_1, result2 = self.extract_single_measure(cmd, param1, param2,
                                                   trec_val_by_name, "P.1")
        p_2, result3 = self.extract_single_measure(cmd, param1, param2,
                                                   trec_val_by_name, "P.2")
        p_3, result4 = self.extract_single_measure(cmd, param1, param2,
                                                   trec_val_by_name, "P.3")
        p_4, result5 = self.extract_single_measure(cmd, param1, param2,
                                                   trec_val_by_name, "P.4")

        result = output.decode("utf-8") + result1.decode(
            "utf-8") + result2.decode("utf-8") + result3.decode(
                "utf-8") + result4.decode("utf-8") + result5.decode("utf-8")
        self.logger.debug('%s', result)

        return result, overall_ndcg, trec_val_by_name
Example #3
0
 def train_model_using_dexter_dataset(self, sentiment_processor, spotter,
                                      afinn_filename):
     dexter_json_doc_list = self.dd.get_dexter_dataset(
         FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
     self.logger.info('building list of n-grams')
     ngram_list = []
     for n_gram_length in range(2, 10):
         for json_doc in dexter_json_doc_list:
             data = json.loads(json_doc)
             # pprint.pprint(data)
             body = self.extract_body(data)
             title = data['title']
             title_entities = spotter.get_entity_candidates(title, 0.5)
             for e in title_entities:
                 n_gram = sentiment_processor.get_ngram(
                     title, n_gram_length, e.start_char, e.end_char)
                 ngram_list.append(n_gram)
             body_entities = spotter.get_entity_candidates(body, 0.5)
             for e in body_entities:
                 n_gram = sentiment_processor.get_ngram(
                     body, n_gram_length, e.start_char, e.end_char)
                 ngram_list.append(n_gram)
     self.logger.info('processing list of n-grams')
     sentiment_processor.cal_term_weight_on_full_corpus(afinn_filename,
                                                        ngram_list,
                                                        debug_mode=1)
     self.logger.info('processing complete')
Example #4
0
 def get_wikipedia_link_graph_sparse_csc(self):
     filename = FileLocations.get_dropbox_wikipedia_path(
     ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle'
     self.logger.info('Loading %s', filename)
     with open(filename, 'rb') as handle:
         self.wikipedia_link_graph_sparse_csc = pickle.load(handle)
     self.logger.info('Loaded %s', filename)
     return self.wikipedia_link_graph_sparse_csc
Example #5
0
 def load_wikititle_id_by_id(self, filename=None):
     if self.wikititle_id_by_id is None:
         if filename is None:
             filename = FileLocations.get_dropbox_wikipedia_path(
             ) + 'wikititle_id_by_id.case_insensitive.15910478.pickle'
         self.logger.info('Loading %s', filename)
         with open(filename, 'rb') as handle:
             self.wikititle_id_by_id = pickle.load(handle)
         self.logger.info('Loaded %s', filename)
Example #6
0
 def load_wikipeadia_link_graph(self, link_graph_filename=None):
     if self.wikipedia_link_graph_sparse is None:
         if link_graph_filename is None:
             link_graph_filename = FileLocations.get_dropbox_wikipedia_path(
             ) + 'wikipedia_link_graph_sparse.deduped.15910478.pickle'
         if os.path.isfile(link_graph_filename):
             self.logger.info('loading wikipedia_link_graph_sparse from %s',
                              link_graph_filename)
             with open(link_graph_filename, 'rb') as handle:
                 self.wikipedia_link_graph_sparse = pickle.load(handle)
             self.logger.info('loaded')
Example #7
0
    def create_reference_file(self, zero_less_than_2):

        # load the data
        dd = DatasetDexter()
        document_list = dd.get_dexter_dataset(
            path=FileLocations.get_dropbox_dexter_path())

        results = ''
        # process the data
        result_count = 0
        doc_count = 0

        for document in document_list:
            data = json.loads(document)
            saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden(
                data)
            docid = data['docId']

            sorted_list = self.get_ordered_list_from_dictionary(
                saliency_by_ent_id_golden)

            for item in sorted_list:
                entity_id = item[0]
                salience = item[1]
                if zero_less_than_2:
                    if salience < 2.0:
                        salience = 0.0
                results = results + str(docid) + ' 0 ' + str(
                    entity_id) + ' ' + str(salience) + '\n'
                result_count += 1

            self.logger.info('Documents Processed %d Entities Processed %d ',
                             doc_count, result_count)
            doc_count += 1

        fn = FileLocations.get_dropbox_intermediate_path(
        ) + "trec_ground_truth.txt"
        self.logger.info('writing to %s ', fn)
        file = open(fn, "w")
        file.write(results)
        file.close()
Example #8
0
    def convert_link_graph_to_csr_and_csc(self):

        self.load_wikipeadia_link_graph()
        self.logger.info('converting to csr')
        csr = self.wikipedia_link_graph_sparse.tocsr()
        self.logger.info('converting to csc')
        csc = self.wikipedia_link_graph_sparse.tocsc()

        output_filename = FileLocations.get_dropbox_wikipedia_path(
        ) + 'wikipedia_link_graph_sparse_csr.deduped.15910478.pickle'
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(csr, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)

        output_filename = FileLocations.get_dropbox_wikipedia_path(
        ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle'
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(csc, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)
    def dexter_dataset_sentiment(self, sentiment_processor, spotter,
                                 output_filename):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building list of n-grams')
        ngram_list = []

        sent_by_entity_id_by_docid = {}

        file_contents = ''
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            body = self.extract_body(data)
            title = data['title']
            docid = data['docId']

            sent_by_entity_id_by_docid[docid] = {}
            for n_gram_length in range(2, 10):
                title_entities = spotter.get_entity_candidates(title, 0.5)
                for e in title_entities:
                    n_gram = sentiment_processor.get_ngram(
                        title, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent

                ngram_list.append(n_gram)
                body_entities = spotter.get_entity_candidates(body, 0.5)
                for e in body_entities:
                    n_gram = sentiment_processor.get_ngram(
                        body, n_gram_length, e.start_char, e.end_char)
                    sent = sentiment_processor.get_doc_sentiment(n_gram)
                    if e.entity_id not in sent_by_entity_id_by_docid[docid]:
                        sent_by_entity_id_by_docid[docid][e.entity_id] = 0
                    sent_by_entity_id_by_docid[docid][
                        e.entity_id] = sent_by_entity_id_by_docid[docid][
                            e.entity_id] + sent
            #log progress
            for entity_id in sent_by_entity_id_by_docid[docid].keys():
                sent = sent_by_entity_id_by_docid[docid][entity_id]

                s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent)
                self.logger.info(s)
                file_contents = file_contents + s + '\n'

        file = open(output_filename, "w")
        file.write(file_contents)
        file.close()

        self.logger.info('processing complete')
Example #10
0
    def get_ndcg_and_trec_eval(self, feature_filename, model_filename, feature_names, docid_set, wikipediaDataset , dexterDataset, per_document_ndcg):
        self.logger.info('loading model %s', model_filename)

        with open(model_filename, 'rb') as handle:
            model = pickle.load(handle)

        salience_by_entity_by_doc_id = self.get_salience_by_entity_by_doc_id(feature_filename, model, docid_set, feature_names, dexterDataset,
                                                    wikipediaDataset, filter_for_interesting=False)

        trc = TrecReferenceCreator()
        prefix = 'model_runner_x_temp'
        trc.create_results_file(salience_by_entity_by_doc_id, prefix)
        overall_report, overall_ndcg, overall_trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix)


        ndcg_by_docid = {}
        trec_val_by_name_by_docid = {}
        if per_document_ndcg:
            skipped = []
            for docid in docid_set:
                salience_by_entity_by_doc_id_b = {}
                if docid in salience_by_entity_by_doc_id:
                    salience_by_entity_by_doc_id_b[docid] = salience_by_entity_by_doc_id[docid]
                    trc = TrecReferenceCreator()
                    prefix = 'model_runner_x_temp'
                    trc.create_results_file(salience_by_entity_by_doc_id_b, prefix)
                    report, ndcg, trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix)
                    trc.logger.info('\nTrec Eval Results:\n%s', report)
                    ndcg_by_docid[docid] = ndcg
                    trec_val_by_name_by_docid[docid] = trec_val_by_name
                else:
                    self.logger.warning('No data for docid %d, skipping',docid)
                    skipped.append(docid)
            self.logger.info('per doc ndcg : %s ', ndcg_by_docid)
            self.logger.info('skipped in the per doc ndcg : %s ', skipped)

        trc.logger.info('\n_____________________________________\nTrec Eval Results Overall:\n%s', overall_report)

        return overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid
 def grep_articles(self):
     for docid in self.docid_set:
         self.logger.info('%s', docid)
         cmd = '/usr/bin/grep'
         param1 = docid
         param2 = FileLocations.get_dropbox_datasets_path(
         ) + 'washingtonpost/WashingtonPost/data/*.txt'
         self.logger.info('%s %s %s  ', cmd, param1, param2)
         full_cmd = cmd + ' ' + param1 + ' ' + param2 + ' >> wp.txt'
         process = subprocess.Popen(full_cmd,
                                    shell=True,
                                    stdout=subprocess.PIPE)
         process.wait()
         self.logger.info('return code %d ', process.returncode)
Example #12
0
    def get_only_golden_rows(self, X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset):

        dexter_json_doc_list = dexterDataset.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        golden_saliency_by_entid_by_docid = dexterDataset.get_golden_saliency_by_entid_by_docid(dexter_json_doc_list, wikipediaDataset)

        rows_in_golden = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            docid = docid_array[i]
            entity_id = entity_id_array[i]
            if docid in golden_saliency_by_entid_by_docid:
                if entity_id in golden_saliency_by_entid_by_docid[docid]:
                    rows_in_golden[i] = 1

        X_filtered = X[rows_in_golden == 1]
        y_filtered = y[rows_in_golden == 1]
        docid_array_filtered = docid_array[rows_in_golden == 1]
        entity_id_array_filtered = entity_id_array[rows_in_golden == 1]

        return X_filtered, y_filtered, docid_array_filtered, entity_id_array_filtered
Example #13
0
    def create_results_file(self, salience_by_entity_by_doc_id, prefix):
        results = ''
        lines_written = 0
        for docid in salience_by_entity_by_doc_id.keys():
            if docid in salience_by_entity_by_doc_id:
                salience_by_entity_id = salience_by_entity_by_doc_id[docid]
                ordered_list = self.get_ordered_list_from_dictionary(
                    salience_by_entity_id)
                for item in ordered_list:
                    entity_id = item[0]
                    salience = item[1]
                    results = results + str(docid) + ' 0 ' + str(
                        entity_id) + ' 0 ' + str(salience) + ' STANDARD\n'
                    lines_written += 1

        fn = FileLocations.get_temp_path() + prefix + ".trec_results.txt"
        self.logger.info('writing to %s ', fn)
        file = open(fn, "w")
        file.write(results)
        file.close()
        return lines_written
Example #14
0
import logging
from sklearn.ensemble import GradientBoostingRegressor
from sellibrary.gbrt import GBRTWrapper
from sellibrary.text_file_loader import load_feature_matrix
from sellibrary.filter_only_golden import FilterGolden
from sellibrary.sel.dexter_dataset import DatasetDexter
from sellibrary.wiki.wikipedia_datasets import WikipediaDataset

from sellibrary.locations import FileLocations
INTERMEDIATE_PATH = FileLocations.get_dropbox_intermediate_path()

# setup logging

handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s'))
logger = logging.getLogger(__name__)
logger.addHandler(handler)
logger.propagate = False
logger.setLevel(logging.INFO)

feature_names = [
    'v1_graph_size', 'v1_graph_diameter', 'v1_node_degree',
    'v1_degree_mean_median_ratio', 'v1_out_degree_mean_median_ratio',
    'v1_degree_mean_median_ratio', 'v1_farness', 'v1_closeness',
    'v1_centrality', 'v1_minus_low_relatedness_graph_size',
    'v1_minus_low_relatedness_graph_diameter',
    'v1_minus_low_relatedness_node_degree',
    'v1_minus_low_relatedness_degree_mean_median_ratio',
    'v1_minus_low_relatedness_out_degree_mean_median_ratio',
    'v1_minus_low_relatedness_degree_mean_median_ratio',
Example #15
0
                           line_kws={
                               "color": "r",
                               "alpha": 0.7,
                               "lw": 5
                           })  #
    sns_plot.set_xlabel(xlabel)
    sns_plot.set_ylabel(ylabel)
    if log_y:
        sns_plot.set_yscale('log')
    fig = sns_plot.get_figure()
    fig.savefig(filename)


if __name__ == "__main__":

    filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle'
    build_model = False

    #    smb = SelModelBuilder()

    # if build_model:
    #     sentiment_processor = smb.train_and_save_model(filename)
    # else:
    #     sentiment_processor = SentimentProcessor()
    #     sentiment_processor.load_model(filename)

    dd = DatasetDexter()
    wikipediaDataset = WikipediaDataset()
    document_list = dd.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    spotter = GoldenSpotter(document_list, wikipediaDataset)
            self.logger.info('%s %d', docid, new_id)
            cmd = '/usr/bin/sed'

            param1 = '-i'
            param2 = ".old"
            param3 = "'s/" + docid + "/" + str(new_id) + "/g'"
            param4 = filename

            self.logger.info('%s %s %s %s %s', cmd, param1, param2, param3,
                             param4)
            full_cmd = cmd + ' ' + param1 + ' ' + param2 + ' ' + param3 + ' ' + param4
            process = subprocess.Popen(full_cmd,
                                       shell=True,
                                       stdout=subprocess.PIPE)
            process.wait()
            self.logger.info('return code %d ', process.returncode)


if __name__ == "__main__":

    # greps all the documents in the corpus against an ID list to build a file. This
    # file needs to be manually copied to the washingtonpost directory
    app = WashingtonPostSedder()
    app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() +
                               'wp/wp_minus_0.txt')
    app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() +
                               'wp_sentiment_simple.txt')
    app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() +
                               'wp_base_tf_simple_v2.txt')
                                       verbose=0,
                                       warm_start=False)

        forest = forest.fit(X_train, y_train)

        print('oob score ' + str(forest.oob_score_))
        with open(model_filename, 'wb') as handle:
            pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL)


if __name__ == "__main__":

    use_dexter_dataset = False
    use_wahington_post_dataset = True
    output_filename = 'base_tf_simple_v2.txt'
    model_filename = FileLocations.get_dropbox_intermediate_path(
    ) + 'simple_tf.pickle'
    train_model = False
    filter_for_interesting = False
    train_docid_set = None  # == ALL - filtered later
    train_docid_set = set(Const.TRAINSET_DOCID_LIST).union(
        Const.TESTSET_DOCID_LIST)
    report_docid_set = None  #set(Const.TESTSET_DOCID_LIST).union(Const.TRAINSET_DOCID_LIST) # filters the outputfile - add Train data to get a full output files

    if (use_wahington_post_dataset):
        output_filename = 'wp_' + output_filename
    output_filename = FileLocations.get_dropbox_intermediate_path(
    ) + output_filename

    document_to_feature_converter = DocToTermFreqConverter()
    handler = logging.StreamHandler()
    handler.setFormatter(
Example #18
0
            for entity_id in all_heavy_features_by_entity_id.keys():
                output = '{0},{1},{2},{3},{4}\n'.format(
                    str(optional_docid), str(entity_id), str('?'), str('?'),
                    str(all_heavy_features_by_entity_id[entity_id]))
                file.write(output)
            file.close()

        return features_by_entity_id


if __name__ == "__main__":

    #build a the golden spotter
    dd = DatasetDexter()
    document_list = dd.get_dexter_dataset(
        FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
    wikipedia_dataset = WikipediaDataset()
    spotter = GoldenSpotter(document_list, wikipedia_dataset)

    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""

    sfe = SelFeatureExtractor(spotter,
                              binary_classifier_threshold=0.5,
                              min_candidates_to_pass_through=5,
                              binary_classifier=None,
                              light_feature_filename=None,
                              heavy_feature_filename=None,
                              num_light_features=23,
                              break_early=False)
Example #19
0

def extract_body(data):
    body = ''
    for d in data['document']:
        if d['name'].startswith('body_par_'):
            body = body + d['value']
    return body


if __name__ == "__main__":

    dd = DatasetDexter()
    wikipediaDataset = WikipediaDataset()
    document_list = dd.get_dexter_dataset(
        path=FileLocations.get_dropbox_dexter_path())
    spotter = GoldenSpotter(document_list, wikipediaDataset)
    golden_saliency_by_entid_by_docid = dd.get_golden_saliency_by_entid_by_docid(
        document_list, wikipediaDataset)

    entities_per_doc = []
    high_salience_per_doc = []
    salience_list = []
    for docid in golden_saliency_by_entid_by_docid.keys():
        entities_per_doc.append(len(golden_saliency_by_entid_by_docid[docid]))
        salience_list.extend(golden_saliency_by_entid_by_docid[docid].values())
        count = 0
        for x in golden_saliency_by_entid_by_docid[docid].values():
            if x >= 2:
                count += 1
        high_salience_per_doc.append(count)
    def build_output_using_dexter_dataset(self, spotter,
                                          golden_saliency_by_entid_by_docid,
                                          output_filename, docid_set,
                                          use_rand_values):
        dexter_json_doc_list = self.dd.get_dexter_dataset(
            FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json')
        self.logger.info('building features')

        if (output_filename != None):
            file = open(output_filename, "w")
        else:
            file = None

        salience_by_entity_by_doc_id = {}
        for json_doc in dexter_json_doc_list:
            data = json.loads(json_doc)
            # pprint.pprint(data)
            docid = data['docId']

            if docid_set is None or docid in docid_set:

                salience_by_entity_by_doc_id[docid] = {}
                body = self.extract_body(data)
                title = data['title']
                title_entities = spotter.get_entity_candidates(title, docid)
                body_entities = spotter.get_entity_candidates(body, docid)

                features_by_entity_id = {}

                for e in title_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]
                for e in body_entities:
                    if docid in golden_saliency_by_entid_by_docid:
                        if e.entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                e.entity_id]
                    if use_rand_values:
                        features_by_entity_id[e.entity_id] = [random.random()]
                    else:
                        features_by_entity_id[e.entity_id] = [golden]

                for entity_id in features_by_entity_id.keys():
                    golden = 0
                    if docid in golden_saliency_by_entid_by_docid:
                        if entity_id in golden_saliency_by_entid_by_docid[
                                docid]:
                            golden = golden_saliency_by_entid_by_docid[docid][
                                entity_id]

                    line = str(docid) + ',' + str(entity_id) + ',' + str(
                        golden) + ',0,' + str(features_by_entity_id[entity_id])

                    if file is not None:
                        file.write(line)
                        file.write('\n')

                    sentiment = features_by_entity_id[entity_id][0]
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
                    self.logger.debug('sent %f', sentiment)

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id
Example #21
0

    # ___________Entry Point To Class________________________________________________

    def get_feature_list_by_ent(self, body, title, spotter, very_light=False, docid = -1):
        entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \
            self.get_entity_saliency_list(body, title, spotter, very_light, docid)
        return features_by_ent_id, name_by_entity_id

    # ___________________________________________________________

if __name__ == "__main__":

    #build a the golden spotter
    dd = DatasetDexter()
    document_list = dd.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(),'saliency-dataset.json')
    wikipedia_dataset = WikipediaDataset()
    spotter = GoldenSpotter(document_list, wikipedia_dataset)

    body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States."
    title = ""

    light_feature_calculator = SELLightFeatureCalculator()

    combiner = SELLightFeatureCombiner(light_feature_calculator)
    features_by_ent_id, name_by_entity_id = combiner.get_feature_list_by_ent(body, title, spotter, very_light=False, docid = 2)

    logger = logging.getLogger(__name__)
    logger.info(features_by_ent_id)
    logger.info(name_by_entity_id)
from sellibrary.locations import FileLocations
from sellibrary.sel.dexter_dataset import DatasetDexter
from sellibrary.sentiment.sentiment import SentimentProcessor
from sellibrary.wiki.wikipedia_datasets import WikipediaDataset
from sellibrary.converters.tosentiment.simple_gbrt import SimpleGBRT
from sellibrary.converters.tosentiment.sel_features_to_sentiment import SelFeatToSent
from sellibrary.trec.trec_util import TrecReferenceCreator
from sellibrary.util.s3_util import AWSUtil
from sellibrary.main.main_build_sel_model import SelModelBuilder


if __name__ == "__main__":
    min_number = int(sys.argv[1])
    max_number = int(sys.argv[2])

    filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle'
    build_model = False
    break_early = False
    aws_util = AWSUtil()
    smb = SelModelBuilder()


    # if build_model:
    #     sentiment_processor = smb.train_and_save_model(filename)
    # else:
    #     sentiment_processor = SentimentProcessor()
    #     sentiment_processor.load_model(filename)

    dd = smb.get_dexter_datset()
    wikipediaDataset = WikipediaDataset()
Example #23
0
        'freq in last 3 sentences of body ',  # 4
        'freq in title ',  # 4
        'one occurrence capitalised',  # 5
        'maximum fraction of uppercase letters',  # 6
        'average spot length in words',  # 8.1 :
        'average spot length in characters',  # 8.2 :
        'is in title',  # 11 :
        'unambiguous entity frequency',  # 14 : 1 entity frequency feature
        'entity in_degree in wikipeada',  # 20 :
        'entity out_degree in wikipeada',  # 20 :
        'entity degree in wikipeada',  # 20 :
        'document length',  # 22 :
    ]

    X, y, docid_array, entity_id_array = load_feature_matrix(
        feature_filename=FileLocations.get_dropbox_intermediate_path() + 'dexter_fset_02__1_to_604_light_output_all.txt',
        feature_names=feature_names,
        entity_id_index=1,
        y_feature_index=2, first_feature_index=4, number_features_per_line=27,
        tmp_filename='/tmp/temp_conversion_file.txt'
    )

    # train only on records we have a golden salience for
    fg = FilterGolden()
    X2, y2, docid2, entityid2 = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset= , wikipediaDataset=)

    wrapper = GBRTWrapper()

    splitter = DataSplitter()

    wrapper.train_model(X2, y2, entityid2, 0, n_estimators=40)
Example #24
0
 def get_dexter_dataset(self, path=None, filename='short.json'):
     if path is None:
         path = FileLocations.get_dropbox_wikipedia_path()
     with open(path + filename) as f:
         content = f.readlines()
     return content
                        file.write('\n')

                    sentiment = features_by_entity_id[entity_id][0]
                    salience_by_entity_by_doc_id[docid][entity_id] = sentiment
                    self.logger.debug('sent %f', sentiment)

        if file is not None:
            file.close()
            self.logger.info('written to %s', output_filename)
        self.logger.info('processing complete')

        return salience_by_entity_by_doc_id


if __name__ == "__main__":
    filename = FileLocations.get_dropbox_intermediate_path(
    ) + 'sentiment.pickle'
    smb = PWModelBuilder()

    sentiment_processor = SentimentProcessor()
    sentiment_processor.load_model(filename)

    phrase = ' one iraq three'
    # sent = sentiment_processor.get_doc_sentiment(phrase)
    # print(sent, phrase)

    smb.get_feature_list(sentiment_processor, ' one iraq three')
    smb.get_feature_list(sentiment_processor, 'abandon')
    smb.get_feature_list(sentiment_processor, 'outstanding')
    smb.get_feature_list(sentiment_processor, 'appeases')
    smb.get_feature_list(sentiment_processor, 'superb')
    smb.get_feature_list(sentiment_processor, 'prick')
                s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent)
                self.logger.info(s)
                file_contents = file_contents + s + '\n'

        file = open(output_filename, "w")
        file.write(file_contents)
        file.close()

        self.logger.info('processing complete')

    # def train_and_save_model(self, filename):
    #     spotter = SpotlightCachingSpotter(False)
    #     afinn_filename = '../sellibrary/resources/AFINN-111.txt'
    #     sentiment_processor = SentimentProcessor()
    #     self.train_model_using_dexter_dataset(sentiment_processor, spotter, afinn_filename)
    #     sentiment_processor.save_model(filename)
    #     return sentiment_processor


if __name__ == "__main__":
    filename = FileLocations.get_dropbox_intermediate_path(
    ) + 'sentiment.pickle'

    dexter_feeder = DexterFeeder()
    spotter = SpotlightCachingSpotter(False)
    sentiment_processor = SentimentProcessor()
    sentiment_processor.load_model(filename)

    dexter_feeder.dexter_dataset_sentiment(sentiment_processor, spotter,
                                           '/tmp/sentiment_output.txt')
Example #27
0
    def extract_graph_from_compressed(self, wikititle_to_id_filename=None):
        self.logger.warning('running extract_graph_from_compressed().')
        self.logger.warning(
            '[this takes about 2hr 20 min on Dwane\'s home machine]')
        input_file = gzip.open(FileLocations.get_dropbox_wikipedia_path() +
                               'wikipedia-dump.json.gz',
                               'rt',
                               encoding='utf-8')
        if wikititle_to_id_filename is not None:
            fn = wikititle_to_id_filename
        else:
            fn = FileLocations.get_dropbox_wikipedia_path(
            ) + 'wikititle_marisa_trie.case_insensitive.15910478.pickle'
        self.logger.warning(
            ' %s needs to be complete for these results to make most sense',
            fn)
        self.get_wikititle_case_insensitive_marisa_trie()

        count = 0
        line = '{}'
        from_list = []
        to_list = []
        value_list = []
        max_id = 0

        while count < 25000000 and line is not None and line != '':
            count += 1
            early_log = count <= 50000 and count % 10000 == 0
            late_log = count > 50000 and count % 1000000 == 0
            if early_log or late_log:
                self.logger.info('%d lines processed', count)
                output_filename = FileLocations.get_temp_path(
                ) + 'wikipedia_link_graph_sparse.deduped.' + str(
                    count) + '.pickle'
                self.logger.info('saving file %s', output_filename)
                row = np.array(from_list)
                col = np.array(to_list)
                data = np.array(value_list)
                mtx = sparse.coo_matrix((data, (row, col)),
                                        shape=(max_id + 1, max_id + 1))
                self.logger.info('About to write %s', output_filename)
                with open(output_filename, 'wb') as handle:
                    pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL)
                self.logger.info('file written = %s', output_filename)

            line = input_file.readline()
            if line != '':
                try:
                    data = json.loads(line)
                except json.decoder.JSONDecodeError as e:
                    self.logger.warning(
                        "type error decoding json: json = %s, error = %s",
                        line, str(e))
                    break
                # pprint.pprint(data)
                if 'links' in data:
                    fid = data['wid']
                    if self.get_wikititle_id_from_id(fid)[0][0] != fid:
                        self.logger.info(
                            '%s -> %s ', fid,
                            self.get_wikititle_id_from_id(fid)[0][0])
                        fid = self.get_wikititle_id_from_id(fid)[0][0]

                    if fid > max_id:
                        max_id = fid
                    for link in data['links']:
                        link_name = link[
                            'id']  # this is not numeric, has underscores, matches WikiTitle

                        if link_name in self.wikititle_marisa_trie:
                            link_list = self.wikititle_marisa_trie[link_name]
                            link_cid = link_list[0][0]

                            if link_cid > max_id:
                                max_id = link_cid

                            # d['type'] = link['type'] # do we care about link type? assuming no
                            from_list.append(fid)
                            to_list.append(link_cid)
                            value_list.append(1)

        self.logger.info('%d lines processed', count)
        output_filename = FileLocations.get_temp_path(
        ) + 'wikipedia_link_graph_sparse.deduped.' + str(count) + '.pickle'
        self.logger.info('saving file %s', output_filename)
        row = np.array(from_list)
        col = np.array(to_list)
        data = np.array(value_list)
        mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id, max_id))
        self.logger.info('About to write %s', output_filename)
        with open(output_filename, 'wb') as handle:
            pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL)
        self.logger.info('file written = %s', output_filename)
Example #28
0
        return salience_by_entity_by_doc_id



if __name__ == "__main__":

    min_docid = int(sys.argv[1])
    max_docid = int(sys.argv[2])
    break_early = False
    if len(sys.argv) >= 4:
        break_early = sys.argv[3].lower() == 'break_early'
        if break_early:
            SelModelBuilder.logger.warning('Break early is set to True')


    filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle'
    build_model = False
    aws_util = AWSUtil()
    smb = SelModelBuilder()


    # if build_model:
    #     sentiment_processor = smb.train_and_save_model(filename)
    # else:
    #     sentiment_processor = SentimentProcessor()
    #     sentiment_processor.load_model(filename)

    dd = smb.get_dexter_datset()
    wikipediaDataset = WikipediaDataset()
    document_list = dd.get_dexter_dataset(path=FileLocations.get_dropbox_dexter_path())
    spotter = GoldenSpotter(document_list, wikipediaDataset)
Example #29
0
from sellibrary.text_file_loader import join_feature_matrix

from sellibrary.text_file_loader import load_feature_matrix
from sellibrary.locations import FileLocations
from sellibrary.sel.dexter_dataset import DatasetDexter
from sellibrary.wiki.wikipedia_datasets import WikipediaDataset
from sellibrary.filter_only_golden import FilterGolden
from sellibrary.util.const import Const

if __name__ == "__main__":

    const = Const()
    dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path()

    file_A_feature_names = const.get_sel_feature_names()
    filename_A = dropbox_intermediate_path + 'wp/wp.txt'

    file_B_feature_names = const.sent_feature_names
    filename_B = dropbox_intermediate_path + 'wp_sentiment_simple.txt'  #'base_tf_simple_v2.txt'

    output_filename = dropbox_intermediate_path + 'wp_joined.txt'  #'joined_sel_sent_and_tf.txt'

    # Load File A
    X1, y1, docid_array1, entity_id_array1 = load_feature_matrix(
        feature_filename=filename_A,
        feature_names=file_A_feature_names,
        entity_id_index=1,
        y_feature_index=2,
        first_feature_index=4,
        number_features_per_line=len(file_A_feature_names) + 4,
        tmp_filename='/tmp/temp_conversion_file.txt')
import botocore
from sellibrary.locations import FileLocations

s3_client = boto3.client('s3')

# Call S3 to list current buckets
response = s3_client.list_buckets()

# Get a list of all bucket names from the response
buckets = [bucket['Name'] for bucket in response['Buckets']]
# Print out the bucket list
print("Bucket List: %s" % buckets)

BUCKET_NAME = 'entity-salience.rnd.signal'  # replace with your bucket name
KEY = 'my_image_in_s3.jpg'  # replace with your object key
path = FileLocations.get_dropbox_intermediate_path() + 'wp'

s3 = boto3.resource('s3')
try:
    bucket = s3.Bucket(BUCKET_NAME)
    for object in bucket.objects.all():
        print(object.key)
        # d = object.get()
        if object.key.find(
                'sel_all_features_golden_spotter.washington_post.docnum') > -1:
            i = object.key.rfind('/')
            name = path + object.key[i:]
            print(name)
            bucket.download_file(object.key, name)

except botocore.exceptions.ClientError as e: