def build_file_train_model_produce_output( self, feature_names, n_gram_length, sentiment_processor, spotter, golden_saliency_by_entid_by_docid, dexter_dataset, wikipedia_dataset): feature_filename = FileLocations.get_dropbox_intermediate_path( ) + 'sentiment_simple_ngram_' + str(n_gram_length) + '.txt' document_to_feature_converter = SimpleSentiment( sentiment_processor, n_gram_length=n_gram_length) model_filename = FileLocations.get_dropbox_intermediate_path( ) + 'simple_sentiment_model_ngram_' + str(n_gram_length) + '.pickle' tosent_converter = SimpleGBRT(model_filename) test_docid_set = set(Const.TESTSET_DOCID_LIST) train_docid_set = set(Const.TRAINSET_DOCID_LIST) salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset( spotter, golden_saliency_by_entid_by_docid, feature_filename, document_to_feature_converter, tosent_converter, test_docid_set, train_docid_set) # if not os.path.isfile(model_filename): # build model self.train_model(feature_filename, feature_names, dexter_dataset, wikipedia_dataset, model_filename) trc = TrecReferenceCreator() prefix = str(n_gram_length) + '_n_gram_x_temp' trc.create_results_file(salience_by_entity_by_doc_id, prefix) report, ndcg, trec_by_id = trc.get_report( FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) trc.logger.info('\nTrec Eval Results:\n%s', report) return salience_by_entity_by_doc_id, ndcg, trec_by_id
def get_report(self, golden_source_filename, prefix): cmd = FileLocations.get_trec_eval_executable_location() param1 = golden_source_filename param2 = FileLocations.get_temp_path() + prefix + ".trec_results.txt" self.logger.info('%s %s %s ', cmd, param1, param2) output = check_output([cmd, param1, param2]) s = output.decode("utf-8").split('\n') trec_val_by_name = {} for i in range(len(s)): s_list = s[i].split('\t') if len(s_list) >= 2: n = s_list[0].strip() v = s_list[2] if self.isfloat(v): trec_val_by_name[n] = float(v) overall_ndcg, result1 = self.extract_single_measure( cmd, param1, param2, trec_val_by_name, "ndcg") p_1, result2 = self.extract_single_measure(cmd, param1, param2, trec_val_by_name, "P.1") p_2, result3 = self.extract_single_measure(cmd, param1, param2, trec_val_by_name, "P.2") p_3, result4 = self.extract_single_measure(cmd, param1, param2, trec_val_by_name, "P.3") p_4, result5 = self.extract_single_measure(cmd, param1, param2, trec_val_by_name, "P.4") result = output.decode("utf-8") + result1.decode( "utf-8") + result2.decode("utf-8") + result3.decode( "utf-8") + result4.decode("utf-8") + result5.decode("utf-8") self.logger.debug('%s', result) return result, overall_ndcg, trec_val_by_name
def train_model_using_dexter_dataset(self, sentiment_processor, spotter, afinn_filename): dexter_json_doc_list = self.dd.get_dexter_dataset( FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json') self.logger.info('building list of n-grams') ngram_list = [] for n_gram_length in range(2, 10): for json_doc in dexter_json_doc_list: data = json.loads(json_doc) # pprint.pprint(data) body = self.extract_body(data) title = data['title'] title_entities = spotter.get_entity_candidates(title, 0.5) for e in title_entities: n_gram = sentiment_processor.get_ngram( title, n_gram_length, e.start_char, e.end_char) ngram_list.append(n_gram) body_entities = spotter.get_entity_candidates(body, 0.5) for e in body_entities: n_gram = sentiment_processor.get_ngram( body, n_gram_length, e.start_char, e.end_char) ngram_list.append(n_gram) self.logger.info('processing list of n-grams') sentiment_processor.cal_term_weight_on_full_corpus(afinn_filename, ngram_list, debug_mode=1) self.logger.info('processing complete')
def get_wikipedia_link_graph_sparse_csc(self): filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle' self.logger.info('Loading %s', filename) with open(filename, 'rb') as handle: self.wikipedia_link_graph_sparse_csc = pickle.load(handle) self.logger.info('Loaded %s', filename) return self.wikipedia_link_graph_sparse_csc
def load_wikititle_id_by_id(self, filename=None): if self.wikititle_id_by_id is None: if filename is None: filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikititle_id_by_id.case_insensitive.15910478.pickle' self.logger.info('Loading %s', filename) with open(filename, 'rb') as handle: self.wikititle_id_by_id = pickle.load(handle) self.logger.info('Loaded %s', filename)
def load_wikipeadia_link_graph(self, link_graph_filename=None): if self.wikipedia_link_graph_sparse is None: if link_graph_filename is None: link_graph_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse.deduped.15910478.pickle' if os.path.isfile(link_graph_filename): self.logger.info('loading wikipedia_link_graph_sparse from %s', link_graph_filename) with open(link_graph_filename, 'rb') as handle: self.wikipedia_link_graph_sparse = pickle.load(handle) self.logger.info('loaded')
def create_reference_file(self, zero_less_than_2): # load the data dd = DatasetDexter() document_list = dd.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) results = '' # process the data result_count = 0 doc_count = 0 for document in document_list: data = json.loads(document) saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden( data) docid = data['docId'] sorted_list = self.get_ordered_list_from_dictionary( saliency_by_ent_id_golden) for item in sorted_list: entity_id = item[0] salience = item[1] if zero_less_than_2: if salience < 2.0: salience = 0.0 results = results + str(docid) + ' 0 ' + str( entity_id) + ' ' + str(salience) + '\n' result_count += 1 self.logger.info('Documents Processed %d Entities Processed %d ', doc_count, result_count) doc_count += 1 fn = FileLocations.get_dropbox_intermediate_path( ) + "trec_ground_truth.txt" self.logger.info('writing to %s ', fn) file = open(fn, "w") file.write(results) file.close()
def convert_link_graph_to_csr_and_csc(self): self.load_wikipeadia_link_graph() self.logger.info('converting to csr') csr = self.wikipedia_link_graph_sparse.tocsr() self.logger.info('converting to csc') csc = self.wikipedia_link_graph_sparse.tocsc() output_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csr.deduped.15910478.pickle' self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(csr, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename) output_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikipedia_link_graph_sparse_csc.deduped.15910478.pickle' self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(csc, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename)
def dexter_dataset_sentiment(self, sentiment_processor, spotter, output_filename): dexter_json_doc_list = self.dd.get_dexter_dataset( FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json') self.logger.info('building list of n-grams') ngram_list = [] sent_by_entity_id_by_docid = {} file_contents = '' for json_doc in dexter_json_doc_list: data = json.loads(json_doc) body = self.extract_body(data) title = data['title'] docid = data['docId'] sent_by_entity_id_by_docid[docid] = {} for n_gram_length in range(2, 10): title_entities = spotter.get_entity_candidates(title, 0.5) for e in title_entities: n_gram = sentiment_processor.get_ngram( title, n_gram_length, e.start_char, e.end_char) sent = sentiment_processor.get_doc_sentiment(n_gram) if e.entity_id not in sent_by_entity_id_by_docid[docid]: sent_by_entity_id_by_docid[docid][e.entity_id] = 0 sent_by_entity_id_by_docid[docid][ e.entity_id] = sent_by_entity_id_by_docid[docid][ e.entity_id] + sent ngram_list.append(n_gram) body_entities = spotter.get_entity_candidates(body, 0.5) for e in body_entities: n_gram = sentiment_processor.get_ngram( body, n_gram_length, e.start_char, e.end_char) sent = sentiment_processor.get_doc_sentiment(n_gram) if e.entity_id not in sent_by_entity_id_by_docid[docid]: sent_by_entity_id_by_docid[docid][e.entity_id] = 0 sent_by_entity_id_by_docid[docid][ e.entity_id] = sent_by_entity_id_by_docid[docid][ e.entity_id] + sent #log progress for entity_id in sent_by_entity_id_by_docid[docid].keys(): sent = sent_by_entity_id_by_docid[docid][entity_id] s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent) self.logger.info(s) file_contents = file_contents + s + '\n' file = open(output_filename, "w") file.write(file_contents) file.close() self.logger.info('processing complete')
def get_ndcg_and_trec_eval(self, feature_filename, model_filename, feature_names, docid_set, wikipediaDataset , dexterDataset, per_document_ndcg): self.logger.info('loading model %s', model_filename) with open(model_filename, 'rb') as handle: model = pickle.load(handle) salience_by_entity_by_doc_id = self.get_salience_by_entity_by_doc_id(feature_filename, model, docid_set, feature_names, dexterDataset, wikipediaDataset, filter_for_interesting=False) trc = TrecReferenceCreator() prefix = 'model_runner_x_temp' trc.create_results_file(salience_by_entity_by_doc_id, prefix) overall_report, overall_ndcg, overall_trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) ndcg_by_docid = {} trec_val_by_name_by_docid = {} if per_document_ndcg: skipped = [] for docid in docid_set: salience_by_entity_by_doc_id_b = {} if docid in salience_by_entity_by_doc_id: salience_by_entity_by_doc_id_b[docid] = salience_by_entity_by_doc_id[docid] trc = TrecReferenceCreator() prefix = 'model_runner_x_temp' trc.create_results_file(salience_by_entity_by_doc_id_b, prefix) report, ndcg, trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) trc.logger.info('\nTrec Eval Results:\n%s', report) ndcg_by_docid[docid] = ndcg trec_val_by_name_by_docid[docid] = trec_val_by_name else: self.logger.warning('No data for docid %d, skipping',docid) skipped.append(docid) self.logger.info('per doc ndcg : %s ', ndcg_by_docid) self.logger.info('skipped in the per doc ndcg : %s ', skipped) trc.logger.info('\n_____________________________________\nTrec Eval Results Overall:\n%s', overall_report) return overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid
def grep_articles(self): for docid in self.docid_set: self.logger.info('%s', docid) cmd = '/usr/bin/grep' param1 = docid param2 = FileLocations.get_dropbox_datasets_path( ) + 'washingtonpost/WashingtonPost/data/*.txt' self.logger.info('%s %s %s ', cmd, param1, param2) full_cmd = cmd + ' ' + param1 + ' ' + param2 + ' >> wp.txt' process = subprocess.Popen(full_cmd, shell=True, stdout=subprocess.PIPE) process.wait() self.logger.info('return code %d ', process.returncode)
def get_only_golden_rows(self, X, y, docid_array, entity_id_array, dexterDataset, wikipediaDataset): dexter_json_doc_list = dexterDataset.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json') golden_saliency_by_entid_by_docid = dexterDataset.get_golden_saliency_by_entid_by_docid(dexter_json_doc_list, wikipediaDataset) rows_in_golden = np.zeros(X.shape[0]) for i in range(X.shape[0]): docid = docid_array[i] entity_id = entity_id_array[i] if docid in golden_saliency_by_entid_by_docid: if entity_id in golden_saliency_by_entid_by_docid[docid]: rows_in_golden[i] = 1 X_filtered = X[rows_in_golden == 1] y_filtered = y[rows_in_golden == 1] docid_array_filtered = docid_array[rows_in_golden == 1] entity_id_array_filtered = entity_id_array[rows_in_golden == 1] return X_filtered, y_filtered, docid_array_filtered, entity_id_array_filtered
def create_results_file(self, salience_by_entity_by_doc_id, prefix): results = '' lines_written = 0 for docid in salience_by_entity_by_doc_id.keys(): if docid in salience_by_entity_by_doc_id: salience_by_entity_id = salience_by_entity_by_doc_id[docid] ordered_list = self.get_ordered_list_from_dictionary( salience_by_entity_id) for item in ordered_list: entity_id = item[0] salience = item[1] results = results + str(docid) + ' 0 ' + str( entity_id) + ' 0 ' + str(salience) + ' STANDARD\n' lines_written += 1 fn = FileLocations.get_temp_path() + prefix + ".trec_results.txt" self.logger.info('writing to %s ', fn) file = open(fn, "w") file.write(results) file.close() return lines_written
import logging from sklearn.ensemble import GradientBoostingRegressor from sellibrary.gbrt import GBRTWrapper from sellibrary.text_file_loader import load_feature_matrix from sellibrary.filter_only_golden import FilterGolden from sellibrary.sel.dexter_dataset import DatasetDexter from sellibrary.wiki.wikipedia_datasets import WikipediaDataset from sellibrary.locations import FileLocations INTERMEDIATE_PATH = FileLocations.get_dropbox_intermediate_path() # setup logging handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')) logger = logging.getLogger(__name__) logger.addHandler(handler) logger.propagate = False logger.setLevel(logging.INFO) feature_names = [ 'v1_graph_size', 'v1_graph_diameter', 'v1_node_degree', 'v1_degree_mean_median_ratio', 'v1_out_degree_mean_median_ratio', 'v1_degree_mean_median_ratio', 'v1_farness', 'v1_closeness', 'v1_centrality', 'v1_minus_low_relatedness_graph_size', 'v1_minus_low_relatedness_graph_diameter', 'v1_minus_low_relatedness_node_degree', 'v1_minus_low_relatedness_degree_mean_median_ratio', 'v1_minus_low_relatedness_out_degree_mean_median_ratio', 'v1_minus_low_relatedness_degree_mean_median_ratio',
line_kws={ "color": "r", "alpha": 0.7, "lw": 5 }) # sns_plot.set_xlabel(xlabel) sns_plot.set_ylabel(ylabel) if log_y: sns_plot.set_yscale('log') fig = sns_plot.get_figure() fig.savefig(filename) if __name__ == "__main__": filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle' build_model = False # smb = SelModelBuilder() # if build_model: # sentiment_processor = smb.train_and_save_model(filename) # else: # sentiment_processor = SentimentProcessor() # sentiment_processor.load_model(filename) dd = DatasetDexter() wikipediaDataset = WikipediaDataset() document_list = dd.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) spotter = GoldenSpotter(document_list, wikipediaDataset)
self.logger.info('%s %d', docid, new_id) cmd = '/usr/bin/sed' param1 = '-i' param2 = ".old" param3 = "'s/" + docid + "/" + str(new_id) + "/g'" param4 = filename self.logger.info('%s %s %s %s %s', cmd, param1, param2, param3, param4) full_cmd = cmd + ' ' + param1 + ' ' + param2 + ' ' + param3 + ' ' + param4 process = subprocess.Popen(full_cmd, shell=True, stdout=subprocess.PIPE) process.wait() self.logger.info('return code %d ', process.returncode) if __name__ == "__main__": # greps all the documents in the corpus against an ID list to build a file. This # file needs to be manually copied to the washingtonpost directory app = WashingtonPostSedder() app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() + 'wp/wp_minus_0.txt') app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() + 'wp_sentiment_simple.txt') app.replace_docid_articles(FileLocations.get_dropbox_intermediate_path() + 'wp_base_tf_simple_v2.txt')
verbose=0, warm_start=False) forest = forest.fit(X_train, y_train) print('oob score ' + str(forest.oob_score_)) with open(model_filename, 'wb') as handle: pickle.dump(forest, handle, protocol=pickle.HIGHEST_PROTOCOL) if __name__ == "__main__": use_dexter_dataset = False use_wahington_post_dataset = True output_filename = 'base_tf_simple_v2.txt' model_filename = FileLocations.get_dropbox_intermediate_path( ) + 'simple_tf.pickle' train_model = False filter_for_interesting = False train_docid_set = None # == ALL - filtered later train_docid_set = set(Const.TRAINSET_DOCID_LIST).union( Const.TESTSET_DOCID_LIST) report_docid_set = None #set(Const.TESTSET_DOCID_LIST).union(Const.TRAINSET_DOCID_LIST) # filters the outputfile - add Train data to get a full output files if (use_wahington_post_dataset): output_filename = 'wp_' + output_filename output_filename = FileLocations.get_dropbox_intermediate_path( ) + output_filename document_to_feature_converter = DocToTermFreqConverter() handler = logging.StreamHandler() handler.setFormatter(
for entity_id in all_heavy_features_by_entity_id.keys(): output = '{0},{1},{2},{3},{4}\n'.format( str(optional_docid), str(entity_id), str('?'), str('?'), str(all_heavy_features_by_entity_id[entity_id])) file.write(output) file.close() return features_by_entity_id if __name__ == "__main__": #build a the golden spotter dd = DatasetDexter() document_list = dd.get_dexter_dataset( FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json') wikipedia_dataset = WikipediaDataset() spotter = GoldenSpotter(document_list, wikipedia_dataset) body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States." title = "" sfe = SelFeatureExtractor(spotter, binary_classifier_threshold=0.5, min_candidates_to_pass_through=5, binary_classifier=None, light_feature_filename=None, heavy_feature_filename=None, num_light_features=23, break_early=False)
def extract_body(data): body = '' for d in data['document']: if d['name'].startswith('body_par_'): body = body + d['value'] return body if __name__ == "__main__": dd = DatasetDexter() wikipediaDataset = WikipediaDataset() document_list = dd.get_dexter_dataset( path=FileLocations.get_dropbox_dexter_path()) spotter = GoldenSpotter(document_list, wikipediaDataset) golden_saliency_by_entid_by_docid = dd.get_golden_saliency_by_entid_by_docid( document_list, wikipediaDataset) entities_per_doc = [] high_salience_per_doc = [] salience_list = [] for docid in golden_saliency_by_entid_by_docid.keys(): entities_per_doc.append(len(golden_saliency_by_entid_by_docid[docid])) salience_list.extend(golden_saliency_by_entid_by_docid[docid].values()) count = 0 for x in golden_saliency_by_entid_by_docid[docid].values(): if x >= 2: count += 1 high_salience_per_doc.append(count)
def build_output_using_dexter_dataset(self, spotter, golden_saliency_by_entid_by_docid, output_filename, docid_set, use_rand_values): dexter_json_doc_list = self.dd.get_dexter_dataset( FileLocations.get_dropbox_dexter_path(), 'saliency-dataset.json') self.logger.info('building features') if (output_filename != None): file = open(output_filename, "w") else: file = None salience_by_entity_by_doc_id = {} for json_doc in dexter_json_doc_list: data = json.loads(json_doc) # pprint.pprint(data) docid = data['docId'] if docid_set is None or docid in docid_set: salience_by_entity_by_doc_id[docid] = {} body = self.extract_body(data) title = data['title'] title_entities = spotter.get_entity_candidates(title, docid) body_entities = spotter.get_entity_candidates(body, docid) features_by_entity_id = {} for e in title_entities: if docid in golden_saliency_by_entid_by_docid: if e.entity_id in golden_saliency_by_entid_by_docid[ docid]: golden = golden_saliency_by_entid_by_docid[docid][ e.entity_id] if use_rand_values: features_by_entity_id[e.entity_id] = [random.random()] else: features_by_entity_id[e.entity_id] = [golden] for e in body_entities: if docid in golden_saliency_by_entid_by_docid: if e.entity_id in golden_saliency_by_entid_by_docid[ docid]: golden = golden_saliency_by_entid_by_docid[docid][ e.entity_id] if use_rand_values: features_by_entity_id[e.entity_id] = [random.random()] else: features_by_entity_id[e.entity_id] = [golden] for entity_id in features_by_entity_id.keys(): golden = 0 if docid in golden_saliency_by_entid_by_docid: if entity_id in golden_saliency_by_entid_by_docid[ docid]: golden = golden_saliency_by_entid_by_docid[docid][ entity_id] line = str(docid) + ',' + str(entity_id) + ',' + str( golden) + ',0,' + str(features_by_entity_id[entity_id]) if file is not None: file.write(line) file.write('\n') sentiment = features_by_entity_id[entity_id][0] salience_by_entity_by_doc_id[docid][entity_id] = sentiment self.logger.debug('sent %f', sentiment) if file is not None: file.close() self.logger.info('written to %s', output_filename) self.logger.info('processing complete') return salience_by_entity_by_doc_id
# ___________Entry Point To Class________________________________________________ def get_feature_list_by_ent(self, body, title, spotter, very_light=False, docid = -1): entity_list, entity_id_set, features_by_ent_id, name_by_entity_id, title_entity_list, title_entity_id_set = \ self.get_entity_saliency_list(body, title, spotter, very_light, docid) return features_by_ent_id, name_by_entity_id # ___________________________________________________________ if __name__ == "__main__": #build a the golden spotter dd = DatasetDexter() document_list = dd.get_dexter_dataset(FileLocations.get_dropbox_dexter_path(),'saliency-dataset.json') wikipedia_dataset = WikipediaDataset() spotter = GoldenSpotter(document_list, wikipedia_dataset) body = "Iranian representatives say negotiations with Europe on its nuclear program are in the final stages. Iran's foreign minister, Kamal Kharazi, told state television Saturday Iranian negotiators have given their final response to a European Union proposal to suspend Iran's uranium enrichment program. He said it is now up to the Europeans to decide whether or not to accept their decision. Iran and the European Union's big three powers; Britain, Germany, and France; have been negotiating a deal under which Tehran would agree to freeze sensitive nuclear work to avoid possible U.N. Security Council sanctions. U.S. Secretary of State Colin Powell, says that Iran's nuclear program is intended to make nuclear weapons. Iran authorities have insisted that their nuclear ambitions are limited to generating electricity from atomic energy plants, not making bombs. Critics of the position of the United States point to Israel's nuclear program. Israel maintains a policy of nuclear ambiguity, but is widely believed to possess at least 82 nuclear weapons. The program has not been condemned by the United States." title = "" light_feature_calculator = SELLightFeatureCalculator() combiner = SELLightFeatureCombiner(light_feature_calculator) features_by_ent_id, name_by_entity_id = combiner.get_feature_list_by_ent(body, title, spotter, very_light=False, docid = 2) logger = logging.getLogger(__name__) logger.info(features_by_ent_id) logger.info(name_by_entity_id)
from sellibrary.locations import FileLocations from sellibrary.sel.dexter_dataset import DatasetDexter from sellibrary.sentiment.sentiment import SentimentProcessor from sellibrary.wiki.wikipedia_datasets import WikipediaDataset from sellibrary.converters.tosentiment.simple_gbrt import SimpleGBRT from sellibrary.converters.tosentiment.sel_features_to_sentiment import SelFeatToSent from sellibrary.trec.trec_util import TrecReferenceCreator from sellibrary.util.s3_util import AWSUtil from sellibrary.main.main_build_sel_model import SelModelBuilder if __name__ == "__main__": min_number = int(sys.argv[1]) max_number = int(sys.argv[2]) filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle' build_model = False break_early = False aws_util = AWSUtil() smb = SelModelBuilder() # if build_model: # sentiment_processor = smb.train_and_save_model(filename) # else: # sentiment_processor = SentimentProcessor() # sentiment_processor.load_model(filename) dd = smb.get_dexter_datset() wikipediaDataset = WikipediaDataset()
'freq in last 3 sentences of body ', # 4 'freq in title ', # 4 'one occurrence capitalised', # 5 'maximum fraction of uppercase letters', # 6 'average spot length in words', # 8.1 : 'average spot length in characters', # 8.2 : 'is in title', # 11 : 'unambiguous entity frequency', # 14 : 1 entity frequency feature 'entity in_degree in wikipeada', # 20 : 'entity out_degree in wikipeada', # 20 : 'entity degree in wikipeada', # 20 : 'document length', # 22 : ] X, y, docid_array, entity_id_array = load_feature_matrix( feature_filename=FileLocations.get_dropbox_intermediate_path() + 'dexter_fset_02__1_to_604_light_output_all.txt', feature_names=feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=27, tmp_filename='/tmp/temp_conversion_file.txt' ) # train only on records we have a golden salience for fg = FilterGolden() X2, y2, docid2, entityid2 = fg.get_only_golden_rows(X, y, docid_array, entity_id_array, dexterDataset= , wikipediaDataset=) wrapper = GBRTWrapper() splitter = DataSplitter() wrapper.train_model(X2, y2, entityid2, 0, n_estimators=40)
def get_dexter_dataset(self, path=None, filename='short.json'): if path is None: path = FileLocations.get_dropbox_wikipedia_path() with open(path + filename) as f: content = f.readlines() return content
file.write('\n') sentiment = features_by_entity_id[entity_id][0] salience_by_entity_by_doc_id[docid][entity_id] = sentiment self.logger.debug('sent %f', sentiment) if file is not None: file.close() self.logger.info('written to %s', output_filename) self.logger.info('processing complete') return salience_by_entity_by_doc_id if __name__ == "__main__": filename = FileLocations.get_dropbox_intermediate_path( ) + 'sentiment.pickle' smb = PWModelBuilder() sentiment_processor = SentimentProcessor() sentiment_processor.load_model(filename) phrase = ' one iraq three' # sent = sentiment_processor.get_doc_sentiment(phrase) # print(sent, phrase) smb.get_feature_list(sentiment_processor, ' one iraq three') smb.get_feature_list(sentiment_processor, 'abandon') smb.get_feature_list(sentiment_processor, 'outstanding') smb.get_feature_list(sentiment_processor, 'appeases') smb.get_feature_list(sentiment_processor, 'superb') smb.get_feature_list(sentiment_processor, 'prick')
s = '%d %d 0 0 [ %f ]' % (docid, entity_id, sent) self.logger.info(s) file_contents = file_contents + s + '\n' file = open(output_filename, "w") file.write(file_contents) file.close() self.logger.info('processing complete') # def train_and_save_model(self, filename): # spotter = SpotlightCachingSpotter(False) # afinn_filename = '../sellibrary/resources/AFINN-111.txt' # sentiment_processor = SentimentProcessor() # self.train_model_using_dexter_dataset(sentiment_processor, spotter, afinn_filename) # sentiment_processor.save_model(filename) # return sentiment_processor if __name__ == "__main__": filename = FileLocations.get_dropbox_intermediate_path( ) + 'sentiment.pickle' dexter_feeder = DexterFeeder() spotter = SpotlightCachingSpotter(False) sentiment_processor = SentimentProcessor() sentiment_processor.load_model(filename) dexter_feeder.dexter_dataset_sentiment(sentiment_processor, spotter, '/tmp/sentiment_output.txt')
def extract_graph_from_compressed(self, wikititle_to_id_filename=None): self.logger.warning('running extract_graph_from_compressed().') self.logger.warning( '[this takes about 2hr 20 min on Dwane\'s home machine]') input_file = gzip.open(FileLocations.get_dropbox_wikipedia_path() + 'wikipedia-dump.json.gz', 'rt', encoding='utf-8') if wikititle_to_id_filename is not None: fn = wikititle_to_id_filename else: fn = FileLocations.get_dropbox_wikipedia_path( ) + 'wikititle_marisa_trie.case_insensitive.15910478.pickle' self.logger.warning( ' %s needs to be complete for these results to make most sense', fn) self.get_wikititle_case_insensitive_marisa_trie() count = 0 line = '{}' from_list = [] to_list = [] value_list = [] max_id = 0 while count < 25000000 and line is not None and line != '': count += 1 early_log = count <= 50000 and count % 10000 == 0 late_log = count > 50000 and count % 1000000 == 0 if early_log or late_log: self.logger.info('%d lines processed', count) output_filename = FileLocations.get_temp_path( ) + 'wikipedia_link_graph_sparse.deduped.' + str( count) + '.pickle' self.logger.info('saving file %s', output_filename) row = np.array(from_list) col = np.array(to_list) data = np.array(value_list) mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id + 1, max_id + 1)) self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename) line = input_file.readline() if line != '': try: data = json.loads(line) except json.decoder.JSONDecodeError as e: self.logger.warning( "type error decoding json: json = %s, error = %s", line, str(e)) break # pprint.pprint(data) if 'links' in data: fid = data['wid'] if self.get_wikititle_id_from_id(fid)[0][0] != fid: self.logger.info( '%s -> %s ', fid, self.get_wikititle_id_from_id(fid)[0][0]) fid = self.get_wikititle_id_from_id(fid)[0][0] if fid > max_id: max_id = fid for link in data['links']: link_name = link[ 'id'] # this is not numeric, has underscores, matches WikiTitle if link_name in self.wikititle_marisa_trie: link_list = self.wikititle_marisa_trie[link_name] link_cid = link_list[0][0] if link_cid > max_id: max_id = link_cid # d['type'] = link['type'] # do we care about link type? assuming no from_list.append(fid) to_list.append(link_cid) value_list.append(1) self.logger.info('%d lines processed', count) output_filename = FileLocations.get_temp_path( ) + 'wikipedia_link_graph_sparse.deduped.' + str(count) + '.pickle' self.logger.info('saving file %s', output_filename) row = np.array(from_list) col = np.array(to_list) data = np.array(value_list) mtx = sparse.coo_matrix((data, (row, col)), shape=(max_id, max_id)) self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(mtx, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename)
return salience_by_entity_by_doc_id if __name__ == "__main__": min_docid = int(sys.argv[1]) max_docid = int(sys.argv[2]) break_early = False if len(sys.argv) >= 4: break_early = sys.argv[3].lower() == 'break_early' if break_early: SelModelBuilder.logger.warning('Break early is set to True') filename = FileLocations.get_dropbox_intermediate_path() + 'sel.pickle' build_model = False aws_util = AWSUtil() smb = SelModelBuilder() # if build_model: # sentiment_processor = smb.train_and_save_model(filename) # else: # sentiment_processor = SentimentProcessor() # sentiment_processor.load_model(filename) dd = smb.get_dexter_datset() wikipediaDataset = WikipediaDataset() document_list = dd.get_dexter_dataset(path=FileLocations.get_dropbox_dexter_path()) spotter = GoldenSpotter(document_list, wikipediaDataset)
from sellibrary.text_file_loader import join_feature_matrix from sellibrary.text_file_loader import load_feature_matrix from sellibrary.locations import FileLocations from sellibrary.sel.dexter_dataset import DatasetDexter from sellibrary.wiki.wikipedia_datasets import WikipediaDataset from sellibrary.filter_only_golden import FilterGolden from sellibrary.util.const import Const if __name__ == "__main__": const = Const() dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path() file_A_feature_names = const.get_sel_feature_names() filename_A = dropbox_intermediate_path + 'wp/wp.txt' file_B_feature_names = const.sent_feature_names filename_B = dropbox_intermediate_path + 'wp_sentiment_simple.txt' #'base_tf_simple_v2.txt' output_filename = dropbox_intermediate_path + 'wp_joined.txt' #'joined_sel_sent_and_tf.txt' # Load File A X1, y1, docid_array1, entity_id_array1 = load_feature_matrix( feature_filename=filename_A, feature_names=file_A_feature_names, entity_id_index=1, y_feature_index=2, first_feature_index=4, number_features_per_line=len(file_A_feature_names) + 4, tmp_filename='/tmp/temp_conversion_file.txt')
import botocore from sellibrary.locations import FileLocations s3_client = boto3.client('s3') # Call S3 to list current buckets response = s3_client.list_buckets() # Get a list of all bucket names from the response buckets = [bucket['Name'] for bucket in response['Buckets']] # Print out the bucket list print("Bucket List: %s" % buckets) BUCKET_NAME = 'entity-salience.rnd.signal' # replace with your bucket name KEY = 'my_image_in_s3.jpg' # replace with your object key path = FileLocations.get_dropbox_intermediate_path() + 'wp' s3 = boto3.resource('s3') try: bucket = s3.Bucket(BUCKET_NAME) for object in bucket.objects.all(): print(object.key) # d = object.get() if object.key.find( 'sel_all_features_golden_spotter.washington_post.docnum') > -1: i = object.key.rfind('/') name = path + object.key[i:] print(name) bucket.download_file(object.key, name) except botocore.exceptions.ClientError as e: