def build_file_train_model_produce_output( self, feature_names, n_gram_length, sentiment_processor, spotter, golden_saliency_by_entid_by_docid, dexter_dataset, wikipedia_dataset): feature_filename = FileLocations.get_dropbox_intermediate_path( ) + 'sentiment_simple_ngram_' + str(n_gram_length) + '.txt' document_to_feature_converter = SimpleSentiment( sentiment_processor, n_gram_length=n_gram_length) model_filename = FileLocations.get_dropbox_intermediate_path( ) + 'simple_sentiment_model_ngram_' + str(n_gram_length) + '.pickle' tosent_converter = SimpleGBRT(model_filename) test_docid_set = set(Const.TESTSET_DOCID_LIST) train_docid_set = set(Const.TRAINSET_DOCID_LIST) salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset( spotter, golden_saliency_by_entid_by_docid, feature_filename, document_to_feature_converter, tosent_converter, test_docid_set, train_docid_set) # if not os.path.isfile(model_filename): # build model self.train_model(feature_filename, feature_names, dexter_dataset, wikipedia_dataset, model_filename) trc = TrecReferenceCreator() prefix = str(n_gram_length) + '_n_gram_x_temp' trc.create_results_file(salience_by_entity_by_doc_id, prefix) report, ndcg, trec_by_id = trc.get_report( FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) trc.logger.info('\nTrec Eval Results:\n%s', report) return salience_by_entity_by_doc_id, ndcg, trec_by_id
def get_ndcg_and_trec_eval(self, feature_filename, model_filename, feature_names, docid_set, wikipediaDataset , dexterDataset, per_document_ndcg): self.logger.info('loading model %s', model_filename) with open(model_filename, 'rb') as handle: model = pickle.load(handle) salience_by_entity_by_doc_id = self.get_salience_by_entity_by_doc_id(feature_filename, model, docid_set, feature_names, dexterDataset, wikipediaDataset, filter_for_interesting=False) trc = TrecReferenceCreator() prefix = 'model_runner_x_temp' trc.create_results_file(salience_by_entity_by_doc_id, prefix) overall_report, overall_ndcg, overall_trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) ndcg_by_docid = {} trec_val_by_name_by_docid = {} if per_document_ndcg: skipped = [] for docid in docid_set: salience_by_entity_by_doc_id_b = {} if docid in salience_by_entity_by_doc_id: salience_by_entity_by_doc_id_b[docid] = salience_by_entity_by_doc_id[docid] trc = TrecReferenceCreator() prefix = 'model_runner_x_temp' trc.create_results_file(salience_by_entity_by_doc_id_b, prefix) report, ndcg, trec_val_by_name = trc.get_report(FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', prefix) trc.logger.info('\nTrec Eval Results:\n%s', report) ndcg_by_docid[docid] = ndcg trec_val_by_name_by_docid[docid] = trec_val_by_name else: self.logger.warning('No data for docid %d, skipping',docid) skipped.append(docid) self.logger.info('per doc ndcg : %s ', ndcg_by_docid) self.logger.info('skipped in the per doc ndcg : %s ', skipped) trc.logger.info('\n_____________________________________\nTrec Eval Results Overall:\n%s', overall_report) return overall_ndcg, ndcg_by_docid, overall_trec_val_by_name, trec_val_by_name_by_docid
None, train_docid_set, wikipediaDataset, filter_for_interesting=filter_for_interesting) builder = SalienceBasedOnTFModelBuilder() builder.train_model(output_filename, document_to_feature_converter.tf_feature_names, datasetDexter, wikipediaDataset, model_filename) tosent_converter = SimpleGBRT(model_filename) salience_by_entity_by_doc_id = smb.build_output_using_dexter_dataset( spotter, golden_saliency_by_entid_by_docid, output_filename, document_to_feature_converter, tosent_converter, report_docid_set, wikipediaDataset, filter_for_interesting=filter_for_interesting, json_doc_list=document_list) if use_dexter_dataset: trc = TrecReferenceCreator() lines_written = trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp') if lines_written > 0: report, ndcg, p_at = trc.get_report( FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', 'x_temp') trc.logger.info(' Trec Eval Results:\n %s', report)
def main(self, from_, to_, measurement, pipeline_portion): # load the data dd = DatasetDexter() document_list = dd.get_dexter_dataset() # process the data count = 0 slcs = SpotlightCachingSpotter() light_features_to_zero = [] lfe = SELLightFeatureExtractor(light_features_to_zero) gbrt = None # GBRT('fred') ndcg = NDCG() min_candidates_to_pass_through = 3 binary_classifier_threshold = 0.5 spotter_confidence = 0.5 corpus_name = 'dexter_fset_02_' break_early = False file_prefix = (corpus_name + '_' + str(from_) + '_to_' + str(to_) + '_') salience_by_entity_by_doc_id = {} time_by_docid = {} light_feature_filename = FileLocations.get_temp_path( ) + file_prefix + 'light_output_partial.txt' file = open(light_feature_filename, "a") file.write( '\ndocId, entity_id, golden_salience, estimated_salience, [light_features]' ) file.close() for document in document_list: data = json.loads(document) docid = data['docId'] if (count in range(from_, (to_ + 1)) and measurement == 'LINE') or \ (docid in range(from_, (to_ + 1)) and measurement == 'DOCID'): self.logger.info('_______________________________________') self.logger.info('Starting processing of docid = %d line=%d ', docid, count) start_time = time.time() saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden( data) body = self.extract_body(data) title = data['title'] pipeline = Pipeline002(slcs, lfe, gbrt, ndcg, light_feature_filename) calculated_saliency_by_entity_id, golden_salience_by_entity_id, discount_sum, model_dcgs = \ pipeline.process_document( docid, body, title, file_prefix, break_early=break_early, golden_salience_by_entity_id=saliency_by_ent_id_golden, min_candidates_to_pass_through=min_candidates_to_pass_through, binary_classifier_threshold=binary_classifier_threshold, spotter_confidence=spotter_confidence) salience_by_entity_by_doc_id[ docid] = calculated_saliency_by_entity_id self.logger.info('count = %d, docId = %d ', count, docid) self.logger.info('calculated_saliency_by_entity_id = %s ', str(calculated_saliency_by_entity_id)) self.logger.info('discount_sum = %s ', str(discount_sum)) self.logger.info('model_dcgs = %s ', str(model_dcgs)) diff = time.time() - start_time time_by_docid[docid] = diff self.logger.info('Times taken %s', time_by_docid) self.logger.info('Time taken for docid=%d, time=%f', docid, diff) count += 1 self.logger.info('Times taken by docid: %s', time_by_docid) trc = TrecReferenceCreator() trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp') report, ndcg, p_at = trc.get_report( FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', 'x_temp') self.logger.info(' Trec Eval Results:\n %s', report)
from sellibrary.trec.trec_util import TrecReferenceCreator if __name__ == "__main__": df = TrecReferenceCreator() df.create_reference_file(True)