def save_cache(self): output_filename = FileLocations.get_dropbox_intermediate_path( ) + 'spotlight_docs_list.pickle' temp_filename = FileLocations.get_dropbox_intermediate_path( ) + "spotlight_docs_list.tmp.2.pickle" lock_acquired = False while not lock_acquired: try: if not os.path.isfile(output_filename) and not os.path.isfile( temp_filename): pass else: move(output_filename, temp_filename) except: time.sleep(3) else: lock_acquired = True self.logger.info('lock acquired') # do your writing self.logger.info('About to write %s', temp_filename) with open(temp_filename, 'wb') as handle: pickle.dump(self.cache, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename) # release file by moving it back move(temp_filename, output_filename) # lock_acquired = False self.logger.info('lock released')
def load_light_parameter_data_fmt_03(): # Format # First Row : text headers # Second row onwards comma separated # DocId, # Entity_id as per wikipedia # Golden Salience as per Trani et al. (saliency as marked by expert annotators ( 0.0 < value <= 3.0 )) # Estimated salience (will be rubbish if the binary classifier has not been trained) # Light features written in list [ 1, 2, 3, None, 7, ...] # light_param_filename = FileLocations.get_dropbox_intermediate_path() \ + 'dexter_light_features_fmt_v03.run.01.txt' BinaryClassifierTrainer.logger.info('loading data from : %s', light_param_filename) with open(light_param_filename) as f: lines = f.readlines() transformed_contents = '' for line in lines: if len(line.split(sep=',')) != 27: pass # skip line - it is a header BinaryClassifierTrainer.logger.info('skipping line: %s', line) else: line = line.replace('[', '') line = line.replace(']', '') line = line.replace('None', '0.0') transformed_contents = transformed_contents + '\n' + line fn = FileLocations.get_temp_path() + 'light_output_intermediate.txt' file = open(fn, "w") file.write(transformed_contents) file.close() data = genfromtxt(fn, delimiter=',') entity_id_array = data[:, 1] y = data[:, 2] X = data[:, 4:] y = y > 0.0 # convert y to only 1.0 and 0.0 # Overwrite NaNs with 0s BinaryClassifierTrainer.logger.debug('Are there NaNs?') nan_locations = np.argwhere(np.isnan(X)) for loc in nan_locations: X[loc[0], loc[1]] = 0 BinaryClassifierTrainer.logger.debug('Are there still any NaNs?') nan_locations = np.argwhere(np.isnan(X)) BinaryClassifierTrainer.logger.debug(nan_locations) BinaryClassifierTrainer.logger.info('X Shape %s', X.shape) return X, y, entity_id_array
def load_light_parameter_data_fmt_02(): # # First column is whether the entity was marked as salient by expert annotators in # the dexter corpus ( value > 0.0 ) # Second Column is the salience as predicted when this was passed through the pipeline. # Third column is a list of feature values # light_param_filename = FileLocations.get_dropbox_intermediate_path() + \ 'dexter_light_features_fmt_v02_partial.03.docs-1-195.txt' BinaryClassifierTrainer.logger.info('loading data from : %s', light_param_filename) data = genfromtxt(light_param_filename, delimiter=',') y = data[:, 0] X = data[:, 2:] y = y > 0.0 # Overwrite NaNs with 0s # Overwrite NaNs with 0s BinaryClassifierTrainer.logger.debug('Are there NaNs?') nan_locations = np.argwhere(np.isnan(X)) for loc in nan_locations: X[loc[0], loc[1]] = 0 BinaryClassifierTrainer.logger.debug('Are there still any NaNs?') nan_locations = np.argwhere(np.isnan(X)) BinaryClassifierTrainer.logger.debug(nan_locations) BinaryClassifierTrainer.logger.info('X Shape %s', X.shape) return X, y
def save_model(self): dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path( ) output_filename = dropbox_intermediate_path + 'binary_classifier.pickle' self.logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(self.model, handle, protocol=pickle.HIGHEST_PROTOCOL) self.logger.info('file written = %s', output_filename)
def save_partial_results(self, prefix, obj): output_filename = FileLocations.get_temp_path( ) + prefix + '.partial.pickle' self.logger.info('About to write %s', output_filename) try: with open(output_filename, 'wb') as handle: pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL) except OSError as e: self.logger.warning('Could not save file %s. err=%s', output_filename, str(e)) self.logger.info('file written = %s', output_filename)
def load_model(self): dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path( ) input_filename = dropbox_intermediate_path + 'binary_classifier.pickle' if os.path.isfile(input_filename): self.logger.info('loading binary classifier from %s', input_filename) with open(input_filename, 'rb') as handle: self.model = pickle.load(handle) self.logger.info('loaded') else: self.logger.info('Could not load model %s', input_filename)
def __init__(self): # set up logging handler = logging.StreamHandler() handler.setFormatter( logging.Formatter( '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')) self.logger = logging.getLogger(__name__) self.logger.addHandler(handler) self.logger.propagate = False self.logger.setLevel(logging.INFO) # set up instance variables wds = WikipediaDataset() self.intermediate_path = FileLocations.get_temp_path() self.spotlight_util = SpotlightUtil()
def load_cache(self): input_filename = FileLocations.get_dropbox_intermediate_path( ) + 'spotlight_docs_list.pickle' count = 0 if count < 3 and not os.path.isfile(input_filename): # may be being written to by another process, wait time.sleep(1) if os.path.isfile(input_filename): self.logger.info('loading spotlight cache from %s', input_filename) with open(input_filename, 'rb') as handle: self.cache = pickle.load(handle) self.logger.info('%d items loaded', len(self.cache.values())) else: self.logger.info('file does not exists %s', input_filename) self.cache = {}
def load_light_parameter_data_fmt_01(): # # First column is whether the entity was marked as salient by expert annotators in the dexter corpus # dropbox_intermediate_path = FileLocations.get_dropbox_intermediate_path( ) data = genfromtxt(dropbox_intermediate_path + 'dexter_light_features_v0001.txt', delimiter=',') y = data[:, 0] X = data[:, 1:-1] # Overwrite NaNs with 0s BinaryClassifierTrainer.logger.debug('Are there NaNs?') nan_locations = np.argwhere(np.isnan(X)) for loc in nan_locations: X[loc[0], loc[1]] = 0 BinaryClassifierTrainer.logger.debug('Are there still any NaNs?') nan_locations = np.argwhere(np.isnan(X)) BinaryClassifierTrainer.logger.debug(nan_locations) BinaryClassifierTrainer.logger.info('X Shape %s', X.shape) return X, y
def process_document(self, optional_docid, body, title, file_prefix, break_early=False, golden_salience_by_entity_id={}, min_candidates_to_pass_through=0, binary_classifier_threshold=0.5, spotter_confidence=0.5): if golden_salience_by_entity_id is None: golden_salience_by_entity_id = {} light_features_by_ent_id, name_by_entity_id = \ self.light_feature_extractor.get_feature_list_by_ent(body, title, self.spotter, False, spotter_confidence=spotter_confidence) self.logger.info('Light features have been calculated ') survivor_candidates = [] predictions_by_entity_id = {} light_results = '' for entity_id in light_features_by_ent_id.keys(): light_features = light_features_by_ent_id[entity_id] prediction = None try: prediction = self.binary_classifier.predict(light_features) except ValueError as e: self.logger.warning( 'An exception occurred, could not predict, assuming 0.0. light_features = %s, err=%s', str(light_features), str(e)) if prediction is None: prediction = 0.0 predictions_by_entity_id[entity_id] = prediction self.logger.info( 'entity_id %d prediction %f binary_classifier_threshold=%f', entity_id, prediction, binary_classifier_threshold) if prediction > binary_classifier_threshold: # binary_classifier_threshold appears to be a tuple... why? survivor_candidates.append(entity_id) golden_salience = 0 if entity_id in golden_salience_by_entity_id: golden_salience = golden_salience_by_entity_id[entity_id] light_results = '{0}\n{1},{2},{3},{4},{5}'.format( light_results, str(optional_docid), str(entity_id), str(golden_salience), str(prediction), str(light_features)) file = open( FileLocations.get_temp_path() + file_prefix + 'light_output_partial.txt', "a") file.write(light_results) file.close() self.logger.info('Predictions %s', predictions_by_entity_id) self.logger.info('Survivor candidate entity_id are: %s ', survivor_candidates) if len(survivor_candidates) < 1: self.logger.warning( "No candidates survived, passing first %d through", min_candidates_to_pass_through) for entity_id in light_features_by_ent_id.keys(): if len(survivor_candidates) < min_candidates_to_pass_through: survivor_candidates.append(entity_id) if self.heavy_feature_extractor is None: self.logger.warning( 'Heavy extractor is None, not performing further processing.') return {}, {}, 0, [] all_heavy_features_by_entity_id = self.heavy_feature_extractor.process( survivor_candidates, break_early=break_early, optional_docId=optional_docid) fname = title.replace(' ', '_') fname = fname.replace('.', '_') fname = fname.replace('"', '_') fname = fname.replace('\'', '_') fname = fname[0:50].lower() self.save_partial_results('all_heavy_features_by_entity_id_' + fname, all_heavy_features_by_entity_id) calculated_saliency_by_entity_id = {} output = '' for entity_id in all_heavy_features_by_entity_id.keys(): if entity_id in golden_salience_by_entity_id: target_saliency = golden_salience_by_entity_id[entity_id] else: target_saliency = 0.0 list_of_heavy_features = all_heavy_features_by_entity_id[entity_id] self.logger.info('Number of heavy features %d for entity_id %d ', len(list_of_heavy_features), entity_id) calculated_saliency = 0.0 try: pred_array = self.heavy_feature_regressor.predict( np.array(list_of_heavy_features).reshape(1, -1)) calculated_saliency = pred_array[0] except ValueError as e: self.logger.warning( 'could not calc gbrt, returning 0. entity_id=%d, x=%s, err=%s', entity_id, list_of_heavy_features, e) except IndexError as e: self.logger.warning( 'could not calc gbrt, returning 0. entity_id=%d, x=%s, err=%s', entity_id, list_of_heavy_features, e) self.logger.info( 'calculated saliency for docid = %d entity_id = %d saliency = %f ', optional_docid, entity_id, calculated_saliency) calculated_saliency_by_entity_id[entity_id] = calculated_saliency output = '{0}{1},{2},{3},{4},{5}\n'.format( output, str(optional_docid), str(entity_id), str(target_saliency), str(calculated_saliency), str(all_heavy_features_by_entity_id[entity_id])) fn = FileLocations.get_temp_path( ) + file_prefix + 'heavy_output_partial.txt' self.logger.debug('Appending heavy parameters to %s', output) file = open(fn, "a") file.write(output) file.close() self.logger.info('\n%s', output) return calculated_saliency_by_entity_id, golden_salience_by_entity_id
def main(self, from_, to_, measurement, pipeline_portion): # load the data dd = DatasetDexter() document_list = dd.get_dexter_dataset() # process the data count = 0 slcs = SpotlightCachingSpotter() light_features_to_zero = [] lfe = SELLightFeatureExtractor(light_features_to_zero) gbrt = None # GBRT('fred') ndcg = NDCG() min_candidates_to_pass_through = 3 binary_classifier_threshold = 0.5 spotter_confidence = 0.5 corpus_name = 'dexter_fset_02_' break_early = False file_prefix = (corpus_name + '_' + str(from_) + '_to_' + str(to_) + '_') salience_by_entity_by_doc_id = {} time_by_docid = {} light_feature_filename = FileLocations.get_temp_path( ) + file_prefix + 'light_output_partial.txt' file = open(light_feature_filename, "a") file.write( '\ndocId, entity_id, golden_salience, estimated_salience, [light_features]' ) file.close() for document in document_list: data = json.loads(document) docid = data['docId'] if (count in range(from_, (to_ + 1)) and measurement == 'LINE') or \ (docid in range(from_, (to_ + 1)) and measurement == 'DOCID'): self.logger.info('_______________________________________') self.logger.info('Starting processing of docid = %d line=%d ', docid, count) start_time = time.time() saliency_by_ent_id_golden = self.extract_saliency_by_ent_id_golden( data) body = self.extract_body(data) title = data['title'] pipeline = Pipeline002(slcs, lfe, gbrt, ndcg, light_feature_filename) calculated_saliency_by_entity_id, golden_salience_by_entity_id, discount_sum, model_dcgs = \ pipeline.process_document( docid, body, title, file_prefix, break_early=break_early, golden_salience_by_entity_id=saliency_by_ent_id_golden, min_candidates_to_pass_through=min_candidates_to_pass_through, binary_classifier_threshold=binary_classifier_threshold, spotter_confidence=spotter_confidence) salience_by_entity_by_doc_id[ docid] = calculated_saliency_by_entity_id self.logger.info('count = %d, docId = %d ', count, docid) self.logger.info('calculated_saliency_by_entity_id = %s ', str(calculated_saliency_by_entity_id)) self.logger.info('discount_sum = %s ', str(discount_sum)) self.logger.info('model_dcgs = %s ', str(model_dcgs)) diff = time.time() - start_time time_by_docid[docid] = diff self.logger.info('Times taken %s', time_by_docid) self.logger.info('Time taken for docid=%d, time=%f', docid, diff) count += 1 self.logger.info('Times taken by docid: %s', time_by_docid) trc = TrecReferenceCreator() trc.create_results_file(salience_by_entity_by_doc_id, 'x_temp') report, ndcg, p_at = trc.get_report( FileLocations.get_dropbox_intermediate_path() + 'trec_ground_truth.txt', 'x_temp') self.logger.info(' Trec Eval Results:\n %s', report)
from sel.file_locations import FileLocations # dense to sparse # set up logging handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')) logger = logging.getLogger(__name__) logger.addHandler(handler) logger.propagate = False logger.setLevel(logging.INFO) if __name__ == "__main__": ds = WikipediaDataset() wikititle_marisa_trie = ds.get_wikititle_case_insensitive_marisa_trie() logger.info('Creating dictionary') wikititle_by_id = {} for k in wikititle_marisa_trie.keys(): wid = wikititle_marisa_trie.get(k)[0][0] wikititle_by_id[wid] = k logger.info('complete') output_filename = FileLocations.get_dropbox_wikipedia_path( ) + 'wikititle_by_id.pickle' logger.info('About to write %s', output_filename) with open(output_filename, 'wb') as handle: pickle.dump(wikititle_by_id, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info('file written = %s', output_filename)