def get_feature_avg(svm_in): l_svm_data = load_svm_feature(svm_in) h_rel_feature = {} h_irrel_feature = {} rel_cnt = 0 irrel_cnt = 0 for data in l_svm_data: label = data['score'] h_feature = data['feature'] for key, score in h_feature.items(): if score < -20: score = 0 else: score = math.exp(score) h_feature[key] = score if label > 0: h_rel_feature = add_svm_feature(h_rel_feature, h_feature) rel_cnt += 1 else: h_irrel_feature = add_svm_feature(h_irrel_feature, h_feature) irrel_cnt += 1 rel_cnt = float(rel_cnt) irrel_cnt = float(irrel_cnt) for key in h_rel_feature: h_rel_feature[key] /= rel_cnt for key in h_irrel_feature: h_irrel_feature[key] /= irrel_cnt return h_rel_feature, h_irrel_feature
def evaluate_normal(self, docs, f_predict): print("Evaluating predictions [%s] from [%s]." % (f_predict, docs)) evaluator = SalienceEva() # evaluator with default values. h_e_total_eva = dict() e_p = 0 p = 0 skip = 0 for res in self.load_pairs(docs, f_predict): p += 1 if not res: e_p += 1 else: skip += 1 continue predictions, s_e_label, s_evm_label = res l_e_pack = self.get_e_labels(predictions, s_e_label) if l_e_pack: h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1]) h_e_total_eva = add_svm_feature(h_e_total_eva, h_e) if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) sys.stdout.write('\rEvaluated %d files, %d with entities,' ' %d line skipped. P@1: %s.' % (p, e_p, skip, h_e_mean_eva['p@01'])) print('') h_e_mean_eva = {} if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) logging.info('finished predicted [%d] docs on entity, eva %s', e_p, json.dumps(h_e_mean_eva)) res = {'entity': h_e_mean_eva} with open(f_predict + '.entity.eval', 'w') as out: json.dump(res, out, indent=1)
def predict(self, test_in_name, label_out_name, debug=False): """ predict the data in test_in, dump predict labels in label_out_name :param test_in_name: :param label_out_name: :param debug: :return: """ res_dir = os.path.dirname(label_out_name) if not os.path.exists(res_dir): os.makedirs(res_dir) self.model.debug_mode(debug) self.model.eval() out = open(label_out_name, 'w') logging.info('start predicting for [%s]', test_in_name) p = 0 h_total_eva = dict() for line in open(test_in_name): if self.io_parser.is_empty_line(line): continue h_out, h_this_eva = self._per_doc_predict(line) if h_out is None: continue h_total_eva = add_svm_feature(h_total_eva, h_this_eva) print >> out, json.dumps(h_out) p += 1 h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) if not p % 1000: logging.info('predicted [%d] docs, eva %s', p, json.dumps(h_mean_eva)) h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / max(p, 1.0)) l_mean_eva = h_mean_eva.items() l_mean_eva.sort(key=lambda item: item[0]) logging.info('finished predicted [%d] docs, eva %s', p, json.dumps(l_mean_eva)) json.dump(l_mean_eva, open(label_out_name + '.eval', 'w'), indent=1) out.close() return
def process(self): h_total_eva = {} with gzip.open(self.corpus_in) as test_in, \ open(self.test_out, 'w') as out: p = 0 for line in test_in: data = json.loads(line) if self.is_empty(data): continue p += 1 word2eid = defaultdict(list) labels = [] l_e = [] index = 0 for event in data[self.spot_field][body_field]: word2eid[event['surface']].append(index) labels.append(event['salience']) event_id = self.h_event_id.get(self.get_event_head(event), 0) l_e.append(event_id) text = data[body_field] parser = PlaintextParser.from_string(text, Tokenizer('english')) predicted = {} rank = 1 for sentence in self.summarizer(parser.document, 10): for word in sentence.words: if word in word2eid: eids = word2eid[word] if word not in predicted: predicted[word] = (eids, rank) rank += 1 prediction = [0] * len(labels) for w, (eids, rank) in predicted.items(): for eid in eids: prediction[eid] = 1.0 / rank eva = self.evaluator.evaluate(prediction, labels) h_out = { 'docno': data['docno'], body_field: { 'predict': zip(l_e, prediction), }, 'eval': eva, } h_total_eva = add_svm_feature(h_total_eva, eva) h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) print >> out, json.dumps(h_out) if not p % 1000: logging.info('predicted [%d] docs, eva %s', p, json.dumps(h_mean_eva))
def evaluate_json_joint(self, docs, f_predict): print("Evaluating joint predictions [%s] from [%s]." % (f_predict, docs)) evaluator = SalienceEva() # evaluator with default values. h_e_total_eva = dict() h_e_mean_eva = dict() h_evm_total_eva = dict() h_evm_mean_eva = dict() h_all_total_eva = dict() h_all_mean_eva = dict() e_p = 0 evm_p = 0 all_p = 0 p = 0 for res in self.load_pairs(docs, f_predict): p += 1 if not res: continue predictions, s_e_label, s_evm_label = res l_e_pack = self.get_e_labels(predictions, s_e_label) l_evm_pack = self.get_evm_labels(predictions, s_evm_label) all_pack = zip(*zip(*l_e_pack) + zip(*l_evm_pack)) if l_e_pack: h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1]) e_p += 1 h_e_total_eva = add_svm_feature(h_e_total_eva, h_e) if l_evm_pack: h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1]) evm_p += 1 h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm) if all_pack: h_all = evaluator.evaluate(all_pack[0], all_pack[1]) all_p += 1 h_all_total_eva = add_svm_feature(h_all_total_eva, h_all) if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) if not evm_p == 0: h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p) if not all_p == 0: h_all_mean_eva = mutiply_svm_feature(h_all_total_eva, 1.0 / all_p) ep1 = '%.4f' % h_e_mean_eva[ 'p@01'] if 'p@01' in h_e_mean_eva else 'N/A' evmp1 = '%.4f' % h_evm_mean_eva[ 'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A' all1 = '%.4f' % h_all_mean_eva[ 'p@01'] if 'p@01' in h_all_mean_eva else 'N/A' sys.stdout.write('\rEvaluated %d files, %d with entities and %d ' 'with events, En P@1: %s, Evm P@1: %s, ' 'All P@1: %s.' % (p, e_p, evm_p, ep1, evmp1, all1)) print('') h_e_mean_eva = {} if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) logging.info('finished predicted [%d] docs on entity, eva %s', e_p, json.dumps(h_e_mean_eva)) h_evm_mean_eva = {} if not evm_p == 0: h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p) logging.info('finished predicted [%d] docs on event, eva %s', evm_p, json.dumps(h_evm_mean_eva)) logging.info("Results to copy:") line1 = ["p@01", "p@05", "p@10", "p@20", "auc"] line2 = ["r@01", "r@05", "r@10", "r@20"] line1_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line1] line1_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line1] line1_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line1] line2_evm_scores = ["%.4f" % h_evm_mean_eva[k] for k in line2] line2_ent_scores = ["%.4f" % h_e_mean_eva[k] for k in line2] line2_all_scores = ["%.4f" % h_all_mean_eva[k] for k in line2] print "\t-\t".join(line1_evm_scores) + "\t-\t-\t" + \ "\t".join(line1_all_scores) + "\t-\t" + \ "\t".join(line1_ent_scores) print "\t-\t".join(line2_evm_scores) + "\t-\t-\t-\t-\t" + \ "\t".join(line2_all_scores) + "\t-\t-\t" + \ "\t".join(line2_ent_scores) res = {'entity': h_e_mean_eva, 'event': h_evm_mean_eva} with open(f_predict + '.joint.eval', 'w') as out: json.dump(res, out, indent=1)
def process(self): open_func = gzip.open if self.corpus_in.endswith("gz") else open outs = [] for name in self.feature_names_split: out_path = self.test_out + "_" + name.replace(" ", "_") + '.json' outs.append(open(out_path, 'w')) logging.info("Feature output will be stored at [%s]" % out_path) with open_func(self.corpus_in) as in_f: l_h_total_eva = [{} for _ in range(self.feature_dim)] p = 0 for line in in_f: if self.io.is_empty_line(line): continue # Instead of providing batch, we just give one by one. h_packed_data, m_label = self.io.parse_data([line]) h_info = json.loads(line) key_name = 'docno' docno = h_info[key_name] p += 1 l_h_out = self.eval_per_dim(h_packed_data, m_label, self.reverse_dim, key_name, docno) for (dim, h_out), out in zip(enumerate(l_h_out), outs): h_this_eva = h_out['eval'] l_h_total_eva[dim] = add_svm_feature( l_h_total_eva[dim], h_this_eva) h_mean_eva = mutiply_svm_feature(l_h_total_eva[dim], 1.0 / p) print >> out, json.dumps(h_out) if not p % 1000: logging.info('predicted [%d] docs, eva %s for [%s]', p, json.dumps(h_mean_eva), self.feature_names_split[dim]) for dim, h_total_eva in enumerate(l_h_total_eva): h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) logging.info('finished predicted [%d] docs, eva %s for [%s]', p, json.dumps(h_mean_eva), self.feature_names_split[dim]) for (dim, h_total_eva), name in zip(enumerate(l_h_total_eva), self.feature_names_split): h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p) l_mean_eva = sorted(h_mean_eva.items(), key=lambda item: item[0]) logging.info('finished predicted [%d] docs, eva %s', p, json.dumps(l_mean_eva)) with open(self.test_out + "_" + name.replace(" ", "_") + '.eval', 'w') as o: json.dump(l_mean_eva, o, indent=1) for out in outs: out.close()
def predict(self, test_in_name, label_out_name, debug=False, timestamp=True): """ predict the data in test_in, dump predict labels in label_out_name :param test_in_name: :param label_out_name: :param debug: :return: """ res_dir = os.path.dirname(label_out_name) if not os.path.exists(res_dir): os.makedirs(res_dir) self.model.debug_mode(debug) self.model.eval() name, ext = os.path.splitext(label_out_name) if timestamp: ent_label_out_name = name + "_entity_" + self.init_time + ext evm_label_out_name = name + "_event_" + self.init_time + ext else: ent_label_out_name = name + "_entity" + ext evm_label_out_name = name + "_event" + ext ent_out = open(ent_label_out_name, 'w') evm_out = open(evm_label_out_name, 'w') outs = [ent_out, evm_out] logging.info('start predicting for [%s]', test_in_name) logging.info('Test output will be at [%s] and [%s]', ent_label_out_name, evm_label_out_name) p = 0 ent_p = 0 evm_p = 0 h_total_ent_eva = dict() h_total_evm_eva = dict() for line in open(test_in_name): if self.io_parser.is_empty_line(line): continue l_h_out = self._per_doc_predict(line) if not l_h_out: continue for h_out, name, out in zip(l_h_out, self.output_names, outs): if not h_out: continue out.write(json.dumps(h_out) + '\n') eva = h_out['eval'] if name == 'entity': ent_p += 1 h_total_ent_eva = add_svm_feature(h_total_ent_eva, eva) if name == 'event': evm_p += 1 h_total_evm_eva = add_svm_feature(h_total_evm_eva, eva) p += 1 if not p % 1000: h_mean_ent_eva = mutiply_svm_feature(h_total_ent_eva, 1.0 / max(ent_p, 1.0)) h_mean_evm_eva = mutiply_svm_feature(h_total_evm_eva, 1.0 / max(evm_p, 1.0)) logging.info( 'predicted [%d] docs: [%d] with entities, eva %s;' '[%d] with events, eva %s', p, ent_p, json.dumps(h_mean_ent_eva), evm_p, json.dumps(h_mean_evm_eva), ) h_mean_ent_eva = mutiply_svm_feature(h_total_ent_eva, 1.0 / max(ent_p, 1.0)) h_mean_evm_eva = mutiply_svm_feature(h_total_evm_eva, 1.0 / max(evm_p, 1.0)) l_mean_ent_eva = sorted(h_mean_ent_eva.items(), key=lambda item: item[0]) l_mean_evm_eva = sorted(h_mean_evm_eva.items(), key=lambda item: item[0]) logging.info( 'finished predicted [%d] docs, [%d] with entities, eva %s' '[%d] with events, eva %s', p, ent_p, json.dumps(l_mean_ent_eva), evm_p, json.dumps(l_mean_evm_eva)) self.tab_scores(h_mean_ent_eva, h_mean_evm_eva) json.dump(l_mean_ent_eva, open(ent_label_out_name + '.eval', 'w'), indent=1) json.dump(l_mean_evm_eva, open(evm_label_out_name + '.eval', 'w'), indent=1) ent_out.close() evm_out.close() return
def split_and_eval(self, docs, f_predict): print("Split and evaluating joint predictions [%s]." % f_predict) evaluator = SalienceEva() # evaluator with default values. h_e_total_eva = dict() h_e_mean_eva = dict() h_evm_total_eva = dict() h_evm_mean_eva = dict() e_p = 0 evm_p = 0 p = 0 with open(f_predict + '.entity.json', 'w') as entity_out, \ open(f_predict + '.event.json', 'w') as event_out: for res in self.load_pairs(docs, f_predict): p += 1 if not res: continue doc, predictions, s_e_label, s_evm_label = res l_e_pack = self.get_e_labels(predictions, s_e_label) l_evm_pack = self.get_evm_labels(predictions, s_evm_label) pred_event = {'bodyText': {}} pred_entity = {'bodyText': {}} if l_e_pack: h_e = evaluator.evaluate(l_e_pack[0], l_e_pack[1]) e_p += 1 h_e_total_eva = add_svm_feature(h_e_total_eva, h_e) pred_entity['bodyText']['predict'] = [[ eid, score ] for eid, score in zip(l_e_pack[2], l_e_pack[0])] pred_entity['docno'] = doc['docno'] pred_entity['eval'] = h_e entity_out.write(json.dumps(pred_entity)) entity_out.write('\n') if l_evm_pack: h_evm = evaluator.evaluate(l_evm_pack[0], l_evm_pack[1]) evm_p += 1 h_evm_total_eva = add_svm_feature(h_evm_total_eva, h_evm) pred_event['bodyText']['predict'] = [[ eid, score ] for eid, score in zip(l_evm_pack[2], l_evm_pack[0])] pred_event['docno'] = doc['docno'] pred_event['eval'] = h_evm event_out.write(json.dumps(pred_event)) event_out.write('\n') if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) if not evm_p == 0: h_evm_mean_eva = mutiply_svm_feature( h_evm_total_eva, 1.0 / evm_p) ep1 = '%.4f' % h_e_mean_eva[ 'p@01'] if 'p@01' in h_e_mean_eva else 'N/A' evmp1 = '%.4f' % h_evm_mean_eva[ 'p@01'] if 'p@01' in h_evm_mean_eva else 'N/A' sys.stdout.write( '\rEvaluated %d files, %d with entities and %d ' 'with events, En P@1: %s, Evm P@1: %s, ' % (p, e_p, evm_p, ep1, evmp1)) print('') h_e_mean_eva = {} if not e_p == 0: h_e_mean_eva = mutiply_svm_feature(h_e_total_eva, 1.0 / e_p) logging.info('finished predicted [%d] docs on entity, eva %s', e_p, json.dumps(h_e_mean_eva)) h_evm_mean_eva = {} if not evm_p == 0: h_evm_mean_eva = mutiply_svm_feature(h_evm_total_eva, 1.0 / evm_p) logging.info('finished predicted [%d] docs on event, eva %s', evm_p, json.dumps(h_evm_mean_eva)) with open(f_predict + '.entity.eval', 'w') as out: json.dump([[k, v] for k, v in h_e_mean_eva.items()], out, indent=1) with open(f_predict + '.event.eval', 'w') as out: json.dump([[k, v] for k, v in h_evm_mean_eva.items()], out, indent=1)