def main(args): input_dataset = args[1] input_run = args[2] output_dir = args[3] gold_relations = [json.loads(x) for x in open('%s/relations.json' % input_dataset)] predicted_relations = [json.loads(x) for x in open('%s/output.json' % input_run)] language = identify_language(gold_relations) all_correct = validate_relation_list(predicted_relations, language) if not all_correct: exit(1) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [x for x in gold_relations if x['Type'] == 'Explicit'] explicit_predicted_relations = [x for x in predicted_relations if x['Type'] == 'Explicit'] write_results('Explicit only', \ evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_relations = [x for x in gold_relations if x['Type'] != 'Explicit'] non_explicit_predicted_relations = [x for x in predicted_relations if x['Type'] != 'Explicit'] write_results('Non-explicit only', \ evaluate(non_explicit_gold_relations, non_explicit_predicted_relations), output_file) output_file.close()
def evaluate_and_visualize(config_name, model_path, output_path, gene_variant=None): # Prepare tokenizer, dataset, and model configs = get_configs(config_name, verbose=False) if configs['use_gene_features']: assert (not gene_variant is None) configs['gene_variant'] = gene_variant tokenizer = BertTokenizer.from_pretrained(configs['transformer'], do_basic_tokenize=False) train_set, dev_set, test_set = load_oneie_dataset( configs['base_dataset_path'], tokenizer) model = BasicCorefModel(configs) # Reload the model and evaluate checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['model_state_dict']) print('Evaluation on the dev set', flush=True) evaluate(model, dev_set, configs)['avg'] print('Evaluation on the test set', flush=True) evaluate(model, test_set, configs) # Generate visualizations (for the test set) generate_coref_preds(model, test_set, '_predictions.json') generate_visualizations('_predictions.json', output_path) os.remove('_predictions.json')
def evaluate(scorer_dataset_file, feature_file, **kwargs): scorer_dataset = joblib.load(scorer_dataset_file) if feature_file: feature_list = json.load(feature_file)['features'] else: feature_list = None scorer.evaluate(scorer_dataset, feature_list, **kwargs)
def main(args): input_dataset = args[1] input_run = args[2] output_dir = args[3] gold_relations = [ json.loads(x) for x in open('%s/relations.json' % input_dataset) ] predicted_relations = [ json.loads(x) for x in open('%s/output.json' % input_run) ] if len(gold_relations) != len(predicted_relations): err_message = 'Gold standard has % instances; predicted %s instances' % \ (len(gold_relations), len(predicted_relations)) print >> sys.stderr, err_message exit(1) language = identify_language(gold_relations) all_correct = validate_relation_list(predicted_relations, language) if not all_correct: print >> sys.stderr, 'Invalid format' exit(1) gold_relations = sorted(gold_relations, key=lambda x: x['ID']) predicted_relations = sorted(predicted_relations, key=lambda x: x['ID']) use_gold_standard_types(gold_relations, predicted_relations) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [ x for x in gold_relations if x['Type'] == 'Explicit' ] explicit_predicted_relations = [ x for x in predicted_relations if x['Type'] == 'Explicit' ] write_results('Explicit only', \ evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_relations = [ x for x in gold_relations if x['Type'] != 'Explicit' ] non_explicit_predicted_relations = [ x for x in predicted_relations if x['Type'] != 'Explicit' ] write_results('Non-explicit only', \ evaluate(non_explicit_gold_relations, non_explicit_predicted_relations), output_file) output_file.close()
def main(args): input_dataset = args[1] input_run = args[2] output_dir = args[3] gold_relations = [ json.loads(x) for x in open('%s/relations.json' % input_dataset) ] predicted_relations = [ json.loads(x) for x in open('%s/output.json' % input_run) ] language = identify_language(gold_relations) all_correct = validate_relation_list(predicted_relations, language) if not all_correct: exit(1) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [ x for x in gold_relations if x['Type'] == 'Explicit' ] explicit_predicted_relations = [ x for x in predicted_relations if x['Type'] == 'Explicit' ] write_results('Explicit only', \ evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_relations = [ x for x in gold_relations if x['Type'] != 'Explicit' ] non_explicit_predicted_relations = [ x for x in predicted_relations if x['Type'] != 'Explicit' ] write_results('Non-explicit only', \ evaluate(non_explicit_gold_relations, non_explicit_predicted_relations), output_file) print '\nPartial Evaluation for all discourse relations' write_partial_match_results('All (partial match)', \ partial_evaluate(gold_relations, predicted_relations, 0.7), output_file) print '\nPartial Evaluation for explicit discourse relations' write_partial_match_results('Explicit only (partial match)', \ partial_evaluate(explicit_gold_relations, explicit_predicted_relations, 0.7), output_file) print '\nPartial Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' write_partial_match_results('Non-explicit only (partial match)', \ partial_evaluate(non_explicit_gold_relations, non_explicit_predicted_relations, 0.7), output_file) output_file.close()
def main(args): input_dataset = args[1] input_run = args[2] output_dir = args[3] relation_file = '%s/relations.json' % input_dataset gold_relations = [] file_line = 0 for x in open(relation_file): try: gold_relations.append(json.loads(x[x.index('{'):])) except: print "Error reading json file on line %s" % file_line print x file_line = file_line + 1 # gold_relations = [json.loads(x) for x in open('%s/relations.json' % input_dataset)] predicted_relations = [json.loads(x) for x in open('%s/output.json' % input_run)] if len(gold_relations) != len(predicted_relations): err_message = 'Gold standard has % instances; predicted %s instances' % \ (len(gold_relations), len(predicted_relations)) print >> sys.stderr, err_message exit(1) language = identify_language(gold_relations) all_correct = validate_relation_list(predicted_relations, language) if not all_correct: print >> sys.stderr, 'Invalid format' exit(1) gold_relations = sorted(gold_relations, key=lambda x: x['ID']) predicted_relations = sorted(predicted_relations, key=lambda x: x['ID']) use_gold_standard_types(gold_relations, predicted_relations) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [x for x in gold_relations if x['Type'] == 'Explicit'] explicit_predicted_relations = [x for x in predicted_relations if x['Type'] == 'Explicit'] write_results('Explicit only', \ evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_relations = [x for x in gold_relations if x['Type'] != 'Explicit'] non_explicit_predicted_relations = [x for x in predicted_relations if x['Type'] != 'Explicit'] write_results('Non-explicit only', \ evaluate(non_explicit_gold_relations, non_explicit_predicted_relations), output_file) output_file.close()
def main(): """Test the scorer There are 29 gold relations. We corrupt 5 relations and remove 1. Precision = (29 - 6) / 28 = 0.8214 Recall = (29 - 6) / 29 = 0.7931 F1 = 2 * (0.8214 * 0.7931) / (0.8214 + 0.7931) = 0.8070 """ relations = [json.loads(x) for x in open('tutorial/pdtb_trial_data.json')] output_relations = [convert_to_output(x) for x in relations] output_relations[1]['Connective']['TokenList'] = [0] output_relations[3]['Arg1']['TokenList'].pop(4) output_relations[4]['Arg2']['TokenList'].pop(4) output_relations[5]['Arg2']['TokenList'].pop(4) output_relations[6]['Sense'] = [u'Contingency.Condition'] # This will hurt sense recall output_relations.pop(0) # This will hurt all precision scorer.evaluate(relations, output_relations) return output_relations
def main(): """Test the scorer There are 29 gold relations. We corrupt 5 relations and remove 1. Precision = (29 - 6) / 28 = 0.8214 Recall = (29 - 6) / 29 = 0.7931 F1 = 2 * (0.8214 * 0.7931) / (0.8214 + 0.7931) = 0.8070 """ relations = [json.loads(x) for x in open('tutorial/pdtb_trial_data.json')] output_relations = [convert_to_output(x) for x in relations] output_relations[1]['Connective']['TokenList'] = [0] output_relations[3]['Arg1']['TokenList'].pop(4) output_relations[4]['Arg2']['TokenList'].pop(4) output_relations[5]['Arg2']['TokenList'].pop(4) output_relations[6]['Sense'] = [u'Contingency.Condition' ] # This will hurt sense recall output_relations.pop(0) # This will hurt all precision scorer.evaluate(relations, output_relations) return output_relations
def tester(world_name,punter_names,fout): global simple_gui global FPS_FACT if MAC_MUSIC: music_proc = subprocess.Popen([MUSIC_CMD, MUSIC[music_id],'-I','rc']) N=len(punter_names) ### START OF PROGRAM world_test = world.World(world_name) pod_list=[] zombies=[] cnt=0 default_dir=os.getcwd() for name in punter_names: pod = pods.CarPod(world_test) pod_list.append(pod) pod.score=0.0 pod.stat="-" pod.name=name pod.mess="Uninitialized" try: punters_path='punters_test/'+name os.chdir(punters_path) plug=importlib.import_module('punters_test.'+name+'.plugin') # call the plugin to equip the car # set the current path to the punters directory plug.equip_car(pod) os.chdir(default_dir) pod.controller=plug.controller hue=(360.0*cnt)/N col=pygame.Color(0) col.hsla=(hue,100,50,0) pod.col=(col.r,col.g,col.b) cnt+=1 except: print name print "Unexpected error:", sys.exc_info() # fout.write(name+" Error "+ str(sys.exc_info()[0])) traceback.print_tb(sys.exc_info()[2]) pod.mess="Loading Error: "+str(sys.exc_info()[0]) pod.score=0.0 pod.stat="E" zombies.append(pod) os.chdir(default_dir) runners=copy.copy(pod_list) # remove zombies for pod in zombies: runners.remove(pod) if GUI: simple_gui=gui.SimpleGui(frames_per_sec=int(FPS_FACT/world_test.dt),world=world_test,pods=runners,back_ground=(5,5,5)) # use a control to activate the car. control=pods.Control() while runners: zombies=[] for pod in runners: try: pod.controller(pod) pod.step() score,kill,mess=scorer.evaluate(pod) pod.score=max(score,0) pod.mess=mess except: print name+": Unexpected error:", sys.exc_info() traceback.print_tb(sys.exc_info()[2]) pod.score=0 pod.mess="RunError ->"+str(sys.exc_info()) kill=True pod.stat="e" if kill: zombies.append(pod) # remove crashed for pod in zombies: runners.remove(pod) ranked = sorted(pod_list, key = lambda x:x.score,reverse=True) if GUI: disp="" pos=[0,10] simple_gui.clear() for pod in ranked: col=pod.col gui_base.draw_string(simple_gui.screen,pod.stat+":"+pod.name,pos,col,FONT_SIZE,'Courier New') pos[1]+=FONT_SIZE simple_gui.display(clear=False,fps=int(FPS_FACT/world_test.dt)) if simple_gui.check_for_quit(): sys.exit(0) if simple_gui.get_pressed()[gui.keys.K_p]: pause=True if simple_gui.get_pressed()[gui.keys.K_EQUALS]: FPS_FACT = min(FPS_FACT*2,200) print FPS_FACT if simple_gui.get_pressed()[gui.keys.K_MINUS]: FPS_FACT = max(int(FPS_FACT/2),1) print FPS_FACT if simple_gui.get_pressed()[gui.keys.K_s]: pause=False ranked=sorted(pod_list, key = lambda x:x.score,reverse=True) for pod in ranked: buff="%15s %6.3f %s" % (pod.name+":",pod.score, ":"+pod.mess+"\n") fout.write(buff) if MAC_MUSIC: music_proc.terminate() return pod_list
def train(config_name, gene_variant=None): # Prepare tokenizer, dataset, and model configs = get_configs(config_name, verbose=False) if configs['use_gene_features']: assert(not gene_variant is None) configs['gene_variant'] = gene_variant tokenizer = BertTokenizer.from_pretrained(configs['transformer'], do_basic_tokenize=False) train_set, dev_set, test_set = load_oneie_dataset(configs['base_dataset_path'], tokenizer) model = BasicCorefModel(configs) # Initialize the optimizer num_train_docs = len(train_set) epoch_steps = int(math.ceil(num_train_docs / configs['batch_size'])) num_train_steps = int(epoch_steps * configs['epochs']) num_warmup_steps = int(num_train_steps * 0.1) optimizer = model.get_optimizer(num_warmup_steps, num_train_steps) print('Initialized optimizer') # Main training loop best_dev_score, iters, batch_loss = 0.0, 0, 0 for epoch in range(configs['epochs']): #print('Epoch: {}'.format(epoch)) print('\n') progress = tqdm.tqdm(total=epoch_steps, ncols=80, desc='Train {}'.format(epoch)) accumulated_loss = RunningAverage() train_indices = list(range(num_train_docs)) random.shuffle(train_indices) for train_idx in train_indices: iters += 1 inst = train_set[train_idx] iter_loss = model(inst, is_training=True)[0] iter_loss /= configs['batch_size'] iter_loss.backward() batch_loss += iter_loss.data.item() if iters % configs['batch_size'] == 0: accumulated_loss.update(batch_loss) torch.nn.utils.clip_grad_norm_(model.parameters(), configs['max_grad_norm']) optimizer.step() optimizer.zero_grad() batch_loss = 0 # Update progress bar progress.update(1) progress.set_postfix_str('Average Train Loss: {}'.format(accumulated_loss())) progress.close() # Evaluation after each epoch print('Evaluation on the dev set', flush=True) dev_score = evaluate(model, dev_set, configs)['avg'] # Save model if it has better dev score if dev_score > best_dev_score: best_dev_score = dev_score # Evaluation on the test set print('Evaluation on the test set', flush=True) evaluate(model, test_set, configs) # Save the model save_path = os.path.join(configs['saved_path'], 'model.pt') torch.save({'model_state_dict': model.state_dict()}, save_path) print('Saved the model', flush=True)
print "Unexpected error:", sys.exc_info() traceback.print_tb(sys.exc_info()[2]) pod.mess = "Loading Error: " + str(sys.exc_info()[0]) os.chdir(default_dir) sys.exit(0) os.chdir(default_dir) if GUI: simple_gui = gui.SimpleGui(frames_per_sec=int(FPS_FACT / track.dt), world=track, pods=[pod], back_ground=(5, 5, 5)) while True: pod.controller(pod) pod.step() if GUI: simple_gui.set_message(str(pod.state)) simple_gui.display() if simple_gui.check_for_quit(): sys.exit(0) score, kill, mess = scorer.evaluate(pod) if kill: print " mess=", mess print " score=", score break
"""CONLL Shared Task 2015 Scorer """ import argparse import json from scorer import evaluate if __name__ == '__main__': parser = argparse.ArgumentParser(description="Evaluate system's output against the gold standard") parser.add_argument('gold', help='Gold standard file') parser.add_argument('predicted', help='System output file') args = parser.parse_args() gold_list = [json.loads(x) for x in open(args.gold)] predicted_list = [json.loads(x) for x in open(args.predicted)] print 'Evaluation for all discourse relations' evaluate(gold_list, predicted_list) print 'Evaluation for explicit discourse relations only' explicit_gold_list = [x for x in gold_list if x['Type'] == 'Explicit'] explicit_predicted_list = [x for x in predicted_list if x['Type'] == 'Explicit'] evaluate(explicit_gold_list, explicit_predicted_list) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_list = [x for x in gold_list if x['Type'] != 'Explicit'] non_explicit_predicted_list = [x for x in predicted_list if x['Type'] != 'Explicit'] evaluate(non_explicit_gold_list, non_explicit_predicted_list)
def run_test(images, base_dir): zoom_prefix = str(zoom_level) + 'x/' if zoom_level > 1 else '' for image in images: # Set the current image for the evaluation scorer scorer.set_current_image(image) # if not image.startswith('009-NW'): # continue print('Processing: ' + image) # Get OCR data from the oxford API data = oxford_api.get_json_data(image, base_dir, zoom_level, img_pref) ai2_zoom_level = 3 # ai2_data = ai2_api.get_json_data(image, base_dir, ai2_zoom_level, img_pref); # ai2_boxes = ai2_api.convert_to_boxes(ai2_data, ai2_zoom_level) # Extract lines from the image lines = liner.get_lines(image, base_dir) # Extract hierarchical contours h_boxes, hierarchy = hallucinator.get_contours( image, base_dir, img_pref + 'box_hallucinations/' + image) # Here we could filter out top level boxes to get rid # of legends, etc. root_boxes = hallucinator.get_root_contours(h_boxes, hierarchy) # import pdb;pdb.set_trace() # best_root = hallucinator.get_most_nested(root_boxes, hierarchy, h_boxes) # if best_root is None: best_rects = h_boxes base_box = get_full_box(image, base_dir) # else: # best_rects = hallucinator.get_rects(best_root[1], h_boxes) # base_box = hallucinator.contour_to_box(best_root[0][1]) child_boxes = hallucinator.contours_to_boxes( hallucinator.get_child_contours(best_rects, hierarchy)) # gt_boxes = get_gt_boxes(image, img_pref) margins = spacer.get_whitespace(image, base_dir) ocr_boxes, raw_boxes = boxer.get_boxes( data, zoom_level, lines, img_pref + 'combos/' + image + '.txt', child_boxes, margins, img_pref + 'google_cache/' + zoom_prefix, base_dir + '/' + zoom_prefix, image) # box_points = get_v_points(raw_boxes) # voronoi.process_image_points(image, base_dir, img_pref + 'voronoi/', box_points) # ocr_boxes, raw_boxes = boxer.get_boxes(ai2_data, zoom_level, lines, img_pref + 'combos/' + image + '.txt', child_boxes) # Merge the oxford ocr boxes with the ai2 boxes # boxer.merge_ocr_boxes(ocr_boxes, ai2_boxes) merged_boxes = boxer.merge_box_groups(child_boxes, ocr_boxes, 0.9, base_box) # merged_boxes = gt_boxes merged_labels = boxer.merge_ocr_boxes(raw_boxes, []) # ai2_boxes) # TODO: Ensure that this is sorted right # boxes = boxer.add_labels(merged_boxes, merged_labels, 0.9) # boxes = cloud_api.add_labels(merged_boxes, base_dir + '/', image, img_pref + 'google_cache/', 1) boxes = cloud_api.add_labels(merged_boxes, base_dir + '/' + zoom_prefix, image, img_pref + 'google_cache/' + zoom_prefix, zoom_level) scores = liner.rate_lines(lines, boxes) filtered_lines = liner.filter_lines(lines, boxes, scores) new_lines = liner.remove_lines(lines, filtered_lines, scores) rows, cols = score_rows.get_structure(boxes, new_lines) # predicted_boxes = boxer.predict_missing_boxes(rows, cols, boxes) scorer.evaluate_cells(image, img_pref, boxes) # + predicted_boxes) # import pdb;pdb.set_trace() if verbose: print_structure(rows, 'Rows') print_structure(cols, 'Cols') # draw_lines(base_dir + '/' + image, lines, img_pref + 'table_labeling/' + image + '_orig.jpg') # draw_lines(base_dir + '/' + image, new_lines, img_pref + 'table_labeling/' + image) # draw_structure(translate_box_paradigm(raw_boxes), base_dir + '/' + image, img_pref + 'table_structure/' + image + '_oxford_ocr.jpg') # draw_structure(translate_box_paradigm(boxes), base_dir + '/' + image, img_pref + 'table_structure/' + image + '_merged_boxes.jpg') # draw_structure(translate_box_paradigm(merged_labels), base_dir + '/' + image, img_pref + 'table_structure/' + image + '_merged_ocr.jpg') # draw_structure(translate_box_paradigm(ai2_boxes), base_dir + '/' + image, img_pref + 'table_structure/' + image + '_ai2_ocr.jpg') # draw_structure(rows, base_dir + '/' + image, img_pref + 'table_structure/' + image + '_rows.jpg') # draw_structure(cols, base_dir + '/' + image, img_pref + 'table_structure/' + image + '_cols.jpg') spreadsheeter.output( rows, cols, boxes, img_pref + xlsx_path + '/' + zoom_prefix + image + '.xlsx', img_pref + json_out_path + '/' + zoom_prefix + image + '.json') if verbose: print('Estimating (' + str(len(new_lines[0]) - 1) + ' x ' + str(len(new_lines[1]) - 1) + ')') print() if sleep_delay > 0: time.sleep(sleep_delay) scorer.score_cells_overall() scorer.evaluate()
def scores_compute(gold_json, systems): """Verify and compute scores of all system outputs.""" def to_percent(vals): return [v * 100.0 for v in vals] gold_list = [json.loads(x) for x in open(gold_json)] scores = {} for system_name, system_json in systems: log.debug("- validating system '{}' ('{}')...".format( system_name, system_json)) if system_json != gold_json and not validator.validate_file( system_json): log.error("Invalid system output format in '{}' ('{}')!".format( system_name, system_json)) exit(-1) log.debug("- scoring system '{}' ('{}')...".format( system_name, system_json)) if system_json != gold_json: predicted_list = [json.loads(x) for x in open(system_json)] else: # gold standard as system output import copy predicted_list = conv_gold_to_output(copy.deepcopy(gold_list)) connective_cm, arg1_cm, arg2_cm, rel_arg_cm, sense_cm, precision, recall, f1 = scorer.evaluate( gold_list, predicted_list) scores[system_name] = { 'conn': to_percent(connective_cm.get_prf('yes')), 'arg1': to_percent(arg1_cm.get_prf('yes')), 'arg2': to_percent(arg2_cm.get_prf('yes')), 'comb': to_percent(rel_arg_cm.get_prf('yes')), 'sense': to_percent(cm_avg_prf(sense_cm)), 'overall': to_percent((precision, recall, f1)), } return scores
def scores_compute(gold_json, systems): """Verify and compute scores of all system outputs.""" def to_percent(vals): return [ v * 100.0 for v in vals ] gold_list = [ json.loads(x) for x in open(gold_json) ] scores = {} for system_name, system_json in systems: log.debug("- validating system '{}' ('{}')...".format(system_name, system_json)) if system_json != gold_json and not validator.validate_file(system_json): log.error("Invalid system output format in '{}' ('{}')!".format(system_name, system_json)) exit(-1) log.debug("- scoring system '{}' ('{}')...".format(system_name, system_json)) if system_json != gold_json: predicted_list = [ json.loads(x) for x in open(system_json) ] else: # gold standard as system output import copy predicted_list = conv_gold_to_output(copy.deepcopy(gold_list)) connective_cm, arg1_cm, arg2_cm, rel_arg_cm, sense_cm, precision, recall, f1 = scorer.evaluate(gold_list, predicted_list) scores[system_name] = { 'conn': to_percent(connective_cm.get_prf('yes')), 'arg1': to_percent(arg1_cm.get_prf('yes')), 'arg2': to_percent(arg2_cm.get_prf('yes')), 'comb': to_percent(rel_arg_cm.get_prf('yes')), 'sense': to_percent(cm_avg_prf(sense_cm)), 'overall': to_percent((precision, recall, f1)), } return scores
input_run = sys.argv[2] output_dir = sys.argv[3] gold_relations = [ json.loads(x) for x in open('%s/pdtb-data.json' % input_dataset) ] predicted_relations = [ json.loads(x) for x in open('%s/output.json' % input_run) ] all_correct = validate_relation_list(predicted_relations) if not all_correct: exit(1) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [ x for x in gold_relations if x['Type'] == 'Explicit' ] explicit_predicted_relations = [ x for x in predicted_relations if x['Type'] == 'Explicit' ] write_results( 'Explicit only', evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)'
write_proto_text('%s Sense recall' % prefix, r, output_file) write_proto_text('%s Sense f1' % prefix, f, output_file) if __name__ == '__main__': input_dataset = sys.argv[1] input_run = sys.argv[2] output_dir = sys.argv[3] gold_relations = [json.loads(x) for x in open('%s/pdtb-data.json' % input_dataset)] predicted_relations = [json.loads(x) for x in open('%s/output.json' % input_run)] all_correct = validate_relation_list(predicted_relations) if not all_correct: exit(1) output_file = open('%s/evaluation.prototext' % output_dir, 'w') print 'Evaluation for all discourse relations' write_results('All', evaluate(gold_relations, predicted_relations), output_file) print 'Evaluation for explicit discourse relations only' explicit_gold_relations = [x for x in gold_relations if x['Type'] == 'Explicit'] explicit_predicted_relations = [x for x in predicted_relations if x['Type'] == 'Explicit'] write_results('Explicit only', evaluate(explicit_gold_relations, explicit_predicted_relations), output_file) print 'Evaluation for non-explicit discourse relations only (Implicit, EntRel, AltLex)' non_explicit_gold_relations = [x for x in gold_relations if x['Type'] != 'Explicit'] non_explicit_predicted_relations = [x for x in predicted_relations if x['Type'] != 'Explicit'] write_results('Non-explicit only', evaluate(non_explicit_gold_relations, non_explicit_predicted_relations), output_file) output_file.close()
os.chdir(default_dir) if GUI: simple_gui=gui.SimpleGui(frames_per_sec=int(FPS_FACT/track.dt),world=track,pods=[pod],back_ground=(5,5,5)) while True: pod.controller(pod) pod.step() if GUI: simple_gui.set_message(str(pod.state)) simple_gui.display() if simple_gui.check_for_quit(): sys.exit(0) score,kill,mess=scorer.evaluate(pod) if kill: print " mess=",mess print " score=",score break
def write_proto_text(key, value, f): f.write('measure {\n key: "%s" \n value: "%s"\n}\n' % (key ,round(value, 4))) if __name__ == '__main__': input_dataset = sys.argv[1] input_run = sys.argv[2] output_dir = sys.argv[3] gold_relations = [json.loads(x) for x in open('%s/pdtb-data.json' % input_dataset)] predicted_relations = [json.loads(x) for x in open('%s/output.json' % input_run)] all_correct = validate_relation_list(predicted_relations) if not all_correct: exit(1) connective_cm, arg1_cm, arg2_cm, rel_arg_cm, sense_cm, precision, recall, f1 = \ evaluate(gold_relations, predicted_relations) output_file = open('%s/evaluation.prototext' % output_dir, 'w') write_proto_text('Parser precision', precision, output_file) write_proto_text('Parser recall', recall, output_file) write_proto_text('Parser f1', f1, output_file) p, r, f = connective_cm.get_prf('yes') write_proto_text('Explicit connective precision', p, output_file) write_proto_text('Explicit connective recall', r, output_file) write_proto_text('Explicit connective f1', f, output_file) p, r, f = arg1_cm.get_prf('yes') write_proto_text('Arg1 extraction precision', p, output_file) write_proto_text('Arg1 extraction recall', r, output_file) write_proto_text('Arg1 extraction f1', f, output_file)
input_dataset = sys.argv[1] input_run = sys.argv[2] output_dir = sys.argv[3] gold_relations = [ json.loads(x) for x in open('%s/pdtb-data.json' % input_dataset) ] predicted_relations = [ json.loads(x) for x in open('%s/output.json' % input_run) ] all_correct = validate_relation_list(predicted_relations) if not all_correct: exit(1) connective_cm, arg1_cm, arg2_cm, rel_arg_cm, sense_cm, precision, recall, f1 = \ evaluate(gold_relations, predicted_relations) output_file = open('%s/evaluation.prototext' % output_dir, 'w') write_proto_text('Parser precision', precision, output_file) write_proto_text('Parser recall', recall, output_file) write_proto_text('Parser f1', f1, output_file) p, r, f = connective_cm.get_prf('yes') write_proto_text('Explicit connective precision', p, output_file) write_proto_text('Explicit connective recall', r, output_file) write_proto_text('Explicit connective f1', f, output_file) p, r, f = arg1_cm.get_prf('yes') write_proto_text('Arg1 extraction precision', p, output_file) write_proto_text('Arg1 extraction recall', r, output_file) write_proto_text('Arg1 extraction f1', f, output_file)