def __init__(self, training_properties, train_iter, dev_iter, test_iter, device): self.optimizer_type = training_properties["optimizer"] self.learning_rate = training_properties["learning_rate"] self.weight_decay = training_properties["weight_decay"] self.momentum = training_properties["momentum"] self.norm_ratio = training_properties["norm_ratio"] self.epoch = training_properties["epoch"] self.topk = training_properties["topk"] self.print_every = training_properties["print_every_batch_step"] self.save_every = training_properties["save_every_epoch"] self.eval_every = training_properties["eval_every"] self.save_path = training_properties["save_path"] self.openAIAdamSchedulerType = training_properties["scheduler_type"] self.amsgrad = training_properties["amsgrad"] self.partial_adam = training_properties["partial_adam"] self.train_iter = train_iter self.dev_iter = dev_iter self.test_iter = test_iter self.device = device self.dev_evaluator, self.test_evaluator = Evaluator( ).evaluator_factory("single_model_evaluator", self.device)
def test_evaluate(self): evaluator = Evaluator(model_dir=self.model_dir, input_shape=self.input_shape) BAR, EAR = evaluator.evaluate(basic_model=self.basic_model, evaluate_model=self.evaluate_model, valid_stocks=self.valid_stocks, rounds=3) print BAR, EAR self.assertNotEqual(BAR, EAR)
def prec_recall(data, gt): search_engine = SearchEngine(data) print('\n> Running Evaluation...\n', end='') evaluator = Evaluator(search_engine, gt) prec, avg_prec_recall = evaluator.evaluate() mkdir(EVALUATION_PATH) save_to_csv(prec, os.path.join(EVALUATION_PATH, 'precision.csv')) save_to_csv(avg_prec_recall, os.path.join(EVALUATION_PATH, 'avg_prec_recall.csv'), index=True) print('\n Results of evaluation saved to directory "%s"' % os.path.relpath(EVALUATION_PATH, PROJ_ROOT))
def eval_run_func(params): from evaluation.evaluator import Evaluator # get input parameters model_dir = params['model_dir'] basic_model = params['basic_model'] evaluate_model = params['evaluate_model'] input_shape = params['input_shape'] rounds = params['rounds'] valid_stocks = params['valid_stocks'] _evaluator = Evaluator(model_dir=model_dir, input_shape=input_shape) BAR, EAR = _evaluator.evaluate(basic_model, evaluate_model, valid_stocks, rounds) return BAR, EAR
def __init__(self, output_dir): if cfg.TRAIN.FLAG: self.model_dir = os.path.join(output_dir, 'Model') self.image_dir = os.path.join(output_dir, 'Image') self.log_dir = os.path.join(output_dir, 'Log') mkdir_p(self.model_dir) mkdir_p(self.image_dir) mkdir_p(self.log_dir) self.summary_writer = FileWriter(self.log_dir) self.max_epoch = cfg.TRAIN.MAX_EPOCH self.snapshot_interval = cfg.TRAIN.SNAPSHOT_INTERVAL s_gpus = cfg.GPU_ID.split(',') self.gpus = [int(ix) for ix in s_gpus] self.num_gpus = len(self.gpus) self.batch_size = cfg.TRAIN.BATCH_SIZE * self.num_gpus torch.cuda.set_device(self.gpus[0]) cudnn.benchmark = True # load fasttext embeddings (e.g., birds.en.vec) path = os.path.join(cfg.DATA_DIR, cfg.DATASET_NAME + ".en.vec") txt_dico, _txt_emb = load_external_embeddings(path) txt_emb = nn.Embedding(len(txt_dico), 300, sparse=False) txt_emb.weight.data.copy_(_txt_emb) txt_emb.weight.requires_grad = False self.txt_dico = txt_dico self.txt_emb = txt_emb # load networks and evaluator self.networks = self.load_network() self.evaluator = Evaluator(self.networks, self.txt_emb) # visualizer to visdom server self.vis = Visualizer(cfg.VISDOM_HOST, cfg.VISDOM_PORT, output_dir) self.vis.make_img_window("real_im") self.vis.make_img_window("fake_im") self.vis.make_txt_window("real_captions") self.vis.make_txt_window("genr_captions") self.vis.make_plot_window("G_loss", num=7, legend=["errG", "uncond", "cond", "latent", "cycltxt", "autoimg", "autotxt"]) self.vis.make_plot_window("D_loss", num=4, legend=["errD", "uncond", "cond", "latent"]) self.vis.make_plot_window("KL_loss", num=4, legend=["kl", "img", "txt", "fakeimg"]) self.vis.make_plot_window("inception_score", num=2, legend=["real", "fake"]) self.vis.make_plot_window("r_precision", num=1)
def evaluate_multiple_experiments(name_pattern, config_base_dir, user_dir_override): for config_path in glob(path.join(config_base_dir, name_pattern)): print('Evaluating experiment ', path.basename(config_path).replace('.json', '')) params = get_params_from_config(config_path, user_dir_override) model = Model(checkpoint_interval=10, model_params=params) evaluator = Evaluator(model) all_flows = evaluator.flows_over_epochs(every_nth=10) result_path = path.join( user_dir_override, 'training', 'results', path.basename(config_path).replace('.json', '.pkl')) with open(result_path, 'wb') as f: pkl.dump(all_flows, f)
def __init__(self, use_old_model, use_time, port, python_port, train, evaluate): self._use_old_model = use_old_model self._use_time = use_time self._port = port self._python_port = python_port self._train = train self._evaluate = evaluate if use_old_model: self._models_folder = 'old_model' else: self._models_folder = 'new_model' self._evaluator = Evaluator(self._port, self._python_port) print self._models_folder
def run_nrt_experiment(self): self.history_logs = edict() self.history_logs['Train'] = [] self.history_logs['Val'] = [] for dataName in sorted(os.listdir(Path(self.cfg.nrt_data_folder) / self.cfg.input_folder)): self.dataName = dataName print(f"\n==> {self.dataName}") self.dataloaders = self.train_val_loader(num=self.cfg.size_of_train) self.optimizer = torch.optim.Adam([dict(params=self.model.parameters(), lr=self.cfg.learning_rate, weight_decay=self.cfg.weight_decay)]) # self.history_logs = {'Train': np.zeros((len(metrics)+1, self.cfg.max_epoch)), # 'Val': np.zeros((len(metrics)+1, self.cfg.max_epoch))} # --------------------------------- Train ------------------------------------------- for epoch in range(0, self.cfg.max_epoch): print(f"\n==> train epoch: {epoch}/{self.cfg.max_epoch}") valid_logs = self.train_one_epoch(epoch) # do something (save model, change lr, etc.) if self.cfg.max_score < valid_logs['iou_score']: max_score = valid_logs['iou_score'] torch.save(self.model, self.model_url) print('Model saved!') if epoch == 10: self.optimizer.param_groups[0]['lr'] = self.cfg.learning_rate * 0.1 print(f"Decrease decoder learning rate to {self.optimizer.param_groups[0]['lr']}!") # save learning history self.plot_and_save_learnHistory() self.cfg.data_folder = self.cfg.nrt_data_folder self.cfg.modelPath = self.savePath self.evaluator = Evaluator(self.cfg) url = Path(self.cfg.nrt_data_folder) / self.cfg.input_folder / self.dataName print(url) predMap = self.evaluator.inference(url, self.savePath)
def test(cls, X_train, y_train, X_test, y_test, X_syn, y_syn, train_on, title, repeated, save_path=None): """ Train and test classification model for inner-corpus evaluation @param cls: Initialized classifier given hyperparameters @param X_train: Real training data @param y_train: Labels for real training data @param X_test: Test data @param y_test: Labels for test data @param X_syn: Synthetic training data @param y_syn: Labels for synthetic training data @param train_on: Which training data will be used @param title: Title for generated image of confusion matrix @param repeated: Number of repeated times for each classification @save_path: File path to save the generated image @return: Mean and standard deviation of test recall """ start = time.time() eva = Evaluator(X_train, y_train, X_test, y_test, X_syn, y_syn, cls=cls, repeated=repeated) if train_on == 'real': mean_recall, std_recall, mean_cm = eva.real() elif train_on == 'syn': mean_recall, std_recall, mean_cm = eva.syn() elif train_on == 'real+syn': mean_recall, std_recall, mean_cm = eva.real_plus_syn() end = time.time() print("time used: {} s".format(time_converter(start, end))) print("inner-corpus test - mean: {}, std: {}".format(mean_recall, std_recall)) plot_confusion_matrix(mean_cm, title=title) if save_path: plt.savefig(save_path, bbox_inches='tight') print("Successfully generated {}".format(save_path)) else: plt.show() plt.close() return mean_recall, std_recall
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--file_name', type=str, help='File name in manual_src to run.', default=None) parser.add_argument('-g', '--gen_src_count', type=int, help='Number of automatically generated source to run.', default=0) parser.add_argument('-n', '--num_seeds', type=int, help='Number of seeds that will run per source file.', default=2) args = parser.parse_args() seeds = [10 * s for s in range(args.num_seeds)] evaluator = Evaluator(seeds, 'evaluation/manual_src', 'evaluation/__genned') if args.file_name is not None: evaluator.eval_with_manual_src_file(args.file_name) elif args.gen_src_count > 0: for _ in range(args.gen_src_count): evaluator.eval_with_gen_src() else: evaluator.eval_with_manual_src_all() dnn_approx_result, dnn_result, vanila_result = evaluator.get_all_results() print('All results.') print('DNN + Approx') print_result(dnn_approx_result) print('DNN') print_result(dnn_result) print('Vanila') print_result(vanila_result) dnn_approx_avg, dnn_avg, vanila_avg = evaluator.get_all_avg_results() print(print_bar_double) print('Overall avg results.') print('DNN + Approx') print(dnn_approx_avg) print('DNN') print(dnn_avg) print('Vanila') print(vanila_avg)
def evaluate_labeling(dir_path, labeling: Dict[str, Dict[str, int]], key_path: str = None, maxLabels= 2) \ -> Dict[str, Dict[str, float]]: #RL maxLabels added """ labeling example : {'become.v.3': {'become.sense.1':3,'become.sense.5':17} ... } means instance become.v.3' is 17/20 in sense 'become.sense.5' and 3/20 in sense 'become.sense.1' :param key_path: write produced key to this file :param dir_path: SemEval dir :param labeling: instance id labeling :return: FNMI, FBC as calculated by SemEval provided code """ logging.info('starting evaluation key_path: %s' % key_path) def get_scores(gold_key, eval_key): ret = {} # for metric, jar, column in [ # # ('jaccard-index','SemEval-2013-Task-13-test-data/scoring/jaccard-index.jar'), # # ('pos-tau', 'SemEval-2013-Task-13-test-data/scoring/positional-tau.jar'), # # ('WNDC', 'SemEval-2013-Task-13-test-data/scoring/weighted-ndcg.jar'), # ('FNMI', os.path.join(dir_path, 'scoring/fuzzy-nmi.jar'), 1), # ('FBC', os.path.join(dir_path, 'scoring/fuzzy-bcubed.jar'), 3), # ]: # logging.info('calculating metric %s' % metric) # res = subprocess.Popen(['java', '-jar', jar, gold_key, eval_key], stdout=subprocess.PIPE).stdout.readlines() # # columns = [] # for line in res: # line = line.decode().strip() # if line.startswith('term'): # # columns = line.split('\t') # pass # else: # split = line.split('\t') # if len(split) > column: # word = split[0] # # results = list(zip(columns[1:], map(float, split[1:]))) # result = split[column] # if word not in ret: # ret[word] = {} # ret[word][metric] = float(result) #+RL script = [ "python2.7", "./spanish-lex-sample/score/score", eval_key, gold_key, './spanish-lex-sample/test/emptysensemap' ] res = subprocess.Popen(" ".join(script), shell=True, env={ "PYTHONPATH": "." }, stdout=subprocess.PIPE).stdout.readlines() ret['all'] = {} splitted = res[2].strip().split() ret['all']['precision'] = float(splitted[1]) ret['all']['correct'] = float( str(splitted[2].decode()).replace('(', '')) ret['all']['attempted'] = float(splitted[5]) splitted = res[3].strip().split() ret['all']['recall'] = float(splitted[1]) ret['all']['total'] = float(splitted[5]) splitted = res[4].strip().split() ret['all']['attemptedPct'] = float(splitted[1]) #- return ret def getGoldKeySENSEVAL2(goldPath): #+RL with open(os.path.join(dir_path, goldPath), 'r') as fgold: goldKey = dict() for line in fgold.readlines(): splitted = line.strip().split() #if splitted[0] == lemma: instance = dict() graded = dict() rest = splitted[2:] for index in rest: graded[splitted[0] + '.' + index] = 1.0 / len(rest) instance[splitted[1]] = graded if not splitted[0] in goldKey: goldKey[splitted[0]] = instance else: goldKey[splitted[0]].update(instance) return goldKey def dictToJ(dictionary): #+RL HashMap = autoclass('java.util.HashMap') String = autoclass('java.lang.String') Double = autoclass('java.lang.Double') map = HashMap() for token, instances in dictionary.items(): jToken = String(token) instanceMap = HashMap() for instance, labels in instances.items(): jInstance = String(instance) labelMap = HashMap() sum_applicabilities = sum([a for _, a in labels.items()]) for label, applicability in labels.items(): if sum_applicabilities > 1: applicability /= sum_applicabilities jLabel = String(label) jApplicability = Double(applicability) labelMap.put(jLabel, jApplicability) instanceMap.put(jInstance, labelMap) map.put(jToken, instanceMap) return map def getTrainingInstances(trainingSets): #+RL HashSet = autoclass('java.util.HashSet') String = autoclass('java.lang.String') listJTrainingSets = [] for trainingSet in trainingSets: jTrainingSet = HashSet() for instance in trainingSet: jInstance = String(instance) jTrainingSet.add(jInstance) listJTrainingSets.append(jTrainingSet) return listJTrainingSets def printTrainingSets(listJTrainingSets): #+RL trainingSet = 1 for trainingInstances in listJTrainingSets: print( '---------------------------------------------Training set %d \n' % trainingSet) entrySetIterator = trainingInstances.iterator() string = '' while entrySetIterator.hasNext(): e = entrySetIterator.next() string += e + ', ' print(string) trainingSet += 1 def mapSenses(trainingInstances, goldMap, labelingMap, maxLabels): #+RL GradedReweightedKeyMapper = autoclass( 'edu.ucla.clustercomparison.GradedReweightedKeyMapper') mapper = GradedReweightedKeyMapper() allRemappedTestKey = {} remappedTestKey = mapper.convert(goldMap, labelingMap, trainingInstances) #print(remappedTestKey) convertedSet = remappedTestKey.entrySet() convertedIterator = convertedSet.iterator() while convertedIterator.hasNext(): e = convertedIterator.next() doc = e.getKey() instRatings = e.getValue() instanceIterator = instRatings.entrySet().iterator() while instanceIterator.hasNext(): i = instanceIterator.next() instance = i.getKey() labelIterator = i.getValue().entrySet().iterator() labelList = [] while labelIterator.hasNext(): l = labelIterator.next() label = l.getKey() applicability = l.getValue() # print(f'{label} -----{applicability}') labelList.append((label, applicability)) labelList.sort(key=lambda x: x[1], reverse=True) allRemappedTestKey[instance] = labelList[0:maxLabels] return allRemappedTestKey with tempfile.NamedTemporaryFile('wt') as fout: lines = [] #+RL goldPath = 'key' goldKey = getGoldKeySENSEVAL2(goldPath) allInstances = [] for _, v in goldKey.items(): for k1, _ in v.items(): allInstances.append(k1) indices = list(range(0, len(allInstances))) random.seed(18) random.shuffle(indices) trainingSets = [set() for _ in range(0, 5)] for i in range(0, len(allInstances)): instance = allInstances[i] toExclude = i % len(trainingSets) for j in range(0, len(trainingSets)): if j != toExclude: trainingSets[j].add(instance) #print(trainingSets) # termToNumberSenses = {} # for e in goldKey.items(): # term = e[0] # senses = set() # for ratings in goldKey[term].values(): # for sense in ratings.keys(): # senses.update(sense) # termToNumberSenses[term] = len(senses) listJTrainingInstances = getTrainingInstances(trainingSets) #TrainingSets(listJTrainingInstances) goldMap = dictToJ(goldKey) lemmaLabeling = {} # print(labeling) for k, v in labeling.items(): lemma = k.split('.')[0] if not lemma in lemmaLabeling: lemmaLabeling[lemma] = {k: v} else: lemmaLabeling[lemma][k] = v labelingMap = dictToJ(lemmaLabeling) lines = [] global_test_key = {} for jTrainingInstances in listJTrainingInstances: testKey = mapSenses(jTrainingInstances, goldMap, labelingMap, maxLabels) # print(sorted(testKey.items(), key= lambda x: x[0])) global_test_key.update(testKey) for instance, label in testKey.items(): clusters_str = ' '.join(x[0].split('.')[1] for x in label[0:maxLabels]) lines.append('%s %s %s' % (instance.split('.')[0], instance, clusters_str)) evaluator = Evaluator(goldKey, global_test_key) evals = evaluator.semeval_2013_task_13_metrics() evalKey = key_path logging.info('writing key to file %s' % evalKey) with open(evalKey, 'w', encoding="utf-8") as fout2: lines = sorted(lines) fout2.write('\n'.join(lines)) scores = get_scores( os.path.join(dir_path, goldPath), #'keys/gold/all.key'), RL goldPath added evalKey) #RL task added scores['all'].update(evals) print(scores) #- # goldPath = 'keys/gold/all.key' # for instance_id, clusters_dict in labeling.items(): # clusters = sorted(clusters_dict.items(), key=lambda x: x[1]) # clusters_str = ' '.join([('%s/%d' % (cluster_name, count)) for cluster_name, count in clusters]) # lemma_pos = instance_id.rsplit('.', 1)[0] # lines.append('%s %s %s' % (lemma_pos, instance_id, clusters_str)) # fout.write('\n'.join(lines)) # fout.flush() # scores = get_scores(os.path.join(dir_path, goldPath), #'keys/gold/all.key'), RL goldPath added # fout.name,task) #RL task added # if key_path: # logging.info('writing key to file %s' % key_path) # with open(key_path, 'w', encoding="utf-8") as fout2: # fout2.write('\n'.join(lines)) return scores
def __init__(self, training_properties, datasetloader, device): super(SingleModelNerTrainer, self).__init__(training_properties, datasetloader, device) self.scorer = NerScorer(datasetloader.ner_vocab) self.dev_evaluator, self.test_evaluator = Evaluator().evaluator_factory("single_model_ner_evaluator", self.device)
import json from evaluation.evaluator import Evaluator if __name__ == '__main__': with open('dataset/dev-predictions-final-it4.json', 'r') as f: bad_format_predictions = json.loads(f.read()) predictions = {} for question_id, predictions_list in bad_format_predictions.iteritems( ): predictions[question_id] = predictions_list[0] evaluator = Evaluator('dataset/dev.json') print evaluator.ExactMatch(predictions) print evaluator.F1(predictions)
datasets = {} dataset_splits = DatasetSplitter.generate_splits(config) transformations = TransformsGenerator.get_final_transforms(config) for key in dataset_splits: path, batching_config, split = dataset_splits[key] transform = transformations[key] datasets[key] = VideoDataset(path, batching_config, transform, split) trainer = Trainer(config, model, datasets["train"], logger) evaluator = Evaluator(config, datasets["validation"], logger, action_sampler=None, logger_prefix="validation") # Resume training try: trainer.load_checkpoint(model) except Exception as e: logger.print(e) logger.print("Cannot play without loading checkpoint") exit(1) model.eval() dataloader = evaluator.dataloader # Uses validation dataloader #dataset_index = int(input(f"- Insert start sample index in [0, {len(dataloader)}): ")) dataset_index = 0
# Used to compute the number of weights to use. feature_counter = FeatureCounter() training_titles = set() training_examples = ReadExamples(FLAGS.input_train_features, feature_counter, FLAGS.max_train_articles, training_titles) random.shuffle(training_examples) dev_titles = set() dev_examples = ReadExamples(FLAGS.input_dev_features, feature_counter, FLAGS.max_dev_articles, dev_titles) dev_question_annotations = ReadQuestionAnnotations( FLAGS.input_dev_articles) dev_evaluator = Evaluator(path=FLAGS.input_dev, restrict_to_titles=dev_titles) # Use a small set of articles for computing the metrics on the training set. training_metric_titles = set( random.sample(training_titles, len(dev_titles)) ) if len(training_titles) > len(dev_titles) else training_titles training_metric_examples = [ example for example in training_examples if example.article_title in training_metric_titles ] training_question_annotations = ReadQuestionAnnotations( FLAGS.input_train_articles) training_evaluator = Evaluator(path=FLAGS.input_train, restrict_to_titles=training_metric_titles) logger.info('Using %d features.', feature_counter.NumFeatures())
# raw_input("done") editDistGroup = dict() for dist, qaIdByDist in groupby(sorted(editDist.iteritems(), key=lambda x: x[1]), key=lambda x: x[1]): editDistGroup[dist] = list(qaIdByDist) predFile = "./output/dev-predictions-it3.json" jsonDataFile = "./dataset/json/dev.json" # predFile = "./dev-predictions-it3.json" # jsonDataFile = "./dev.json" with open(predFile, "r") as fp: predDict = json.load(fp) evaluator = Evaluator(jsonDataFile) exactMatchRateList = list() F1List = list() for dist in sorted(editDistGroup.keys()): predSubDict = dict() for qaId, _ in editDistGroup[dist]: predSubDict[qaId] = predDict[qaId] exactMatchRate = evaluator.ExactMatch(predSubDict) F1 = evaluator.F1(predSubDict) exactMatchRateList.append(exactMatchRate) F1List.append(F1) print "edit dist ", dist print "number of sample ", len(editDistGroup[dist]) print "exact match ", exactMatchRate print "F1 ", F1 print
# raw_input("done") editDistGroup = dict() for dist, qaIdByDist in groupby(sorted(editDist.iteritems(), key=lambda x: x[1]), key=lambda x: x[1]): editDistGroup[dist] = list(qaIdByDist) predFile = "./output/dev-predictions-it3.json" jsonDataFile = "./dataset/json/dev.json" # predFile = "./dev-predictions-it3.json" # jsonDataFile = "./dev.json" with open(predFile, "r") as fp: predDict = json.load(fp) evaluator = Evaluator(jsonDataFile) exactMatchRateList = list() F1List = list() HumanF1List = list() human_predictions = {} with open(jsonDataFile, "r") as fp: human_articles = json.load(fp)['data'] for article in human_articles: for paragraph in article['paragraphs']: for qa in paragraph['qas']: if len(qa['answers']) > 1: human_predictions[qa['id']] = qa['answers'].pop(1)['text'] human_evaluator = Evaluator(articles=human_articles) for dist in sorted(editDistGroup.keys()):
dictionary = Dictionary( FLAGS.input_featuredict, FLAGS.ablate_features.split(',') if FLAGS.ablate_features else []) logger.info('Using %d features.', dictionary.NumFeatures()) training_titles = set() training_examples = ReadExamples(FLAGS.input_train_features, dictionary, FLAGS.max_train_articles, training_titles) random.shuffle(training_examples) dev_titles = set() dev_examples = ReadExamples(FLAGS.input_dev_features, dictionary, FLAGS.max_dev_articles, dev_titles) dev_question_annotations = ReadQuestionAnnotations( FLAGS.input_dev_articles) dev_evaluator = Evaluator(FLAGS.input_dev, dev_titles) # Use a small set of articles for computing the metrics on the training set. training_metric_titles = set( random.sample(training_titles, len(dev_titles)) ) if len(training_titles) > len(dev_titles) else training_titles training_metric_examples = [ example for example in training_examples if example.article_title in training_metric_titles ] training_question_annotations = ReadQuestionAnnotations( FLAGS.input_train_articles) training_evaluator = Evaluator(FLAGS.input_train, training_metric_titles) # Filter the training questions for the learning curve. num_training_questions = 0
'') flags.DEFINE_string('input-features', 'dataset/test-featuresbucketized.proto', '') flags.DEFINE_integer('num-features', 186194776, '') flags.DEFINE_string('input-model', 'dataset/model', '') flags.DEFINE_integer('min-articles', None, '') if __name__ == '__main__': feature_counter = FeatureCounter(num_features=FLAGS.num_features) titles = set() examples = ReadExamples(FLAGS.input_features, feature_counter, FLAGS.min_articles, titles) random.shuffle(examples) question_annotations = ReadQuestionAnnotations(FLAGS.input_articles) evaluator = Evaluator(path=FLAGS.input, restrict_to_titles=titles) inputs = GetInputPlaceholders() variables = GetVariables(feature_counter) logits = GetLogits(inputs, variables) _, predict_op = tf.nn.top_k(logits, 1) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, FLAGS.input_model) ComputeAndDisplayMetrics(sess, inputs, None, predict_op, examples, question_annotations, evaluator, '') # if FLAGS.print_errors: # for example in examples:
default=None, help="Number of sequences to evaluate (default: all).") parser.add_argument("-mp", action="store_true", help="Use multiprocessing.") parser.add_argument( "--out", required=False, type=str, help="Evaluation output file (file endings will be attached).") args = parser.parse_args() words = set(read_lines(args.words)) print(len(words)) evaluator = Evaluator(words) start = time.monotonic() correct_sequences = read_file(args.correct)[:args.n] corrupt_sequences = read_file(args.misspelled)[:args.n] predicted_sequences = read_file(args.predictions)[:args.n] n_cpus = mp.cpu_count() if args.mp else 1 with mp.Pool(n_cpus) as pool: results = pool.starmap( evaluator.evaluate_sample, list(zip(correct_sequences, corrupt_sequences, predicted_sequences)))
'') flags.DEFINE_string('input-featuredict', 'dataset/featuredictbucketized.proto', '') flags.DEFINE_string('input-model', 'dataset/model13-it3', '') flags.DEFINE_integer('min-articles', None, '') flags.DEFINE_boolean('print-errors', False, '') if __name__ == '__main__': dictionary = Dictionary(FLAGS.input_featuredict, []) titles = set() examples = ReadExamples(FLAGS.input_features, dictionary, FLAGS.min_articles, titles) random.shuffle(examples) question_annotations = ReadQuestionAnnotations(FLAGS.input_articles) evaluator = Evaluator(FLAGS.input, titles) inputs = GetInputPlaceholders() variables = GetVariables(dictionary) logits = GetLogits(inputs, variables) _, predict_op_top_1 = tf.nn.top_k(logits, 1) _, predict_op_top_3 = tf.nn.top_k(logits, 3) saver = tf.train.Saver() with tf.Session() as sess: saver.restore(sess, FLAGS.input_model) ComputeAndDisplayMetrics(sess, inputs, None, predict_op_top_3, examples, question_annotations, evaluator, '') if FLAGS.print_errors:
predictions = {} num_same_counts = Counter() for article in articles: for paragraph in article['paragraphs']: for qa in paragraph['qas']: if len(qa['answers']) >= 3: num_same_counts[3 - len( set([ Evaluator.CleanAnswer(answer['text']) for answer in qa['answers'][0:3] ])) + 1] += 1 if len(qa['answers']) > 1: predictions[qa['id']] = qa['answers'].pop(1)['text'] evaluator = Evaluator(articles=articles) print 'Exact match:', round(evaluator.ExactMatch(predictions), 1) print 'F1:', round(evaluator.F1(predictions), 1) total_num_same_count = sum(num_same_counts.values()) for num_same, count in sorted(num_same_counts.items()): print num_same, 'same:', round(100.0 * count / total_num_same_count, 1) with open('dataset/dev-answertypetags.json') as fileobj: tags = json.loads(fileobj.read()) print len(tags), 'tagged questions' for tag, _ in Counter(tags.values()).most_common(): num_correct = 0 total_f1 = 0 num_total = 0 for question_id, _ in filter(lambda x: x[1] == tag, tags.items()):
if extractions is not None: for i, extraction in enumerate(extractions): points = str(extraction.points) correct_points = str(extraction.correct_points) file.write('Segment ' + str(i + 1) + '\n') file.write('Points: ' + points + '\n') file.write('Given points: ' + correct_points + '\n') file.write('------------\n') file.close() stop = False roads = get_roads_from_xml_file(path_roads) evaluator = Evaluator() total_statistics = {} total_len_extracted_previous = 0 total_len_reference_previous = 0 evaluation_count = 0 for image_name in os.listdir(path_images): if not stop and image_name[-3:] == 'png': image_name_list = image_name.split('_') if len(image_name_list) < 3: continue road_name = image_name_list[0] segment_number = int(image_name_list[1]) zoom_level = int(image_name_list[-2][1:])