def main(): Logger.log("Reading input data...") (sentences, gold_tags, token_count, tag_count) = \ extract_sentences(args.infile, encoding=args.enc) Logger.log("Parsed {0} token(s) with {1} tags in {2} sentence(s)." .format(token_count, tag_count, len(sentences))) check_counts_for_mode(token_count, tag_count, args.train) if args.preprocessing: sentences = preprocess_sentences(sentences) if args.train: Logger.log("Training...") perceptron_model = get_perceptron_model(args.implementation, args.structured) model = perceptron_model( averaged=args.averaging, iterations=args.iterations, learning_rate=1, sequenced=True, feature_extractor=get_feature_extractor(args.feature, args.context_size) ) widgets = [pb.Percentage(), ' ', pb.Bar(marker='#', left='[', right=']'), ' ', pb.ETA(), ' ', pb.AbsoluteETA()] with pb.ProgressBar(max_value=(token_count * args.iterations), redirect_stderr=True, widgets=widgets) as bar: model.log_to = Logger model.progress_func = bar.update model.train(sentences, gold_tags) Logger.log("Saving...") with gzip.open(args.par, 'wb') as f: pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL) if not args.train: Logger.log("Loading model...") with gzip.open(args.par, 'rb') as f: model = pickle.load(f) model.log_to = Logger Logger.log("Tagging...") correct_count = 0 nbest = (args.nbest > 1) if nbest: predictions = model.predict_all_nbest(sentences, n=args.nbest) else: predictions = model.predict_all(sentences) for sentence in it.izip(sentences, predictions, gold_tags): for (word, pred_tag, gold_tag) in it.izip(*sentence): if nbest: print(u"{0}\t{1}".format(word, u"\t".join(pred_tag)).encode("utf-8")) pred_tag = pred_tag[0] else: print(u"{0}\t{1}".format(word, pred_tag).encode("utf-8")) if gold_tag is not None and gold_tag == pred_tag: correct_count += 1 print('') # line break between sentences if tag_count > 0: # print evaluation Logger.log("Accuracy: {0:7}/{1:7} correct ({2:.2f}%)" .format(correct_count, tag_count, (float(correct_count)/tag_count)*100)) Logger.log("Done.")
def main(): Logger.log("Reading input data...") (sentences, gold_tags, token_count, tag_count) = extract_sentences(args.infile, encoding=args.enc) Logger.log("Parsed {0} token(s) with {1} tags in {2} sentence(s).".format(token_count, tag_count, len(sentences))) check_counts_for_mode(token_count, tag_count, True) sentences = preprocess_sentences(sentences) models = make_models(args.models) all_splits = list(make_cv_splits(sentences, gold_tags, args.folds)) for model in models: Logger.log("Benchmarking model " + model.__name__) accuracies = [] times_train = [] times_tag = [] for n, (training_data, eval_data) in enumerate(all_splits): Logger.log("Processing fold {0}...".format(n + 1)) p = model( averaged=args.averaging, iterations=args.iterations, learning_rate=1, sequenced=True, feature_extractor=get_feature_extractor(args.feature, args.context_size), log_to=None, ) # training time_start = timeit.default_timer() p.train(*training_data) time_train = timeit.default_timer() # tagging (eval_sentences, eval_tags) = eval_data correct, eval_count = 0, 0 predictions = p.predict_all(eval_sentences) time_tag = timeit.default_timer() # evaluating for sent in it.izip(predictions, eval_tags): correct += sum((guess == truth for guess, truth in it.izip(*sent))) eval_count += len(sent[0]) accuracy = 1.0 * correct / eval_count delta_train = time_train - time_start delta_tag = time_tag - time_train Logger.log( " fold {0}: accuracy {1:.4f}, training time {2:.4f}, tagging time {3:.4f}".format( n + 1, accuracy, delta_train, delta_tag ) ) accuracies.append(accuracy) times_train.append(delta_train) times_tag.append(delta_tag) accuracies = np.array(accuracies) times_train = np.array(times_train) times_tag = np.array(times_tag) Logger.log("Evaluation results of model " + model.__name__, type="info!") Logger.log( " avg accuracy: {0:2.4f} std: {1:.4f}".format(np.mean(accuracies), np.std(accuracies)), type="info!" ) Logger.log( " avg training time: {0:2.2f} std: {1:2.2f}".format(np.mean(times_train), np.std(times_train)), type="info!", ) Logger.log( " avg tagging time: {0:2.2f} std: {1:2.2f}".format(np.mean(times_tag), np.std(times_tag)), type="info!", ) Logger.log("Done.")