data = read_twitter(trainfile=args.train, evalfiles=args.eval) import tagger if args.tagger == "logreg": tagger = tagger.LogisticRegressionTagger() elif args.tagger == "crf": tagger = tagger.CRFPerceptron() else: sys.stderr.write("Did not properly select tagger!") sys.exit(1) # Train the tagger tagger.fit_data(data.train_sents, data.train_labels) # Evaluation (also writes out predictions) trainoutfile = "{}/{}.pred".format(args.outdir, os.path.basename(args.train)) print "### Train evaluation; writing to {}".format(trainoutfile) data.train_preds = tagger.evaluate_data(data.train_sents, data.train_labels) write_preds(trainoutfile, data.train_sents, data.train_labels, data.train_preds) for evalstr, evalset in zip(args.eval, data.eval): evaloutfile = "{}/{}.pred".format(args.outdir, os.path.basename(evalstr)) print "### evaluation of {}; writing to {}".format( evalstr, evaloutfile) preds = tagger.evaluate_data(evalset["sents"], evalset["labels"]) write_preds(evaloutfile, evalset["sents"], evalset["labels"], preds)
use_test = args.test data = read_twitter(test=use_test) if model == 'crf': tagger = tagger.CRFPerceptron() else: tagger = tagger.LogisticRegressionTagger() # Train the tagger tagger.fit_data(data.train_sents, data.train_labels) # Evaluation (also writes out predictions) print "### Train evaluation" data.train_preds = tagger.evaluate_data(data.train_sents, data.train_labels) write_preds("%s/twitter_train.%s.pred" % (base_path_predictions, model), data.train_sents, data.train_labels, data.train_preds) print "### Dev evaluation" data.dev_preds = tagger.evaluate_data(data.dev_sents, data.dev_labels) write_preds("%s/twitter_dev.%s.pred" % (base_path_predictions, model), data.dev_sents, data.dev_labels, data.dev_preds) # Following is commented, only useful once test data is available. if use_test: print "### Generating Test predictions" data.test_preds = tagger.evaluate_data(data.test_sents, data.test_labels, quite=True) write_preds("%s/twitter_test.%s.pred" % (base_path_predictions, model),
# Do no run, the following function was used to generate the splits # file_splitter("data/twitter_train_all.pos", "data/twitter_train.pos", "data/twitter_dev.pos") dname = "pos" # or "ner" data = read_twitter(dname) # data = synthetic_data() import tagger tagger = tagger.LogisticRegressionTagger() #tagger = tagger.CRFPerceptron() # Train the tagger tagger.fit_data(data.train_sents, data.train_labels) # Evaluation (also writes out predictions) print "### Train evaluation" data.train_preds = tagger.evaluate_data(data.train_sents, data.train_labels) write_preds("data/twitter_train.%s.pred" % dname, data.train_sents, data.train_labels, data.train_preds) print "### Dev evaluation" data.dev_preds = tagger.evaluate_data(data.dev_sents, data.dev_labels) write_preds("data/twitter_dev.%s.pred" % dname, data.dev_sents, data.dev_labels, data.dev_preds) # Following is commented, only useful once test data is available. # print "### Test evaluation" # data.test_preds = tagger.evaluate_data(data.test_sents, data.test_labels) # write_preds("data/twitter_test.%s.pred" % dname, # data.test_sents, data.test_labels, data.test_preds)
default=4, help="batch size") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) # print os.getcwd() # data = read_twitter(trainfile=args.train, evalfiles=args.eval) import tagger if args.tagger == "logreg": tagger = tagger.LogisticRegressionTagger() elif args.tagger == "crf": tagger = tagger.CRFPerceptron(args.epochs, args.batch_size) else: sys.stderr.write("Did not properly select tagger!") sys.exit(1) # Train the tagger tagger.fit_data(data.train_sents, data.train_labels) # Evaluation (also writes out predictions) for evalstr, evalset in zip(args.eval, data.eval): evaloutfile = "{}/{}.pred".format(args.outdir, os.path.basename(evalstr)) print("### evaluation of {}; writing to {}".format( evalstr, evaloutfile)) preds = tagger.evaluate_data(evalset["sents"], evalset["labels"]) write_preds(evaloutfile, evalset["sents"], evalset["labels"], preds)