def re_train(model, model_loc, train, dev, corpus_loc, num, num_runs, classes): """ runs the back feed mechanism. :param model: classificator :param model_loc: If the model was not loaded, a raw one is trained and saved to this location :param train: File with annotated training tweets :param dev: File with annotated development tweets :param corpus_loc: Directory of the Sentiment140 corpus :param num: Amount of added corpus data per iteration :param num_runs: Number of iterations :param classes: Variable holding bit flags corresponding to class labels -- definition is in preprocessing variable 'ltd' """ # load training data, is needed either way train_labels, train_tweets, train_pos = preprocessing.parse(train, classes) x_train = [e[0] + "\t" + e[1] for e in zip(train_tweets, train_pos)] y_train = train_labels # x_train, y_train = uniformify(x_train, y_train) # if model is raw, train it if not hasattr(model, "grid_scores_"): logging.info("No trained model given, building training features for " "binary class codes: %s" % bin(classes)) model.fit(x_train, y_train) logging.info("writing new model to disk at %s.." % model_loc) with open(model_loc, "w") as sink: cPickle.dump(model, sink) logging.info("done.") # get test data dev_labels, dev_tweets, dev_pos = preprocessing.parse(dev, classes) x_dev = [e[0] + "\t" + e[1] for e in zip(dev_tweets, dev_pos)] y_dev = dev_labels # x_dev, y_dev = uniformify(x_dev, y_dev) # initial eval logging.info("Initial evaluation..") print_scores(model, x_dev, y_dev, classes) # print label distribution, in order to check on how the ratio of pos to neg # influences the scoring. Seems pretty balanced now that len(t_gold_1) is # equal to len(t_gold_2) # print('num t_gold_1 '+str(len([l for l in y_train if l == 1]))) # print('num t_gold_2 '+str(len([l for l in y_train if l == 2]))) # print('t_1 '+str(len([l for l in model.predict(x_train) if l == 1]))) # print('t_2 '+str(len([l for l in model.predict(x_train) if l == 2]))) # print('num d_gold_1 '+str(len([l for l in y_dev if l == 1]))) # print('num d_gold_2 '+str(len([l for l in y_dev if l == 2]))) # print('d_1 '+str(len([l for l in model.predict(x_dev) if l == 1]))) # print('d_2 '+str(len([l for l in model.predict(x_dev) if l == 2]))) # feedback loop logging.info("Initializing backfeed instance..") feed = Feeder(corpus_loc) logging.info("done. Now starting backfeed loop") for count in range(1, num_runs + 1): feed.add_best_n(model, num, x_train, y_train) logging.info("Retrain run %i" % count) print_scores(model, x_dev, y_dev, classes)
def run(model, x_train, y_train, x_test, y_test, mode, retrain=30, amount=300, token=''): # initial step model.fit(x_train, y_train) logging.info('initial evaluation') get_score(model, x_test, y_test) # external sources set-up cl = pickle.load(open('cl.model', 'rb')) af = pickle.load(open('af.model', 'rb')) km = pickle.load(open('km.model', 'rb')) classes = model.best_estimator_.named_steps['svm'].classes_ cl.add_filter_ranges(**{str(POS): (1.5, float('inf')), str(NEG): (float('-inf'), -1.5), str(NEU): (-1.5, 1.5)}) cl.add_weight(5, classes) af.add_filter_ranges(**{str(POS): (0.4, float('inf')), str(NEG): (float('-inf'), -0.4), str(NEU): (-0.4, 0.4)}) af.add_weight(2.2, classes) km.add_filter_ranges(**{str(POS): (2.5, float('inf')), str(NEG): (float('-inf'), -2.5), str(NEU): (-2.5, 2.5)}) km.add_weight(40, classes) # source inclusion feed = Feeder() if token == 'km': feed.add_mutator(km) if token == 'af': feed.add_mutator(af) if token == 'cl': feed.add_mutator(cl) # retrain loop, feedback, and evaluation for i in range(retrain): logging.debug('count nr. %i' % i) feed.add_best_n(model, amount, x_train, y_train, False, mode) get_score(model, x_test, y_test)