def following_maxent(): maxent.set_verbose(1) dataset = getFollowingData() print 'finished loading dataset' doCrossValidation(dataset)
def baseline(sentences, labels): maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() with open(sentences) as file_content: sentences = file_content.readlines() with open(labels) as file_content: labels = file_content.readlines() for i in xrange(0, 3000): m.add_event(sentences[i].split(" "), labels[i].strip()) m.end_add_event() m.train() correct = 0 false = 0 for i in xrange(3000, len(sentences)): result = m.eval(sentences[i].split(" "), "1") result = int(round(result)) label = int(labels[i]) if result == label: correct = correct + 1 else: false = false + 1 print "correct :", correct print "false :", false print("accuracy : {:.2f}%".format(correct * 100.0 / (correct + false)))
def singleTest(train_file, test_file): maxent.set_verbose(1) model = getModel(train_file) print 'Finished learning a model.' confusion_matrix = np.zeros([4,4]) for line in open(test_file): user_id, target = line.rstrip('\n').split('\t') context = getContext(user_id) if context is None: continue weight = 1.0 predictions = model.eval_all(context) predicted_target = predictions[0][0] target = int(target) predicted_target = int(predicted_target) confusion_matrix[target][predicted_target] += 1 accuracy = (np.trace(confusion_matrix) / float(confusion_matrix.sum())) print 'Confusion Matrix:\n%s' % str(confusion_matrix) print 'Accuracy: %f\n' % accuracy return (confusion_matrix, accuracy)
def getModel(X, y): model_file = os.path.join(cachedir, __file__ + '.eng0.model2') #if os.path.exists(model_file): if False: m = cmaxent.MaxentModel() m.load(model_file) return m else: maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() c_line = 0 for i in range(X.shape[0]): row = X.getrow(i).tocoo() context = [] for f, v in zip(row.col, row.data): context.append((str(f), v)) weight = 1 / float(4000) label = str(int(y[i])) m.add_event(context, label, weight) c_line += 1 if c_line%1000 == 0: print '%d' % c_line elif c_line%100 == 0: sys.stdout.write('.') sys.stdout.flush() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) m.save(model_file) return m
def trainModelFromFile(filename): assert os.path.exists(filename) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(filename): tokens = line.rstrip("\n").split(" ") label = tokens[0] context = [] for pair in tokens[1:]: f, v = pair.split(":") context.append((str(f), float(v))) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE) return m
def getModel(): ''' Train a maxent classifier with Twitter data ''' train_label_file = os.path.join(DATA, 'twitter/self_reveal/user_pool8.csv') assert(os.path.exists(train_label_file)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() # add event reading the file one by one for line in open(train_label_file): user_id, target = line.rstrip('\n').split('\t') context = getTrTWContext(user_id) if context is None: continue weight = 1 m.add_event(context, target, weight) m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def doCrossValidation(self, dataset): maxent.set_verbose(1) tester = Tester() random.shuffle(dataset) for train, test in dp_dataset.kFolds(dataset): # training m = self.trainedModelOn(train) print 'train size', len(train) # prediction trials = [] for datum in test: context, target, weight = datum pre_target = m.predict(context) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def maximize(dataset, weights, verbose = 1): ''' Returns a learned model from the given training set (dataset), and instance weights. ''' X, y = dataset maxent.set_verbose(verbose) m = cmaxent.MaxentModel() m.begin_add_event() c_line = 0 for i in range(X.shape[0]): row = X.getrow(i).tocoo() context = [] for f, v in zip(row.col, row.data): context.append((str(f), v)) weight = weights[i] label = str(y[i]) m.add_event(context, label, weight) if verbose: c_line += 1 if c_line%1000 == 0: print '%d' % c_line elif c_line%100 == 0: sys.stdout.write('.') sys.stdout.flush() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def trainModelFromFile(filename, model_file = None): assert(os.path.exists(filename)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(filename): tokens = line.rstrip('\n').split(' ') label = tokens[0] context = [] for pair in tokens[1:]: f, v = pair.split(':') context.append((str(f), float(v))) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) if model_file: m.save(model_file) print "Model saved to %s" % model_file return m
def train_model(self, corpus_dir, ambiguity_dir ): self.me.begin_add_event() #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func) sentence = [] corpus_files = get_corpus_files(corpus_dir) for corpus_file in corpus_files: morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) ) morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func ) for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ): morph_analys_token = morph_analys_tokens.next() if corpus_token[0] == EOS_TOKEN: words = [token[0].word for token in sentence] labels = [token[0].gram for token in sentence] for i,token_info in enumerate( sentence ): gold_token = token_info[0] morph_analysises = [token.gram for token in token_info[1]] if gold_token.word != token_info[1][0].word: print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0} morph analysis token : {1}".format( gold_token.word, token_info[1][0].word ) morph_analysises = None word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) ) gold_token_gram = gold_token.gram.encode('utf-8') self.me.add_event(word_features, gold_token_gram ) sentence = [] else: sentence.append( (corpus_token[0], morph_analys_token) ) self.me.end_add_event() maxent.set_verbose(1) self.me.train( 50, 'lbfgs', 0.0 ) maxent.set_verbose(0)
def baselineText(): maxent.set_verbose(1) train_file = '../../data/semi/train_test_hardlabel/train4' model = getLearner(train_file) print 'Finished learning a model.' test_file = '../../data/semi/train_test_hardlabel/test4' confusion_matrix = np.zeros([4,4]) for line in open(test_file): user_id, target = line.rstrip('\n').split('\t') context = getContext(user_id) if context is None: continue weight = 1.0 predictions = model.eval_all(context) predicted_target = predictions[0][0] target = int(target) predicted_target = int(predicted_target) confusion_matrix[target][predicted_target] += 1 print confusion_matrix print 'Accuracy: %f' % (np.trace(confusion_matrix) / float(confusion_matrix.sum()))
def getModel(train_file, tweets_dir): assert(os.path.exists(train_file)) assert(os.path.exists(tweets_dir)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(train_file): user_id, label = line.rstrip('\n').split('\t') tweet_file = os.path.join(tweets_dir, user_id) context = contextFromTweetFile(tweet_file) if context is None: continue weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def trainedMaxentModel(X, y): maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() c_line = 0 for i in range(X.shape[0]): row = X.getrow(i).tocoo() context = [] for f, v in zip(row.col, row.data): context.append((str(f), v)) weight = 1.0 label = str(int(y[i])) m.add_event(context, label, weight) c_line += 1 if c_line%1000 == 0: print '%d' % c_line elif c_line%100 == 0: sys.stdout.write('.') sys.stdout.flush() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def backup(): test_file = os.path.join(DATA, 'twitter/annotated/ver2.8-econLabel.csv') assert(os.path.exists(test_file)) maxent.set_verbose(1) distant_model = getModel() doTest(test_file, distant_model) print MARK print "Prior Weight: %e" % PRIOR_WEIGHT
def load_models(self, model_name="name"): """Function that loads previously trained models saved under the workspace dir.""" maxent.set_verbose(1) self.M_pos = maxent.MaxentModel() self.M_pos.load(self.WS+model_name+".POS") self.M_lem = maxent.MaxentModel() self.M_lem.load(self.WS+model_name+".LEM") self.lemmatizer = pickle.load(open(self.WS+"lemmatizer_"+model_name+".p", "rb")) self.token_freqs = pickle.load(open(self.WS+"token_freqs"+model_name+".p", "rb")) self.lemma_freqs = pickle.load(open(self.WS+"lemma_freqs"+model_name+".p", "rb")) return
def train_ne_binary_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() print >>sys.stderr, "*** Training options are:" print >>sys.stderr, " ", options print >>sys.stderr, "*** First pass: Computing statistics..." for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) for word, pos, label in sentence: data["word_frequencies"][word] += 1 if label.startswith("B-") or label.startswith("I-"): if word not in data["labelled_words"]: data["labelled_words"][word] = defaultdict(long) data["labelled_words"][word][label] += 1 print >>sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >>sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_ne_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) if labels[i].startswith("B-") or labels[i].startswith("I-"): model.add_event(features, "NE") else: model.add_event(features, "O") for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"])) print >>sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >>sys.stderr, "*** Saving..." model.save(options.model + ".ne.binary.maxent") with open(options.model + ".ne.binary.data", "w") as handle: cPickle.dump(data, handle)
def train_model(options, iterable): model = MaxentModel() data = {} data["feature_set"] = set() data["word_frequencies"] = defaultdict(long) # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be # a better choice here (for |labelled_words|) but it could not be pickled. # C'est la vie. data["labelled_words"] = dict() print >> sys.stderr, "*** Training options are:" print >> sys.stderr, " ", options print >> sys.stderr, "*** First pass: Computing statistics..." for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) for word, pos, label in sentence: data["word_frequencies"][word] += 1 if label.startswith("B-") or label.startswith("I-"): if word in data["labelled_words"]: data["labelled_words"][word][label] += 1 else: data["labelled_words"][word] = defaultdict(long) print >> sys.stderr, "*** Second pass: Collecting features..." model.begin_add_event() for n, sentence in enumerate(iterable): if (n % 1000) == 0: print >> sys.stderr, " {0:6d} sentences...".format(n) words, poses, labels = map(list, zip(*sentence)) for i in xrange(len(labels)): features = compute_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^") features = list(features) model.add_event(features, labels[i]) for feature in features: data["feature_set"].add(feature) model.end_add_event(options.cutoff) print >> sys.stderr, "*** Collected {0} features.".format( len(data["feature_set"])) print >> sys.stderr, "*** Training..." maxent.set_verbose(1) model.train(options.iterations, options.technique, options.gaussian) maxent.set_verbose(0) print >> sys.stderr, "*** Saving..." model.save(options.model + ".maxent") with open(options.model + ".data", "w") as handle: cPickle.dump(data, handle)
def test_indomain_model(): model_file = os.path.join(DATA, 'livejournal/models/indomain0') test_file = os.path.join(DATA, 'ver2.8-hardLabel.csv') assert(os.path.exists(model_file)) assert(os.path.exists(test_file)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.load(model_file) print 'model loaded' singleTest(test_file, m)
def multiTest(): maxent.set_verbose(1) distant_model = getModel() data_dir = '../data/semi/train_test_hardlabel' accuracies = [] for i in range(5): test_file = os.path.join(data_dir, 'test' + str(i)) confusion_matrix, accuracy = singleTest(test_file, distant_model) accuracies.append(accuracy) print 'Five-fold CV Accuracy: %f' % np.array(accuracies).mean()
def test1(): maxent.set_verbose(1) m = cmaxent.MaxentModel() weights = 10 m.begin_add_event() m.add_event([('a', 1), ('b', 1)], '0', 0.1) m.add_event([('a', 1), ('b', 1)], '0', 0.1) m.add_event([('c', 1), ('b', 1)], '1', 0.1) m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) print m.eval_all([('c', 1), ('a', 1)])
def test(): maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() m.add_event(['1'], '1') m.add_event(['2'], '2') m.add_event(['3'], '3') m.end_add_event() m.train(30, 'lbfgs', 2, 1e-03) for x in map(str, range(1,4)): print "tested on:", x, "predicted:", m.eval_all([x])
def trainOn(self, train_groups): ''' Train on the train set and return the trained model ''' maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() for pair in train_groups: m.add_event(pair[0], pair[1]) m.end_add_event() m.train(20, 'lbfgs', 1e-04, 1e-03) return m
def outputOnlyMatched(): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) n_total = 0 n_emit = 0 for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_pre = t_model.predict(text_context) f_pre = f_model.predict(following_context) if t_pre == f_pre: trials.append((target, t_pre)) n_emit += 1 n_total += 1 trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix() print 'emitted portion:', float(n_emit) / float(n_total)
def getTrainedModel(dataset, verbose = 1): ''' @param dataset A list of (text, label) @return A trained maxent model ''' maxent.set_verbose(verbose) m = cmaxent.MaxentModel() m.begin_add_event() for text, label in dataset: context = extractFeatures(text) weight = 1.0 m.add_event(context, label, weight) m.end_add_event() m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def getLearner(): maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() # Add instances from the clean training data train_label_file = '../../data/semi/train_test_hardlabel/train0' addInstancesFromFile(m, train_label_file) # Add instances from weakly labeled data train_label_file = '../../data/public_stream/weak_labeled_users/train0' addInstancesFromFile(m, train_label_file) # Finish adding events m.end_add_event(1) m.train(100, 'lbfgs', 1e1, 1e-4) return m
def scoreDistribution(): maxent.set_verbose(1) text_dataset = getTextData() for train, test in data.kFolds(text_dataset): model = trainedModelOn(train) for datum in test: context, target, weight = datum pred = model.predict(context) model.eval_all(context) if pred != target: prob = map(itemgetter(1), sorted(model.eval_all(context), key = itemgetter(0))) print prob, target break
def trainOn(self, train_groups, n_itr = 15, var = 1, tol = 1e-5): ''' Train on the train set and return the trained model ''' print "training set:", Counter(zip(*train_groups)[1]).most_common() maxent.set_verbose(1) m = MaxentModel() m.begin_add_event() for pair in train_groups: m.add_event(pair[0], pair[1]) n_cutoff = 1 m.end_add_event(n_cutoff) m.train(n_itr, 'lbfgs', var, tol) return m
def seePredictionOnTrainingData(): maxent.set_verbose(1) dataset = getTextData() print 'finished loading dataset' for train, test in data.kFolds(dataset): m = trainedModelOn(train) print "Accuracy on Training Set" for datum in train: context, target, weight = datum print m.eval_all(context) print "Accuracy on Test Set" for datum in test: context, target, weight = datum print m.eval_all(context) break
def getModel(labeled_set): ''' @param labeled_set a list of (user_id, label) ''' maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() # add event reading the file one by one for user_id, label in labeled_set: context = getTWContext(user_id) if context is None: continue weight = 1.0 m.add_event(context, label, weight) m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def simpleEnsemble(pickup): maxent.set_verbose(1) text_dataset = getTextData(False) following_dataset = getFollowingData(False) dataset = zip(text_dataset, following_dataset) random.shuffle(dataset) print 'finished loading dataset' tester = tests.tester(4) for train, test in data.kFolds(dataset): text_train, following_train = zip(*train) # training t_model = trainedModelOn(text_train) f_model = trainedModelOn(following_train) # prediction trials = [] for datum in test: text_datum, following_datum = datum text_context, target, weight = text_datum following_context, target, weight = following_datum t_conf = t_model.eval_all(text_context) f_conf = f_model.eval_all(following_context) pre_target = str(pickup(t_conf, f_conf)) trials.append((target, pre_target)) trials = zip(*trials) tester.record(trials[0], trials[1]) print 'accuracy:', tester.accuracy() print 'confusion matrix:' print tester.confusionMatrix()
def get_model(users_label, model_file=None): maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for (user_id, label) in users_label: context = readFollowingContext(user_id) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE) if model_file: m.save(model_file) print "Model saved to %s" % model_file return m