def make_guess_tweet(model, user_ids, tweets_data_dir, output_file): ''' process users with their retrieved tweets, while the above method used preprocessed text files. ''' assert(os.path.exists(tweets_data_dir)) assert(not os.path.exists(output_file)) tick = Tick() fout = open(output_file, 'w') for user_id in user_ids: fname = os.path.join(tweets_data_dir, user_id) if not os.path.exists(fname): continue context = contextFromTweetFile(fname) if context is None: continue p_label, conf = model.eval_all(context)[0] fout.write(user_id + '\t' + p_label + '\t' + str(conf) + '\n') tick.tick() fout.close()
def trainModelFromFile(filename): assert os.path.exists(filename) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(filename): tokens = line.rstrip("\n").split(" ") label = tokens[0] context = [] for pair in tokens[1:]: f, v = pair.split(":") context.append((str(f), float(v))) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE) return m
def trainModelFromFile(filename, model_file = None): assert(os.path.exists(filename)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(filename): tokens = line.rstrip('\n').split(' ') label = tokens[0] context = [] for pair in tokens[1:]: f, v = pair.split(':') context.append((str(f), float(v))) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) if model_file: m.save(model_file) print "Model saved to %s" % model_file return m
def getModel(train_file, tweets_dir): assert(os.path.exists(train_file)) assert(os.path.exists(tweets_dir)) maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for line in open(train_file): user_id, label = line.rstrip('\n').split('\t') tweet_file = os.path.join(tweets_dir, user_id) context = contextFromTweetFile(tweet_file) if context is None: continue weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE) return m
def get_model(users_label, model_file=None): maxent.set_verbose(1) m = cmaxent.MaxentModel() m.begin_add_event() tick = Tick() for (user_id, label) in users_label: context = readFollowingContext(user_id) weight = 1.0 m.add_event(context, label, weight) tick.tick() m.end_add_event(PRUNE_COUNT) m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE) if model_file: m.save(model_file) print "Model saved to %s" % model_file return m
def make_guess(model, user_ids, text_data_dir, output_file): assert(os.path.exists(text_data_dir)) assert(not os.path.exists(output_file)) tick = Tick() fout = open(output_file, 'w') for user_id in user_ids: fname = os.path.join(text_data_dir, user_id) if not os.path.exists(fname): continue text = open(fname).read() context = contextFromText(text) p_label, conf = model.eval_all(context)[0] fout.write(user_id + '\t' + p_label + '\t' + str(conf) + '\n') tick.tick() fout.close()
tweets_dir = os.path.join(DATA, 'twitter/tweets_data_3200') #model_file = os.path.join(DATA, 'twitter/models/Dec17_text_heuristic.model') model_file = os.path.join(DATA, 'twitter/models/Dec17_text_heuristic_1e4.model') test_file = os.path.join(DATA, 'twitter/annotated/Dec16_g6Label_split2.csv') # load the pre-trained model, the features (or context) in the model are parsed # words, not integer mapped values model = cmaxent.MaxentModel() model.load(model_file) print 'model loaded from %s' % model_file # Collect the prediction and real class label tick = Tick() prediction_real_pairs = [] for line in open(test_file): user_id, r_label = line.rstrip('\n').split('\t') tweet_file = os.path.join(tweets_dir, user_id) context = contextFromTweetFile(tweet_file) if context is None: continue p_label = model.predict(context) prediction_real_pairs.append((int(p_label), int(r_label))) tick.tick() print 'prediction finished on the file %s' % test_file