Example #1
0
def following_maxent():
    maxent.set_verbose(1)

    dataset = getFollowingData()
    print 'finished loading dataset'

    doCrossValidation(dataset)
Example #2
0
def baseline(sentences, labels):

    maxent.set_verbose(1)
    m = MaxentModel()
    m.begin_add_event()

    with open(sentences) as file_content:
        sentences = file_content.readlines()
    with open(labels) as file_content:
        labels = file_content.readlines()

    for i in xrange(0, 3000):
        m.add_event(sentences[i].split(" "), labels[i].strip())

    m.end_add_event()

    m.train()

    correct = 0
    false = 0

    for i in xrange(3000, len(sentences)):
        result = m.eval(sentences[i].split(" "), "1")
        result = int(round(result))
        label = int(labels[i])
        if result == label:
            correct = correct + 1
        else:
            false = false + 1

    print "correct   :", correct
    print "false     :", false

    print("accuracy  : {:.2f}%".format(correct * 100.0 / (correct + false)))
def singleTest(train_file, test_file):
    maxent.set_verbose(1)

    model = getModel(train_file)

    print 'Finished learning a model.'

    confusion_matrix = np.zeros([4,4])

    for line in open(test_file):
        user_id, target = line.rstrip('\n').split('\t')
        context = getContext(user_id)

        if context is None:
            continue

        weight = 1.0
        predictions = model.eval_all(context)
        predicted_target = predictions[0][0]

        target = int(target)
        predicted_target = int(predicted_target)

        confusion_matrix[target][predicted_target] += 1


    accuracy = (np.trace(confusion_matrix) /
                        float(confusion_matrix.sum()))

    print 'Confusion Matrix:\n%s' % str(confusion_matrix)
    print 'Accuracy: %f\n' % accuracy

    return (confusion_matrix, accuracy)
def getModel(X, y):
    model_file = os.path.join(cachedir, __file__ + '.eng0.model2')

    #if os.path.exists(model_file):
    if False:
        m = cmaxent.MaxentModel()
        m.load(model_file)
        return m

    else:
        maxent.set_verbose(1)
        m = cmaxent.MaxentModel()
        m.begin_add_event()
        c_line = 0
        for i in range(X.shape[0]):
            row = X.getrow(i).tocoo()
            context = []
            for f, v in zip(row.col, row.data):
                context.append((str(f), v))
            weight = 1 / float(4000)
            label = str(int(y[i]))
            m.add_event(context, label, weight)

            c_line += 1
            if c_line%1000 == 0:
                print '%d' % c_line
            elif c_line%100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()

        m.end_add_event(PRUNE_COUNT)
        m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)
        m.save(model_file)

        return m
def trainModelFromFile(filename):
    assert os.path.exists(filename)

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(filename):
        tokens = line.rstrip("\n").split(" ")
        label = tokens[0]
        context = []
        for pair in tokens[1:]:
            f, v = pair.split(":")
            context.append((str(f), float(v)))
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE)

    return m
def getModel():
    '''
    Train a maxent classifier with Twitter data
    '''
    train_label_file = os.path.join(DATA,
            'twitter/self_reveal/user_pool8.csv')
    assert(os.path.exists(train_label_file))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    # add event reading the file one by one
    for line in open(train_label_file):
        user_id, target = line.rstrip('\n').split('\t')
        context = getTrTWContext(user_id)

        if context is None:
            continue

        weight = 1
        m.add_event(context, target, weight)

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Example #7
0
    def doCrossValidation(self, dataset):
        maxent.set_verbose(1)
        tester = Tester()
        random.shuffle(dataset)

        for train, test in dp_dataset.kFolds(dataset):
            # training
            m = self.trainedModelOn(train)
            print 'train size', len(train)

            # prediction
            trials = []

            for datum in test:
                context, target, weight = datum
                pre_target = m.predict(context)
                trials.append((target, pre_target))

            trials = zip(*trials)

            tester.record(trials[0], trials[1])

        print 'accuracy:', tester.accuracy()
        print 'confusion matrix:'
        print tester.confusionMatrix()
Example #8
0
def maximize(dataset, weights, verbose = 1):
    '''
    Returns a learned model from the given training set (dataset), and instance
    weights.
    '''
    X, y = dataset

    maxent.set_verbose(verbose)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    c_line = 0
    for i in range(X.shape[0]):
        row = X.getrow(i).tocoo()
        context = []
        for f, v in zip(row.col, row.data):
            context.append((str(f), v))
        weight = weights[i]
        label = str(y[i])
        m.add_event(context, label, weight)

        if verbose:
            c_line += 1
            if c_line%1000 == 0:
                print '%d' % c_line
            elif c_line%100 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()

    m.end_add_event(PRUNE_COUNT)

    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Example #9
0
def trainModelFromFile(filename, model_file = None):
    assert(os.path.exists(filename))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(filename):
        tokens = line.rstrip('\n').split(' ')
        label = tokens[0]
        context = []
        for pair in tokens[1:]:
            f, v = pair.split(':')
            context.append((str(f), float(v)))
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    if model_file:
        m.save(model_file)
        print "Model saved to %s" % model_file

    return m
    def train_model(self, corpus_dir, ambiguity_dir ):
        self.me.begin_add_event()
        #self.B = train_B_corpus(corpus_dir = corpus_dir,N_filter_func = N_filter_func)
        sentence = []

        corpus_files = get_corpus_files(corpus_dir)
        for corpus_file in corpus_files:

            morph_analys_file = os.path.join( ambiguity_dir, os.path.basename( corpus_file ) )
            morph_analys_tokens = get_tokens_from_file(morph_analys_file, N_filter_func = self.filter_func )

            for corpus_token in get_tokens_from_file(corpus_file, N_filter_func = self.filter_func ):

                morph_analys_token = morph_analys_tokens.next()
                if corpus_token[0] == EOS_TOKEN:
                    words = [token[0].word for token in sentence]
                    labels = [token[0].gram for token in sentence]
                    for i,token_info in enumerate( sentence ):
                        gold_token = token_info[0]
                        morph_analysises = [token.gram for token in token_info[1]]
                        if gold_token.word != token_info[1][0].word:
                            print >>sys.stderr, u"Cannot match gold token and morph analysis token\n gold token : {0}     morph analysis token : {1}".format( gold_token.word, token_info[1][0].word )
                            morph_analysises = None
                        word_features = list( self.compute_features( sentence = words, i = i , prev_label= labels[ i - 1 ] if i >0 else None, analysises = morph_analysises, labels = labels) )
                        gold_token_gram = gold_token.gram.encode('utf-8')
                        self.me.add_event(word_features, gold_token_gram )
                    sentence = []
                else:
                    sentence.append( (corpus_token[0], morph_analys_token)  )

        self.me.end_add_event()
        maxent.set_verbose(1)
        self.me.train( 50, 'lbfgs', 0.0 )
        maxent.set_verbose(0)
Example #11
0
def baselineText():
    maxent.set_verbose(1)

    train_file = '../../data/semi/train_test_hardlabel/train4'
    model = getLearner(train_file)

    print 'Finished learning a model.'

    test_file = '../../data/semi/train_test_hardlabel/test4'

    confusion_matrix = np.zeros([4,4])

    for line in open(test_file):
        user_id, target = line.rstrip('\n').split('\t')
        context = getContext(user_id)

        if context is None:
            continue

        weight = 1.0
        predictions = model.eval_all(context)
        predicted_target = predictions[0][0]

        target = int(target)
        predicted_target = int(predicted_target)

        confusion_matrix[target][predicted_target] += 1

    print confusion_matrix
    print 'Accuracy: %f' % (np.trace(confusion_matrix) /
                            float(confusion_matrix.sum()))
def getModel(train_file, tweets_dir):
    assert(os.path.exists(train_file))
    assert(os.path.exists(tweets_dir))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for line in open(train_file):
        user_id, label = line.rstrip('\n').split('\t')
        tweet_file = os.path.join(tweets_dir, user_id)
        context = contextFromTweetFile(tweet_file)

        if context is None:
            continue

        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Example #13
0
def trainedMaxentModel(X, y):
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()
    c_line = 0
    for i in range(X.shape[0]):
        row = X.getrow(i).tocoo()
        context = []
        for f, v in zip(row.col, row.data):
            context.append((str(f), v))
        weight = 1.0
        label = str(int(y[i]))
        m.add_event(context, label, weight)

        c_line += 1
        if c_line%1000 == 0:
            print '%d' % c_line
        elif c_line%100 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
def backup():
    test_file = os.path.join(DATA, 'twitter/annotated/ver2.8-econLabel.csv')
    assert(os.path.exists(test_file))
    maxent.set_verbose(1)
    distant_model = getModel()
    doTest(test_file, distant_model)
    print MARK
    print "Prior Weight: %e" % PRIOR_WEIGHT
Example #15
0
 def load_models(self, model_name="name"):
     """Function that loads previously trained models saved under the workspace dir."""
     maxent.set_verbose(1)
     self.M_pos = maxent.MaxentModel()
     self.M_pos.load(self.WS+model_name+".POS")
     self.M_lem = maxent.MaxentModel()
     self.M_lem.load(self.WS+model_name+".LEM")
     self.lemmatizer = pickle.load(open(self.WS+"lemmatizer_"+model_name+".p", "rb"))
     self.token_freqs = pickle.load(open(self.WS+"token_freqs"+model_name+".p", "rb"))
     self.lemma_freqs = pickle.load(open(self.WS+"lemma_freqs"+model_name+".p", "rb"))
     return
Example #16
0
def train_ne_binary_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >>sys.stderr, "*** Training options are:"
    print >>sys.stderr, "   ", options

    print >>sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word not in data["labelled_words"]:
                    data["labelled_words"][word] = defaultdict(long)
                data["labelled_words"][word][label] += 1

    print >>sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >>sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_ne_features(data, words, poses, i, labels[i - 1] if i >= 1 else "^")
            features = list(features)
            if labels[i].startswith("B-") or labels[i].startswith("I-"):
                model.add_event(features, "NE")
            else:
                model.add_event(features, "O")

            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >>sys.stderr, "*** Collected {0} features.".format(len(data["feature_set"]))

    print >>sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >>sys.stderr, "*** Saving..."
    model.save(options.model + ".ne.binary.maxent")
    with open(options.model + ".ne.binary.data", "w") as handle:
        cPickle.dump(data, handle)
Example #17
0
def train_model(options, iterable):
    model = MaxentModel()
    data = {}

    data["feature_set"] = set()
    data["word_frequencies"] = defaultdict(long)
    # XXX(sandello): defaultdict(lambda: defaultdict(long)) would be
    # a better choice here (for |labelled_words|) but it could not be pickled.
    # C'est la vie.
    data["labelled_words"] = dict()

    print >> sys.stderr, "*** Training options are:"
    print >> sys.stderr, "   ", options

    print >> sys.stderr, "*** First pass: Computing statistics..."
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        for word, pos, label in sentence:
            data["word_frequencies"][word] += 1
            if label.startswith("B-") or label.startswith("I-"):
                if word in data["labelled_words"]:
                    data["labelled_words"][word][label] += 1
                else:
                    data["labelled_words"][word] = defaultdict(long)

    print >> sys.stderr, "*** Second pass: Collecting features..."
    model.begin_add_event()
    for n, sentence in enumerate(iterable):
        if (n % 1000) == 0:
            print >> sys.stderr, "   {0:6d} sentences...".format(n)
        words, poses, labels = map(list, zip(*sentence))
        for i in xrange(len(labels)):
            features = compute_features(data, words, poses, i,
                                        labels[i - 1] if i >= 1 else "^")
            features = list(features)
            model.add_event(features, labels[i])
            for feature in features:
                data["feature_set"].add(feature)
    model.end_add_event(options.cutoff)
    print >> sys.stderr, "*** Collected {0} features.".format(
        len(data["feature_set"]))

    print >> sys.stderr, "*** Training..."
    maxent.set_verbose(1)
    model.train(options.iterations, options.technique, options.gaussian)
    maxent.set_verbose(0)

    print >> sys.stderr, "*** Saving..."
    model.save(options.model + ".maxent")
    with open(options.model + ".data", "w") as handle:
        cPickle.dump(data, handle)
Example #18
0
def test_indomain_model():
    model_file = os.path.join(DATA, 'livejournal/models/indomain0')
    test_file = os.path.join(DATA, 'ver2.8-hardLabel.csv')
    assert(os.path.exists(model_file))
    assert(os.path.exists(test_file))

    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.load(model_file)

    print 'model loaded'

    singleTest(test_file, m)
def multiTest():
    maxent.set_verbose(1)
    distant_model = getModel()

    data_dir = '../data/semi/train_test_hardlabel'

    accuracies = []
    for i in range(5):
        test_file = os.path.join(data_dir, 'test' + str(i))

        confusion_matrix, accuracy = singleTest(test_file, distant_model)
        accuracies.append(accuracy)
    print 'Five-fold CV Accuracy: %f' % np.array(accuracies).mean()
Example #20
0
def test1():
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()

    weights = 10

    m.begin_add_event()
    m.add_event([('a', 1), ('b', 1)], '0', 0.1)
    m.add_event([('a', 1), ('b', 1)], '0', 0.1)
    m.add_event([('c', 1), ('b', 1)], '1', 0.1)
    m.end_add_event(PRUNE_COUNT)

    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    print m.eval_all([('c', 1), ('a', 1)])
Example #21
0
def test():
    maxent.set_verbose(1)

    m = MaxentModel()

    m.begin_add_event()
    m.add_event(['1'], '1')
    m.add_event(['2'], '2')
    m.add_event(['3'], '3')
    m.end_add_event()

    m.train(30, 'lbfgs', 2, 1e-03)

    for x in map(str, range(1,4)):
        print "tested on:", x, "predicted:", m.eval_all([x])
Example #22
0
    def trainOn(self, train_groups):
        ''' Train on the train set and return the trained model '''
        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        m.end_add_event()

        m.train(20, 'lbfgs', 1e-04, 1e-03)

        return m
Example #23
0
def outputOnlyMatched():
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    n_total = 0
    n_emit = 0

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_pre = t_model.predict(text_context)
            f_pre = f_model.predict(following_context)

            if t_pre == f_pre:
                trials.append((target, t_pre))
                n_emit += 1

            n_total += 1

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
    print 'emitted portion:', float(n_emit) / float(n_total)
def getTrainedModel(dataset, verbose = 1):
    '''
    @param dataset  A list of (text, label)
    @return         A trained maxent model
    '''
    maxent.set_verbose(verbose)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    for text, label in dataset:
        context = extractFeatures(text)
        weight = 1.0
        m.add_event(context, label, weight)

    m.end_add_event()
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Example #25
0
def getLearner():
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    # Add instances from the clean training data
    train_label_file = '../../data/semi/train_test_hardlabel/train0'
    addInstancesFromFile(m, train_label_file)

    # Add instances from weakly labeled data
    train_label_file = '../../data/public_stream/weak_labeled_users/train0'
    addInstancesFromFile(m, train_label_file)

    # Finish adding events
    m.end_add_event(1)
    m.train(100, 'lbfgs', 1e1, 1e-4)

    return m
Example #26
0
def scoreDistribution():
    maxent.set_verbose(1)
    text_dataset = getTextData()

    for train, test in data.kFolds(text_dataset):
        model = trainedModelOn(train)

        for datum in test:
            context, target, weight = datum
            pred = model.predict(context)
            model.eval_all(context)

            if pred != target:
                prob = map(itemgetter(1),
                        sorted(model.eval_all(context), key = itemgetter(0)))
                print prob, target

        break
Example #27
0
    def trainOn(self, train_groups, n_itr = 15, var = 1, tol = 1e-5):
        ''' Train on the train set and return the trained model '''

        print "training set:", Counter(zip(*train_groups)[1]).most_common()

        maxent.set_verbose(1)

        m = MaxentModel()

        m.begin_add_event()

        for pair in train_groups:
            m.add_event(pair[0], pair[1])

        n_cutoff = 1
        m.end_add_event(n_cutoff)

        m.train(n_itr, 'lbfgs', var, tol)

        return m
Example #28
0
def seePredictionOnTrainingData():
    maxent.set_verbose(1)

    dataset = getTextData()
    print 'finished loading dataset'

    for train, test in data.kFolds(dataset):
        m = trainedModelOn(train)

        print "Accuracy on Training Set"
        for datum in train:
            context, target, weight = datum
            print m.eval_all(context)

        print "Accuracy on Test Set"
        for datum in test:
            context, target, weight = datum
            print m.eval_all(context)

        break
def getModel(labeled_set):
    '''
    @param  labeled_set     a list of (user_id, label)
    '''
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    # add event reading the file one by one
    for user_id, label in labeled_set:
        context = getTWContext(user_id)
        if context is None:
            continue
        weight = 1.0
        m.add_event(context, label, weight)

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, 'lbfgs', PRIOR_WEIGHT, TOLERANCE)

    return m
Example #30
0
def simpleEnsemble(pickup):
    maxent.set_verbose(1)

    text_dataset = getTextData(False)
    following_dataset = getFollowingData(False)
    dataset = zip(text_dataset, following_dataset)
    random.shuffle(dataset)
    print 'finished loading dataset'

    tester = tests.tester(4)

    for train, test in data.kFolds(dataset):
        text_train, following_train = zip(*train)

        # training
        t_model = trainedModelOn(text_train)
        f_model = trainedModelOn(following_train)

        # prediction
        trials = []

        for datum in test:
            text_datum, following_datum = datum
            text_context, target, weight = text_datum
            following_context, target, weight = following_datum

            t_conf = t_model.eval_all(text_context)
            f_conf = f_model.eval_all(following_context)

            pre_target = str(pickup(t_conf, f_conf))

            trials.append((target, pre_target))

        trials = zip(*trials)
        tester.record(trials[0], trials[1])

    print 'accuracy:', tester.accuracy()
    print 'confusion matrix:'
    print tester.confusionMatrix()
def get_model(users_label, model_file=None):
    maxent.set_verbose(1)
    m = cmaxent.MaxentModel()
    m.begin_add_event()

    tick = Tick()

    for (user_id, label) in users_label:
        context = readFollowingContext(user_id)
        weight = 1.0
        m.add_event(context, label, weight)

        tick.tick()

    m.end_add_event(PRUNE_COUNT)
    m.train(LBFGS_ITERATION, "lbfgs", PRIOR_WEIGHT, TOLERANCE)

    if model_file:
        m.save(model_file)
        print "Model saved to %s" % model_file

    return m