Esempio n. 1
0
    def doc2vec_embedding(self, min_count=2, epochs=200):
        """
        embed using doc2vec
        :return:
        """
        # fit embedding by doc2vec
        print('{}: starting fitting doc2vec model'.format(
            time.asctime(time.localtime(time.time()))))
        self.doc2vec_model = Doc2Vec(fname='',
                                     linux=False,
                                     use_file=False,
                                     data=self.data,
                                     vector_size=self.embedding_size,
                                     min_count=min_count,
                                     epochs=epochs)
        print('{}: finish fitting doc2vec model'.format(
            time.asctime(time.localtime(time.time()))))

        # save trained model
        self.doc2vec_fitted_model_file_path = os.path.join(
            self.data_directory, 'doc2vec_submission_titles.pkl')
        print('{}: starting saving doc2vec model'.format(
            time.asctime(time.localtime(time.time()))))
        joblib.dump(self.doc2vec_model, self.doc2vec_fitted_model_file_path)
        print('{}: finishing saving doc2vec model'.format(
            time.asctime(time.localtime(time.time()))))

        return
def main():
    numProcs = 3
    taskID = process.fork_processes(numProcs, max_restarts=0)
    port = BASE_PORT + taskID
    if taskID == 0:
        app = httpserver.HTTPServer(tornado.web.Application([
            (r"/submit", Web)], **SETTINGS))
        logging.info("webapp listening on %d" % port)
    else:
        #load trained model from either dm or dbow
        if os.path.isfile(dmLabeled) and os.path.isfile(dbowLabeled):
            fname = dmLabeled if taskID == 1 else dbowLabeled
            model = Doc2Vec.load(fname)   
        else:
            raise RuntimeError("Must first train doc2vec model")

        app = httpserver.HTTPServer(web.Application([(r"/doc2vec", Doc2vecServer, dict(model = model))]))
        logging.info("Doc2vec server %d listening on %d" % (taskID, port))
    
    app.add_sockets(netutil.bind_sockets(port))
    IOLoop.current().start()
Esempio n. 3
0
def get_res(iter, baseline):
    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    test_file = pd.read_csv('../data/test_public.csv')
    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv('../data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])
    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]

    print(test_vec)
    test_id = list(test_file['content_id'])

    res_id, res_subject, value_list = Lgb.cal_subject_mul(
        train_vec, subject_list, test_id, test_vec, iter, baseline)

    GetResult.res2doc_mul(res_id, res_subject, value_list)
Esempio n. 4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# aba : just some tests to see that it is working

import logging
import sys
import os
from word2vec import Word2Vec
from doc2vec import Doc2Vec, LineSentence

logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))

input_file = 'test.txt'
model = Word2Vec(LineSentence(input_file), size=100, window=5, sg=0, min_count=5, workers=8)
model.save(input_file + '.model')
model.save_word2vec_format(input_file + '.vec')

#aba : initialize it with a already learned word vectors through model_file
sent_file = 'sent.txt'
model = Doc2Vec(LineSentence(sent_file), model_file=input_file + '.model')
model.save_doc2vec_format(sent_file + '.vec')

program = os.path.basename(sys.argv[0])
logging.info("finished running %s" % program)
Esempio n. 5
0
words = words[:10000]
print('Data size', len(words))

# fake some docs
doc_length = 100
docs = [words[i:i+doc_length] for i in range(0, doc_length, len(words))]

vocabulary_size = 500

d2v = Doc2Vec(vocabulary_size=vocabulary_size, 
	document_size=len(docs),
	n_steps=2001)

# print w2v.get_params()
d2v.fit(docs)
print(d2v.word_embeddings.shape)
print(d2v.doc_embeddings.shape)

save_path = d2v.save('models/test_d2v_model')
print(d2v.word_embeddings[0,0])
print(d2v.doc_embeddings[0,0])

print save_path

# restore a saved model
d2v_restored = Doc2Vec.restore(save_path)
print(d2v_restored.word_embeddings[0,0])
print(d2v_restored.doc_embeddings[0,0])


    return data


filename = 'text8.zip'
words = read_data(filename)
words = words[:10000]
print('Data size', len(words))

# fake some docs
doc_length = 100
docs = [words[i:i + doc_length] for i in range(0, doc_length, len(words))]

vocabulary_size = 500

d2v = Doc2Vec(vocabulary_size=vocabulary_size,
              document_size=len(docs),
              n_steps=2001)

# print w2v.get_params()
d2v.fit(docs)
print(d2v.word_embeddings.shape)
print(d2v.doc_embeddings.shape)

save_path = d2v.save('models/test_d2v_model')
print(d2v.word_embeddings[0, 0])
print(d2v.doc_embeddings[0, 0])

print save_path

# restore a saved model
d2v_restored = Doc2Vec.restore(save_path)
# -*- coding: utf-8 -*-
import os, sys
from settings import size, dmUnlabeled, dbowUnlabeled,\
 dmLabeled, dbowLabeled, useModifiedModule, testRes, trainedClassifer
import logging
import numpy as np
import pickle
from sklearn.linear_model import SGDClassifier
from doc2vec import Doc2Vec
from doc2vec import LabeledSentence
from docVecTrain import flushLoggerInfo, cleanUpText

if __name__ == "__main__":
    
    flushLoggerInfo()
    if os.path.isfile(dmLabeled) and os.path.isfile(dbowLabeled)\
       and os.path.isfile(trainedClassifer):
        model_dm = Doc2Vec.load(dmLabeled)   
        model_dbow = Doc2Vec.load(dbowLabeled)
        neg = "This movie is an absolute disaster within a disaster film. It is full of great action scenes, which are only meaningful if you throw away all sense of reality. Let's see, word to the wise, lava burns you; steam burns you. You can't stand next to lava. Diverting a minor lava flow is difficult, let alone a significant one. Scares me to think that some might actually believe what they saw in this movie.<br /><br />Even worse is the significant amount of talent that went into making this film. I mean the acting is actually very good. The effects are above average. Hard to believe somebody read the scripts for this and allowed all this talent to be wasted. I guess my suggestion would be that if this movie is about to start on TV ... look away! It is like a train wreck: it is so awful that once you know what is coming, you just have to watch. Look away and spend your time on more meaningful content."
        neg_test_vecs = np.hstack((model_dm.train_online(cleanUpText(neg)), model_dbow.train_online(cleanUpText(neg))))
        pos = "Naturally in a film who's main themes are of mortality, nostalgia, and loss of innocence it is perhaps not surprising that it is rated more highly by older viewers than younger ones. However there is a craftsmanship and completeness to the film which anyone can enjoy. The pace is steady and constant, the characters full and engaging, the relationships and interactions natural showing that you do not need floods of tears to show emotion, screams to show fear, shouting to show dispute or violence to show anger. Naturally Joyce's short story lends the film a ready made structure as perfect as a polished diamond, but the small changes Huston makes such as the inclusion of the poem fit in neatly. It is truly a masterpiece of tact, subtlety and overwhelming beauty."
        pos_test_vecs = np.hstack((model_dm.train_online(cleanUpText(pos)), model_dbow.train_online(cleanUpText(pos))))
        with open(trainedClassifer, 'rb') as f:
            lr = pickle.load(f)
            #we are expecting the classification to be 0 but there is no guarantee 
            print lr.predict([neg_test_vecs, pos_test_vecs]) 
    else:
        print 'runtime error'
        sys.exit(1)
Esempio n. 8
0
def cv_test_mul():
    train_vec = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv',
        header=None)
    test_file = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv')

    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    subject_list = np.array(subject_list)

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)
    value_list = np.array(value_list)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]
    test_id = list(test_file['content_id'])
    X, test, y, test_id, y1 = train_vec, test_vec, value_list, test_id, subject_list
    N = 10
    res = open('res2.txt', 'w')
    # kf = StratifiedKFold(n_splits=N, random_state=2018).split(X, y)
    for i in range(10):
        subject_oh = y1.copy()
        for l in range(len(subject_oh)):
            if subject_oh[l] != i:
                subject_oh[l] = 0
            else:
                subject_oh[l] = 1
        params = {
            'boosting_type': 'gbdt',
            'num_leaves': 55,
            'reg_alpha': 0.1,
            'reg_lambda': 1,
            'max_depth': 15,
            'objective': 'binary',
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'learning_rate': 0.06,
            'min_child_weight': 1,
            'random_state': 20,
            'n_jobs': 4
        }

        data_train = lgb.Dataset(X, subject_oh)
        clf = lgb.cv(params,
                     data_train,
                     num_boost_round=10000,
                     nfold=5,
                     stratified=False,
                     shuffle=True,
                     metrics='rmse',
                     early_stopping_rounds=50,
                     verbose_eval=50,
                     show_stdv=True,
                     seed=0)
        res.write(str(len(clf['rmse-mean'])))
        res.write(' ')
        res.write(str(clf['rmse-mean'][-1]))
        res.write('\n')
Esempio n. 9
0
def run_base_bdc():
    train_vec = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/content_vec_withoutD.csv',
        header=None)
    test_file = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/test_public.csv')

    # train_vec_sentiment = pd.read_csv('../content_vec_sentiment.csv', header=None)
    train_vec = np.array(train_vec)
    # train_vec_sentiment = np.array(train_vec_sentiment)
    data = pd.read_csv(
        '/home/hujoe/PycharmProjects/df-2018-NLP/data/train.csv')
    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in data['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break
    subject_list = np.array(subject_list)

    value_list = list()
    for i in data['sentiment_value']:
        value_list.append(i)
    value_list = np.array(value_list)

    bdc = Bdc.cal_bdc(train_vec, subject_list, 10)
    for i in range(train_vec.shape[0]):
        for j in range(train_vec.shape[1]):
            if train_vec[i][j] > 0:
                train_vec[i][j] = bdc[j]

    print(train_vec)
    test_vec = Doc2Vec.test2vec()
    for i in range(test_vec.shape[0]):
        for j in range(test_vec.shape[1]):
            if test_vec[i][j] > 0:
                test_vec[i][j] = bdc[j]

    print(test_vec)
    test_id = list(test_file['content_id'])

    N = 10
    kf = StratifiedKFold(n_splits=N,
                         random_state=2018).split(train_vec, subject_list)

    clf = lgb.LGBMClassifier(boosting_type='gbdt',
                             num_leaves=80,
                             reg_alpha=0.1,
                             reg_lambda=1,
                             max_depth=8,
                             n_estimators=500,
                             objective='binary',
                             subsample=0.8,
                             colsample_bytree=0.8,
                             subsample_freq=1,
                             learning_rate=0.06,
                             min_child_weight=1,
                             random_state=20,
                             n_jobs=4)
    clf_1 = lgb.LGBMClassifier(boosting_type='gbdt',
                               num_leaves=80,
                               reg_alpha=0.1,
                               reg_lambda=1,
                               max_depth=8,
                               n_estimators=10,
                               objective='binary',
                               subsample=0.8,
                               colsample_bytree=0.8,
                               subsample_freq=1,
                               learning_rate=0.06,
                               min_child_weight=1,
                               random_state=20,
                               n_jobs=4)
    y_train_oofp = np.zeros_like(subject_list, dtype='float64')
    y_train_oofp1 = np.zeros_like(subject_list, dtype='float64')
    '''
    y_train_oofp: 
    y_y_train_oofp1:
    '''

    y_test_oofp = np.zeros((test_vec.shape[0], N))
    y_test_oofp_1 = np.zeros((test_vec.shape[0], N))

    acc = 0
    vcc = 0

    l = 0
    ll = 0
    for i, (train_fold, test_fold) in enumerate(kf):
        X_train, X_validate, label_train, label_validate, label_1_train, label_1_validate, = \
        train_vec[train_fold, :], train_vec[test_fold,:], value_list[train_fold], value_list[test_fold], subject_list[train_fold], subject_list[test_fold]
        clf.fit(X_train, label_train)

        val_ = clf.predict(X_validate)
        y_train_oofp[test_fold] = val_
        if micro_avg_f1(label_validate, val_) > 0.7:
            l += 1
            print('sentiment_value_f1:%f' % micro_avg_f1(label_validate, val_))
            acc += micro_avg_f1(label_validate, val_)
            result = clf.predict(test_vec)
            y_test_oofp[:, i] = result

        # clf = svm.LinearSVC(loss='hinge', tol=1e-4, C=0.6)

        clf_1.fit(X_train, label_1_train)
        val_1 = clf_1.predict(X_validate)
        y_train_oofp1[test_fold] = val_

        if micro_avg_f1(label_1_validate, val_1) > 0.6:
            ll += 1
            vcc += micro_avg_f1(label_1_validate, val_1)
            result = clf_1.predict(test_vec)
            y_test_oofp_1[:, i] = result

    print(acc / l)
    print(vcc / ll)

    lbl = pk.load(open('../tmp/label_encoder.sav', 'rb'))
    res_2 = []
    for i in range(y_test_oofp_1.shape[0]):
        tmp = []
        for j in range(N):
            tmp.append(int(y_test_oofp_1[i][j]))
        word_counts = Counter(tmp)
        yes = word_counts.most_common(1)
        res_2.append(lbl.inverse_transform([yes[0][0]])[0])

    res = []
    for i in range(y_test_oofp.shape[0]):
        tmp = []
        for j in range(N):
            tmp.append(y_test_oofp[i][j])
        res.append(max(set(tmp), key=tmp.count))

    result = pd.DataFrame()
    result['content_id'] = list(test_id)

    result['subject'] = list(res_2)
    result['subject'] = result['subject']

    result['sentiment_value'] = list(res)
    result['sentiment_value'] = result['sentiment_value'].astype(int)

    result['sentiment_word'] = ''
    result.to_csv('../submit_bdc.csv', index=False)
Esempio n. 10
0
print('done reading data')
# In[4]:


#prepare models
#for Doc2Vec
wind_size = 15
embedding_dim = 300
min_count = 5

models = {}

models["TF-IDF"]     = {"model": TfIdfRetrieval(docs), "results": {}, "metrics": {}}
# models["word2vec"]   = {"model": ..., "results": {}, "metrics": {}}
models["doc2vec"]    = {"model": Doc2Vec(docs, wind_size, embedding_dim, min_count=min_count), "results": {}, "metrics": {}}
# models["LSI-BoW"]    = {"model": ..., "results": {}, "metrics": {}}
# models["LSI-TF-IDF"] = {"model": ..., "results": {}, "metrics": {}}
# models["LDA"]        = {"model": ..., "results": {}, "metrics": {}}


# In[5]:


#run each model for each query

for qid in qrels: 
    query_text = queries[qid]

    #this might be slightly different for each model
    models["TF-IDF"]["results"][qid] = dict(models["TF-IDF"]["model"].search(query_text))
Esempio n. 11
0
tfidf_vectorizer = generate_tfidf_vectorizer(corpus)

logger.info("Codificando y...")
X = np.array(corpus.data.acordao)
l_enc = preprocessing.LabelEncoder()
y = l_enc.fit_transform(np.array(corpus.data.relator.tolist()))

logger.info("Dividindo dados para treinamento e teste...")
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

embedders = [
    tfidf_vectorizer,
    Doc2Vec(model_pretrained),
    Doc2Vec(model_trained)
]

embedders_map = {0: 'TFIDF', 1: 'PRETRAINED', 2: 'TRAINED'}

logger.info("Pré-codificando dados em cada modelo de vetorização...")
try:
    train_encoded_data = pickle.load(open("train_enc.pickle", "rb"))
    test_encoded_data = pickle.load(open("test_enc.pickle", "rb"))
except:
    train_encoded_data = []
    test_encoded_data = []
    for embedder in tqdm(embedders):
        X_train_enc = embedder.transform(X_train)
        train_encoded_data.append(X_train_enc)
Esempio n. 12
0
    d2v.get_doc_vecs(docs_by_id)
    for qid in tqdm(qrels):
        query_text = queries[qid]
        results = d2v.search(query_text)
        overall_ser[qid] = dict(results)
    with open("d2v_windsize_"+str(wind_size)+".json", "w") as writer:
        json.dump(overall_ser, writer, indent=1)

for vocab_size in vocab_sizes:
    overall_ser = {}
    d2v = Doc2Vec(docs_by_id, wind_size_def, vec_dim_def, vocab_size)
    d2v.get_doc_vecs(docs_by_id)
    for qid in tqdm(qrels):
        query_text = queries[qid]
        results = d2v.search(query_text)
        overall_ser[qid] = dict(results)
    with open("d2v_vocabsize_"+str(vocab_size)+".json", "w") as writer:
        json.dump(overall_ser, writer, indent=1)
"""
wind_size = 15
vec_dim = 200
vocab = 50
overall_ser = {}
d2v = Doc2Vec(docs_by_id, wind_size, vec_dim, vocab)
d2v.get_doc_vecs(docs_by_id)
for qid in tqdm(qrels):
    query_text = queries[qid]
    results = d2v.search(query_text)
    overall_ser[qid] = dict(results)
with open("d2v_vecdim_" + str(vec_dim) + ".json", "w") as writer:
    json.dump(overall_ser, writer, indent=1)
Esempio n. 13
0
from sklearn import svm
import lightgbm as lgb
import numpy as np
import pandas as pd

from bdc import Bdc
from doc2vec import Doc2Vec
from getResult import GetResult

if __name__ == '__main__':
    res = pd.read_csv('../tmp/baseline.csv')
    train = pd.read_csv('../data/train.csv')
    train_vec = pd.read_csv('../content_vec_withoutD.csv', header=None)
    train_vec = np.array(train_vec)
    test_vec = Doc2Vec.test2vec()

    value_list = list(train['sentiment_value'])

    subject_vocab = list(
        ['价格', '配置', '操控', '舒适性', '油耗', '动力', '内饰', '安全性', '空间', '外观'])

    subject_list = list()
    for i in train['subject']:
        for k in range(10):
            if subject_vocab[k] == i:
                subject_list.append(k)
                break

    predict_subject = list()
    for i in res['subject']:
        for k in range(10):
Esempio n. 14
0
    save_path = 'models/docs_model'
    data_files = 'trabajos/*.txt'

    saved_model = glob.glob(save_path+'/checkpoint')
    restore_model = continue_training = False
    if len(saved_model)>0:
        answer = input('Do you want to (t)rain, (r)estore, or (c)ontinue training a model?')
        c = answer[0].lower()
        restore_model = (c == 'r')
        continue_training = (c == 'c')

    docs = docs_from_path(data_files)

    if restore_model or continue_training:
        print('Restoring a saved model...')
        d2v = Doc2Vec.restore(save_path + '/model.ckpt')
    else: # restart training
        d2v = Doc2Vec(vocabulary_size=vocabulary_size,
                      document_size=len(docs),
                      embedding_size_d=64,
                      embedding_size_w=64,
                      learning_rate=0.1,
                      n_steps=100001)

    if not restore_model:
        if continue_training:
            steps = input('How many steps? (%d)'%d2v.n_steps)
            if len(steps.strip()) != 0:
                d2v.n_steps = int(steps)
        d2v.fit(docs, continue_training=continue_training)