def main(args):

    articles = pd.read_csv(args.input)
    print(articles.head())
    if args.n == 'test':
        articles = articles.loc[:3]

    elif args.n == 'all':
        # Initialize an instance of the model
        model = Model(root_path=args.path)

        results = []
        for i, text in articles["text_en"].iteritems():
            print("start transforming text")
            # Run LSTM model to predict final hidden units' values
            text_features = model.transform(text)
            print("text transformed")
            # Extract content from sentiment hidden unit 2388
            results.append(text_features[:, 2388])
            print(f"text {i} analyzed")
            pickle.dump(
                results,
                open("../data/sentiment_analysis_scores_test.pkl", "wb"))

        pickle.dump(results, open("../data/sentiment_analysis_scores.pkl",
                                  "wb"))

    elif args.n == 'text':
        # Initialize an instance of the model
        model = Model(root_path=args.path)
        with open(args.input, "r") as myfile:
            text = myfile.readlines()
        text_features = model.transform(text)
        pickle.dump(text_features[:, 2388],
                    open("../data/sentiment_analysis_scores_text.pkl", "wb"))
Esempio n. 2
0
def prepare_model():
    global model, encoder
    if encoder is None:
        from encoder import Model
        encoder = Model()
    if model is None:
        from ResearchNLP import Constants as cn
        import config
        model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl'

        if os.path.exists(model_path):
            model = joblib.load(model_path)
        else:  # load model and save it
            from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model
            import pandas as pd
            train_df = pd.concat([cn.base_training_df, cn.pool_df])
            trX = train_df[cn.col_names.text].values
            trY = train_df[cn.col_names.tag].values
            tstX = cn.validation_data_df[cn.col_names.text].values
            tstY = cn.validation_data_df[cn.col_names.tag].values
            trXt = encoder.transform(trX)
            tstXt = encoder.transform(tstX)
            model = train_model(trXt, trY, tstXt, tstY)  # train on all data
            print test_model(model, tstXt, tstY)
            joblib.dump(model, model_path)
from encoder import Model
import numpy as np
import pickle
# from sklearn.decomposition import IncrementalPCA

test_data = list(open("../data/twitter-datasets/test_data.txt", "r",encoding='utf8').readlines())
print(np.shape(test_data))
model=Model()
# ipca = IncrementalPCA(n_components=500)

x = [s.strip() for s in test_data]
x_text = [sent for sent in x]
print(np.shape(x_text))
x = model.transform(x_text)
np.save("/mnt/ds3lab/tifreaa/openai_features/test_X.npy",x)
print(np.shape(x))
#     ipca.partial_fit(x)

# pickle.dump(ipca, open("pca", 'wb'))
Esempio n. 4
0
va_num = 2000  #训练集

if languageType == 'c':
    max_length = 100
    load_path = 'data/chinese'
    language = 'chinese'
    tr_num = 17000
    va_num = 2000
elif languageType == 'e':
    max_length = 40
    load_path = 'data/english'
    language = 'english'
    tr_num = 8000
    va_num = 600

model = Model(max_length)

all_data = sst_binary(load_path)  #分别获取所有的句子和标签
print('=> Succeeds in loading <' + language +
      '> file and starting to translate words into Embeddedness······')

x, y, wi = model.transform(all_data)  #将每个句子里的词转化成词频索引值
print(
    '=> Succeeds in translating swords into word Embeddedness and starting to train the model process······'
)

accuracy = model_train(x, y, wi, language, max_length, tr_num,
                       va_num)  #训练模型  (如果已经有训练好的模型,这行代码注释掉)
print('=> accuracy: ', accuracy * 100, '%')

# model_load(language) #如果模型训练好了,调用此方法直接加载模型,不需要再训练
Esempio n. 5
0
from encoder import Model
from matplotlib import pyplot as plt
from utils import sst_binary, train_with_reg_cv
import numpy as np
import os
from sklearn import svm, metrics
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout

model = Model('./model/0/model.npy')

trX, vaX, teX, trY, vaY, teY = sst_binary()

if not os.path.exists('features/labelleddata'):
    os.makedirs('features/labelleddata')

    trXt = model.transform(trX)
    vaXt = model.transform(vaX)
    teXt = model.transform(teX)
    print(trXt.shape)

    np.save('features/labelleddata/trXt', trXt)
    np.save('features/labelleddata/vaXt', vaXt)
    np.save('features/labelleddata/teXt', teXt)

else:
    print('load features')
    trXt = np.load('features/labelleddata/trXt.npy')
    vaXt = np.load('features/labelleddata/vaXt.npy')
    teXt = np.load('features/labelleddata/teXt.npy')
from encoder import Model

mdl = Model()

text = [
    'it was a nice day', 'it was a great day', 'it was a bad day',
    'It was a wonderful day', 'It was an excellent day',
    'It was a super excellent day', 'It was such a bad bad day ',
    'It was such a bad bad bad day'
]
text_features = mdl.transform(text)
for i in range(len(text)):
    sentiment = text_features[i, 2388]
    print(text[i], sentiment)
import pandas as pd
from encoder import Model
#from utils import sst_binary

sentiment_model = Model()

data_token = pd.read_csv(
    "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv"
)

samples = list(data_token['message'])
subsample = [samples[1]]
#samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#samples1 = sst_binary()
sent = []
for sublist in data_token['message']:
    subsample = [sublist]
    text_features = sentiment_model.transform(subsample)
    sentiment_scores = text_features[:, 2388]
    sent.append(sentiment_scores)
result = pd.DataFrame(data={
    "sentiment": sent,
    "message": data_token['message'][0:3847]
})

#data_token['sentiment_scores'] = sentiment_scores

#data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
from encoder import Model
from matplotlib import pyplot as plt
from utils import sst_binary, train_with_reg_cv
import numpy as np
import os

model = Model('./model/994/model.npy')

trX, vaX, teX, trY, vaY, teY = sst_binary()

if not os.path.exists('features'):
    os.makedirs('features')

    trXt = model.transform(trX)
    vaXt = model.transform(vaX)
    teXt = model.transform(teX)

    np.save('features/trXt', trXt)
    np.save('features/vaXt', trXt)
    np.save('features/teXt', trXt)

else:
    trXt = np.load('features/trXt.npy')
    vaXt = np.load('features/vaXt.npy')
    teXt = np.load('features/teXt.npy')

full_rep_acc, c, nnotzero, coef, lg_model = train_with_reg_cv(
    trXt, trY, vaXt, vaY, teXt, teY)
print('%05.2f test accuracy' % full_rep_acc)
print('%05.2f regularization coef' % c)
print('%05d features used' % nnotzero)
Esempio n. 9
0
#!usr/bin/env python
# -*- coding:utf-8 -*-
import os
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

from encoder import Model


base_dir = '/var/data/mlstm/'

model = Model(base_dir + 'models/model.npy')

def load_sst(path):
    data = pd.read_csv(path, encoding='utf-8')
    X = data['sentence'].values.tolist()
    Y = data['label'].values
    return X, Y


trX,trY = load_sst('./data/train.csv')
teX,teY = load_sst('./data/test.csv')
print trX[0]
print trY[0]

print 'loading features...'

if not os.path.exists(base_dir + 'features'):
    os.makedirs(base_dir + 'features')   
    trXt = model.transform(trX)
import random
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.utils.np_utils import to_categorical
from keras import backend as K
import os
from keras.models import model_from_json
import sys
from dataSource import DataSource
from encoder import Model

#x=np.load('X.npy')
#y=np.load('Y.npy' )
#y = to_categorical(y)
encoder = Model()

def log(*s):
    print("TRAIN_OPENAI:", s)

def baseline_model():
	# create model
    model = Sequential()
    model.add(Dense(4096, input_shape=(4096,), init='lecun_uniform'))
    model.add(Activation('relu'))
    model.add(Dense(2, init='lecun_uniform'))
    model.add(Activation('softmax'))

    return model

def new_baseline_model():
Esempio n. 11
0
 def load_model():
     os.chdir('src/grds')
     model = Model()
     os.chdir('../..')
     return model
Esempio n. 12
0
 def __init__(self):
     print(self.NAME, "Loading Model")
     self.model = Model()