def main(args): articles = pd.read_csv(args.input) print(articles.head()) if args.n == 'test': articles = articles.loc[:3] elif args.n == 'all': # Initialize an instance of the model model = Model(root_path=args.path) results = [] for i, text in articles["text_en"].iteritems(): print("start transforming text") # Run LSTM model to predict final hidden units' values text_features = model.transform(text) print("text transformed") # Extract content from sentiment hidden unit 2388 results.append(text_features[:, 2388]) print(f"text {i} analyzed") pickle.dump( results, open("../data/sentiment_analysis_scores_test.pkl", "wb")) pickle.dump(results, open("../data/sentiment_analysis_scores.pkl", "wb")) elif args.n == 'text': # Initialize an instance of the model model = Model(root_path=args.path) with open(args.input, "r") as myfile: text = myfile.readlines() text_features = model.transform(text) pickle.dump(text_features[:, 2388], open("../data/sentiment_analysis_scores_text.pkl", "wb"))
def prepare_model(): global model, encoder if encoder is None: from encoder import Model encoder = Model() if model is None: from ResearchNLP import Constants as cn import config model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl' if os.path.exists(model_path): model = joblib.load(model_path) else: # load model and save it from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model import pandas as pd train_df = pd.concat([cn.base_training_df, cn.pool_df]) trX = train_df[cn.col_names.text].values trY = train_df[cn.col_names.tag].values tstX = cn.validation_data_df[cn.col_names.text].values tstY = cn.validation_data_df[cn.col_names.tag].values trXt = encoder.transform(trX) tstXt = encoder.transform(tstX) model = train_model(trXt, trY, tstXt, tstY) # train on all data print test_model(model, tstXt, tstY) joblib.dump(model, model_path)
from encoder import Model import numpy as np import pickle # from sklearn.decomposition import IncrementalPCA test_data = list(open("../data/twitter-datasets/test_data.txt", "r",encoding='utf8').readlines()) print(np.shape(test_data)) model=Model() # ipca = IncrementalPCA(n_components=500) x = [s.strip() for s in test_data] x_text = [sent for sent in x] print(np.shape(x_text)) x = model.transform(x_text) np.save("/mnt/ds3lab/tifreaa/openai_features/test_X.npy",x) print(np.shape(x)) # ipca.partial_fit(x) # pickle.dump(ipca, open("pca", 'wb'))
va_num = 2000 #训练集 if languageType == 'c': max_length = 100 load_path = 'data/chinese' language = 'chinese' tr_num = 17000 va_num = 2000 elif languageType == 'e': max_length = 40 load_path = 'data/english' language = 'english' tr_num = 8000 va_num = 600 model = Model(max_length) all_data = sst_binary(load_path) #分别获取所有的句子和标签 print('=> Succeeds in loading <' + language + '> file and starting to translate words into Embeddedness······') x, y, wi = model.transform(all_data) #将每个句子里的词转化成词频索引值 print( '=> Succeeds in translating swords into word Embeddedness and starting to train the model process······' ) accuracy = model_train(x, y, wi, language, max_length, tr_num, va_num) #训练模型 (如果已经有训练好的模型,这行代码注释掉) print('=> accuracy: ', accuracy * 100, '%') # model_load(language) #如果模型训练好了,调用此方法直接加载模型,不需要再训练
from encoder import Model from matplotlib import pyplot as plt from utils import sst_binary, train_with_reg_cv import numpy as np import os from sklearn import svm, metrics from xgboost import XGBClassifier from keras.models import Sequential from keras.layers import Dense, Dropout model = Model('./model/0/model.npy') trX, vaX, teX, trY, vaY, teY = sst_binary() if not os.path.exists('features/labelleddata'): os.makedirs('features/labelleddata') trXt = model.transform(trX) vaXt = model.transform(vaX) teXt = model.transform(teX) print(trXt.shape) np.save('features/labelleddata/trXt', trXt) np.save('features/labelleddata/vaXt', vaXt) np.save('features/labelleddata/teXt', teXt) else: print('load features') trXt = np.load('features/labelleddata/trXt.npy') vaXt = np.load('features/labelleddata/vaXt.npy') teXt = np.load('features/labelleddata/teXt.npy')
from encoder import Model mdl = Model() text = [ 'it was a nice day', 'it was a great day', 'it was a bad day', 'It was a wonderful day', 'It was an excellent day', 'It was a super excellent day', 'It was such a bad bad day ', 'It was such a bad bad bad day' ] text_features = mdl.transform(text) for i in range(len(text)): sentiment = text_features[i, 2388] print(text[i], sentiment)
import pandas as pd from encoder import Model #from utils import sst_binary sentiment_model = Model() data_token = pd.read_csv( "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv" ) samples = list(data_token['message']) subsample = [samples[1]] #samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #samples1 = sst_binary() sent = [] for sublist in data_token['message']: subsample = [sublist] text_features = sentiment_model.transform(subsample) sentiment_scores = text_features[:, 2388] sent.append(sentiment_scores) result = pd.DataFrame(data={ "sentiment": sent, "message": data_token['message'][0:3847] }) #data_token['sentiment_scores'] = sentiment_scores #data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
from encoder import Model from matplotlib import pyplot as plt from utils import sst_binary, train_with_reg_cv import numpy as np import os model = Model('./model/994/model.npy') trX, vaX, teX, trY, vaY, teY = sst_binary() if not os.path.exists('features'): os.makedirs('features') trXt = model.transform(trX) vaXt = model.transform(vaX) teXt = model.transform(teX) np.save('features/trXt', trXt) np.save('features/vaXt', trXt) np.save('features/teXt', trXt) else: trXt = np.load('features/trXt.npy') vaXt = np.load('features/vaXt.npy') teXt = np.load('features/teXt.npy') full_rep_acc, c, nnotzero, coef, lg_model = train_with_reg_cv( trXt, trY, vaXt, vaY, teXt, teY) print('%05.2f test accuracy' % full_rep_acc) print('%05.2f regularization coef' % c) print('%05d features used' % nnotzero)
#!usr/bin/env python # -*- coding:utf-8 -*- import os import pandas as pd import numpy as np from sklearn.linear_model import LogisticRegression from encoder import Model base_dir = '/var/data/mlstm/' model = Model(base_dir + 'models/model.npy') def load_sst(path): data = pd.read_csv(path, encoding='utf-8') X = data['sentence'].values.tolist() Y = data['label'].values return X, Y trX,trY = load_sst('./data/train.csv') teX,teY = load_sst('./data/test.csv') print trX[0] print trY[0] print 'loading features...' if not os.path.exists(base_dir + 'features'): os.makedirs(base_dir + 'features') trXt = model.transform(trX)
import random from keras.models import Sequential from keras.layers import Dense, Dropout, Activation from keras.optimizers import Adam from keras.utils.np_utils import to_categorical from keras import backend as K import os from keras.models import model_from_json import sys from dataSource import DataSource from encoder import Model #x=np.load('X.npy') #y=np.load('Y.npy' ) #y = to_categorical(y) encoder = Model() def log(*s): print("TRAIN_OPENAI:", s) def baseline_model(): # create model model = Sequential() model.add(Dense(4096, input_shape=(4096,), init='lecun_uniform')) model.add(Activation('relu')) model.add(Dense(2, init='lecun_uniform')) model.add(Activation('softmax')) return model def new_baseline_model():
def load_model(): os.chdir('src/grds') model = Model() os.chdir('../..') return model
def __init__(self): print(self.NAME, "Loading Model") self.model = Model()