def main(args): articles = pd.read_csv(args.input) print(articles.head()) if args.n == 'test': articles = articles.loc[:3] elif args.n == 'all': # Initialize an instance of the model model = Model(root_path=args.path) results = [] for i, text in articles["text_en"].iteritems(): print("start transforming text") # Run LSTM model to predict final hidden units' values text_features = model.transform(text) print("text transformed") # Extract content from sentiment hidden unit 2388 results.append(text_features[:, 2388]) print(f"text {i} analyzed") pickle.dump( results, open("../data/sentiment_analysis_scores_test.pkl", "wb")) pickle.dump(results, open("../data/sentiment_analysis_scores.pkl", "wb")) elif args.n == 'text': # Initialize an instance of the model model = Model(root_path=args.path) with open(args.input, "r") as myfile: text = myfile.readlines() text_features = model.transform(text) pickle.dump(text_features[:, 2388], open("../data/sentiment_analysis_scores_text.pkl", "wb"))
def prepare_model(): global model, encoder if encoder is None: from encoder import Model encoder = Model() if model is None: from ResearchNLP import Constants as cn import config model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl' if os.path.exists(model_path): model = joblib.load(model_path) else: # load model and save it from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model import pandas as pd train_df = pd.concat([cn.base_training_df, cn.pool_df]) trX = train_df[cn.col_names.text].values trY = train_df[cn.col_names.tag].values tstX = cn.validation_data_df[cn.col_names.text].values tstY = cn.validation_data_df[cn.col_names.tag].values trXt = encoder.transform(trX) tstXt = encoder.transform(tstX) model = train_model(trXt, trY, tstXt, tstY) # train on all data print test_model(model, tstXt, tstY) joblib.dump(model, model_path)
class ModelInterface(): NAME = ("VG16." + __name__) _testData = [ 'too bad!', 'it was so cool, beautiful', 'the screenplay and the directing were horrendous', 'best books' ] def __init__(self): print(self.NAME, "Loading Model") self.model = Model() # Input: String def prediction(self, input): text_features = self.model.transform([input["text"]]) sentiment = text_features[:, 2388][0] return {"score": Decimal(str(sentiment))}
class ReviewSentimentWrapper(object): def __init__(self): logger.info('Loading Review sentiment') self.graph = tf.Graph() with self.graph.as_default(): current_directory = os.getcwd() # Necessary as the model is imported with relative path os.chdir(reviews_path) self.model = SentimentModel() os.chdir(current_directory) def predict(self, text): """ # Arguments text: a string to process # Returns A dict containing predictions """ text_features = self.model.transform([text]) # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2 sentiment = text_features[0, 2388] return json.dumps({'sentiment': str(sentiment)})
# Open the file with io.open(path, 'r') as raw: for line in raw: reviewBuf.append(line) return reviewBuf if __name__ == '__main__': # Parse command-line arguments parser = ArgumentParser() parser.add_argument("-i", "--input-path", help="Path to reviews", type=str) parser.add_argument("-o", "--output-path", help="Destination of inferred topics", type=str) args = parser.parse_args() # Take reviews and split into sentences sentences = readReviews(args.input_path) tok, sent = preprocess(sentences) model = Model() for i in range(len(sentences)): vec = 0 for k in range(len(tok[i])): text_features = model.transform(tok[i][k]) vec = vec + text_features[0][2388] print(vec, sentences[i])
import pandas as pd from encoder import Model #from utils import sst_binary sentiment_model = Model() data_token = pd.read_csv( "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv" ) samples = list(data_token['message']) subsample = [samples[1]] #samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here" #samples1 = sst_binary() sent = [] for sublist in data_token['message']: subsample = [sublist] text_features = sentiment_model.transform(subsample) sentiment_scores = text_features[:, 2388] sent.append(sentiment_scores) result = pd.DataFrame(data={ "sentiment": sent, "message": data_token['message'][0:3847] }) #data_token['sentiment_scores'] = sentiment_scores #data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
model = Model() positive_examples = list(open("train_pos_full.txt", "r").readlines()) positive_examples = [s.strip() for s in positive_examples] # -1000 negative_examples = list(open("train_neg_full.txt", "r").readlines()) negative_examples = [s.strip() for s in negative_examples] x = positive_examples + negative_examples x_text = [sent for sent in x] positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) x= model.transform(x_text) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] np.save('X.npy', x_shuffled ) np.save('Y.npy', y_shuffled ) cross_validation_indices = np.array(random.sample(list(np.arange(len(y))), int(len(y) * 0.1) )) train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices))) x_train, x_test= x_shuffled[train_indices], x_shuffled[cross_validation_indices] y_train, y_test = y_shuffled[train_indices], y_shuffled[cross_validation_indices]
data = pd.read_csv(path, encoding='utf-8') X = data['sentence'].values.tolist() Y = data['label'].values return X, Y trX,trY = load_sst('./data/train.csv') teX,teY = load_sst('./data/test.csv') print trX[0] print trY[0] print 'loading features...' if not os.path.exists(base_dir + 'features'): os.makedirs(base_dir + 'features') trXt = model.transform(trX) teXt = model.transform(teX) np.save(base_dir + 'features/trXt',trXt) np.save(base_dir + 'features/teXt',teXt) else: trXt = np.load(base_dir + 'features/trXt.npy') teXt = np.load(base_dir + 'features/teXt.npy') print 'training...' clf = LogisticRegression() clf.fit(trXt, trY) score = clf.score(teXt, teY)
import numpy as np from encoder import Model model = Model() import time myJDs = np.load('../data/myJDs.npy') print(myJDs.shape) myDim = 4096 average_vector = 1 t0 = time.time() X = [] for ck in range(len(myJDs)): t1 = time.time() tempX = model.transform(myJDs[ck]) print(tempX.shape) tempX = np.mean(tempX, axis=0) if (average_vector): #switch this conditional to save full vector if (ck == 0): X = np.reshape(tempX, (1, myDim)) else: X = np.append(X, np.reshape(tempX, (1, myDim)), axis=0) else: X.append(np.array(tempX)) elapsed = time.time() - t1 print('job description number %i parsed in %.3f s' % (ck, elapsed)) if (average_vector):
# Harvard Business Review tag = "@HarvardBiz" tweets = api.search(q=tag, lang="en", count=10) # tweets_text = [tweet.text for tweet in tweets] # remove url in tweets def remove_url(txt): return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split()) # list of tweets strings clean_tweets = [remove_url(tweet.text) for tweet in tweets] # transform the tweets list tw_transfrom = model.transform(clean_tweets) # use clf to predict for Boolean results predictions = clf.predict(tw_transfrom) # print the final results # an empty line formatting print() print("Search tag:", tag) for i in range(0, 10): tw1 = clean_tweets[i] sa = predictions[i] print(sa, tw1)
of Support.\ Sources report there will not be a vote regarding the authorization for the\ bond issuance/bridge loan by the May 8th deadline. Any possibility for a\ deal has reportedly fallen apart. According to sources, both the Republicans\ and Democratic caucuses are turning against Davis. The Democratic caucus\ is reportedly "unwilling to fight" for Davis. Many legislative Republicans\ and Democrats reportedly do not trust Davis and express concern that, once\ the bonds are issued to replenish the General Fund, Davis would "double\ dip" into the fund. Clearly there is a lack of good faith between the legislature\ and the governor. However, it is believed once Davis discloses the\ e details of the power contracts negotiated, a bond issuance will take place.\ Additionally, some generator sources have reported that some of the long\ -term power contracts (as opposed to those still in development) require that\ hat the bond issuance happen by July 1, 2001. If not, the state may be in \ breach of contract. Sources state that if the legislature does not pass the\ bridge loan legislation by May 8th, having a bond issuance by July 1st will\ be very difficult.'] """ """ text = ['Bahah I can see your whole history \ Including the parts where you debated separating the chat you invited me to because I would make off with your technical papers \ Real nice'] """ text_features = model.transform(text) print(text_features.shape) print(text_features) #17.660 seconds to transform 8 examples for i in range(len(text)): sentiment = text_features[i, 2388] print(text[i], sentiment)
import pandas as pd import numpy as np from encoder import Model df = pd.read_csv('questions.csv') question1 = np.array(df['question1']) question2 = np.array(df['question2']) labels = np.array(df['is_duplicate']) del df model = Model() for i in range(21, 40): ques1_features = model.transform(question1[10000 * i:10000 * (i + 1)]) ques2_features = model.transform(question2[10000 * i:10000 * (i + 1)]) label = labels[10000 * i:10000 * (i + 1)].reshape([10000, 1]) data = np.concatenate((ques1_features, ques2_features, label), axis=1) np.save("quora_data/quora_features{}".format(i), data) del ques1_features, ques2_features, label, data