def main(args):

    articles = pd.read_csv(args.input)
    print(articles.head())
    if args.n == 'test':
        articles = articles.loc[:3]

    elif args.n == 'all':
        # Initialize an instance of the model
        model = Model(root_path=args.path)

        results = []
        for i, text in articles["text_en"].iteritems():
            print("start transforming text")
            # Run LSTM model to predict final hidden units' values
            text_features = model.transform(text)
            print("text transformed")
            # Extract content from sentiment hidden unit 2388
            results.append(text_features[:, 2388])
            print(f"text {i} analyzed")
            pickle.dump(
                results,
                open("../data/sentiment_analysis_scores_test.pkl", "wb"))

        pickle.dump(results, open("../data/sentiment_analysis_scores.pkl",
                                  "wb"))

    elif args.n == 'text':
        # Initialize an instance of the model
        model = Model(root_path=args.path)
        with open(args.input, "r") as myfile:
            text = myfile.readlines()
        text_features = model.transform(text)
        pickle.dump(text_features[:, 2388],
                    open("../data/sentiment_analysis_scores_text.pkl", "wb"))
Beispiel #2
0
def prepare_model():
    global model, encoder
    if encoder is None:
        from encoder import Model
        encoder = Model()
    if model is None:
        from ResearchNLP import Constants as cn
        import config
        model_path = config.CODE_DIR + 'prediction_models/SOTA_sentiment_library/model/' + cn.data_name + '.pkl'

        if os.path.exists(model_path):
            model = joblib.load(model_path)
        else:  # load model and save it
            from ResearchNLP.prediction_models.SOTA_sentiment_library.utils import train_model, test_model
            import pandas as pd
            train_df = pd.concat([cn.base_training_df, cn.pool_df])
            trX = train_df[cn.col_names.text].values
            trY = train_df[cn.col_names.tag].values
            tstX = cn.validation_data_df[cn.col_names.text].values
            tstY = cn.validation_data_df[cn.col_names.tag].values
            trXt = encoder.transform(trX)
            tstXt = encoder.transform(tstX)
            model = train_model(trXt, trY, tstXt, tstY)  # train on all data
            print test_model(model, tstXt, tstY)
            joblib.dump(model, model_path)
Beispiel #3
0
class ModelInterface():

    NAME = ("VG16." + __name__)
    _testData = [
        'too bad!', 'it was so cool, beautiful',
        'the screenplay and the directing were horrendous', 'best books'
    ]

    def __init__(self):
        print(self.NAME, "Loading Model")
        self.model = Model()

    # Input: String
    def prediction(self, input):
        text_features = self.model.transform([input["text"]])
        sentiment = text_features[:, 2388][0]
        return {"score": Decimal(str(sentiment))}
Beispiel #4
0
class ReviewSentimentWrapper(object):
    def __init__(self):
        logger.info('Loading Review sentiment')
        self.graph = tf.Graph()
        with self.graph.as_default():
            current_directory = os.getcwd()

            # Necessary as the model is imported with relative path
            os.chdir(reviews_path)
            self.model = SentimentModel()
            os.chdir(current_directory)

    def predict(self, text):
        """ # Arguments
                text: a string to process

        # Returns
            A dict containing predictions
        """
        text_features = self.model.transform([text])
        # For more info https://github.com/openai/generating-reviews-discovering-sentiment/issues/2
        sentiment = text_features[0, 2388]

        return json.dumps({'sentiment': str(sentiment)})
    # Open the file
    with io.open(path, 'r') as raw:
        for line in raw:
            reviewBuf.append(line)

    return reviewBuf


if __name__ == '__main__':

    # Parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("-i", "--input-path", help="Path to reviews", type=str)
    parser.add_argument("-o",
                        "--output-path",
                        help="Destination of inferred topics",
                        type=str)
    args = parser.parse_args()

    # Take reviews and split into sentences
    sentences = readReviews(args.input_path)
    tok, sent = preprocess(sentences)
    model = Model()

    for i in range(len(sentences)):
        vec = 0
        for k in range(len(tok[i])):
            text_features = model.transform(tok[i][k])
            vec = vec + text_features[0][2388]
        print(vec, sentences[i])
import pandas as pd
from encoder import Model
#from utils import sst_binary

sentiment_model = Model()

data_token = pd.read_csv(
    "C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/raw.csv"
)

samples = list(data_token['message'])
subsample = [samples[1]]
#samples1 = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#s = "I want to transition from support to midlane. Any tips on the lane and good tutorials? Also, i dont know how to manager waves, so i need some help here"
#samples1 = sst_binary()
sent = []
for sublist in data_token['message']:
    subsample = [sublist]
    text_features = sentiment_model.transform(subsample)
    sentiment_scores = text_features[:, 2388]
    sent.append(sentiment_scores)
result = pd.DataFrame(data={
    "sentiment": sent,
    "message": data_token['message'][0:3847]
})

#data_token['sentiment_scores'] = sentiment_scores

#data_token.to_csv('C:/Users/matthew li yuen fong/Desktop/sentiment-analysis-master/data/raw/openaiscore.csv')
model = Model()

positive_examples = list(open("train_pos_full.txt", "r").readlines())
positive_examples = [s.strip() for s in positive_examples]   # -1000
negative_examples = list(open("train_neg_full.txt", "r").readlines())
negative_examples = [s.strip() for s in negative_examples]

x = positive_examples + negative_examples

x_text = [sent for sent in x]

positive_labels = [1 for _ in positive_examples]
negative_labels = [0 for _ in negative_examples]

y = np.concatenate([positive_labels, negative_labels], 0)
x= model.transform(x_text)

shuffle_indices = np.random.permutation(np.arange(len(y)))

x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

np.save('X.npy', x_shuffled )
np.save('Y.npy', y_shuffled )

cross_validation_indices = np.array(random.sample(list(np.arange(len(y))), int(len(y) * 0.1) ))
train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices)))

x_train, x_test= x_shuffled[train_indices], x_shuffled[cross_validation_indices]
y_train, y_test = y_shuffled[train_indices], y_shuffled[cross_validation_indices]
Beispiel #8
0
    data = pd.read_csv(path, encoding='utf-8')
    X = data['sentence'].values.tolist()
    Y = data['label'].values
    return X, Y


trX,trY = load_sst('./data/train.csv')
teX,teY = load_sst('./data/test.csv')
print trX[0]
print trY[0]

print 'loading features...'

if not os.path.exists(base_dir + 'features'):
    os.makedirs(base_dir + 'features')   
    trXt = model.transform(trX)
    teXt = model.transform(teX)

    np.save(base_dir + 'features/trXt',trXt)
    np.save(base_dir + 'features/teXt',teXt)

else:
    trXt = np.load(base_dir + 'features/trXt.npy')
    teXt = np.load(base_dir + 'features/teXt.npy')


print 'training...'

clf = LogisticRegression()
clf.fit(trXt, trY)
score = clf.score(teXt, teY)
import numpy as np
from encoder import Model
model = Model()
import time

myJDs = np.load('../data/myJDs.npy')

print(myJDs.shape)
myDim = 4096
average_vector = 1
t0 = time.time()
X = []
for ck in range(len(myJDs)):
    t1 = time.time()
    tempX = model.transform(myJDs[ck])
    print(tempX.shape)
    tempX = np.mean(tempX, axis=0)
    if (average_vector):  #switch this conditional to save full vector
        if (ck == 0):
            X = np.reshape(tempX, (1, myDim))
        else:
            X = np.append(X, np.reshape(tempX, (1, myDim)), axis=0)
    else:
        X.append(np.array(tempX))

    elapsed = time.time() - t1

    print('job description number %i parsed in %.3f s' % (ck, elapsed))

if (average_vector):
# Harvard Business Review
tag = "@HarvardBiz"

tweets = api.search(q=tag, lang="en", count=10)

# tweets_text = [tweet.text for tweet in tweets]


# remove url in tweets
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())


# list of tweets strings
clean_tweets = [remove_url(tweet.text) for tweet in tweets]

# transform the tweets list
tw_transfrom = model.transform(clean_tweets)
# use clf to predict for Boolean results
predictions = clf.predict(tw_transfrom)

# print the final results
# an empty line formatting
print()
print("Search tag:", tag)
for i in range(0, 10):
    tw1 = clean_tweets[i]
    sa = predictions[i]
    print(sa, tw1)
Beispiel #11
0
 of Support.\
Sources report there will not be a vote regarding the authorization for the\
 bond issuance/bridge loan by the May 8th deadline.  Any possibility for a\
 deal has reportedly fallen apart.  According to sources, both the Republicans\
 and Democratic caucuses are turning against Davis.  The Democratic caucus\
 is reportedly "unwilling to fight" for Davis.  Many legislative Republicans\
 and Democrats reportedly do not trust Davis and express concern that, once\
 the bonds are issued to replenish the General Fund, Davis would "double\
 dip" into the fund.  Clearly there is a lack of good faith between the legislature\
 and the governor.  However, it is believed once Davis discloses the\
e details of the power contracts negotiated, a bond issuance will take place.\
  Additionally, some generator sources have reported that some of the long\
-term power contracts (as opposed to those still in development) require that\
hat the bond issuance happen by July 1, 2001.  If not, the state may be in \
breach of contract.  Sources state that if the legislature does not pass the\
 bridge loan legislation by May 8th, having a bond issuance by July 1st will\
 be very difficult.']
"""
"""
text = ['Bahah I can see your whole history \
Including the parts where you debated separating the chat you invited me to because I would make off with your technical papers \
Real nice']
"""
text_features = model.transform(text)
print(text_features.shape)
print(text_features)

#17.660 seconds to transform 8 examples
for i in range(len(text)):
    sentiment = text_features[i, 2388]
print(text[i], sentiment)
import pandas as pd
import numpy as np

from encoder import Model

df = pd.read_csv('questions.csv')
question1 = np.array(df['question1'])
question2 = np.array(df['question2'])
labels = np.array(df['is_duplicate'])

del df

model = Model()
for i in range(21, 40):
    ques1_features = model.transform(question1[10000 * i:10000 * (i + 1)])
    ques2_features = model.transform(question2[10000 * i:10000 * (i + 1)])
    label = labels[10000 * i:10000 * (i + 1)].reshape([10000, 1])

    data = np.concatenate((ques1_features, ques2_features, label), axis=1)
    np.save("quora_data/quora_features{}".format(i), data)

    del ques1_features, ques2_features, label, data