def mine():
    print("Starting")
    clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
    print('Create/Load Classifier')
    doc_stream = stream_docs(path='./movie_data.csv')
    print('Fitting data')
    classes = np.array([0,1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
    print('Finished Fitting')

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test,y_test))

    print('create pickle objects')
    dest = os.path.join('','pkl_objects')
    if not os.path.exists(dest):
        os.makedirs(dest)

    pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4)
    pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)
Esempio n. 2
0
def classify(document):
    label = {0: 'computer', 1: 'science', 2: 'sports', 3: 'religion', 4: 'politics', 5: 'automobiles'}
    # So, first converting text data into vectors of numerical values using tf-idf to form feature vector
    X = vect.transform([document])
    #vectorizer = TfidfVectorizer()
    #X = vectorizer.fit_transform(Xtr)
    y = clf.predict(X)[0]
    return label[y]
Esempio n. 3
0
def update_model(db_path, model, batch_size=10000):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('SELECT * FROM review_db')

    results = c.fetchmany(batch_size)
    while results:
        data = np.array(results)
        X = data[:, 0]
        y = data[:, 1].astype(int)
        classes = np.array([0, 1])
        X_train = vect.transform(X)
        model.partial_fit(X_train, y, classes=classes)
        results = c.fetchmany(batch_size)

    conn.close()
    return model
Esempio n. 4
0
def train(document, y):
	X = vect.transform([document])
	clf.partial_fit(X, [y])
Esempio n. 5
0
def classify(document):
	label = {0: 'negative', 1: 'positive'}
	X = vect.transform([document])
	y = clf.predict(X)[0]
	proba = np.max(clf.predict_proba(X))
	return label[y], proba
import re
import os
from vectorizer import vect 

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

import numpy as np 
label = {0:'Negative', 1:'Positive'}

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

Hardcoded reviews. Shouldn't be too difficult to prompt user for input.

"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""

# Bad review: 
example = ['I would not recommend coming here. I will never eat here again. Terrible food, slow service. Our waiter Mark was very rude']

# Average review: 
#example = ['The food was amazing, but the service was...meh. Our waiter Mark was ok']

# Great review: 
#example = ['Everything about this place was awesome--The food...amazing! The atmosphere...nice! The staff...Our waiter Mark was great!']

X = vect.transform(example)

if label[clf.predict(X)[0]] == 'Negative':
	print 'Prediction: %s\nRecommended Rating: %.1f' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*5-3)

else:
	print 'Prediction: %s\nRecommended Rating: %.1f' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*5)
Esempio n. 7
0
def train(document, y):
    X = vect.transform([document])
    clf.partial_fit(X, [y])
Esempio n. 8
0
def classify(document):
    label = {0: 'negative', 1: 'positive'}
    X = vect.transform([document])
    y = clf.predict(X)[0]
    proba = clf.predict_proba(X).max()
    return label[y], proba
Esempio n. 9
0
import pickle
import re
import os
from vectorizer import vect

clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb'))

import numpy as np

label = {0: '음성', 1: '양성'}

example = ['I love this movie']

X = vect.transform(example)

print('예측 : %s\n 확률: %.2f%%' %
      (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)) * 100))
def classify(document):
    label = {-1:'Sin sentimiento', 0:'Neutro', 1:'Positivo',2: 'Negativo'}
    X = vect.transform([document])
    y = clf.predict(X)[0]
    proba = np.max(clf.predict_proba(X))
    return label[y], proba
Esempio n. 11
0
def classify(document):
    label = {0: 'computer', 1: 'science', 2: 'sports', 3: 'religion', 4: 'politics', 5: 'automobiles'}
    X = vect.transform([document])
    y = clf.predict(X)[0]
    return label[y]
import numpy as np
import pandas as pd
import pyprind

df = pd.read_csv(
    "/content/drive/My Drive/Analisis_sentimientos_Twitter/TWEETS_LGGG_15Abr_clean.csv",
    encoding='utf-8')
# creamos una columna llamada Sentimient donde guardaremos la predicción
df['sentiment'] = ''
# creamos una columna llamada Probability donde guardaremos la acertabilidad que dio el clasificador
df['probability'] = 0
# conversión de sentimientos (numeros a palabras)= NONE->-1 | NEU -> 0 | P->1 | N->2
label = {-1: 'Sin sentimiento', 0: 'Neutro', 1: 'Positivo', 2: 'Negativo'}
for rowid in range(len(df.index)):
    text = df['text'][rowid]
    textConvert = vect.transform([text])
    df['sentiment'][rowid] = label[clf.predict(textConvert)[0]]
    df['probability'][rowid] = np.max(clf.predict_proba(textConvert)) * 100
    pbar.update()
# df.head(20)
df.to_csv(
    '/content/drive/My Drive/Analisis_sentimientos_Twitter/TWEETS_LGGG_15Abr_analysis.csv',
    index=False,
    encoding='utf-8')
"""# Generar gráficos estadísticos"""

import matplotlib.pyplot as plt

# sentimientos = df["sentiment"].unique()
df.groupby('sentiment')['location'].nunique().plot(kind='bar')
print(df.groupby(['sentiment']).size())
Esempio n. 13
0
sys.path.append("..")
from tokenizer import tokenizer
from vectorizer import vect
from sklearn.linear_model import SGDClassifier
from pyprind import ProgBar
import os
import pickle

clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

classes = np.array([0, 1])
doc_stream = stream_docs('movie_data.csv')
pbar = ProgBar(45)

for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, 1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

X_test, y_test = get_minibatch(doc_stream, 5000)
X_test = vect.transform(X_test)

print("Accuracy: %.3f" % clf.score(X_test, y_test))
clf.partial_fit(X_test, y_test, classes=classes)

pickle.dump(clf,
            open(os.path.join('..', 'pkl_objects', 'classifier.pkl'), 'wb'),
            protocol=4)
Esempio n. 14
0
def train(opinion, y):
    X = vect.transform([opinion])
    clf.partial_fit(X, [y])
Esempio n. 15
0
# import the HashingVectorizer from local dir 
from vectorizer import vect 

def update_model(db_path, model, batch_size=10000)
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('SELECT * from review_db')
    
    results = c.fetchmany(batch_size)
    while results:
        data = np.array(results)
        x = data[:, 0]
        y = data[:, 1].astype(int)
        
        classes = np.array([0, 1])
        x_train = vect.transform(x)
        model.partial_fit(x_train, y, classes=classes)
        results = c.fetchmany(batch_size)
    conn.close()
    return model 
    
    cur_dir = os.path.dirname(__file__)
    clf = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'classifier'), 'rb'))
    
    db = os.path.join(cur_dir, 'reviews.sqlite')
    update_model(db_path=db, model=clf, batch_size=10000)
    
    # updating persistence 
    
    pickle.dump(clf, open(os.path.join(cur_dir, 'pkl_objects', 'classifier.pkl'), 'wb'), protocol=4) 
    
Esempio n. 16
0
def train(document, y):
    # remove stop words and tokenize
    X = vect.transform([document])
    clf.partial_fit(X, [y])