def mine(): print("Starting") clf = SGDClassifier(loss='log',random_state=1,n_iter=1) print('Create/Load Classifier') doc_stream = stream_docs(path='./movie_data.csv') print('Fitting data') classes = np.array([0,1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) print('Finished Fitting') X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test,y_test)) print('create pickle objects') dest = os.path.join('','pkl_objects') if not os.path.exists(dest): os.makedirs(dest) pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4) pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)
def classify(document): label = {0: 'computer', 1: 'science', 2: 'sports', 3: 'religion', 4: 'politics', 5: 'automobiles'} # So, first converting text data into vectors of numerical values using tf-idf to form feature vector X = vect.transform([document]) #vectorizer = TfidfVectorizer() #X = vectorizer.fit_transform(Xtr) y = clf.predict(X)[0] return label[y]
def update_model(db_path, model, batch_size=10000): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute('SELECT * FROM review_db') results = c.fetchmany(batch_size) while results: data = np.array(results) X = data[:, 0] y = data[:, 1].astype(int) classes = np.array([0, 1]) X_train = vect.transform(X) model.partial_fit(X_train, y, classes=classes) results = c.fetchmany(batch_size) conn.close() return model
def train(document, y): X = vect.transform([document]) clf.partial_fit(X, [y])
def classify(document): label = {0: 'negative', 1: 'positive'} X = vect.transform([document]) y = clf.predict(X)[0] proba = np.max(clf.predict_proba(X)) return label[y], proba
import re import os from vectorizer import vect clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb')) import numpy as np label = {0:'Negative', 1:'Positive'} """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" Hardcoded reviews. Shouldn't be too difficult to prompt user for input. """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" # Bad review: example = ['I would not recommend coming here. I will never eat here again. Terrible food, slow service. Our waiter Mark was very rude'] # Average review: #example = ['The food was amazing, but the service was...meh. Our waiter Mark was ok'] # Great review: #example = ['Everything about this place was awesome--The food...amazing! The atmosphere...nice! The staff...Our waiter Mark was great!'] X = vect.transform(example) if label[clf.predict(X)[0]] == 'Negative': print 'Prediction: %s\nRecommended Rating: %.1f' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*5-3) else: print 'Prediction: %s\nRecommended Rating: %.1f' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X))*5)
def classify(document): label = {0: 'negative', 1: 'positive'} X = vect.transform([document]) y = clf.predict(X)[0] proba = clf.predict_proba(X).max() return label[y], proba
import pickle import re import os from vectorizer import vect clf = pickle.load(open(os.path.join('pkl_objects', 'classifier.pkl'), 'rb')) import numpy as np label = {0: '음성', 1: '양성'} example = ['I love this movie'] X = vect.transform(example) print('예측 : %s\n 확률: %.2f%%' % (label[clf.predict(X)[0]], np.max(clf.predict_proba(X)) * 100))
def classify(document): label = {-1:'Sin sentimiento', 0:'Neutro', 1:'Positivo',2: 'Negativo'} X = vect.transform([document]) y = clf.predict(X)[0] proba = np.max(clf.predict_proba(X)) return label[y], proba
def classify(document): label = {0: 'computer', 1: 'science', 2: 'sports', 3: 'religion', 4: 'politics', 5: 'automobiles'} X = vect.transform([document]) y = clf.predict(X)[0] return label[y]
import numpy as np import pandas as pd import pyprind df = pd.read_csv( "/content/drive/My Drive/Analisis_sentimientos_Twitter/TWEETS_LGGG_15Abr_clean.csv", encoding='utf-8') # creamos una columna llamada Sentimient donde guardaremos la predicción df['sentiment'] = '' # creamos una columna llamada Probability donde guardaremos la acertabilidad que dio el clasificador df['probability'] = 0 # conversión de sentimientos (numeros a palabras)= NONE->-1 | NEU -> 0 | P->1 | N->2 label = {-1: 'Sin sentimiento', 0: 'Neutro', 1: 'Positivo', 2: 'Negativo'} for rowid in range(len(df.index)): text = df['text'][rowid] textConvert = vect.transform([text]) df['sentiment'][rowid] = label[clf.predict(textConvert)[0]] df['probability'][rowid] = np.max(clf.predict_proba(textConvert)) * 100 pbar.update() # df.head(20) df.to_csv( '/content/drive/My Drive/Analisis_sentimientos_Twitter/TWEETS_LGGG_15Abr_analysis.csv', index=False, encoding='utf-8') """# Generar gráficos estadísticos""" import matplotlib.pyplot as plt # sentimientos = df["sentiment"].unique() df.groupby('sentiment')['location'].nunique().plot(kind='bar') print(df.groupby(['sentiment']).size())
sys.path.append("..") from tokenizer import tokenizer from vectorizer import vect from sklearn.linear_model import SGDClassifier from pyprind import ProgBar import os import pickle clf = SGDClassifier(loss='log', random_state=1, max_iter=1) classes = np.array([0, 1]) doc_stream = stream_docs('movie_data.csv') pbar = ProgBar(45) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, 1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, 5000) X_test = vect.transform(X_test) print("Accuracy: %.3f" % clf.score(X_test, y_test)) clf.partial_fit(X_test, y_test, classes=classes) pickle.dump(clf, open(os.path.join('..', 'pkl_objects', 'classifier.pkl'), 'wb'), protocol=4)
def train(opinion, y): X = vect.transform([opinion]) clf.partial_fit(X, [y])
# import the HashingVectorizer from local dir from vectorizer import vect def update_model(db_path, model, batch_size=10000) conn = sqlite3.connect(db_path) c = conn.cursor() c.execute('SELECT * from review_db') results = c.fetchmany(batch_size) while results: data = np.array(results) x = data[:, 0] y = data[:, 1].astype(int) classes = np.array([0, 1]) x_train = vect.transform(x) model.partial_fit(x_train, y, classes=classes) results = c.fetchmany(batch_size) conn.close() return model cur_dir = os.path.dirname(__file__) clf = pickle.load(open(os.path.join(cur_dir, 'pkl_objects', 'classifier'), 'rb')) db = os.path.join(cur_dir, 'reviews.sqlite') update_model(db_path=db, model=clf, batch_size=10000) # updating persistence pickle.dump(clf, open(os.path.join(cur_dir, 'pkl_objects', 'classifier.pkl'), 'wb'), protocol=4)
def train(document, y): # remove stop words and tokenize X = vect.transform([document]) clf.partial_fit(X, [y])