import random import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import SGDClassifier from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.pipeline import Pipeline from data.store_data import unpickle_data tweets = unpickle_data('resources/tagged_data/data_tagged_cleaned.pckl') random.shuffle(tweets) X_train = [tweet.text for tweet in tweets[:16000]] y_train = [tweet.feelings[0] for tweet in tweets[:16000]] X_test = [tweet.text for tweet in tweets[-4000:]] y_test = [tweet.feelings[0] for tweet in tweets[-4000:]] text_clf_svm = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ])
from data.store_data import pickle_data from data.store_data import save_json from data.store_data import unpickle_data from text_proccessing.text_proccessing import clean all_data = unpickle_data('resources/tagged_data/data_tagged.pckl') for tweet in all_data: tweet.text = clean(tweet.text) pickle_data(all_data, 'resources/tagged_data/data_tagged_cleaned.pckl') save_json(all_data, 'resources/tagged_data/data_tagged_cleaned.json')