Esempio n. 1
0
import random

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline

from data.store_data import unpickle_data

tweets = unpickle_data('resources/tagged_data/data_tagged_cleaned.pckl')

random.shuffle(tweets)

X_train = [tweet.text for tweet in tweets[:16000]]
y_train = [tweet.feelings[0] for tweet in tweets[:16000]]
X_test = [tweet.text for tweet in tweets[-4000:]]
y_test = [tweet.feelings[0] for tweet in tweets[-4000:]]

text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf-svm',
     SGDClassifier(loss='hinge',
                   penalty='l2',
                   alpha=1e-3,
                   n_iter=5,
                   random_state=42)),
])
Esempio n. 2
0
from data.store_data import pickle_data
from data.store_data import save_json
from data.store_data import unpickle_data
from text_proccessing.text_proccessing import clean

all_data = unpickle_data('resources/tagged_data/data_tagged.pckl')

for tweet in all_data:
    tweet.text = clean(tweet.text)

pickle_data(all_data, 'resources/tagged_data/data_tagged_cleaned.pckl')
save_json(all_data, 'resources/tagged_data/data_tagged_cleaned.json')