def train_nn():
    training = td.TrainingData()
    training.set_tweet_training_data()
    trainingReviews, trainingRatings, validationReviews, validationRatings, train_raw, val_raw = training.get_training_validation_data(0.8)

    trainingReviews = training.matrix_to_dense(trainingReviews)
    validationReviews = training.matrix_to_dense(validationReviews)

    trainingReviews = np.reshape(trainingReviews, (len(trainingReviews), trainingReviews.shape[2]))

    y_train = training.tweets_to_amazon_ratings(trainingRatings)
    y_val = training.tweets_to_amazon_ratings(validationRatings)

    y_val_max = np.array([np.array(i).argmax(axis=0) for i in y_val])

    y_train = np.array(y_train)

    model = build_ffnn_model(len(trainingReviews[0]), len(y_train[0]))
    for i in range(10):
        print("N fold {}".format(i))
        model.fit(trainingReviews, y_train, verbose=2,
                  epochs=10)

        yp = model.predict(y_val_max)
        yp_max = np.array([i.argmax(axis=0) for i in yp])
        print(classification_report(y_val_max, yp_max))
def classify_trump_tweets_one_at_a_time(model, path_to_tweets, results_to_file=True, results_file_name="trump_results.csv"):
    """
    Classifies tweets about Donald Trump
    :param model: Keras model
    :param results_to_file: Boolean. Should the results be written to file
    :param results_file_name: Filename for if they should be written
    :return: The predictions for each trump tweet
    """
    training = td.TrainingData()
    training.set_tweet_training_data()
    count = 0
    tweet_ids = []
    with open(path_to_tweets, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        first_tweet = True
        for row in csv_reader:
            if first_tweet:
                first_tweet = False
                continue
            if count % 10000 == 0:
                print("Done {} tweets".format(count))
            count += 1
            t_id = row[0]
            t = row[1]
            t.replace('\n', '')
            test_data = training._TrainingData__preprocessor.transform([t])
            test_data = training.matrix_to_dense(test_data)
            test_data = np.reshape(test_data, (len(test_data), test_data.shape[2]))

            yp = model.predict(test_data)
            if results_to_file:
                with open(results_file_name, 'a', encoding='utf-8') as f:
                    w = csv.writer(f, delimiter=',')
                    w.writerow([t_id, yp[0][0]])
def classify_trump_tweets(model, path_to_tweets, results_to_file=True, results_file_name="trump_results.csv"):
    """
    Classifies tweets about Donald Trump
    :param model: Keras model
    :param results_to_file: Boolean. Should the results be written to file
    :param results_file_name: Filename for if they should be written
    :return: The predictions for each trump tweet
    """
    training = td.TrainingData()
    training.set_tweet_training_data()

    tweet_texts = []
    with open(path_to_tweets, 'r', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile, delimiter=',')
        for row in csv_reader:
            tweet_texts.append(row[1])
    del tweet_texts[0]
    print("Tweets read from file")
    tweet_texts = [t.replace('\n', '') for t in tweet_texts]
    test_data = training._TrainingData__preprocessor.transform(tweet_texts)
    test_data = training.matrix_to_dense(test_data)

    test_data = np.reshape(test_data, (len(test_data), test_data.shape[2]))
    print("Tweet preprocessing done!")
    yp = model.predict(test_data)
    if results_to_file:
        with open(results_file_name, 'w', encoding='utf-8') as f:
            w = csv.writer(f, delimiter=',')
            for i, row in enumerate(zip(tweet_texts, yp)):
                if i % 10000 == 0:
                    print("Wrote {} tweets to file".format(i))
                w.writerow([row[0], row[1][0]])
    return zip(tweet_texts, yp)
def train_nn():
    training = td.TrainingData()
    #training.set_tweet_training_data()
    training.set_amazon_training_data(category_size=50000)
    trainingReviews, trainingRatings, validationReviews, validationRatings, train_raw, val_raw = training.get_training_validation_data(
        0.8)
    y_train = cont_to_disc(trainingRatings)
    y_val = cont_to_disc(validationRatings)

    y_train = to_categorical(y_train)
    y_val = to_categorical(y_val)
    trainingReviews = training.matrix_to_dense(trainingReviews)
    validationReviews = training.matrix_to_dense(validationReviews)

    trainingReviews = np.reshape(
        trainingReviews, (len(trainingReviews), trainingReviews.shape[2]))
    validationReviews = np.reshape(
        validationReviews,
        (len(validationReviews), validationReviews.shape[2]))
    model = build_ffnn_model(len(trainingReviews[0]), len(y_train[0]))
    #model = load_keras_model('FFNN-Regression.h5')
    val_max = [np.array(i).argmax(axis=0) for i in y_val]
    filepath = "weights-imporement-{epoch:02d}--{val_acc: .2f}.hdf5"
    checkpoint = ModelCheckpoint(filepath,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=False,
                                 mode='max')
    callback_list = [checkpoint]
    for i in range(10):
        print("N fold {}".format(i))
        model.fit(trainingReviews,
                  np.array(y_train),
                  epochs=2,
                  validation_data=(validationReviews, np.array(y_val)),
                  callbacks=callback_list)
        yp = model.predict(validationReviews, verbose=1)
        yp_max = np.array([i.argmax(axis=0) for i in yp])
        print(classification_report(val_max, yp_max))
        print(confusion_matrix(val_max, yp_max))
        print(f1_score(val_max, yp_max, average='weighted'))

    save_keras_model(model, 'FFNN-Regression.h5')
Ejemplo n.º 5
0
def train_and_predict(tweets):
    training = td.TrainingData()
    training.set_tweet_training_data()
    trainingReviews, trainingRatings, validationReviews, validationRatings, train_raw, val_raw = training.get_training_validation_data(
        1)

    y_train = cont_to_disc(trainingRatings)
    y_val = cont_to_disc(validationRatings)

    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        # ('clf', NuSVC(kernel='sigmoid', nu=0.15, coef0=0.7)),
        ('clf', LinearSVC()),
        # ('clf', NuSVC(kernel='rbf', nu=0.15, coef0=0.7)),
        # ('clf', SVC()),
    ])

    text_clf.fit(train_raw, y_train)
    predicted = text_clf.predict(tweets)
    return predicted
Ejemplo n.º 6
0
sys.path.append(
    os.path.normpath(os.path.join(foo_dir, '../TextCleaning', '..')))
import pandas as pd
import DataGathering.trainingdata as td
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn import linear_model
from sklearn import datasets
from sklearn import metrics

training = td.TrainingData()
training.set_tweet_training_data()
trainingReviews, trainingRatings, validationReviews, validationRatings, train_raw, val_raw = training.get_training_validation_data(
    0.8)
trainingRatings = training.tweets_to_amazon_ratings(trainingRatings)
validationRatings = training.tweets_to_amazon_ratings(validationRatings)
#importing the dataset
#dataset = pd.read_csv('/Users/yunuskocyigit/Desktop/KTH/Big Data in Media Technology/projectfinal/tset1_20k_Trump.csv', names=['liked','txt'])
#x = dataset.iloc[:, 1].values
#y = dataset.iloc[:, 1].values
df = pd.read_csv(
    '/Users/yunuskocyigit/Desktop/KTH/Big Data in Media Technology/Big-Data-Final/DataGathering/tweets_GroundTruth.txt',
    sep='\t',
    names=['liked', 'txt'])
df.head()