def test_LinearSVM():
    # test svm with tdidf-vectorized data
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    data = Data_loader().get_data()
    vec = vec.get_Vectorizer(vectorizer='tfidf')
    #vec = vec.get_Vectorizer(vectorizer='word2vec')
    clf = LinearSVM()

    vectorized_data = vec.vectorize(data=data)
    clf.classify(vectorized_data)
    clf.predict(vectorized_data)
def test_NaiveBayes_sklearn():
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    # load data
    data = Data_loader().get_data()
    # create a vectorizer
    tfidf_vec = vec.get_Vectorizer(vectorizer='tfidf')
    # create a classifier
    clf = NaiveBayes_sklearn()
    # vectorize the data
    vectorized_data = tfidf_vec.vectorize(data=data)
    # train classifier
    clf.classify(vectorized_data)
    # inverence for the classifier
    clf.predict(vectorized_data)
Ejemplo n.º 3
0
    def run(self):
        self.vectorizer = v.get_Vectorizer(vectorizer=self.vectorizer,
                                           num_of_samples=self.num_of_samples,
                                           reduction_methode=self.red_method,
                                           w2v_dimension=self.w2v_dim)

        # dependency injection for the provided data
        data_vectorized = self.vectorizer.vectorize(
            self.data_loader.get_data())

        # reduce the dimensionality of the training and testing data with tsne
        # no effort, acc 50 - 60 %
        # data_vectorized['x_train_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_train_v'])
        # data_vectorized['x_test_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_test_v'])

        self.classifier.classify(data_vectorized)

        self.classifier.predict(data_vectorized)
def use_word2vec_with_movie_reviews():
    clf = cls.LinearSVM()

    # samples per sentiment for cluster plotting
    samples = 10000

    # tsne related params
    perplexity = 80
    # filter the most significant dimensions

    #learning_rates = np.logspace(2, 3, 5)
    learning_rates = [1000]
    # how to reduce the dimensionality of the wordvectors / document vectors
    reduction_methode = 'tsne'

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    # bias for the difference of all averaged document vectors
    # how big should the difference between negative and positive feats be?
    # biases = np.array([0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02, 0.01, 0.009, 0.008, 0.007,0.006])
    biases = np.array([0.09])
    accuracies = np.zeros(len(biases))
    extracted_dim = np.zeros(len(biases))

    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)

    # cache the vectorized features for faster parameter research
    import thesis.IO_Organizer as saver
    feature_filename = 'w2v_google'
    try:
        logging.info('Try to load vectorized features')
        vectorized_data_full = saver.load_features('dict_' + feature_filename)
        logging.info('Features loaded from files')
    except:
        logging.info('Feature-file not found, vectorize reviews')
        data = Data_loader().get_data()
        word2vec = vec.get_Vectorizer(vectorizer='word2vec')
        vectorized_data_full = word2vec.vectorize(data=data)
        saver.save_features(vectorized_data_full, feature_filename)

    data = Data_loader().get_data()
    word2vec = vec.get_Vectorizer(vectorizer='word2vec')
    vectorized_data_full = word2vec.vectorize(data=data)

    for learning_rate in learning_rates:
        for i, bias in enumerate(biases):
            logging.info(bias)
            # create a working copy
            vectorized_data = dict(vectorized_data_full)

            ############## plot most informative dimensions ##############
            #plot_sentiment_distribution(vectorized_data['train_neg_v'], vectorized_data['train_pos_v'], source='feats')

            # reduce the dim of our document vectors
            #vectorized_data = vec.transform_data(vectorized_data, bias=bias)

            # plotting
            plot_each_review_dimension(vectorized_data=vectorized_data,
                                       bias=bias)

            # # extract the most significant dim of our document vectors
            if extract_dim:
                vectorized_data = vec.transform_data(vectorized_data,
                                                     bias=bias)

            #### testing purpose, shrinking the whole amount of data to 2d
            # we need to do it batchsized to avoid memory overflow
            batchsize = 4000
            reduced_to_2d = []
            for x in batch(vectorized_data['x_train_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_train_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['x_test_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_test_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_neg_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_neg_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_pos_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_pos_v'] = reduced_to_2d
            reduced_to_2d = []

            ####

            shrink_dim_and_plot_2d_clusters(
                neg_v=vectorized_data['train_neg_v'],
                pos_v=vectorized_data['train_pos_v'],
                reduction_methode=reduction_methode,
                bias=bias,
                perplexity=perplexity,
                learning_rate=learning_rate,
                normalize=normalize,
                extract_dim=extract_dim,
                truncate_by_svd=truncate_by_svd,
                source='feat')

            # select num_of_samples randomly
            # we need to define samples, or we get an memory error
            # neg_samples_v = random.sample(vectorized_data['train_neg_v'], k=samples)
            # pos_samples_v = random.sample(vectorized_data['train_pos_v'], k=samples)

            # shrink_dim_and_plot_2d_clusters(neg_v= neg_samples_v,
            #                                            pos_v= pos_samples_v,
            #                                            reduction_methode= reduction_methode,
            #                                            bias= bias,
            #                                            perplexity= perplexity,
            #                                            learning_rate= learning_rate,
            #                                            normalize= normalize,
            #                                            extract_dim= extract_dim,
            #                                            truncate_by_svd= truncate_by_svd,
            #                                            source= 'feat')

            extr_dim = len(vectorized_data['x_train_v'][0])
            extracted_dim[i] = extr_dim

            #vectorized_data = vec.delete_relevant_dimensions(vectorized_data)

            ######## linear svm ################
            cl = cls.LinearSVM()
            cl.classify(vectorized_data)
            cl.predict(vectorized_data)

            cl = LinearSVC()
            cl.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
            pred = cl.predict(vectorized_data['x_test_v'])
            acc = accuracy_score(y_true=vectorized_data['y_test'], y_pred=pred)
            logging.info('acc: ' + str(acc))
            accuracies[i] = acc
            del vectorized_data
            #
            #vis.plot_hyperplane(clf=cl, X=vectorized_data['x_train_v'], Y=vectorized_data['y_train'])

    #         ######### RandomForestClassifier #########
    #         target_names = ['negative', 'positive']
    #
    #         clf = RandomForestClassifier(n_jobs=2)
    #         clf.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = clf.predict(vectorized_data['x_test_v'])
    #         logging.info(classification_report(vectorized_data['y_test'], prediction,
    #                                            target_names=target_names))
    #         ######## Logisticregression #############
    #         from sklearn.linear_model import LogisticRegression
    #         import pandas as pd
    #
    #         lr = LogisticRegression()
    #         lr.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = lr.predict_proba(vectorized_data['x_test_v'])
    #
    #         logging.info('LR acc: ' + str(lr.score(vectorized_data['x_test_v'], vectorized_data['y_test'])))
    #
    #         metrics.accuracy_score(vectorized_data['y_test'], prediction)
    #
    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)
Ejemplo n.º 5
0
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

import thesis.Data as d
import thesis.Vectorizer as vec
import thesis.my_logger
import thesis.Visualization as plotter

# tfidf
# data = d.Data_loader().get_data()
# tfidf_vec = vec.get_Vectorizer('tfidf')
# vectorized_data = tfidf_vec.vectorize(data=data)

# word2vec
data = d.Data_loader().get_data()
word2vec_vec = vec.get_Vectorizer('word2vec')
vectorized_data = word2vec_vec.vectorize(data=data)

X = vectorized_data['x_train_v']
y = vectorized_data['y_train']

print('grid')
C_range = np.logspace(-2, 2, 5)
gamma_range = np.logspace(-4, 2, 5)

param_grid = dict(tol=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
grid = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=cv, verbose=1)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f" %