def plot_bias():

    # values gained by raising the bias threshhold
    # biases = np.array([0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002])
    # extracted_dim = np.array([2., 3., 5., 10., 22., 40., 69., 106., 165., 242., 250., 255., 257., 268., 273., 278., 286., 291.])
    # acc = np.array([0.6745, 0.69683333, 0.69625, 0.73083333, 0.77433333, 0.79225, 0.79941667, 0.81966667, 0.83083333, 0.84666667,0.84733333, 0.84858333, 0.8475, 0.85083333, 0.8515, 0.8523333, 0.85308333, 0.85591667])
    biases = np.array([
        0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01,
        0.011, 0.012, 0.013
    ])
    extracted_dim = np.array(
        [255., 205., 167., 131., 108., 80., 60., 45., 28., 21., 12., 9., 6.])
    acc = np.array([
        0.8435, 0.83783333, 0.835, 0.82833333, 0.82116667, 0.81658333,
        0.80641667, 0.79791667, 0.78483333, 0.769, 0.73633333, 0.72758333,
        0.6945
    ])
    acc = [elem * 100 for elem in acc]

    vis.plot_acc_for_bias(biases=biases, dimensions=extracted_dim, accs=acc)
    def predict(self, data_vectorized):
        target_names = ['negative', 'positive']
        #x_test_v_scaled = self.scaler.fit_transform(data_vectorized['x_test_v'])
        x_test_v_scaled = data_vectorized['x_test_v']
        start_time = time.time()
        self.prediction_liblinear = self.Classifier_liblinear.predict(
            x_test_v_scaled)
        self.time_prediction = (time.time() - start_time)
        logging.info("prediction finished - %6.2f seconds " %
                     self.time_prediction)

        # cross validation
        # logging.info("cross validation ... ")
        # start_time = time.time()
        # scores = cross_val_score(self.Classifier_liblinear,
        #                          data_vectorized['x_train_v'],
        #                          data_vectorized['y_train'],
        #                          cv=3, n_jobs=-1)
        #
        # logging.info("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        # logging.info("Cross-Validation finished- %6.2f seconds " % (time.time() - start_time))

        # # # Print results in a nice table linearSVC
        logging.info("Results for LinearSVC()")
        logging.info("Training time: %fs; Prediction time: %fs" %
                     (self.time_training, self.time_prediction))
        logging.info(
            classification_report(data_vectorized['y_test'],
                                  self.prediction_liblinear,
                                  target_names=target_names))

        # ### plot top features - only possible for linear and tfidf
        try:
            plotter.plot_coefficients(
                self.Classifier_liblinear,
                data_vectorized['vectorizer'].get_feature_names(),
                fname=self.name)
        except:
            logging.info('feature-plotting not possible')

        io.save_classifier(self.Classifier_liblinear)
def plot_each_review_dimension(vectorized_data, bias=0.1):
    logging.info('negative vectors in vetorized[train_neg_v] : ' +
                 str(len(vectorized_data['train_neg_v'])))
    logging.info('positive vectors in vetorized[train_pos_v] : ' +
                 str(len(vectorized_data['train_pos_v'])))

    ############# plot each dimension to find the significant dimensions #########
    avg = []
    avg_v_neg = vec.avg_vectors(vectorized_data['train_neg_v'])
    avg_v_pos = vec.avg_vectors(vectorized_data['train_pos_v'])

    # calculate a difference vector for all averaged neg and pos vectors
    diff_v = vec.diff(avg_v_neg, avg_v_pos, bias=bias)

    # diff_v = normalize(diff_v)
    avg.append(avg_v_neg)
    avg.append(avg_v_pos)
    vis.plot_each_dim(neg_v=vectorized_data['train_neg_v'],
                      pos_v=vectorized_data['train_pos_v'],
                      avgs=avg,
                      used_bias=bias,
                      diff=diff_v,
                      filename='feats')
def plot_sentiment_distribution(neg_v, pos_v, source=None):
    pos_index_21 = []
    pos_index_119 = []
    neg_index_21 = []
    neg_index_119 = []
    for neg_v, pos_v in zip(neg_v, pos_v):
        pos_index_21.append(pos_v[21])
        pos_index_119.append(pos_v[119])
        neg_index_21.append(neg_v[21])
        neg_index_119.append(neg_v[119])

    negative_reduced = []
    positive_reduced = []
    [
        negative_reduced.append([v21, v119])
        for v21, v119 in zip(neg_index_21, neg_index_119)
    ]
    [
        positive_reduced.append([v21, v119])
        for v21, v119 in zip(pos_index_21, pos_index_119)
    ]

    vis.plot_relevant_indexes(neg_index_21, neg_index_119, pos_index_21,
                              pos_index_119, source)
def shrink_dim_and_plot_2d_clusters(neg_v,
                                    pos_v,
                                    reduction_methode,
                                    bias=None,
                                    perplexity=None,
                                    learning_rate=None,
                                    normalize=True,
                                    extract_dim=None,
                                    truncate_by_svd=True,
                                    source='word or feat'):

    #take the first n feats, they are randomized so we can take the first 2000 - avoid memory error

    input_dimension = len(neg_v[0])
    logging.info('input dimensions before reduction: ' + str(input_dimension))
    if input_dimension == 2:
        calc_acc(neg_v, pos_v)
        # print 2d
        vis.plot_2d_clusters(
            v_neg_reduced=neg_v,
            v_pos_reduced=pos_v,
            filename=source + '_' + reduction_methode + '_' + 'b_' +
            str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' +
            'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) +
            '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' +
            str(normalize))

    else:

        # first reduce the dimensions to 50, then perform t-SNE or PCA
        if truncate_by_svd:
            try:
                start_time = time.time()

                truncated = TruncatedSVD(n_components=50,
                                         random_state=0).fit_transform(neg_v +
                                                                       pos_v)
                # split the truncated
                neg_v = truncated[0:int(len(truncated) / 2)]
                pos_v = truncated[int(len(truncated) / 2):]

                logging.info("dimension truncated with SVD - %6.2f seconds " %
                             (time.time() - start_time))
            except:
                logging.info('truncating not possible, dimension < 50')

        #reduce dimension with TSNE or PCA
        if reduction_methode == 'tsne':
            # data mixed before dimension reduction
            neg_v, pos_v = vec.reduce_with_TSNE_mixed(
                neg_v=neg_v,
                pos_v=pos_v,
                goal_dimensions=2,
                perplexity=perplexity,
                learning_rate=learning_rate)

            # negative and positive separately shrinked
            # neg_v_reduced, pos_v_reduced = reduce_with_TSNE(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2)
        elif reduction_methode == 'pca':
            neg_v, pos_v = vec.reduce_with_PCA_mixed(neg_v=neg_v,
                                                     pos_v=pos_v,
                                                     goal_dimensions=2)

        # normalize the data
        if normalize:
            scaler = preprocessing.StandardScaler().fit(neg_v + pos_v)
            neg_v = scaler.transform(neg_v)
            pos_v = scaler.transform(pos_v)

        calc_acc(neg_v, pos_v)

        # print 2d
        vis.plot_2d_clusters(
            v_neg_reduced=neg_v,
            v_pos_reduced=pos_v,
            filename=source + '_' + reduction_methode + '_' + 'b_' +
            str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' +
            'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) +
            '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' +
            str(normalize))
def use_word2vec_with_wordlists():
    # define general testing parameters for word2vec plotting
    words_to_load = 2000
    # define the min difference between the neg and pos averaged wordvectors
    bias = 0.4
    # tsne related params
    perplexity = 150
    learning_rate = 1000
    # reduce by tsne or pca
    reduction_methode = 'pca'
    # filter the most significant dimensions

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    neg_v = []
    pos_v = []
    extracted_neg_wordvectors = []
    extracted_pos_wordvectors = []

    model = Word2Vec.load('./w2v_model/300_dimensions/word_tokenized/own.d2v')
    mod = model.wv
    del model

    #mod = gensim.models.KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin',binary=True )

    test_words = {}
    test_words['neg'], test_words['pos'] = data.load_neg_pos_wordlist(
        num_of_words=words_to_load)

    for word in test_words['neg']:
        try:
            word_vector = mod[word]
            neg_v.append(word_vector)
        except:
            continue

    for word in test_words['pos']:
        try:
            word_vector = mod[word]
            pos_v.append(word_vector)
        except:
            continue

    # avg all neg and pos words for each dimension
    avg_neg = vec.avg_vectors(neg_v)
    avg_pos = vec.avg_vectors(pos_v)
    avgs = []
    avgs.append(avg_neg)
    avgs.append(avg_pos)
    difference = vec.diff(avg_neg, avg_pos, bias=bias)

    # plot each dimensions of our words, the average and the difference
    vis.plot_each_dim(neg_v=neg_v,
                      pos_v=pos_v,
                      avgs=avgs,
                      used_bias=bias,
                      diff=difference,
                      filename='words')

    ############## plot most informative dimensions ##############
    #plot_sentiment_distribution(neg_v=neg_v, pos_v=pos_v, source='words')

    # extract the significant dimensions of our word vectors according to a defined bias
    if extract_dim:
        relevant_indexes = vec.extraxt_rel_indexes(difference)
        [
            extracted_neg_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in neg_v
        ]
        [
            extracted_pos_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in pos_v
        ]
    else:
        extracted_neg_wordvectors = neg_v
        extracted_pos_wordvectors = pos_v

    # try to classify the words
    # first with all dimensions later with only the most significant dimensions
    neg_labels = []
    pos_labels = []
    for _ in neg_v:
        neg_labels.append(c.NEGATIVE)
    for _ in pos_v:
        pos_labels.append(c.POSITIVE)

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(neg_v + pos_v,
                                                        neg_labels +
                                                        pos_labels,
                                                        test_size=0.25,
                                                        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with all dimensions: ' + str(acc))

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(
        extracted_neg_wordvectors + extracted_pos_wordvectors,
        neg_labels + pos_labels,
        test_size=0.25,
        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with extracted dimensions: ' + str(acc))

    shrink_dim_and_plot_2d_clusters(neg_v=extracted_neg_wordvectors,
                                    pos_v=extracted_pos_wordvectors,
                                    reduction_methode=reduction_methode,
                                    bias=bias,
                                    perplexity=perplexity,
                                    learning_rate=learning_rate,
                                    normalize=normalize,
                                    extract_dim=extract_dim,
                                    truncate_by_svd=truncate_by_svd,
                                    source='word')
    # for ind, i in enumerate(Cs):
    #     plt.plot(Tol, scores[ind], label='C: ' + str(i))
    # plt.legend()
    # plt.xlabel('Los')
    # plt.ylabel('Mean score')
    # plt.show()




    import thesis.Visualization as plotter

    # plot the most informative features of the best pipeline
    features = grid_search.best_estimator_.named_steps['vect'].get_feature_names()
    logging.info(features[0])
    logging.info(len(features))
    clf = grid_search.best_estimator_.named_steps['clf']
    plotter.plot_coefficients(clf, features,fname='test')

    # show best accuracy from the 4 fold cross validation with the validation data
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    # print classification_report with the unseen testing data
    clf = grid_search.best_estimator_
    prediction = clf.predict(data['x_test'])
    target_names = ['negative', 'positive']
    print(classification_report(data['y_test'], prediction, target_names=target_names))
Exemple #8
0
y = vectorized_data['y_train']

print('grid')
C_range = np.logspace(-2, 2, 5)
gamma_range = np.logspace(-4, 2, 5)

param_grid = dict(tol=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
grid = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=cv, verbose=1)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f" %
      (grid.best_params_, grid.best_score_))
print('grid done')

scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
                                                     len(gamma_range))

# Draw heatmap of the validation accuracy as a function of gamma and C
#
# The score are encoded as colors with the hot colormap which varies from dark
# red to bright yellow. As the most interesting scores are all located in the
# 0.82 to 0.85 range we use a custom normalizer to set the mid-point to 0.82 so
# as to make it easier to visualize the small variations of score values in the
# interesting range while not brutally collapsing all the low score values to
# the same color.
plotter.plot_heatmap(scores=scores,
                     gamma_range=gamma_range,
                     C_range=C_range,
                     filename='linear_SVM')