def plot_each_review_dimension(vectorized_data, bias=0.1):
    logging.info('negative vectors in vetorized[train_neg_v] : ' +
                 str(len(vectorized_data['train_neg_v'])))
    logging.info('positive vectors in vetorized[train_pos_v] : ' +
                 str(len(vectorized_data['train_pos_v'])))

    ############# plot each dimension to find the significant dimensions #########
    avg = []
    avg_v_neg = vec.avg_vectors(vectorized_data['train_neg_v'])
    avg_v_pos = vec.avg_vectors(vectorized_data['train_pos_v'])

    # calculate a difference vector for all averaged neg and pos vectors
    diff_v = vec.diff(avg_v_neg, avg_v_pos, bias=bias)

    # diff_v = normalize(diff_v)
    avg.append(avg_v_neg)
    avg.append(avg_v_pos)
    vis.plot_each_dim(neg_v=vectorized_data['train_neg_v'],
                      pos_v=vectorized_data['train_pos_v'],
                      avgs=avg,
                      used_bias=bias,
                      diff=diff_v,
                      filename='feats')
def use_word2vec_with_wordlists():
    # define general testing parameters for word2vec plotting
    words_to_load = 2000
    # define the min difference between the neg and pos averaged wordvectors
    bias = 0.4
    # tsne related params
    perplexity = 150
    learning_rate = 1000
    # reduce by tsne or pca
    reduction_methode = 'pca'
    # filter the most significant dimensions

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    neg_v = []
    pos_v = []
    extracted_neg_wordvectors = []
    extracted_pos_wordvectors = []

    model = Word2Vec.load('./w2v_model/300_dimensions/word_tokenized/own.d2v')
    mod = model.wv
    del model

    #mod = gensim.models.KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin',binary=True )

    test_words = {}
    test_words['neg'], test_words['pos'] = data.load_neg_pos_wordlist(
        num_of_words=words_to_load)

    for word in test_words['neg']:
        try:
            word_vector = mod[word]
            neg_v.append(word_vector)
        except:
            continue

    for word in test_words['pos']:
        try:
            word_vector = mod[word]
            pos_v.append(word_vector)
        except:
            continue

    # avg all neg and pos words for each dimension
    avg_neg = vec.avg_vectors(neg_v)
    avg_pos = vec.avg_vectors(pos_v)
    avgs = []
    avgs.append(avg_neg)
    avgs.append(avg_pos)
    difference = vec.diff(avg_neg, avg_pos, bias=bias)

    # plot each dimensions of our words, the average and the difference
    vis.plot_each_dim(neg_v=neg_v,
                      pos_v=pos_v,
                      avgs=avgs,
                      used_bias=bias,
                      diff=difference,
                      filename='words')

    ############## plot most informative dimensions ##############
    #plot_sentiment_distribution(neg_v=neg_v, pos_v=pos_v, source='words')

    # extract the significant dimensions of our word vectors according to a defined bias
    if extract_dim:
        relevant_indexes = vec.extraxt_rel_indexes(difference)
        [
            extracted_neg_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in neg_v
        ]
        [
            extracted_pos_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in pos_v
        ]
    else:
        extracted_neg_wordvectors = neg_v
        extracted_pos_wordvectors = pos_v

    # try to classify the words
    # first with all dimensions later with only the most significant dimensions
    neg_labels = []
    pos_labels = []
    for _ in neg_v:
        neg_labels.append(c.NEGATIVE)
    for _ in pos_v:
        pos_labels.append(c.POSITIVE)

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(neg_v + pos_v,
                                                        neg_labels +
                                                        pos_labels,
                                                        test_size=0.25,
                                                        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with all dimensions: ' + str(acc))

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(
        extracted_neg_wordvectors + extracted_pos_wordvectors,
        neg_labels + pos_labels,
        test_size=0.25,
        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with extracted dimensions: ' + str(acc))

    shrink_dim_and_plot_2d_clusters(neg_v=extracted_neg_wordvectors,
                                    pos_v=extracted_pos_wordvectors,
                                    reduction_methode=reduction_methode,
                                    bias=bias,
                                    perplexity=perplexity,
                                    learning_rate=learning_rate,
                                    normalize=normalize,
                                    extract_dim=extract_dim,
                                    truncate_by_svd=truncate_by_svd,
                                    source='word')