def train_clustering(n_topics=100, method='gmm'):

    ### Load extra data
    dataloader = csv_dataloader(datafile='data/extra_statuses.csv')
    CACHE_FILE = 'output/extra_cache.pk'
    if not os.path.exists(CACHE_FILE):
        dataloader.read_csv(applyfun=preprocess, verbose=True)
        dataloader.save(CACHE_FILE)
    else:
        dataloader.load(CACHE_FILE)
    dataloader.summary()
    tokens = sum(dataloader.data.viewvalues(), [])
    print '#Tokens from training data: ' + str(len(tokens))
    print 'Readin done'

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'
    ### Convert word to vector

    train_vectors = word2vec.batch_convert(tokens)
    print '#Vectors from training data: ' + str(len(train_vectors))
    save(train_vectors, 'output/extra_vectors.pk')

    ### Train Clustering
    clusters = get_clusters(method=method, n_topics=n_topics)
    if not os.path.exists('output/clustering_'+ method + '_' + str(n_topics) + '.pk'):
        print 'Training Clusters...'
        clusters.fit(train_vectors)
        clusters.save('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
        clusters.summary()
    else:
        print 'Cluster Model Loaded...'
        clusters.load('output/clustering_'+ method + '_' + str(n_topics) + '.pk')
def draw_word2vec():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load("output/data_cache.pk")
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model="data/GoogleNews-vectors-negative300.bin", binary=True, size=300)
    print "Pretrained word2vec loaded"

    all_tokens = sum(dataloader.data.viewvalues(), [])
    print "#Tokens: " + str(len(all_tokens))
    fdist = FreqDist(all_tokens)
    tokens = fdist.keys()[1:500]
    print tokens
    tokens_has_vectors = []
    for token in tokens:
        if word2vec[token] is not None:
            tokens_has_vectors.append(token)

    print "#Unique Tokens \w Vectors: " + str(len(tokens_has_vectors))
    vectors = word2vec.encode(tokens_has_vectors)
    print "#Unique Vectors: " + str(len(vectors))

    print ("Computing MDS embedding")
    clf = manifold.MDS(n_components=2, n_init=1, max_iter=2000)
    # clf = manifold.Isomap(n_components=2, max_iter=100)
    vectors_mds = clf.fit_transform(vectors)
    print ("Done. Stress: %f" % clf.stress_)
    plot_embedding(vectors_mds, tokens_has_vectors, "MDS embedding of the words")
def draw_gmm():
    ### Load data
    dataloader = csv_dataloader()
    dataloader.load('output/data_cache.pk')
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'

    ### Reverse engineering, build vector2word dictionary
    # vec2word = []
    # words =[]
    # for voc, obj in word2vec.model.vocab:
    #     words.append(voc)
    #     vec2word.append(word2vec.model.syn0[obj.index])

    all_vectors = word2vec.model.syn0


    ### Train BoW
    n_topics = 25
    model_file = 'output/clustering_gmm_100.pk';
    clusters = get_clusters(method='gmm', n_topics=n_topics)

    clusters.load(model_file)
    print clusters.clusters.means_

    knn = NearestNeighbors(n_neighbors=10)
    knn.fit(all_vectors)
    save(knn, 'output\draw_gmm_knn.pk')
    nns =  knn.kneighbors(clusters.clusters.means_, return_distance=False)
    for i in range(np.shape(nns)[0]):
        print 'Topic ' + str(i+1) + ': ',
        for j in range(np.shape(nns)[1]):
            print str(word2vec.model.index2word[nns[i,j]]) + ' ',
        print ''
def main(n_fold=10):
    ### Load data
    dataloader = csv_dataloader(extrafile='data/fixed_train_gender_class.csv', extra=True)
    if not os.path.exists('output/data_cache.pk'):
        dataloader.read_csv(applyfun=preprocess)
        dataloader.save('output/data_cache.pk')
    else:
        dataloader.load('output/data_cache.pk')
    dataloader.summary()
    print "Read in finished"

    ### Load pre-train word2vector model
    word2vec = get_word2vec(model='data/GoogleNews-vectors-negative300.bin', binary=True, size=300)
    print 'Pretrained word2vec loaded'

    ### Train BoW
    n_topics = 25
    model_file = 'output/clustering_gmm_25.pk';
    clusters = get_clusters(method='gmm', n_topics=n_topics)
    if not os.path.exists(model_file):
        ### Convert word to vector
        tokens = sum(dataloader.data.viewvalues(), [])
        print '#Tokens from training data: ' + str(len(tokens))
        train_vectors = word2vec.encode(tokens)
        print '#Vectors from training data: ' + str(len(train_vectors))
        print 'Training Clusters...'
        clusters.fit(train_vectors)
        clusters.save(model_file)
        clusters.summary()
    else:
        print 'Cluster Model Loaded...'
        clusters.load(model_file)

    ### Calculate LIWC hist
    LIWC = get_LIWC()
    #print LIWC.calculate_hist(tokens, normalize=False)

    ### ============================================================
    ###                         n fold
    ### ============================================================

    nfolds = dataloader.nfold(n_fold)
    fscores = []
    models = []

    for fold_ind in range(n_fold):

        print '======================== FOLD ' + str(fold_ind+1) + '========================'

        test_id = nfolds[fold_ind]
        train_id = []
        for i in range(n_fold):
            if i != fold_ind:
                train_id += nfolds[i]


        ### ============================================================
        ###                         Train Part
        ### ============================================================
        print 'Training>>>>>>>>>>>>>>>>>>>>>>>>>'

        train_data = dataloader.data_retrieve(train_id)

        ### Balance Train Data
        _, train_pos_id, train_neg_id = dataloader.balance(train_id, K=2)

        encode_pos = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        encode_pos = SMOTE(encode_pos, 200, len(train_pos_id)/4)
        label_pos = np.ones(len(encode_pos))

        encode_neg = encode_feature(train_data, train_pos_id, [word2vec, clusters, LIWC])
        label_neg = np.zeros(len(encode_neg))

        encode = np.concatenate((encode_pos, encode_neg), axis=0)
        label = np.concatenate((label_pos, label_neg), axis=0)
        print encode.shape
        print label.shape

        encode = preprocessing.scale(encode)
        classifier = svm.LinearSVC(verbose=True)
        #weight = label+1 # pos:neg = 2:1 for imbalanced training
        classifier.fit(encode, label)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))


        ### ============================================================
        ###                         Test Part
        ### ============================================================
        print 'Testing>>>>>>>>>>>>>>>>>>>>>>>>>'

        test_data = dataloader.data_retrieve(test_id)
        encode = encode_feature(test_data, test_id, [word2vec, clusters, LIWC])
        label = dataloader.label_retrieve(test_id)

        encode = preprocessing.scale(encode)
        print 'F1 score: ' + str(f1_score(label, classifier.predict(encode)))
        fscores.append(f1_score(label, classifier.predict(encode)))
        models.append(classifier)

    print 'MEAN F1 score: ' + str(np.mean(fscores))
    print 'BEST F1 score: ' + str(np.max(fscores)) + ' by Model ' + str(np.argmax(fscores)+1)
    print 'VAR F1 score: ' + str(np.var(fscores))

    save(models[np.argmax(fscores)], 'output/model_clustering_' + str(fscores[np.argmax(fscores)]) + '.pk')