def TextToCodeByList(data, modelName, lenght, maxQLen=20, featuresLen = 50):
    module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl')
    cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName +  '.mdl')
    word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl')
    print('word2vec_model loaded')

    zero_word = np.zeros(50)
    samples = []

    for j in range(lenght):
        words = []
        words1 = data.iloc[j, 0].split(' ')
        words2 = data.iloc[j, 1].split(' ')

        l1 = len(words1)
        if(l1>maxQLen):
            l1 = maxQLen
        l2 = len(words2)
        if(l2>maxQLen):
            l2 = maxQLen

        # print words1
        # print words2

        # lenght - 1 because last word is empty ''. i don't know why
        for time in xrange(l1 - 1):
            w = words1[time]
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                words.append(temp)
            else:
                words.append(zero_word)

        for time in xrange(l2 - 1):
            w = words2[time]
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                words.append(temp)
            else:
                words.append(zero_word)

        samples.append(words)

        if (j % 10000 == 0):
            print j

    # res_data = np.asarray(samples, dtype=np.float16)
    res_data = np.reshape(samples, (lenght, 40, 50))
    print res_data.shape
    exit()

    return res_data
def Test(byWeights):
    lenn = 2345796
    # y_test = np.zeros(lenn)

    fn = experimentName + '/part_y_test' + '.dat'
    y_test = pickle_file_manager.LoadFromObject(fn)
    data = Load_and_Prefiltred_Data('test.csv', 0, 100)
    X, Y = PrepareData(data, 'test.csv')

    model = None
    if (byWeights):
        model = GetModel(mode='load_W', filename=weightsName, X=X, Y=Y)
    else:
        model = GetModel(mode='load_model', filename=modelName, X=X, Y=Y)

    full_sourceData = Load_and_Prefiltred_Data('test.csv', 0, lenn)

    # step = 195483
    step = 390966
    ss = 6
    m = 1
    # for m in range(ss):
    print("")
    start = step * m
    end = start + step

    X, Y = PrepareData(full_sourceData[start:end], 'test.csv')
    predictions = model.predict(X, verbose=1, batch_size=64)

    y_test[start:end] = predictions[:, 0]
    pickle_file_manager.SaveToObject(y_test, fn)

    print((m + 1), "/", ss, "  part predicted...")
def TextToSCodeNorm(data, modelName, lenght, maxQLen=5, featuresLen=50):
    res_data = np.zeros((lenght, 2 * maxQLen, featuresLen), dtype=np.float16)
    module = pickle_file_manager.LoadFromObject('NModule_' + modelName +
                                                '.mdl')
    cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName + '.mdl')
    word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl')
    print('word2vec_model loaded')

    for j in range(lenght):
        res_row = res_data[j]
        words1 = data.iloc[j, 0]
        words2 = data.iloc[j, 1]

        # keys_1 = keywords(words1, ratio=0.9)
        # keys_2 = keywords(words1, ratio=0.9)
        words1 = words1 + ' ' + words2
        words2 = words2 + ' ' + words2 + ' ' + words2

        print words1
        print words2

        summary_1 = summarize(words1, split=True)
        summary_2 = summarize(words2, split=True)

        print summary_1
        print summary_2
        exit()

        for w in summary_1:
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                res_row[time, :] = temp

        for w in summary_2:
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                res_row[time + maxQLen, :] = temp

        if (j % 10000 == 0):
            print j

    # pickle_file_manager.SaveToObject()
    return res_data
def TextToCodeNorm(data, modelName, lenght, maxQLen=20, featuresLen = 50):

    res_data = np.zeros((lenght, 2 * maxQLen, featuresLen), dtype=np.float16)
    module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl')
    cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName +  '.mdl')
    word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl')
    print('word2vec_model loaded')


    for j in range(lenght):
        res_row = res_data[j]
        words1 = data.iloc[j, 0].split(' ')
        words2 = data.iloc[j, 1].split(' ')

        l1 = len(words1)
        if(l1>maxQLen):
            l1 = maxQLen
        l2 = len(words2)
        if(l2>maxQLen):
            l2 = maxQLen

        # print words1
        # print words2

        # lenght - 1 because last word is empty ''. i don't know why
        already_uses = []
        for time in xrange(l1 - 1):
            w = words1[time]
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                res_row[time, :] = temp

        for time in xrange(l2 - 1):
            w = words2[time]
            if (w in word2vec_model.wv.vocab):
                temp = (word2vec_model.wv[w] + abs(cmin)) / module
                res_row[time + maxQLen, :] = temp

        if (j % 10000 == 0):
            print j

    return res_data
def Get_word_statistics():
    sentences = pickle_file_manager.LoadFromObject('Quora_sentences.dat')
    print 'Data load...'
    lenght = len(sentences)
    print lenght
    sizes = np.zeros(lenght, dtype=np.int32)
    for ii in range(lenght):
        sizes[ii] = len(sentences[ii])

    mean = np.mean(sizes)
    min = np.min(sizes)
    max = np.max(sizes)

    print 'MEAN words count: ', mean
    print 'MIN words count: ', min
    print 'MAX words count: ', max

    unique_sizes = np.bincount(sizes)
    ui = np.nonzero(unique_sizes)[0]
    res = zip(ui, unique_sizes[ui])
    print res
    exit()
def TextToCode(data, modelName, lenght, groupLen=6, codeLen=50, maxQLen=60):

    res_data = np.zeros((lenght, 2, 2 * groupLen, codeLen), dtype=np.float16)

    word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl')
    print('word2vec_model loaded')
    module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl')
    cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName +  '.mdl')

    for j in range(lenght):
        res_row = res_data[j]
        words1 = data.iloc[j, 0].split(' ')
        words2 = data.iloc[j, 1].split(' ')

        l1 = len(words1)
        if(l1>maxQLen):
            l1 = maxQLen
        l2 = len(words2)
        if(l2>maxQLen):
            l2 = maxQLen

        # print words1
        # print words2

        hight_similarity_count = 0
        low_similarity_count = 0

        # lenght - 1 because last word is empty ''. i don't know why
        already_uses = []
        for index_w1 in xrange(l1 - 1):
            w1 = words1[index_w1]
            for index_w2 in xrange(l2 - 1):
                w2 = words2[index_w2]
                # print 'merge: ', w1, " + ", w2
                local_similarity = 0.
                if (w1 in word2vec_model.wv.vocab and w2 in word2vec_model.wv.vocab):
                    local_similarity = word2vec_model.wv.similarity(w1, w2)

                if (hight_similarity_count < groupLen and local_similarity > 0.8):
                    if (w1 not in already_uses and w2 not in already_uses):
                        s1 = (word2vec_model.wv[w1] + abs(cmin)) / module
                        s2 = (word2vec_model.wv[w2] + abs(cmin)) / module
                        res_row[0, hight_similarity_count, :] = s1
                        res_row[1, hight_similarity_count, :] = s2

                        already_uses.append(w1)
                        already_uses.append(w2)

                        # print 'H  ', w1
                        # print 'H  ', w2
                        # print res_row[0:100, hight_similarity_count]
                        # print res_row[100:200, hight_similarity_count]

                        hight_similarity_count += 1
                        break

                if (low_similarity_count < groupLen and local_similarity > 0 and local_similarity < 0.2):
                    if (w1 not in already_uses and w2 not in already_uses):
                        s1 = (word2vec_model.wv[w1] + abs(cmin)) / module
                        s2 = (word2vec_model.wv[w2] + abs(cmin)) / module

                        res_row[0, groupLen + low_similarity_count, :] = s1
                        res_row[1, groupLen + low_similarity_count, :] = s2

                        already_uses.append(w1)
                        already_uses.append(w2)

                        # print 'L  ', w1
                        # print 'L  ', w2
                        low_similarity_count += 1

            if (low_similarity_count >= groupLen and hight_similarity_count >= groupLen):
                break

        if (j % 10000 == 0):
            print j

    return res_data
# sLength = 404290
# sLength = 10000
# source = pandas.read_csv('train.csv').head(n=sLength)
# source = textColumnsToLowcase(source)
# source = specSymbolReplacer(source)
# source = source[['question1', 'question2']].dropna()
# q1_list = source['question1'].str.split().values.tolist()
# q2_list = source['question2'].str.split().values.tolist()
#
# sentences = q1_list + q2_list
# print 'Data prepeared...'

# pickle_file_manager.SaveToObject(sentences, 'Quora_sentences.dat')
# print 'Data saved...'

sentences = pickle_file_manager.LoadFromObject('Quora_sentences.dat')
print 'Data load...'
# Get_Statisctic(sentences)

model = word2vec.Word2Vec(sentences,
                          size=50,
                          window=14,
                          min_count=5,
                          workers=4)
Normalization(model)

# model = word2vec.Word2Vec.load('word2vec_quora_set_model.mdl')
# print 'word2vec_model loaded'

model.save('W2V_Model_Quora_50.mdl')
print '<computer> code: ', model.wv['why']