def TextToCodeByList(data, modelName, lenght, maxQLen=20, featuresLen = 50): module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl') cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName + '.mdl') word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl') print('word2vec_model loaded') zero_word = np.zeros(50) samples = [] for j in range(lenght): words = [] words1 = data.iloc[j, 0].split(' ') words2 = data.iloc[j, 1].split(' ') l1 = len(words1) if(l1>maxQLen): l1 = maxQLen l2 = len(words2) if(l2>maxQLen): l2 = maxQLen # print words1 # print words2 # lenght - 1 because last word is empty ''. i don't know why for time in xrange(l1 - 1): w = words1[time] if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module words.append(temp) else: words.append(zero_word) for time in xrange(l2 - 1): w = words2[time] if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module words.append(temp) else: words.append(zero_word) samples.append(words) if (j % 10000 == 0): print j # res_data = np.asarray(samples, dtype=np.float16) res_data = np.reshape(samples, (lenght, 40, 50)) print res_data.shape exit() return res_data
def Test(byWeights): lenn = 2345796 # y_test = np.zeros(lenn) fn = experimentName + '/part_y_test' + '.dat' y_test = pickle_file_manager.LoadFromObject(fn) data = Load_and_Prefiltred_Data('test.csv', 0, 100) X, Y = PrepareData(data, 'test.csv') model = None if (byWeights): model = GetModel(mode='load_W', filename=weightsName, X=X, Y=Y) else: model = GetModel(mode='load_model', filename=modelName, X=X, Y=Y) full_sourceData = Load_and_Prefiltred_Data('test.csv', 0, lenn) # step = 195483 step = 390966 ss = 6 m = 1 # for m in range(ss): print("") start = step * m end = start + step X, Y = PrepareData(full_sourceData[start:end], 'test.csv') predictions = model.predict(X, verbose=1, batch_size=64) y_test[start:end] = predictions[:, 0] pickle_file_manager.SaveToObject(y_test, fn) print((m + 1), "/", ss, " part predicted...")
def TextToSCodeNorm(data, modelName, lenght, maxQLen=5, featuresLen=50): res_data = np.zeros((lenght, 2 * maxQLen, featuresLen), dtype=np.float16) module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl') cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName + '.mdl') word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl') print('word2vec_model loaded') for j in range(lenght): res_row = res_data[j] words1 = data.iloc[j, 0] words2 = data.iloc[j, 1] # keys_1 = keywords(words1, ratio=0.9) # keys_2 = keywords(words1, ratio=0.9) words1 = words1 + ' ' + words2 words2 = words2 + ' ' + words2 + ' ' + words2 print words1 print words2 summary_1 = summarize(words1, split=True) summary_2 = summarize(words2, split=True) print summary_1 print summary_2 exit() for w in summary_1: if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module res_row[time, :] = temp for w in summary_2: if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module res_row[time + maxQLen, :] = temp if (j % 10000 == 0): print j # pickle_file_manager.SaveToObject() return res_data
def TextToCodeNorm(data, modelName, lenght, maxQLen=20, featuresLen = 50): res_data = np.zeros((lenght, 2 * maxQLen, featuresLen), dtype=np.float16) module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl') cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName + '.mdl') word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl') print('word2vec_model loaded') for j in range(lenght): res_row = res_data[j] words1 = data.iloc[j, 0].split(' ') words2 = data.iloc[j, 1].split(' ') l1 = len(words1) if(l1>maxQLen): l1 = maxQLen l2 = len(words2) if(l2>maxQLen): l2 = maxQLen # print words1 # print words2 # lenght - 1 because last word is empty ''. i don't know why already_uses = [] for time in xrange(l1 - 1): w = words1[time] if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module res_row[time, :] = temp for time in xrange(l2 - 1): w = words2[time] if (w in word2vec_model.wv.vocab): temp = (word2vec_model.wv[w] + abs(cmin)) / module res_row[time + maxQLen, :] = temp if (j % 10000 == 0): print j return res_data
def Get_word_statistics(): sentences = pickle_file_manager.LoadFromObject('Quora_sentences.dat') print 'Data load...' lenght = len(sentences) print lenght sizes = np.zeros(lenght, dtype=np.int32) for ii in range(lenght): sizes[ii] = len(sentences[ii]) mean = np.mean(sizes) min = np.min(sizes) max = np.max(sizes) print 'MEAN words count: ', mean print 'MIN words count: ', min print 'MAX words count: ', max unique_sizes = np.bincount(sizes) ui = np.nonzero(unique_sizes)[0] res = zip(ui, unique_sizes[ui]) print res exit()
def TextToCode(data, modelName, lenght, groupLen=6, codeLen=50, maxQLen=60): res_data = np.zeros((lenght, 2, 2 * groupLen, codeLen), dtype=np.float16) word2vec_model = word2vec.Word2Vec.load(modelName + '.mdl') print('word2vec_model loaded') module = pickle_file_manager.LoadFromObject('NModule_' + modelName + '.mdl') cmin = pickle_file_manager.LoadFromObject('NMin_' + modelName + '.mdl') for j in range(lenght): res_row = res_data[j] words1 = data.iloc[j, 0].split(' ') words2 = data.iloc[j, 1].split(' ') l1 = len(words1) if(l1>maxQLen): l1 = maxQLen l2 = len(words2) if(l2>maxQLen): l2 = maxQLen # print words1 # print words2 hight_similarity_count = 0 low_similarity_count = 0 # lenght - 1 because last word is empty ''. i don't know why already_uses = [] for index_w1 in xrange(l1 - 1): w1 = words1[index_w1] for index_w2 in xrange(l2 - 1): w2 = words2[index_w2] # print 'merge: ', w1, " + ", w2 local_similarity = 0. if (w1 in word2vec_model.wv.vocab and w2 in word2vec_model.wv.vocab): local_similarity = word2vec_model.wv.similarity(w1, w2) if (hight_similarity_count < groupLen and local_similarity > 0.8): if (w1 not in already_uses and w2 not in already_uses): s1 = (word2vec_model.wv[w1] + abs(cmin)) / module s2 = (word2vec_model.wv[w2] + abs(cmin)) / module res_row[0, hight_similarity_count, :] = s1 res_row[1, hight_similarity_count, :] = s2 already_uses.append(w1) already_uses.append(w2) # print 'H ', w1 # print 'H ', w2 # print res_row[0:100, hight_similarity_count] # print res_row[100:200, hight_similarity_count] hight_similarity_count += 1 break if (low_similarity_count < groupLen and local_similarity > 0 and local_similarity < 0.2): if (w1 not in already_uses and w2 not in already_uses): s1 = (word2vec_model.wv[w1] + abs(cmin)) / module s2 = (word2vec_model.wv[w2] + abs(cmin)) / module res_row[0, groupLen + low_similarity_count, :] = s1 res_row[1, groupLen + low_similarity_count, :] = s2 already_uses.append(w1) already_uses.append(w2) # print 'L ', w1 # print 'L ', w2 low_similarity_count += 1 if (low_similarity_count >= groupLen and hight_similarity_count >= groupLen): break if (j % 10000 == 0): print j return res_data
# sLength = 404290 # sLength = 10000 # source = pandas.read_csv('train.csv').head(n=sLength) # source = textColumnsToLowcase(source) # source = specSymbolReplacer(source) # source = source[['question1', 'question2']].dropna() # q1_list = source['question1'].str.split().values.tolist() # q2_list = source['question2'].str.split().values.tolist() # # sentences = q1_list + q2_list # print 'Data prepeared...' # pickle_file_manager.SaveToObject(sentences, 'Quora_sentences.dat') # print 'Data saved...' sentences = pickle_file_manager.LoadFromObject('Quora_sentences.dat') print 'Data load...' # Get_Statisctic(sentences) model = word2vec.Word2Vec(sentences, size=50, window=14, min_count=5, workers=4) Normalization(model) # model = word2vec.Word2Vec.load('word2vec_quora_set_model.mdl') # print 'word2vec_model loaded' model.save('W2V_Model_Quora_50.mdl') print '<computer> code: ', model.wv['why']