def tokenize(sentence): sentence=WordPunctTokenizer().tokenize(sentence.lower()) return ' '.join(sentence)
sim21 = (idf2 * (matrix2.dot(matrix1.T).max(axis=1))).sum() / idf2.sum() return 2 * sim12 * sim21 / (sim12 + sim21) total_len = matrix1.shape[0] + matrix2.shape[0] return sim12 * matrix2.shape[0] / total_len + sim21 * matrix1.shape[ 0] / total_len if __name__ == "__main__": w2v = gensim.models.Word2Vec.load('../data/w2v_model_stemmed') idf = pickle.load(open('../data/idf')) question1 = 'intialize all elements in an ArrayList as a specific integer' question1 = WordPunctTokenizer().tokenize(question1.lower()) question1 = [SnowballStemmer('english').stem(word) for word in question1] question2 = 'set every element of a list to the same constant value' question2 = WordPunctTokenizer().tokenize(question2.lower()) question2 = [SnowballStemmer('english').stem(word) for word in question2] matrix1 = init_doc_matrix(question1, w2v) matrix2 = init_doc_matrix(question2, w2v) matrix1_trans = matrix1.T matrix2_trans = matrix2.T idf1 = init_doc_idf_vector(question1, idf) idf2 = init_doc_idf_vector(question2, idf) #print sim_question_api(question1, question2, idf, w2v)