def createW2VPredictionFile(filePath, model, withStops=True): testQuestions = originalQuestionParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(withStops): predFile = tail + '-w2v-with-stops.pred' else: predFile = tail + '-w2v.pred' modelPath = prepModelFolder() predFile = modelPath + predFile with open(predFile, "w") as tsvfile: writer = csv.writer(tsvfile, delimiter="\t") for t_question in testQuestions: t_question['origQuestion'] = filterPunctuation(t_question['origQuestion']) if(withStops): t_question['origQuestion'] = filterPunctuation(t_question['origQuestion']) t_question['W2V_OVec1'] = generateQuestionVector(model,t_question['origQuestion'], DIM) else: t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().split() if i not in stops]) t_question['W2V_OVec1'] = generateQuestionVector(model,t_question['origQNoStops'], DIM) vecList = [] for rel_quest in t_question['rel_questions']: rel_quest['question'] = filterPunctuation(rel_quest['question']) if(withStops): rel_quest['W2V_qVec1'] = generateQuestionVector(model,rel_quest['question'], DIM) else: rel_quest['relQNoStops'] = " ".join([i for i in rel_quest['question'].lower().split() if i not in stops]) rel_quest['W2V_qVec1'] = generateQuestionVector(model,rel_quest['relQNoStops'], DIM) vecList.append(rel_quest['W2V_qVec1']) simMatrix = cosineSimilarity(t_question['W2V_OVec1'], vecList) for idx, row in enumerate(t_question['rel_questions']): row['simVal'] = simMatrix[idx] writer.writerow([t_question['quest_ID'], row['rel_quest_ID'], 0, row['simVal'], row['relevant']])
def createLDAPredictionFile(filePath, dictionary, numFeatures=200, withStops=True, fileTag=''): testQuestions = originalQuestionParser(filePath) head, tail = os.path.split(filePath) tail = tail.split('.')[0] if(len(fileTag) > 0): fileTag = '-' + fileTag + '-' if(withStops): predFile = tail +'-lda' + str(numFeatures) +'-with-stops.pred' else: predFile = tail + '-lda' + str(numFeatures) + '.pred' modelPath = prepModelFolder() predFile = modelPath + predFile with open(predFile, 'w') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for t_question in testQuestions: t_question['origQuestion'] = filterPunctuation(t_question['origQuestion']) corpus = [] for rel_quest in t_question['rel_questions']: rel_quest['question'] = filterPunctuation(rel_quest['question']) corpus.append(dictionary.doc2vow(rel_quest['question'].lower().word_tokenize())) #corpus = [dictionary.doc2bow(q['question'].lower().word_tokenize()) for q in t_question['rel_questions']] lda, index = generateLDAModel(corpus, dictionary, numFeatures) if(withStops): doc = t_question['origQuestion'] else: t_question['origQNoStops'] = " ".join([i for i in t_question['origQuestion'].lower().word_tokenize() if i not in stops]) doc = t_question['origQNoStops'] vec_bow = dictionary.doc2bow(doc.lower().word_tokenize()) vec_lda = lda[vec_bow] sims = index[vec_lda] for idx, quest in enumerate(t_question['rel_questions']): quest['simVal'] = sims[idx] writer.writerow([t_question['quest_ID'], quest['rel_quest_ID'], idx, quest['simVal'], quest['relevant']])
return featureVec # def getAvgFeatureVecs(questions, model, num_features): # counter = 0 # #preallocation of numpy array for speed purposes # reviewFeatureVecs = np.zeros((len(questions), num_features), dtype="float32") # for question in questions: # reviewFeatureVecs[counter] = makeFeatureVec(question, model, num_features) # counter += 1 # return questionFeatureVecs # TODO: Figure out how to implement tfidf weighting against # the word2vec vectors testQuestions = originalQuestionParser(origQfilePath) for t_quest in testQuestions: t_quest['wordList'] = t_quest['origQuestion'].lower().split() t_quest['w2vectors'] = makeFeatureVec(t_quest['wordList'], model, num_features) for rel_quest in t_quest['rel_questions']: rel_quest['wordList'] = rel_quest['question'].lower().split() rel_quest['w2vectors'] = makeFeatureVec(rel_quest['wordList'], model, num_features) # def createW2VPredictionFile(filePath, model, withStops=True): # testQuestions = originalQuestionParser(filePath) # head, tail = os.path.split(filePath) # tail = tail.split('.')[0] # if(withStops):