Esempio n. 1
0
def createLSIPredictionFileSubTaskA(filePath, dictionary, numFeatures=200, withStops=True, fileTag=''):
	testQuestions = elementParser(filePath)
	head, tail = os.path.split(filePath)
	tail = tail.split('.')[0]
	if(len(fileTag) > 0):
		fileTag = '-' + fileTag + '-'
	if(withStops):
		predFile = tail + '-lda' + str(numFeatures) + '-with-stops' + fileTag + '.pred'
	else:
		predFile = tail + '-lda' + str(numFeatures) + fileTag +'.pred'
	modelPath = prepModelFolder()
	with open(predFile,'w') as tsvfile:
		writer = csv.writer(tsvfile, delimiter='\t')
		for t_question in testQuestions:
			t_question['question'] = filterPunctuation(t_question['question'])
			corpus = []
			for rel_comment in t_question['comments']:
				rel_comment['comment'] = filterPunctuation(rel_comment['comment'])
				corpus.append(dictionary.doc2bow(doc.lower().word_tokenize()))
			lda, index = generateLDAModel(corpus, dictionary, numFeatures)
			if(withStops):
				doc = t_question['question']
			else:
				t_question['question'] = ' '.join([i for i in t_question['question'] if i not in stops])
				doc = t_question['question']
			vec_bow = dictionary.doc2bow(doc.lower().word_tokenize())
			vec_lda = lda[vec_bow]
			sims = index[vec_lda]
			for idx, quest in enumerate(t_question['comments']):
				quest['simVal'] = sims[idx]
				writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
Esempio n. 2
0
def createW2VPredictionFileSubTaskA(filePath, model, withStops=True):
	testQuestions = elementParser(filePath)
	head, tail = os.path.split(filePath)
	tail = tail.split('.')[0]
	if(withStops):
		predFile = tail + '-w2v-with-stops.pred'
	else:
		predFile = tail + '-w2v.pred'
	modelPath = prepModelFolder()
	predFile = modelPath + predFile
	with open(predFile, "w") as tsvfile:
		writer = csv.writer(tsvfile, delimiter="\t")
		for t_question in testQuestions:
			t_question['question'] = filterPunctuation(t_question['question'])
			if(withStops):
				t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['question'], DIM)
			else:
				t_question['relQNoStops'] = " ".join([i for i in t_question['question'].lower().split() if i not in stops])
				t_question['W2V_qVec1'] = generateQuestionVector(model,t_question['relQNoStops'], DIM)
			vecList = []
			for t_comment in t_question['comments']:
				t_comment['comment'] = filterPunctuation(t_comment['comment'])
				if(withStops):
					t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['comment'], DIM)
				else:
					t_comment['relCNoStops'] = " ".join([i for i in t_comment['comment'].lower().split() if i not in stops])
					t_comment['W2V_cVec1'] = generateQuestionVector(model, t_comment['relCNoStops'], DIM)
				vecList.append(t_comment['W2V_cVec1'])
			simMatrix = cosineSimilarity(t_question['W2V_qVec1'], vecList)
			for idx, row in enumerate(t_question['comments']):
				row['simVal'] = simMatrix[idx]
				writer.writerow([t_question['threadId'], row['comment_id'], 0, row['simVal'], row['comment_rel']])
def QuestionCreator(filePaths=[]):
    thisList = []
    questions = []
    for filePath in filePaths:
        thisList += elementParser(filePath)
    for row in thisList:
        questions.append(row['question'])
    return questions
if(Path("../tmp/QTL_list.p").is_file()):
	QTL_List = pickle.load(open("../tmp/QTL_List.p", "rb"))
else:
	QTL_List = []
	for filePath in QTLfilePaths:
		QTL_List += createObjectListFromJson(filePath)
	if not os.path.isdir('../tmp'):
		os.makedirs('../tmp')
	pickle.dump(QTL_List, open("../tmp/QTL_List.p", "wb"))


if(Path("../tmp/thisList.p").is_file()):
	thisList = pickle.load(open("../tmp/thisList.p", "rb"))
else:
	thisList = []
	for filePath in filePaths:
		thisList += elementParser(filePath)
	if not os.path.isdir('../tmp'):
		os.makedirs('../tmp')
	pickle.dump(thisList, open("../tmp/thisList.p", "wb"))

if(Path("../tmp/subTaskAList.p").is_file()):
	subTaskAList = pickle.load(open("../tmp/subTaskAList.p", "rb"))
else:
	subTaskAList = []
	for filePath in filePathsSubTaskA:
		subTaskAList += elementParser(filePath)
	if not os.path.isdir('../tmp'):
		os.makedirs('../tmp')
	pickle.dump(subTaskAList, open("../tmp/subTaskAList.p", "wb"))