def generateTask3QuestionData(hashList): questions = getQuestions(hashList) generateTokens(questions) return questions
from nltk.corpus import stopwords import logging import os import sys sys.path.insert(0, os.path.abspath('..')) from utils.QuestionFileCreator import CreateFilePath, getQuestions, QuestionCleaner, initializeLog from utils.sourceFiles import thisList initializeLog() new_dest = CreateFilePath('LsiModel') stops = set(stopwords.words('english')) questions = QuestionCleaner(getQuestions(thisList)) dictionary = corpora.Dictionary(line['question'].lower().split() for line in questions) # remove stopwords stop_ids = [ dictionary.token2id[stopword] for stopword in stops if stopword in dictionary.token2id ] # remove words only appearing once once_ids = [ tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1 ] dictionary.filter_tokens(stop_ids + once_ids) dictionary.compactify() dictionary.save(new_dest + '.dict')
def prepLabeledSentList(questions=[], withStops=False): mod_questions = [] for q in questions: #print('idx: ' + str(idx) + ' , question: ' + question) mod_questions.append( TaggedDocument([i for i in q['question'] if i not in stops], q['id'])) return mod_questions def altDoc2Vec(questions): mod_questions = prepLabeledSentList(questions) model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8) model.build_vocab(mod_questions) shuffle(mod_questions) for epoch in range(10): model.train(mod_questions) return model questions = getQuestions(thisList) model = altDoc2Vec(questions) createPredictionFile(origQfilePath, model)