def getWordCounts(sentences,numSentences,fileName,corpusVocabulary,inverseDictionary, wordCount,wordCountSentence,lastReadSentenceInd): start_time_all = time.time() start_time = start_time_all for indSentence, sentence in enumerate(sentences): start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000) if indSentence <= lastReadSentenceInd: continue wordsInSentence = util.splitSentence(sentence) wordsAlreadyRead = set() for word in wordsInSentence: if word in wordCount: # corpusDictionary[word].count += 1 wordCount[word] += 1 corpusVocabulary[word].count += 1 if word not in wordsAlreadyRead: wordCountSentence[word] += 1 corpusVocabulary[word].sentenceCount += 1 wordsAlreadyRead.add(word) else: # relatedWords = getConceptNetRelatedWords(word) # corpusDictionary[word] = Word(relatedWords) wordCount[word] = 1 wordCountSentence[word] = 1 inverseDictionary[len(corpusVocabulary)] = word corpusVocabulary[word] = util.Word(word,len(corpusVocabulary)) lastReadSentenceInd = indSentence # filehandler = open(fileName, "wb") # # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler) # pickle.dump([wordCount,wordCountSentence, lastReadSentenceInd], filehandler) # filehandler.close() filehandler = open(fileName, "wb") # pickle.dump([corpusDictionary, lastReadSentenceInd],filehandler) pickle.dump([wordCount, wordCountSentence, corpusVocabulary,inverseDictionary, lastReadSentenceInd], filehandler) filehandler.close() elapsed_time = time.time() - start_time_all print('total time for word count = ' + str(elapsed_time/60) + ' minutes ') return(wordCount,wordCountSentence, corpusVocabulary, lastReadSentenceInd)
def buildSentenceMatrix(sentences,rows,cols,weights,lastReadSentenceInd, corpusVocabulary, fileName): start_time_all = time.time() start_time = start_time_all numSentences = len(sentences) numberWordsPerSentence = np.zeros(numSentences) for indSentence, sentence in enumerate(sentences): start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000) if indSentence <= lastReadSentenceInd: continue wordsAlreadyRead = set() wordsInSentence = util.splitSentence(sentence) for word in wordsInSentence: if word not in wordsAlreadyRead: indWord = corpusVocabulary[word].index rows.append(indSentence) cols.append(indWord) weights.append(1 / corpusVocabulary[word].sentenceCount) numberWordsPerSentence[indSentence] += 1 wordsAlreadyRead.add(word) lastReadSentenceInd = indSentence sentenceMatrix = csr_matrix((weights, (rows, cols)), shape=(numSentences, len(corpusVocabulary))) filehandler = open(fileName, "wb") pickle.dump([sentenceMatrix, numberWordsPerSentence, lastReadSentenceInd], filehandler) filehandler.close() elapsed_time = time.time() - start_time_all print('total time for building sentence matrix = ' + str(elapsed_time/60) + ' minutes ') return (sentenceMatrix,numberWordsPerSentence)
def buildSentenceDictionaries(sentences,sentenceDictList,lastReadSentenceInd, corpusVocabulary, fileName): start_time_all = time.time() start_time = start_time_all numSentences = len(sentences) for indSentence, sentence in enumerate(sentences): start_time = util.printRemainingTime(start_time, numSentences, indSentence, 10000) if indSentence <= lastReadSentenceInd: continue sentenceDict = dict() wordsAlreadyRead = set() wordsInSentence = util.splitSentence(sentence) for word in wordsInSentence: if word not in wordsAlreadyRead: sentenceDict[word] = 1 / corpusVocabulary[word].sentenceCount wordsAlreadyRead.add(word) sentenceDictList.append(sentenceDict) lastReadSentenceInd = indSentence filehandler = open(fileName, "wb") pickle.dump([sentenceDictList, lastReadSentenceInd], filehandler) filehandler.close() elapsed_time = time.time() - start_time_all print('total time for building sentence list = ' + str(elapsed_time/60) + ' minutes ') return sentenceDictList
def findSentences(originalQuestions, originalSentences, numberWordsPerSentence, questionMatrix, choiceMatrix, relatedMatrix, numQuestions, sentences, maxSentences, useConceptNet, saveDir, relatedWordsWeight=0.01): # allBestSentences = [] start_time_all = time.time() start_time = start_time_all for indQ, question in enumerate(questionMatrix): # bestSentences = findSentencesForQuestion(question,sentences,maxSentences,useConceptNet) choice = choiceMatrix[indQ] related = relatedMatrix[indQ] savePath = saveDir + str(indQ) + '.txt' picklePath = saveDir + str(indQ) + '.pkl' if not os.path.exists(savePath): bestSentences = findSentencesForQuestion_SparseMatrices( originalSentences, question, choice, related, sentences, numberWordsPerSentence, maxSentences, useConceptNet, relatedWordsWeight) f = open(savePath, 'w') f.write('Question:\n') j = json.loads(originalQuestions[indQ]) question = j['question'] stem = question['stem'] f.write(stem + '\n') choices = question['choices'] for c in choices: choiceText = c['text'] label = c['label'] f.write(label + ') ' + choiceText + '\n') f.write('\n') f.write('Relevant Sentences:' + '\n') for indSentence in bestSentences.indices: f.write(originalSentences[indSentence] + '\n') f.close() bestSentencesPickelArray = [] for ind in bestSentences.indices: bestSentencesPickelArray.append(originalSentences[ind]) filehandler = open(picklePath, "wb") pickle.dump([bestSentencesPickelArray, bestSentences.indices], filehandler) filehandler.close() gc.collect() # allBestSentences.append(bestSentences) start_time = util.printRemainingTime(start_time, numQuestions, indQ, 1) elapsed_time = time.time() - start_time print('elapsed time to find relevant sentences = ' + str(elapsed_time))
def buildQuestionMatrix(questions, numQuestions, fileName, lastReadQuestionInd, rows, cols, occurrences, corpusVocabulary, wordCountThreshold): start_time = time.time() start_time_print = start_time for indQ, q in enumerate(questions): start_time_print = util.printRemainingTime(start_time_print, numQuestions, indQ, 100) if indQ <= lastReadQuestionInd: continue j = json.loads(q) question = j['question'] stem = question['stem'] wordsInQuestion = util.splitSentence(stem) choices = question['choices'] for c in choices: choiceText = c['text'] wordsInChoice = util.splitSentence(choiceText) wordsInQuestion.extend(wordsInChoice) wordsAlreadyRead = set() for word in wordsInQuestion: if word not in wordsAlreadyRead: if word not in corpusVocabulary: #word not in the vocabulary, so it's a rare word: look for similar words (rows, cols, occurrences) = util.updateSparseWithRelatedWords( word, corpusVocabulary, rows, cols, occurrences, indQ) else: indWord = corpusVocabulary[word].index rows.append(indQ) cols.append(indWord) occurrences.append(1) if corpusVocabulary[ word].sentenceCount < wordCountThreshold: #only use conceptnet for rare words (rows, cols, occurrences) = util.updateSparseWithRelatedWords( word, corpusVocabulary, rows, cols, occurrences, indQ) wordsAlreadyRead.add(word) lastReadQuestionInd = indQ questionMatrix = csr_matrix((occurrences, (rows, cols)), shape=(numQuestions, len(corpusVocabulary))) filehandler = open(fileName, "wb") pickle.dump([questionMatrix, lastReadQuestionInd], filehandler) filehandler.close() elapsed_time = time.time() - start_time print('total time for building question matrix = ' + str(elapsed_time / 60) + ' minutes ') return questionMatrix
def buildQuestionMatrix2(testQuestionIndices, questionMatrix, choiceMatrix, relatedMatrix, questions, numQuestions, fileName, lastReadQuestionInd, corpusVocabulary, inverseDictionary, wordCountThreshold): start_time = time.time() start_time_print = start_time usedQuestions = [] if len(testQuestionIndices) > 0: for q in testQuestionIndices: usedQuestions.append(questions[testQuestionIndices[q]]) else: usedQuestions = questions for indQ, q in enumerate(usedQuestions): start_time_print = util.printRemainingTime(start_time_print, numQuestions, indQ, 100) if indQ <= lastReadQuestionInd: continue j = json.loads(q) question = j['question'] stem = question['stem'] wordsInQuestion = util.splitSentence(stem) choices = question['choices'] wordsInChoices = [] for c in choices: choiceText = c['text'] wordsInCurrChoice = util.splitSentence(choiceText) wordsInChoices.extend(wordsInCurrChoice) # questionVector, relatedQuestionVector = addWordsToSparseMatrix(wordsInQuestion, corpusVocabulary, wordCountThreshold) # choicesVector, relatedChoicesVector = addWordsToSparseMatrix(wordsInChoices, corpusVocabulary, wordCountThreshold) questionVector = getSentenceVector(wordsInQuestion, corpusVocabulary) choicesVector = getSentenceVector(wordsInChoices, corpusVocabulary) # questionWords = [] # choiceWords = [] # questionVector2 = questionVector.toarray()[0] # nonzero = np.where(questionVector2 > 0)[0] # # for c in nonzero: # questionWords.append(inverseDictionary[c]) # # choicesVector2 = choicesVector.toarray()[0] # nonzero = np.where(choicesVector2 > 0)[0] # # for c in nonzero: # choiceWords.append(inverseDictionary[c]) relatedWords = getRelatedWordsBothways(wordsInQuestion, wordsInChoices, questionVector, choicesVector, wordCountThreshold, corpusVocabulary, inverseDictionary) questionMatrix.append(questionVector) choiceMatrix.append(choicesVector) relatedMatrix.append(relatedWords) lastReadQuestionInd = indQ filehandler = open(fileName, "wb") pickle.dump( [questionMatrix, choiceMatrix, relatedMatrix, lastReadQuestionInd], filehandler) filehandler.close() elapsed_time = time.time() - start_time print('total time for building question matrix = ' + str(elapsed_time / 60) + ' minutes ') return (questionMatrix, choiceMatrix, relatedMatrix)