def loadFiles(fileName): invertedIndex = InvertedIndex.InvertedIndex() currentDir = os.getcwd() workingDir = os.getcwd() questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/" os.chdir(questionsDir) for sDir in glob.glob("S*"): dataDir = questionsDir+sDir+"/data/" os.chdir(dataDir) print(sDir) for set in glob.glob("set*"): os.chdir(dataDir+set) print(set) for file in glob.glob("*.clean"): fullFileName = dataDir+set+"/"+file print(fullFileName) stemmedFile = stemText(path(file).text(encoding="utf8")) print("File stemmed") invertedIndex.indexDocument(stemmedFile, fullFileName) print("File added to index") os.chdir(currentDir) invertedIndex.save(fileName)
def loadIncludedCorpusFiles(self, fileName): """Load the included corpus files into the current Inverted Index""" currentDir = os.getcwd() workingDir = os.getcwd() questionsDir = workingDir + "/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/" os.chdir(questionsDir) for sDir in glob.glob("S*"): dataDir = questionsDir + sDir + "/data/" os.chdir(dataDir) print(sDir) for set in glob.glob("set*"): os.chdir(dataDir + set) print(set) for file in glob.glob("*.clean"): fullFileName = dataDir + set + "/" + file print(fullFileName) stemmedFile = stemText(Path(file).text(encoding="utf8")) print("File stemmed") self.indexDocument(stemmedFile, fullFileName) print("File added to index") os.chdir(currentDir) self.save(fileName)
def runQuery(self, query): stemmedQuery = stemText(query) docsWithTermProximity = self.mergeEachPostingPair(stemmedQuery, 5) # print(docsWithTermProximity) shortList = self.consolidateDocProximityList(docsWithTermProximity, 5) self.showDocumentText(shortList, 10)
def showDocumentText(self, documentsByTermProximity, distanceFromTerm): """Show text that may answer the query from the top 3 most relevant documents determined with cosine similarity to query""" #for termPairProximity in documentsByTermProximity: totalSections = 0 print("Documents Retrived: " + str(len(documentsByTermProximity))) blurbInvertedIndex = InvertedIndex.InvertedIndex() blurbList = [] for doc in documentsByTermProximity: #termPairProximity: docId = doc[0] sectionsFound = (len(doc[1]) / 2) documentFile = self.invertedIndex.listOfFiles[docId - 1] rawDocumentText = Path(documentFile).text(encoding='utf8') documentText = [ word.lower().replace('\n', '') for word in rawDocumentText.split(' ') if word.strip() != '' ] i = 0 while i < len(doc[1]) - 1: positionA = doc[1][i] positionB = doc[1][i + 1] if positionA - distanceFromTerm > 0: positionA = positionA - distanceFromTerm else: positionA = 0 termsInDoc = ' '.join( documentText[positionA:(positionB + distanceFromTerm)]) blurbList.append(termsInDoc) blurbInvertedIndex.indexDocument(stemText(termsInDoc), '') i += 2 totalSections += sectionsFound print("Sections Retrieved: " + str(totalSections)) blurbMatrix = blurbInvertedIndex.createTermDocMatrix() bestBlurbs = self.getKNearestDocs(self.query, blurbMatrix, 3) print("Showing Best 3 Results:") for blurbNum in bestBlurbs: print(blurbNum) print(blurbList[blurbNum - 1])
from Parser import stemText from Query import Query import pandas as pd import InvertedIndex from cosineSim import cosineSim fulldoc = path('C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean').bytes() docArray = [para for para in fulldoc.split('\n') if para.strip() != ''] print(len(docArray)) #print(docArray[4].split(' ')) print(stemText(docArray[4])) tester = InvertedIndex.InvertedIndex() for step in docArray: stemmed = stemText(step) tester.indexDocument(stemmed) docTermMatrix = tester.createTermDocMatrix() pd.set_option('display.max_columns', 150) #print(termDocMatrix.head())
from Parser import stemText from Query import Query import itertools import InvertedIndex fulldoc = path( 'C:\Users/admin/Documents/575/parser/Question_Answer_Dataset_v1.2/Question_Answer_Dataset_v1.2/S08/data/set1/a1.txt.clean' ).bytes() docArray = [para for para in fulldoc.split('\n') if para.strip() != ''] tester = InvertedIndex.InvertedIndex() for step in docArray: stemmed = stemText(step) tester.indexDocument(stemmed) print('running query') queryObj = Query(tester) queryText = "kangaroo marsupial" stemmedQuery = stemText(queryText) docsWithTermProximity = queryObj.mergeEachPostingPair(stemmedQuery, 5) print(docsWithTermProximity) shortList = queryObj.consolidateDocProximityList(docsWithTermProximity, 5) queryObj.showDocumentText(shortList, docArray, 10)