def createPostingsFile(): global _TermsWeights, _DocumentPaths, _TermsFrequences FileHandler.writeFile("Postings.txt", "") postingsText = "" for iDocument in range(len(_DocumentsPaths)): for iTerm in range(len(_TermsFrequences)): if _TermsFrequences[iTerm][1][iDocument] != 0: postingsText += str(iDocument) + "\t" + str(_TermsWeights[iTerm][1][iDocument]) + "\n" FileHandler.writeFile("Postings.txt", postingsText) print("Postings file created")
def createDocumentsFile(): global _DocumentsPaths FileHandler.readDirectories("Geografia") FileHandler.writeFile("Documentos.txt", "") path = FileHandler.nextFile() while path != None: FileHandler.writeFileAppend("Documentos.txt", str(FileHandler.getActualFile()) + "\t\t" + path + "\n") _DocumentsPaths.append(path) path = FileHandler.nextFile() print("Documents file created")
def createDictionaryFile(): global _TermsFrequences, _DocumentPaths lenght = 0 FileHandler.writeFile("Diccionario.txt", "") init = 0 dictionaryText = "" for iTerm in range(len(_TermsFrequences)): for iDocument in range(len(_DocumentsPaths)): lenght += _TermsFrequences[iTerm][1][iDocument] dictionaryText += _TermsFrequences[iTerm][0] + "\t" + str(init) + "\t" + str(lenght) + "\n" init += lenght lenght = 0 FileHandler.writeFile("Diccionario.txt", dictionaryText) print("Dictionary file created")
def getStopWords(): stopWordFile = FileHandler.readFile("stopwords.txt").split("\n") stopWords = [] for w in stopWordFile: stopWords.append(w[0:-1]) stopWords = stopWords[0:-1] return stopWords
def analizeDocuments(pDocumentsPaths): global _TermsFrequences,_DocumentsPaths _DocumentsPaths = pDocumentsPaths for iDocument in range(len(_DocumentsPaths)): documentPath = _DocumentsPaths[iDocument] text = FileHandler.readFile(documentPath) text = removeTags(text) text = removeStopWords(text) stemmed_words = steamS(text) addWords(stemmed_words, _TermsFrequences,iDocument) print("Document "+ str(iDocument) + " analized") return _TermsFrequences