def processText(pathIn, dirOut): pathOut = sysHandle.getRawPath(pathIn, dirOut) words = sysHandle.getWordFromTextFile(pathIn) cleanList = trim_and_sort(words) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.openDir(dirOut) sys.exit()
def processText(pathIn, dirOut, dirLog): #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog) pathOut = sysHandle.getRawPath(pathIn, dirOut) filePrefix = "Extract_Sentence_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) #print('pathLog:', pathLog) #print(pathIn) #print(pathOut) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to extract sentences at " + dateStamp) #print('dateStamp:', dateStamp) extractSentences(pathIn, pathOut) #print(cleanList) dateStamp = sysHandle.getDateStamp() logData.append("Sentence extracting completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processText(path1, path2): output_path = getNormalPath(path1, path2) parsed = parser.from_file(path1) textout = parsed["content"] with open(output_path, 'w', encoding="utf-8") as file: file.write(parsed["content"]) openDir(path2) sys.exit()
def processText(bookID, outDir): outPath = sysHand.getNormalPath(bookID, outDir) #print('outPath:', outPath) wordList = sqlData.getWordList(bookID) wordList.sort() sentList = sqlData.getSentences(bookID) matchList = matchWordToSent(wordList, sentList, bookID) sysHand.writeTupleToFile(matchList, outPath) sysHand.openDir(outDir) sys.exit()
def processText(inFile, outDir): outFilePath = sH.getNormalPath(inFile, outDir) newText = sH.readTextFile(inFile) #wordList = getWordList() #newText = restoreBrokenWords2(newText, wordList) #newText = restoreBrokenWords3(newText, wordList) newText = cleanLine1(newText) newText = cleanLine2(newText) #regPat = r'\bBut-\n*ler' #newText = re.sub(regPat, 'Butler', newText) writeTextFile(newText, outFilePath) sH.openDir(outDir) sys.exit()
def processText(inFile, outDir, dbDir): pathRecycleOut = sysHandle.getRawPath(inFile, outDir) pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir) trashListIn = sysHandle.getWordFromTextFile(inFile) databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn) recycleListOut = [ item for item in trashListIn if item not in databseListIn ] standardList = getWordList() newRecycle = [item for item in recycleListOut if item not in standardList] #print (newRecycle) sysHandle.writeListToFile(newRecycle, pathRecycleOut) sysHandle.openDir(outDir) sys.exit()
def processText(inFile, outDir): outFilePath = getOutPath(inFile, outDir) matches = sH.readTextFile(inFile) listMatch = matches.split("\n") cleanList = [] for item in listMatch: parts = item.split(",") if (parts[0].strip()): cleanList.append(item) listMatch = list(dict.fromkeys(cleanList)) sH.writeListToFile(listMatch, outFilePath) #print(listMatch) sH.openDir(outDir) sys.exit()
def processText(pathIn, dirOut, dirExclusion, dirLog): pathOut = sysHandle.getRawPath(pathIn, dirOut) #print('dirLog', dirLog) initialString = "Word_Extract_Log_" pathLog = sysHandle.getDatedFilePath(initialString, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to extract words at " + dateStamp logData.append(message) print(message) #STEP 1: read data file and split to get words words = sysHandle.getWordFromTextFile(pathIn) dateStamp = sysHandle.getDateStamp() message = "Reading word list completed at " + dateStamp logData.append(message) print(message) #STEP 2: trim left, right, remove overlappings and sort wordList = cleanWordList(words) dateStamp = sysHandle.getDateStamp() message = "Trimming word list completed at " + dateStamp logData.append(message) print(message) #print(wordList) #STEP 3: remove items found in exclusion list, remove empty string exclusionList = sysHandle.loadDictionaries(dirExclusion) #print(exclusionList) cleanList = [w for w in wordList if w.lower() not in exclusionList] #remove empty items cleanList = [w for w in cleanList if w] #log activity dateStamp = sysHandle.getDateStamp() message = "Removing exluded items completed at " + dateStamp logData.append(message) print(message) #print(cleanList) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processTab3(pathClean, dirRaw, dirRecycle): #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle) pathRaw = getRawPath(pathClean, dirRaw) pathRecycle = getRawPath(pathClean, dirRecycle) #print('pathRaw', pathRaw) #print('\npathRecycle', pathRecycle) contentRaw = loadFormatPairFile(pathRaw) #print(contentRaw) contentClean = loadFormatPairFile(pathClean) #print(contentClean) recycleData = [item for item in contentRaw if item not in contentClean] dataOut = unpackPairs(recycleData) #print(dataOut) writeListToFile(dataOut, pathRecycle) openDir(dirRecycle) sys.exit()
objectList.append(objUsage) elif (sectionName == '[secorigin]'): objOrigin = sw.processOriginLines(sectLines) objectList.append(objOrigin) elif (sectionName == '[secpronun]'): objPhonetic = sw.processPhoneticLines(sectLines) objectList.append(objPhonetic) #STEP 4: MERGE OBJECTS masterObject = {} for obj in objectList: for key in obj: masterObject[key] = obj[key] #@pprint(masterObject) #STEP 5: WRITE OUT JSON FILE with open(pathOut, 'w', encoding ="utf-8") as outfile: json.dump(masterObject, outfile) sh.openDir(dirOut)
itemData.append(wordDiv) elif (item.name == 'strong'): wordStrong = processStrong(item) if (wordStrong): itemData.append(wordStrong) return itemData #entryList = soup.find_all('div', {'class' : 'entryWrapper'}) #print(len(entryList)) #for item in entryList: # print(item) if __name__ == "__main__": WORD = "a" dirOut = "E:/FULLTEXT/LEXICO/TEXT" pathIn = "E:/FULLTEXT/LEXICO/HTML/" + WORD + ".html" pathOut = getFilePath(pathIn, dirOut) #print(pathOut) wordData = [] with open(pathIn, "r", encoding="utf-8") as file: contents = file.read() wordData = processLexico(contents) writeListToFile(wordData, pathOut) openDir(dirOut)
def prepareMongoWrite(inPath): logDir = 'E:/FULLTEXT/LEXICO/LOG' processJSONDirectory(inPath, logDir) openDir(logDir)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir): #print ('logDir:', logDir, 'recyle Dir:', recycleDir) #print ('recycleList:', recycleList) initialString = "Dictionary_Check_Log_" pathLog = sysHandle.getDatedFilePath(initialString, logDir) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to directionary-check at " + dateStamp logData.append(message) print(message) pathOutClean = sysHandle.getRawPath(inFile, outDir) pathOutTrash = sysHandle.getRawPath(inFile, trashDir) #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash) rawList = convertList(sysHandle.readTextFile(inFile)) dicList = sysHandle.loadDictionaries(dictDir) #split clean and trash based on dictionary listClean, listTrash = filterList(rawList, dicList) #split into lower case and upper case parts lowerClean, upperClean = splitDictByCase(listClean) #get a list of words from mysql database lowerDic, upperDic = splitDictByCase(getWordList()) #logging activity dateStamp = sysHandle.getDateStamp() message = "Loading dictionary completed at " + dateStamp logData.append(message) print(message) newUpperClean = [ item for item in upperClean if item.lower() not in lowerDic ] newClean = newUpperClean + lowerClean #logging activity dateStamp = sysHandle.getDateStamp() message = "Completed dictionary checking at " + dateStamp logData.append(message) print(message) recycleList = sysHandle.loadDictionaries(recycleDir) newListTrash = [item for item in listTrash if item not in recycleList] sysHandle.writeListToFile(newClean, pathOutClean) sysHandle.writeListToFile(newListTrash, pathOutTrash) #logging activity dateStamp = sysHandle.getDateStamp() message = "Finished directionary checking at " + dateStamp logData.append(message) print(message) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(outDir) sys.exit()
def processTab1(sDictPath, cDictPath, outputDir): #print('sDictPath:', sDictPath, '\ncDictPath:', cDictPath, '\noutputDir:', outputDir) extractWordPairs(sDictPath, cDictPath, outputDir) openDir(outputDir) sys.exit()
def processTab2(dirIn, dirOut, dirRecycle): #print('dirIn:', dirIn, '\ndirOut:', dirOut, '\ndirRecycle', dirRecycle) mergePairs(dirIn, dirOut, dirRecycle) openDir(dirOut) sys.exit()