def processText(pathIn, dirOut):
	pathOut = sysHandle.getRawPath(pathIn, dirOut)
	words = sysHandle.getWordFromTextFile(pathIn)
	cleanList = trim_and_sort(words)
	sysHandle.writeListToFile(cleanList, pathOut)
	sysHandle.openDir(dirOut)
	sys.exit()
def processText(pathIn, dirOut, dirLog):

    #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog)
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    filePrefix = "Extract_Sentence_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)
    #print('pathLog:', pathLog)

    #print(pathIn)
    #print(pathOut)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to extract sentences at " + dateStamp)

    #print('dateStamp:', dateStamp)

    extractSentences(pathIn, pathOut)
    #print(cleanList)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence extracting completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
def processText(path1, path2):
    output_path = getNormalPath(path1, path2)
    parsed = parser.from_file(path1)
    textout = parsed["content"]
    with open(output_path, 'w', encoding="utf-8") as file:
        file.write(parsed["content"])
    openDir(path2)
    sys.exit()
Esempio n. 4
0
def processText(bookID, outDir):
	
	outPath = sysHand.getNormalPath(bookID, outDir)
	#print('outPath:', outPath)	
	
	wordList = sqlData.getWordList(bookID)
	wordList.sort()	
	sentList = sqlData.getSentences(bookID)
	matchList = matchWordToSent(wordList, sentList, bookID)
	sysHand.writeTupleToFile(matchList, outPath)
	sysHand.openDir(outDir)
	sys.exit()
def processText(inFile, outDir):
    outFilePath = sH.getNormalPath(inFile, outDir)
    newText = sH.readTextFile(inFile)
    #wordList = getWordList()
    #newText = restoreBrokenWords2(newText, wordList)
    #newText = restoreBrokenWords3(newText, wordList)
    newText = cleanLine1(newText)
    newText = cleanLine2(newText)

    #regPat = r'\bBut-\n*ler'
    #newText = re.sub(regPat, 'Butler', newText)

    writeTextFile(newText, outFilePath)
    sH.openDir(outDir)
    sys.exit()
Esempio n. 6
0
def processText(inFile, outDir, dbDir):
    pathRecycleOut = sysHandle.getRawPath(inFile, outDir)
    pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir)
    trashListIn = sysHandle.getWordFromTextFile(inFile)
    databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn)
    recycleListOut = [
        item for item in trashListIn if item not in databseListIn
    ]
    standardList = getWordList()
    newRecycle = [item for item in recycleListOut if item not in standardList]
    #print (newRecycle)

    sysHandle.writeListToFile(newRecycle, pathRecycleOut)
    sysHandle.openDir(outDir)
    sys.exit()
Esempio n. 7
0
def processText(inFile, outDir):
    outFilePath = getOutPath(inFile, outDir)

    matches = sH.readTextFile(inFile)
    listMatch = matches.split("\n")
    cleanList = []
    for item in listMatch:
        parts = item.split(",")
        if (parts[0].strip()):
            cleanList.append(item)

    listMatch = list(dict.fromkeys(cleanList))

    sH.writeListToFile(listMatch, outFilePath)
    #print(listMatch)

    sH.openDir(outDir)
    sys.exit()
Esempio n. 8
0
def processText(pathIn, dirOut, dirExclusion, dirLog):
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    #print('dirLog', dirLog)
    initialString = "Word_Extract_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, dirLog)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to extract words at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 1: read data file and split to get words
    words = sysHandle.getWordFromTextFile(pathIn)
    dateStamp = sysHandle.getDateStamp()
    message = "Reading word list completed at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 2: trim left, right, remove overlappings and sort
    wordList = cleanWordList(words)
    dateStamp = sysHandle.getDateStamp()
    message = "Trimming word list completed at " + dateStamp
    logData.append(message)
    print(message)
    #print(wordList)

    #STEP 3: remove items found in exclusion list, remove empty string
    exclusionList = sysHandle.loadDictionaries(dirExclusion)
    #print(exclusionList)
    cleanList = [w for w in wordList if w.lower() not in exclusionList]
    #remove empty items
    cleanList = [w for w in cleanList if w]

    #log activity
    dateStamp = sysHandle.getDateStamp()
    message = "Removing exluded items completed at " + dateStamp
    logData.append(message)
    print(message)

    #print(cleanList)
    sysHandle.writeListToFile(cleanList, pathOut)
    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Esempio n. 9
0
def processTab3(pathClean, dirRaw, dirRecycle):
    #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle)

    pathRaw = getRawPath(pathClean, dirRaw)
    pathRecycle = getRawPath(pathClean, dirRecycle)

    #print('pathRaw', pathRaw)
    #print('\npathRecycle', pathRecycle)

    contentRaw = loadFormatPairFile(pathRaw)
    #print(contentRaw)
    contentClean = loadFormatPairFile(pathClean)
    #print(contentClean)

    recycleData = [item for item in contentRaw if item not in contentClean]

    dataOut = unpackPairs(recycleData)
    #print(dataOut)

    writeListToFile(dataOut, pathRecycle)
    openDir(dirRecycle)
    sys.exit()
Esempio n. 10
0
		objectList.append(objUsage)
	
	elif (sectionName == '[secorigin]'):
		objOrigin = sw.processOriginLines(sectLines)
		objectList.append(objOrigin)
	
	elif (sectionName == '[secpronun]'):
		objPhonetic = sw.processPhoneticLines(sectLines)
		objectList.append(objPhonetic)





#STEP 4: MERGE OBJECTS
masterObject = {}
for obj in objectList:
	for key in obj:
		masterObject[key] = obj[key]


#@pprint(masterObject)


#STEP 5: WRITE OUT JSON FILE
with open(pathOut, 'w', encoding ="utf-8") as outfile:  
	json.dump(masterObject, outfile)
sh.openDir(dirOut)
	

Esempio n. 11
0
                    itemData.append(wordDiv)
        elif (item.name == 'strong'):
            wordStrong = processStrong(item)
            if (wordStrong):
                itemData.append(wordStrong)
    return itemData

    #entryList = soup.find_all('div', {'class' : 'entryWrapper'})
    #print(len(entryList))
    #for item in entryList:
    #	print(item)


if __name__ == "__main__":

    WORD = "a"
    dirOut = "E:/FULLTEXT/LEXICO/TEXT"
    pathIn = "E:/FULLTEXT/LEXICO/HTML/" + WORD + ".html"
    pathOut = getFilePath(pathIn, dirOut)

    #print(pathOut)

    wordData = []

    with open(pathIn, "r", encoding="utf-8") as file:
        contents = file.read()
        wordData = processLexico(contents)

    writeListToFile(wordData, pathOut)
    openDir(dirOut)
Esempio n. 12
0
def prepareMongoWrite(inPath):
    logDir = 'E:/FULLTEXT/LEXICO/LOG'
    processJSONDirectory(inPath, logDir)
    openDir(logDir)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir):
    #print ('logDir:', logDir, 'recyle Dir:', recycleDir)

    #print ('recycleList:', recycleList)

    initialString = "Dictionary_Check_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, logDir)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to directionary-check at " + dateStamp
    logData.append(message)
    print(message)

    pathOutClean = sysHandle.getRawPath(inFile, outDir)
    pathOutTrash = sysHandle.getRawPath(inFile, trashDir)
    #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash)
    rawList = convertList(sysHandle.readTextFile(inFile))
    dicList = sysHandle.loadDictionaries(dictDir)

    #split clean and trash based on dictionary
    listClean, listTrash = filterList(rawList, dicList)

    #split into lower case and upper case parts
    lowerClean, upperClean = splitDictByCase(listClean)

    #get a list of words from mysql database
    lowerDic, upperDic = splitDictByCase(getWordList())

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Loading dictionary completed at " + dateStamp
    logData.append(message)
    print(message)

    newUpperClean = [
        item for item in upperClean if item.lower() not in lowerDic
    ]

    newClean = newUpperClean + lowerClean

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Completed dictionary checking at " + dateStamp
    logData.append(message)
    print(message)

    recycleList = sysHandle.loadDictionaries(recycleDir)
    newListTrash = [item for item in listTrash if item not in recycleList]

    sysHandle.writeListToFile(newClean, pathOutClean)
    sysHandle.writeListToFile(newListTrash, pathOutTrash)

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Finished directionary checking at " + dateStamp
    logData.append(message)
    print(message)
    sysHandle.writeListToFile(logData, pathLog)

    sysHandle.openDir(outDir)
    sys.exit()
Esempio n. 14
0
def processTab1(sDictPath, cDictPath, outputDir):
    #print('sDictPath:', sDictPath, '\ncDictPath:', cDictPath, '\noutputDir:', outputDir)
    extractWordPairs(sDictPath, cDictPath, outputDir)
    openDir(outputDir)
    sys.exit()
Esempio n. 15
0
def processTab2(dirIn, dirOut, dirRecycle):
    #print('dirIn:', dirIn, '\ndirOut:', dirOut, '\ndirRecycle', dirRecycle)
    mergePairs(dirIn, dirOut, dirRecycle)
    openDir(dirOut)
    sys.exit()