Ejemplo n.º 1
0
def processJSONDirectory(dataDir, logDir):

    logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir)

    print('log path', logPath)

    logData = []

    dateStamp = getDateStamp()
    message = 'Started processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    dataFileList = os.listdir(dataDir)

    #print(dataFileList)

    for dataFile in dataFileList:
        logData += processSingleFile(dataFile, dataDir)

    dateStamp = getDateStamp()
    message = 'Finished processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    writeListToFile(logData, logPath)
def processText(pathIn, dirOut, dirLog):

    #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog)
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    filePrefix = "Extract_Sentence_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)
    #print('pathLog:', pathLog)

    #print(pathIn)
    #print(pathOut)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to extract sentences at " + dateStamp)

    #print('dateStamp:', dateStamp)

    extractSentences(pathIn, pathOut)
    #print(cleanList)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence extracting completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Ejemplo n.º 3
0
def processText(pathIn, dirOut, dirExclusion, dirLog):
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    #print('dirLog', dirLog)
    initialString = "Word_Extract_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, dirLog)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to extract words at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 1: read data file and split to get words
    words = sysHandle.getWordFromTextFile(pathIn)
    dateStamp = sysHandle.getDateStamp()
    message = "Reading word list completed at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 2: trim left, right, remove overlappings and sort
    wordList = cleanWordList(words)
    dateStamp = sysHandle.getDateStamp()
    message = "Trimming word list completed at " + dateStamp
    logData.append(message)
    print(message)
    #print(wordList)

    #STEP 3: remove items found in exclusion list, remove empty string
    exclusionList = sysHandle.loadDictionaries(dirExclusion)
    #print(exclusionList)
    cleanList = [w for w in wordList if w.lower() not in exclusionList]
    #remove empty items
    cleanList = [w for w in cleanList if w]

    #log activity
    dateStamp = sysHandle.getDateStamp()
    message = "Removing exluded items completed at " + dateStamp
    logData.append(message)
    print(message)

    #print(cleanList)
    sysHandle.writeListToFile(cleanList, pathOut)
    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Ejemplo n.º 4
0
def uploadData(pathIn, bookID, dirLog):

    filePrefix = "Upload_Sentences_To_MySQL_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to upload sentences at " + dateStamp)
    logData.append("BookID being uploaded: " + str(bookID))
    #print(pathIn, bookID)
    sentence_total = upload_data(pathIn, bookID)

    logData.append("Total sentences written to MySQL " + str(sentence_total))
    #sysHandle.openDir(outDir)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence uploading completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sys.exit()
Ejemplo n.º 5
0
            print('Connection Error with IP ' + str(item))
            statList.append('Connection Error with IP ' + str(item))
        except urllib3.exceptions.MaxRetryError:
            print('Max Retry Error with IP ' + str(item))
            statList.append('Max Retry Error with IP ' + str(item))

        except Exception as e:
            statList.append('Error verifying IP ' + str(item))
            print(e)

    file.close()

    dateStamp = sysHand.getDateStamp()

    statList.append('Finish verifying proxy at ' + dateStamp)

    return statList


if __name__ == '__main__':
    proxyDir = 'D:/Proxy/List'
    outPath = 'D:/Proxy/Filter/good_proxy_list.txt'
    logDir = 'D:/Proxy/Log'
    initialString = "Proxy_Verification_Log_"
    logPath = sysHand.getDatedFilePath(initialString, logDir)
    #clear file contents
    open(outPath, "w").close()
    proxyList = sysHand.loadProxyLines(proxyDir)
    statusList = verifyProxy(proxyList, outPath)
    sysHand.writeListToFile(statusList, logPath)
Ejemplo n.º 6
0
    return message


if __name__ == "__main__":

    dirIn = 'E:/FULLTEXT/LEXICO/TEXT'
    dirOut = 'E:/FULLTEXT/LEXICO/TEXT2'
    dirLog = 'E:/FULLTEXT/LEXICO/LOG'
    cf = config_handler.ConfigHandler()
    recentFile = cf.get_config_value(cf.RECENT_OPEN_FILE2)
    #print(recentFile)
    fileList = os.listdir(dirIn)
    lastFile = ''
    prefix = 'Lexicon_Second_Run_Log_'
    logData = []
    logPath = getDatedFilePath(prefix, dirLog)
    #print('log path:', logPath)
    timeStamp = getDateStamp()
    message = 'Starting processing at ' + timeStamp
    logData.append(message)
    print(message)

    for item in fileList:
        if (item > recentFile):
            lastFile = item
            message = 'Processsing item ' + item
            logData.append(message)
            print(message)
            message = processRawText(item, dirIn, dirOut)
            logData.append(message)
            print(message)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir):
    #print ('logDir:', logDir, 'recyle Dir:', recycleDir)

    #print ('recycleList:', recycleList)

    initialString = "Dictionary_Check_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, logDir)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to directionary-check at " + dateStamp
    logData.append(message)
    print(message)

    pathOutClean = sysHandle.getRawPath(inFile, outDir)
    pathOutTrash = sysHandle.getRawPath(inFile, trashDir)
    #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash)
    rawList = convertList(sysHandle.readTextFile(inFile))
    dicList = sysHandle.loadDictionaries(dictDir)

    #split clean and trash based on dictionary
    listClean, listTrash = filterList(rawList, dicList)

    #split into lower case and upper case parts
    lowerClean, upperClean = splitDictByCase(listClean)

    #get a list of words from mysql database
    lowerDic, upperDic = splitDictByCase(getWordList())

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Loading dictionary completed at " + dateStamp
    logData.append(message)
    print(message)

    newUpperClean = [
        item for item in upperClean if item.lower() not in lowerDic
    ]

    newClean = newUpperClean + lowerClean

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Completed dictionary checking at " + dateStamp
    logData.append(message)
    print(message)

    recycleList = sysHandle.loadDictionaries(recycleDir)
    newListTrash = [item for item in listTrash if item not in recycleList]

    sysHandle.writeListToFile(newClean, pathOutClean)
    sysHandle.writeListToFile(newListTrash, pathOutTrash)

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Finished directionary checking at " + dateStamp
    logData.append(message)
    print(message)
    sysHandle.writeListToFile(logData, pathLog)

    sysHandle.openDir(outDir)
    sys.exit()