def processText(pathIn, dirOut, dirLog):

    #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog)
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    filePrefix = "Extract_Sentence_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)
    #print('pathLog:', pathLog)

    #print(pathIn)
    #print(pathOut)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to extract sentences at " + dateStamp)

    #print('dateStamp:', dateStamp)

    extractSentences(pathIn, pathOut)
    #print(cleanList)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence extracting completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
def processJSONDirectory(dataDir, logDir):

    logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir)

    print('log path', logPath)

    logData = []

    dateStamp = getDateStamp()
    message = 'Started processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    dataFileList = os.listdir(dataDir)

    #print(dataFileList)

    for dataFile in dataFileList:
        logData += processSingleFile(dataFile, dataDir)

    dateStamp = getDateStamp()
    message = 'Finished processing JSON at ' + dateStamp
    logData.append(message)
    print(message)

    writeListToFile(logData, logPath)
Exemple #3
0
def runDownLoad(START_NUMBER, proxies, headers, mode, location):

    PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt"

    DIR_DATA_OUT = ''
    DIR_LOG_OUT = ''

    print('Path In:', PATH_IN)
    #For Home only

    if (mode == "local"):
        DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML"
        DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG"

    elif (mode == "remote"):
        if (location == "home"):
            DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"
        elif (location == "office"):
            DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"

    print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT)

    STOP_NUMBER = START_NUMBER + 10

    print('starting at:', START_NUMBER)
    print('using agent:', headers['User-Agent'])

    #NOTE: LOG IS FOR EVERY BATCH
    #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)
    pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT)

    wordList = sysHand.getLineFromTextFile(PATH_IN)

    #results = []
    status = []
    dateStamp = sysHand.getDateStamp()
    status.append('Starting scraping Lexico at  ' + dateStamp)
    status.append('Starting scraping at index ' + str(START_NUMBER))
    status.append('Starting scraping using IP ' + proxies['http'])
    status.append('Starting scraping using agent ' + headers['User-Agent'])

    for i in range(START_NUMBER, STOP_NUMBER):
        pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT)
        word = wordList[i]
        (htmlData, message) = getSingleWord(word, proxies, headers)
        if (htmlData):
            with open(pathDataOut, "w", encoding='utf-8') as file:
                file.write(htmlData)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        time.sleep(7)

    #sysHand.writeDataToJSON(results, pathDataOut)
    dateStamp = sysHand.getDateStamp()
    status.append('Ending scraping Lexico at ' + dateStamp)
    sysHand.writeListToFile(status, pathStatusOut)
Exemple #4
0
def RunCode(START_NUMBER, proxies, headers, mode, location):

    PATH_IN = "E:/FULLTEXT/DICTIONARY/SPECIALTY/NLTK_Words_List.txt"
    PATH_DATA_OUT = ''
    PATH_LOG_OUT = ''

    print('Path In:', PATH_IN)
    #For Home only

    if (mode == "local"):
        PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW"
        PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG"

    elif (mode == "remote"):
        if (location == "home"):
            PATH_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW"
            PATH_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG"
        elif (location == "office"):
            PATH_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW"
            PATH_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG"

    print('\nData Path:', PATH_DATA_OUT, '\nLog Path:', PATH_LOG_OUT)

    STOP_NUMBER = START_NUMBER + 100

    print('starting at:', START_NUMBER)
    print('using agent:', headers['User-Agent'])
    urlTest = "http://icanhazip.com"
    resTest = requests.get(urlTest, proxies=proxies, headers=headers)
    print('using IP:', resTest.text)

    pathDataOut, pathStatusOut = sysHand.getIncrementPath(
        START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)

    wordList = sysHand.getWordFromTextFile(PATH_IN)

    results = []
    status = []
    dateStamp = sysHand.getDateStamp()
    status.append('Starting scraping Google at  ' + dateStamp)
    status.append('Starting scraping at index ' + str(START_NUMBER))
    status.append('Starting scraping using IP ' + resTest.text)
    status.append('Starting scraping using agent ' + headers['User-Agent'])

    for i in range(START_NUMBER, STOP_NUMBER):
        word = wordList[i]
        (data, message) = getSingleWord(word, proxies, headers)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        if (data):
            results.append(data)
        time.sleep(3)

    sysHand.writeDataToJSON(results, pathDataOut)
    dateStamp = sysHand.getDateStamp()
    status.append('Ending scraping Google at ' + dateStamp)
    sysHand.writeListToFile(status, pathStatusOut)
Exemple #5
0
def verifyProxy(proxyList, outPath):

    statList = []

    dateStamp = sysHand.getDateStamp()

    statList.append('Start verifying proxy at ' + dateStamp)

    file = open(outPath, 'a', encoding='utf-8')

    for item in proxyList:
        user_agent = getRamdomUserAgent()
        headers = {'User-Agent': user_agent}
        proxies = {}

        proxies['http'] = item
        proxies['https'] = item

        print('Verifying:', item)
        print('using agent:', headers['User-Agent'])
        print('using proxy:', proxies)
        try:
            urlTest = "http://icanhazip.com"
            resTest = requests.get(urlTest, proxies=proxies, headers=headers)
            print('using IP:', resTest.text)
            file.write(str(item) + '\n')
            statList.append('Sucessfully verified IP ' + str(item))
            time.sleep(2)

        except urllib3.exceptions.ConnectTimeoutError:
            statList.append('Connection Time Out Error with IP ' + str(item))
            print('Connection Time Out Error with IP ' + str(item))
        except urllib3.exceptions.ConnectionError:
            print('Connection Error with IP ' + str(item))
            statList.append('Connection Error with IP ' + str(item))
        except urllib3.exceptions.MaxRetryError:
            print('Max Retry Error with IP ' + str(item))
            statList.append('Max Retry Error with IP ' + str(item))

        except Exception as e:
            statList.append('Error verifying IP ' + str(item))
            print(e)

    file.close()

    dateStamp = sysHand.getDateStamp()

    statList.append('Finish verifying proxy at ' + dateStamp)

    return statList
Exemple #6
0
def processText(pathIn, dirOut, dirExclusion, dirLog):
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    #print('dirLog', dirLog)
    initialString = "Word_Extract_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, dirLog)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to extract words at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 1: read data file and split to get words
    words = sysHandle.getWordFromTextFile(pathIn)
    dateStamp = sysHandle.getDateStamp()
    message = "Reading word list completed at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 2: trim left, right, remove overlappings and sort
    wordList = cleanWordList(words)
    dateStamp = sysHandle.getDateStamp()
    message = "Trimming word list completed at " + dateStamp
    logData.append(message)
    print(message)
    #print(wordList)

    #STEP 3: remove items found in exclusion list, remove empty string
    exclusionList = sysHandle.loadDictionaries(dirExclusion)
    #print(exclusionList)
    cleanList = [w for w in wordList if w.lower() not in exclusionList]
    #remove empty items
    cleanList = [w for w in cleanList if w]

    #log activity
    dateStamp = sysHandle.getDateStamp()
    message = "Removing exluded items completed at " + dateStamp
    logData.append(message)
    print(message)

    #print(cleanList)
    sysHandle.writeListToFile(cleanList, pathOut)
    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
Exemple #7
0
def uploadData(pathIn, bookID, dirLog):

    filePrefix = "Upload_Sentences_To_MySQL_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to upload sentences at " + dateStamp)
    logData.append("BookID being uploaded: " + str(bookID))
    #print(pathIn, bookID)
    sentence_total = upload_data(pathIn, bookID)

    logData.append("Total sentences written to MySQL " + str(sentence_total))
    #sysHandle.openDir(outDir)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence uploading completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sys.exit()
def processSingleFile(dataFile, dataDir):

    pathJSON = os.path.join(dataDir, dataFile)
    pathWordList = 'E:/FULLTEXT/LEXICO/LOG/Lexico_Word_List.txt'

    logData = []

    dateStamp = getDateStamp()
    message = 'Started processing ' + dataFile + ' at ' + dateStamp
    logData.append(message)
    print(message)

    message = 'Processing ' + dataFile
    logData.append(message)
    print(message)

    client = MongoClient('localhost', 27017)
    DB_NAME = 'dictionary'
    db = client[DB_NAME]

    jsonData = []

    with open(pathJSON) as json_file:
        jsonData = json.load(json_file)

    fWordList = open(pathWordList, 'a', encoding='utf-8')

    #pprint(jsonData)
    try:
        headWord = jsonData['head-word']
        if headWord:
            strStatus = insertDBOne(jsonData, db)
            message = 'Inserted ' + headWord + ' at ' + strStatus
            logData.append(message)
            print(message)
            fWordList.write(headWord + '\n')
    except Exception as e:
        print('error encountered')

    fWordList.close()

    return logData
Exemple #9
0
if __name__ == "__main__":

    dirIn = 'E:/FULLTEXT/LEXICO/TEXT'
    dirOut = 'E:/FULLTEXT/LEXICO/TEXT2'
    dirLog = 'E:/FULLTEXT/LEXICO/LOG'
    cf = config_handler.ConfigHandler()
    recentFile = cf.get_config_value(cf.RECENT_OPEN_FILE2)
    #print(recentFile)
    fileList = os.listdir(dirIn)
    lastFile = ''
    prefix = 'Lexicon_Second_Run_Log_'
    logData = []
    logPath = getDatedFilePath(prefix, dirLog)
    #print('log path:', logPath)
    timeStamp = getDateStamp()
    message = 'Starting processing at ' + timeStamp
    logData.append(message)
    print(message)

    for item in fileList:
        if (item > recentFile):
            lastFile = item
            message = 'Processsing item ' + item
            logData.append(message)
            print(message)
            message = processRawText(item, dirIn, dirOut)
            logData.append(message)
            print(message)

    #WRITE INI
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir):
    #print ('logDir:', logDir, 'recyle Dir:', recycleDir)

    #print ('recycleList:', recycleList)

    initialString = "Dictionary_Check_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, logDir)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to directionary-check at " + dateStamp
    logData.append(message)
    print(message)

    pathOutClean = sysHandle.getRawPath(inFile, outDir)
    pathOutTrash = sysHandle.getRawPath(inFile, trashDir)
    #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash)
    rawList = convertList(sysHandle.readTextFile(inFile))
    dicList = sysHandle.loadDictionaries(dictDir)

    #split clean and trash based on dictionary
    listClean, listTrash = filterList(rawList, dicList)

    #split into lower case and upper case parts
    lowerClean, upperClean = splitDictByCase(listClean)

    #get a list of words from mysql database
    lowerDic, upperDic = splitDictByCase(getWordList())

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Loading dictionary completed at " + dateStamp
    logData.append(message)
    print(message)

    newUpperClean = [
        item for item in upperClean if item.lower() not in lowerDic
    ]

    newClean = newUpperClean + lowerClean

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Completed dictionary checking at " + dateStamp
    logData.append(message)
    print(message)

    recycleList = sysHandle.loadDictionaries(recycleDir)
    newListTrash = [item for item in listTrash if item not in recycleList]

    sysHandle.writeListToFile(newClean, pathOutClean)
    sysHandle.writeListToFile(newListTrash, pathOutTrash)

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Finished directionary checking at " + dateStamp
    logData.append(message)
    print(message)
    sysHandle.writeListToFile(logData, pathLog)

    sysHandle.openDir(outDir)
    sys.exit()