def processText(pathIn, dirOut, dirLog): #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog) pathOut = sysHandle.getRawPath(pathIn, dirOut) filePrefix = "Extract_Sentence_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) #print('pathLog:', pathLog) #print(pathIn) #print(pathOut) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to extract sentences at " + dateStamp) #print('dateStamp:', dateStamp) extractSentences(pathIn, pathOut) #print(cleanList) dateStamp = sysHandle.getDateStamp() logData.append("Sentence extracting completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processJSONDirectory(dataDir, logDir): logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir) print('log path', logPath) logData = [] dateStamp = getDateStamp() message = 'Started processing JSON at ' + dateStamp logData.append(message) print(message) dataFileList = os.listdir(dataDir) #print(dataFileList) for dataFile in dataFileList: logData += processSingleFile(dataFile, dataDir) dateStamp = getDateStamp() message = 'Finished processing JSON at ' + dateStamp logData.append(message) print(message) writeListToFile(logData, logPath)
def runDownLoad(START_NUMBER, proxies, headers, mode, location): PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt" DIR_DATA_OUT = '' DIR_LOG_OUT = '' print('Path In:', PATH_IN) #For Home only if (mode == "local"): DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG" elif (mode == "remote"): if (location == "home"): DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" elif (location == "office"): DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT) STOP_NUMBER = START_NUMBER + 10 print('starting at:', START_NUMBER) print('using agent:', headers['User-Agent']) #NOTE: LOG IS FOR EVERY BATCH #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT) wordList = sysHand.getLineFromTextFile(PATH_IN) #results = [] status = [] dateStamp = sysHand.getDateStamp() status.append('Starting scraping Lexico at ' + dateStamp) status.append('Starting scraping at index ' + str(START_NUMBER)) status.append('Starting scraping using IP ' + proxies['http']) status.append('Starting scraping using agent ' + headers['User-Agent']) for i in range(START_NUMBER, STOP_NUMBER): pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT) word = wordList[i] (htmlData, message) = getSingleWord(word, proxies, headers) if (htmlData): with open(pathDataOut, "w", encoding='utf-8') as file: file.write(htmlData) print(i, ':', message) status.append(str(i) + ' ' + message) time.sleep(7) #sysHand.writeDataToJSON(results, pathDataOut) dateStamp = sysHand.getDateStamp() status.append('Ending scraping Lexico at ' + dateStamp) sysHand.writeListToFile(status, pathStatusOut)
def RunCode(START_NUMBER, proxies, headers, mode, location): PATH_IN = "E:/FULLTEXT/DICTIONARY/SPECIALTY/NLTK_Words_List.txt" PATH_DATA_OUT = '' PATH_LOG_OUT = '' print('Path In:', PATH_IN) #For Home only if (mode == "local"): PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG" elif (mode == "remote"): if (location == "home"): PATH_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG" elif (location == "office"): PATH_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG" print('\nData Path:', PATH_DATA_OUT, '\nLog Path:', PATH_LOG_OUT) STOP_NUMBER = START_NUMBER + 100 print('starting at:', START_NUMBER) print('using agent:', headers['User-Agent']) urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) print('using IP:', resTest.text) pathDataOut, pathStatusOut = sysHand.getIncrementPath( START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) wordList = sysHand.getWordFromTextFile(PATH_IN) results = [] status = [] dateStamp = sysHand.getDateStamp() status.append('Starting scraping Google at ' + dateStamp) status.append('Starting scraping at index ' + str(START_NUMBER)) status.append('Starting scraping using IP ' + resTest.text) status.append('Starting scraping using agent ' + headers['User-Agent']) for i in range(START_NUMBER, STOP_NUMBER): word = wordList[i] (data, message) = getSingleWord(word, proxies, headers) print(i, ':', message) status.append(str(i) + ' ' + message) if (data): results.append(data) time.sleep(3) sysHand.writeDataToJSON(results, pathDataOut) dateStamp = sysHand.getDateStamp() status.append('Ending scraping Google at ' + dateStamp) sysHand.writeListToFile(status, pathStatusOut)
def verifyProxy(proxyList, outPath): statList = [] dateStamp = sysHand.getDateStamp() statList.append('Start verifying proxy at ' + dateStamp) file = open(outPath, 'a', encoding='utf-8') for item in proxyList: user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} proxies = {} proxies['http'] = item proxies['https'] = item print('Verifying:', item) print('using agent:', headers['User-Agent']) print('using proxy:', proxies) try: urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) print('using IP:', resTest.text) file.write(str(item) + '\n') statList.append('Sucessfully verified IP ' + str(item)) time.sleep(2) except urllib3.exceptions.ConnectTimeoutError: statList.append('Connection Time Out Error with IP ' + str(item)) print('Connection Time Out Error with IP ' + str(item)) except urllib3.exceptions.ConnectionError: print('Connection Error with IP ' + str(item)) statList.append('Connection Error with IP ' + str(item)) except urllib3.exceptions.MaxRetryError: print('Max Retry Error with IP ' + str(item)) statList.append('Max Retry Error with IP ' + str(item)) except Exception as e: statList.append('Error verifying IP ' + str(item)) print(e) file.close() dateStamp = sysHand.getDateStamp() statList.append('Finish verifying proxy at ' + dateStamp) return statList
def processText(pathIn, dirOut, dirExclusion, dirLog): pathOut = sysHandle.getRawPath(pathIn, dirOut) #print('dirLog', dirLog) initialString = "Word_Extract_Log_" pathLog = sysHandle.getDatedFilePath(initialString, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to extract words at " + dateStamp logData.append(message) print(message) #STEP 1: read data file and split to get words words = sysHandle.getWordFromTextFile(pathIn) dateStamp = sysHandle.getDateStamp() message = "Reading word list completed at " + dateStamp logData.append(message) print(message) #STEP 2: trim left, right, remove overlappings and sort wordList = cleanWordList(words) dateStamp = sysHandle.getDateStamp() message = "Trimming word list completed at " + dateStamp logData.append(message) print(message) #print(wordList) #STEP 3: remove items found in exclusion list, remove empty string exclusionList = sysHandle.loadDictionaries(dirExclusion) #print(exclusionList) cleanList = [w for w in wordList if w.lower() not in exclusionList] #remove empty items cleanList = [w for w in cleanList if w] #log activity dateStamp = sysHandle.getDateStamp() message = "Removing exluded items completed at " + dateStamp logData.append(message) print(message) #print(cleanList) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def uploadData(pathIn, bookID, dirLog): filePrefix = "Upload_Sentences_To_MySQL_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to upload sentences at " + dateStamp) logData.append("BookID being uploaded: " + str(bookID)) #print(pathIn, bookID) sentence_total = upload_data(pathIn, bookID) logData.append("Total sentences written to MySQL " + str(sentence_total)) #sysHandle.openDir(outDir) dateStamp = sysHandle.getDateStamp() logData.append("Sentence uploading completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sys.exit()
def processSingleFile(dataFile, dataDir): pathJSON = os.path.join(dataDir, dataFile) pathWordList = 'E:/FULLTEXT/LEXICO/LOG/Lexico_Word_List.txt' logData = [] dateStamp = getDateStamp() message = 'Started processing ' + dataFile + ' at ' + dateStamp logData.append(message) print(message) message = 'Processing ' + dataFile logData.append(message) print(message) client = MongoClient('localhost', 27017) DB_NAME = 'dictionary' db = client[DB_NAME] jsonData = [] with open(pathJSON) as json_file: jsonData = json.load(json_file) fWordList = open(pathWordList, 'a', encoding='utf-8') #pprint(jsonData) try: headWord = jsonData['head-word'] if headWord: strStatus = insertDBOne(jsonData, db) message = 'Inserted ' + headWord + ' at ' + strStatus logData.append(message) print(message) fWordList.write(headWord + '\n') except Exception as e: print('error encountered') fWordList.close() return logData
if __name__ == "__main__": dirIn = 'E:/FULLTEXT/LEXICO/TEXT' dirOut = 'E:/FULLTEXT/LEXICO/TEXT2' dirLog = 'E:/FULLTEXT/LEXICO/LOG' cf = config_handler.ConfigHandler() recentFile = cf.get_config_value(cf.RECENT_OPEN_FILE2) #print(recentFile) fileList = os.listdir(dirIn) lastFile = '' prefix = 'Lexicon_Second_Run_Log_' logData = [] logPath = getDatedFilePath(prefix, dirLog) #print('log path:', logPath) timeStamp = getDateStamp() message = 'Starting processing at ' + timeStamp logData.append(message) print(message) for item in fileList: if (item > recentFile): lastFile = item message = 'Processsing item ' + item logData.append(message) print(message) message = processRawText(item, dirIn, dirOut) logData.append(message) print(message) #WRITE INI
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir): #print ('logDir:', logDir, 'recyle Dir:', recycleDir) #print ('recycleList:', recycleList) initialString = "Dictionary_Check_Log_" pathLog = sysHandle.getDatedFilePath(initialString, logDir) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to directionary-check at " + dateStamp logData.append(message) print(message) pathOutClean = sysHandle.getRawPath(inFile, outDir) pathOutTrash = sysHandle.getRawPath(inFile, trashDir) #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash) rawList = convertList(sysHandle.readTextFile(inFile)) dicList = sysHandle.loadDictionaries(dictDir) #split clean and trash based on dictionary listClean, listTrash = filterList(rawList, dicList) #split into lower case and upper case parts lowerClean, upperClean = splitDictByCase(listClean) #get a list of words from mysql database lowerDic, upperDic = splitDictByCase(getWordList()) #logging activity dateStamp = sysHandle.getDateStamp() message = "Loading dictionary completed at " + dateStamp logData.append(message) print(message) newUpperClean = [ item for item in upperClean if item.lower() not in lowerDic ] newClean = newUpperClean + lowerClean #logging activity dateStamp = sysHandle.getDateStamp() message = "Completed dictionary checking at " + dateStamp logData.append(message) print(message) recycleList = sysHandle.loadDictionaries(recycleDir) newListTrash = [item for item in listTrash if item not in recycleList] sysHandle.writeListToFile(newClean, pathOutClean) sysHandle.writeListToFile(newListTrash, pathOutTrash) #logging activity dateStamp = sysHandle.getDateStamp() message = "Finished directionary checking at " + dateStamp logData.append(message) print(message) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(outDir) sys.exit()