def ProcessSingleHeadword(word, lines, idx): if (idx > 0): pathOut = 'E:/FULLTEXT/LEXICO/OUTPUT/' + word + '_' + str(idx) + '.txt' else: pathOut = 'E:/FULLTEXT/LEXICO/OUTPUT/' + word + '.txt' writeListToFile(lines, pathOut)
def processJSONDirectory(dataDir, logDir): logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir) print('log path', logPath) logData = [] dateStamp = getDateStamp() message = 'Started processing JSON at ' + dateStamp logData.append(message) print(message) dataFileList = os.listdir(dataDir) #print(dataFileList) for dataFile in dataFileList: logData += processSingleFile(dataFile, dataDir) dateStamp = getDateStamp() message = 'Finished processing JSON at ' + dateStamp logData.append(message) print(message) writeListToFile(logData, logPath)
def RunCode(START_NUMBER, session, headers): PATH_IN = "E:/FULLTEXT/SPECIALTY/NLTK_Words_List.txt" PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG" STOP_NUMBER = START_NUMBER + 100 pathDataOut, pathStatusOut = sysHand.getIncrementPath( START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) wordList = sysHand.getWordFromTextFile(PATH_IN) results = [] status = [] for i in range(START_NUMBER, STOP_NUMBER): word = wordList[i] (data, message) = getSingleWord(word, session, headers) print(i, ':', message) status.append(str(i) + ' ' + message) if (data): results.append(data) time.sleep(3) sysHand.writeDataToJSON(results, pathDataOut) sysHand.writeListToFile(status, pathStatusOut)
def ProcessSingleHeadword(word, lines, idx, dirOut): if (idx > 0): pathOut = dirOut + '/' + word + '_' + str(idx) + '.txt' else: pathOut = dirOut + '/' + word + '.txt' writeListToFile(lines, pathOut)
def processText(pathIn, dirOut): pathOut = sysHandle.getRawPath(pathIn, dirOut) words = sysHandle.getWordFromTextFile(pathIn) cleanList = trim_and_sort(words) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.openDir(dirOut) sys.exit()
def processText(pathIn, dirOut, dirLog): #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog) pathOut = sysHandle.getRawPath(pathIn, dirOut) filePrefix = "Extract_Sentence_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) #print('pathLog:', pathLog) #print(pathIn) #print(pathOut) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to extract sentences at " + dateStamp) #print('dateStamp:', dateStamp) extractSentences(pathIn, pathOut) #print(cleanList) dateStamp = sysHandle.getDateStamp() logData.append("Sentence extracting completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def runDownLoad(START_NUMBER, proxies, headers, mode, location): PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt" DIR_DATA_OUT = '' DIR_LOG_OUT = '' print('Path In:', PATH_IN) #For Home only if (mode == "local"): DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG" elif (mode == "remote"): if (location == "home"): DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" elif (location == "office"): DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT) STOP_NUMBER = START_NUMBER + 10 print('starting at:', START_NUMBER) print('using agent:', headers['User-Agent']) #NOTE: LOG IS FOR EVERY BATCH #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT) wordList = sysHand.getLineFromTextFile(PATH_IN) #results = [] status = [] dateStamp = sysHand.getDateStamp() status.append('Starting scraping Lexico at ' + dateStamp) status.append('Starting scraping at index ' + str(START_NUMBER)) status.append('Starting scraping using IP ' + proxies['http']) status.append('Starting scraping using agent ' + headers['User-Agent']) for i in range(START_NUMBER, STOP_NUMBER): pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT) word = wordList[i] (htmlData, message) = getSingleWord(word, proxies, headers) if (htmlData): with open(pathDataOut, "w", encoding='utf-8') as file: file.write(htmlData) print(i, ':', message) status.append(str(i) + ' ' + message) time.sleep(7) #sysHand.writeDataToJSON(results, pathDataOut) dateStamp = sysHand.getDateStamp() status.append('Ending scraping Lexico at ' + dateStamp) sysHand.writeListToFile(status, pathStatusOut)
def RunCode(START_NUMBER, proxies, headers, mode, location): PATH_IN = "E:/FULLTEXT/DICTIONARY/SPECIALTY/NLTK_Words_List.txt" PATH_DATA_OUT = '' PATH_LOG_OUT = '' print('Path In:', PATH_IN) #For Home only if (mode == "local"): PATH_DATA_OUT = "E:/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "E:/FULLTEXT/GOOGLE/LOG" elif (mode == "remote"): if (location == "home"): PATH_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG" elif (location == "office"): PATH_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/RAW" PATH_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/GOOGLE/LOG" print('\nData Path:', PATH_DATA_OUT, '\nLog Path:', PATH_LOG_OUT) STOP_NUMBER = START_NUMBER + 100 print('starting at:', START_NUMBER) print('using agent:', headers['User-Agent']) urlTest = "http://icanhazip.com" resTest = requests.get(urlTest, proxies=proxies, headers=headers) print('using IP:', resTest.text) pathDataOut, pathStatusOut = sysHand.getIncrementPath( START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) wordList = sysHand.getWordFromTextFile(PATH_IN) results = [] status = [] dateStamp = sysHand.getDateStamp() status.append('Starting scraping Google at ' + dateStamp) status.append('Starting scraping at index ' + str(START_NUMBER)) status.append('Starting scraping using IP ' + resTest.text) status.append('Starting scraping using agent ' + headers['User-Agent']) for i in range(START_NUMBER, STOP_NUMBER): word = wordList[i] (data, message) = getSingleWord(word, proxies, headers) print(i, ':', message) status.append(str(i) + ' ' + message) if (data): results.append(data) time.sleep(3) sysHand.writeDataToJSON(results, pathDataOut) dateStamp = sysHand.getDateStamp() status.append('Ending scraping Google at ' + dateStamp) sysHand.writeListToFile(status, pathStatusOut)
def processHTML(fileName, dirIn, dirOut): fileOut = fileName.replace(".html", ".txt") pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileOut) #print('\npathIn:', pathIn, '\npathOut:', pathOut) wordData = [] with open(pathIn, "r", encoding="utf-8") as file: contents = file.read() wordData = processLexico(contents) writeListToFile(wordData, pathOut)
def getProxyList(targetURL, pathOut): http = urllib3.PoolManager() try: response = http.request('GET', targetURL) print('Getting data from', targetURL, '...') if response.status == 200: data = response.data.decode('utf-8') proxList = str(data).split('\n') writeListToFile(proxList, pathOut) except Exception as e: print(e)
def getZERO_to_ING_forms(wordForms, dictForms, outputDir): #get pairs like CALF => CALVES pathOut = outputDir + "/ZERO_to_ING_Match_Pairs.txt" wordPairs = [] for dictForm in dictForms: inflection = dictForm + 'ing' if (inflection in wordForms): wordPairs.append(dictForm + ', ' + inflection) print('finding...', dictForm, ', ', inflection) writeListToFile(wordPairs, pathOut)
def getZ_to_ES_forms(wordForms, dictForms, outputDir): #get pairs like CALF => CALVES pathOut = outputDir + "/Z_to_ES_Match_Pairs.txt" wordPairs = [] for dictForm in dictForms: inflection = dictForm + 'es' if (dictForm.endswith('z') and inflection in wordForms): wordPairs.append(dictForm + ', ' + inflection) print('finding...', dictForm, ', ', inflection) writeListToFile(wordPairs, pathOut)
def get_T_to_TTING_forms(wordForms, dictForms, outputDir): #get pairs like LIFE => LIVES pathOut = outputDir + "/T_To_TTING_Match_Pairs.txt" wordPairs = [] for dictForm in dictForms: inflection = dictForm + 'ting' if (dictForm.endswith('t') and inflection in wordForms): wordPairs.append(dictForm + ', ' + inflection) print('finding...', dictForm, ', ', inflection) writeListToFile(wordPairs, pathOut)
def get_IE_to_YNG_forms(wordForms, dictForms, outputDir): #get pairs like LIFE => LIVES pathOut = outputDir + "/IE_To_YNG_Match_Pairs.txt" wordPairs = [] for dictForm in dictForms: if (len(dictForm) > 2): inflection = dictForm[:-2] + 'yng' if (dictForm.endswith('ie') and inflection in wordForms): wordPairs.append(dictForm + ', ' + inflection) print('finding...', dictForm, ', ', inflection) writeListToFile(wordPairs, pathOut)
def processText(inFile, outDir, dbDir): pathRecycleOut = sysHandle.getRawPath(inFile, outDir) pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir) trashListIn = sysHandle.getWordFromTextFile(inFile) databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn) recycleListOut = [ item for item in trashListIn if item not in databseListIn ] standardList = getWordList() newRecycle = [item for item in recycleListOut if item not in standardList] #print (newRecycle) sysHandle.writeListToFile(newRecycle, pathRecycleOut) sysHandle.openDir(outDir) sys.exit()
def processText(inFile, outDir): outFilePath = getOutPath(inFile, outDir) matches = sH.readTextFile(inFile) listMatch = matches.split("\n") cleanList = [] for item in listMatch: parts = item.split(",") if (parts[0].strip()): cleanList.append(item) listMatch = list(dict.fromkeys(cleanList)) sH.writeListToFile(listMatch, outFilePath) #print(listMatch) sH.openDir(outDir) sys.exit()
def main(dirOut): WORD = "z" START_NUMBER = 2 STOP_NUMBER = 3 STEP_NUMBER = 1 proxies = startPrivateProxy() for i in range(START_NUMBER, STOP_NUMBER, STEP_NUMBER): print('processing list ', WORD, i) user_agent = getRamdomUserAgent() headers = {'User-Agent': user_agent} INDEX = str(i) pathOut = dirOut + '/list' + WORD + INDEX + ".txt" htmlContent = getLexico(WORD, INDEX, proxies, headers) wordData = processLexicoList(htmlContent, WORD) writeListToFile(wordData, pathOut) time.sleep(10)
def processText(pathIn, dirOut, dirExclusion, dirLog): pathOut = sysHandle.getRawPath(pathIn, dirOut) #print('dirLog', dirLog) initialString = "Word_Extract_Log_" pathLog = sysHandle.getDatedFilePath(initialString, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to extract words at " + dateStamp logData.append(message) print(message) #STEP 1: read data file and split to get words words = sysHandle.getWordFromTextFile(pathIn) dateStamp = sysHandle.getDateStamp() message = "Reading word list completed at " + dateStamp logData.append(message) print(message) #STEP 2: trim left, right, remove overlappings and sort wordList = cleanWordList(words) dateStamp = sysHandle.getDateStamp() message = "Trimming word list completed at " + dateStamp logData.append(message) print(message) #print(wordList) #STEP 3: remove items found in exclusion list, remove empty string exclusionList = sysHandle.loadDictionaries(dirExclusion) #print(exclusionList) cleanList = [w for w in wordList if w.lower() not in exclusionList] #remove empty items cleanList = [w for w in cleanList if w] #log activity dateStamp = sysHandle.getDateStamp() message = "Removing exluded items completed at " + dateStamp logData.append(message) print(message) #print(cleanList) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def uploadData(pathIn, bookID, dirLog): filePrefix = "Upload_Sentences_To_MySQL_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to upload sentences at " + dateStamp) logData.append("BookID being uploaded: " + str(bookID)) #print(pathIn, bookID) sentence_total = upload_data(pathIn, bookID) logData.append("Total sentences written to MySQL " + str(sentence_total)) #sysHandle.openDir(outDir) dateStamp = sysHandle.getDateStamp() logData.append("Sentence uploading completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sys.exit()
def processTab3(pathClean, dirRaw, dirRecycle): #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle) pathRaw = getRawPath(pathClean, dirRaw) pathRecycle = getRawPath(pathClean, dirRecycle) #print('pathRaw', pathRaw) #print('\npathRecycle', pathRecycle) contentRaw = loadFormatPairFile(pathRaw) #print(contentRaw) contentClean = loadFormatPairFile(pathClean) #print(contentClean) recycleData = [item for item in contentRaw if item not in contentClean] dataOut = unpackPairs(recycleData) #print(dataOut) writeListToFile(dataOut, pathRecycle) openDir(dirRecycle) sys.exit()
def mergePairs(dirIn, dirOut, dirRecycle): pathOut = sysHandle.getOutPath(dirOut) #print(pathOut) #print('dirIn:', dirIn) bigDict = sysHandle.loadWordPairs(dirIn) #print(bigDict) recycleList = sysHandle.loadWordPairs(dirRecycle) #print(recycleList) msqlList = getPairList() newDict = [item for item in bigDict if item not in msqlList] outDict = [item for item in newDict if item not in recycleList] newPairList = unpackPairs(outDict) sysHandle.writeListToFile(newPairList, pathOut)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir): #print ('logDir:', logDir, 'recyle Dir:', recycleDir) #print ('recycleList:', recycleList) initialString = "Dictionary_Check_Log_" pathLog = sysHandle.getDatedFilePath(initialString, logDir) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to directionary-check at " + dateStamp logData.append(message) print(message) pathOutClean = sysHandle.getRawPath(inFile, outDir) pathOutTrash = sysHandle.getRawPath(inFile, trashDir) #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash) rawList = convertList(sysHandle.readTextFile(inFile)) dicList = sysHandle.loadDictionaries(dictDir) #split clean and trash based on dictionary listClean, listTrash = filterList(rawList, dicList) #split into lower case and upper case parts lowerClean, upperClean = splitDictByCase(listClean) #get a list of words from mysql database lowerDic, upperDic = splitDictByCase(getWordList()) #logging activity dateStamp = sysHandle.getDateStamp() message = "Loading dictionary completed at " + dateStamp logData.append(message) print(message) newUpperClean = [ item for item in upperClean if item.lower() not in lowerDic ] newClean = newUpperClean + lowerClean #logging activity dateStamp = sysHandle.getDateStamp() message = "Completed dictionary checking at " + dateStamp logData.append(message) print(message) recycleList = sysHandle.loadDictionaries(recycleDir) newListTrash = [item for item in listTrash if item not in recycleList] sysHandle.writeListToFile(newClean, pathOutClean) sysHandle.writeListToFile(newListTrash, pathOutTrash) #logging activity dateStamp = sysHandle.getDateStamp() message = "Finished directionary checking at " + dateStamp logData.append(message) print(message) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(outDir) sys.exit()
print('Connection Error with IP ' + str(item)) statList.append('Connection Error with IP ' + str(item)) except urllib3.exceptions.MaxRetryError: print('Max Retry Error with IP ' + str(item)) statList.append('Max Retry Error with IP ' + str(item)) except Exception as e: statList.append('Error verifying IP ' + str(item)) print(e) file.close() dateStamp = sysHand.getDateStamp() statList.append('Finish verifying proxy at ' + dateStamp) return statList if __name__ == '__main__': proxyDir = 'D:/Proxy/List' outPath = 'D:/Proxy/Filter/good_proxy_list.txt' logDir = 'D:/Proxy/Log' initialString = "Proxy_Verification_Log_" logPath = sysHand.getDatedFilePath(initialString, logDir) #clear file contents open(outPath, "w").close() proxyList = sysHand.loadProxyLines(proxyDir) statusList = verifyProxy(proxyList, outPath) sysHand.writeListToFile(statusList, logPath)
def secondRun(fileName, dirIn, dirOut): pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileName) lines = getLineFromTextFile(pathIn) lineTuple = [] for line in lines: if (line): key, text = splitLine(line) lineTuple.append((key, text)) #print(lineTuple) span = [] lineMap = [] for i in range(len(lineTuple) - 1): #print(lineTuple[i]) #key = lineTuple[i][0] #value = lineTuple[i][1] #print(key, value) if (lineTuple[i][0] == lineTuple[i + 1][0]): #match span.append(i) else: span.append(i) lineMap.append(span) span = [] #last item if (i == len(lineTuple) - 2): #print('i+1:', i+1) #print(lineTuple[i], lineTuple[i+1]) if (lineTuple[i][0] != lineTuple[i + 1][0]): span = [] span.append(i + 1) lineMap.append(span) dataOut = [] #print(lineMap) for items in lineMap: if len(items) == 1: #print('single', items) #print() idx = items[0] #print(lineTuple[idx]) line = lineTuple[idx][0] + lineTuple[idx][1] #print(line) dataOut.append(line) else: #print('series', items) header = '' text = '' for idx in items: if lineTuple[idx]: #print(lineTuple[idx][0], lineTuple[idx][1]) if not header: header = lineTuple[idx][0] text += lineTuple[idx][1] + '|' line = header + text dataOut.append(line) writeListToFile(dataOut, pathOut)
import sys from mysql_data import getWordList from system_handler import writeListToFile, getWordFromTextFile pathIn = "E:/FULLTEXT/TEMPOUT/GENERAL_MATCHING_PAIRS.txt" pathOut = "E:/FULLTEXT/SPECIALTY/GENERAL_MATCHING_PAIRS.txt" wordPairs = getWordFromTextFile(pathIn) newPairs = [] for pair in wordPairs: if (pair): items = pair.split(',') newPairs.append(items[0].strip() + ', ' + items[1].strip()) results = list(dict.fromkeys(newPairs)) results.sort() #print(results) writeListToFile(results, pathOut)
#print(recentFile) fileList = os.listdir(dirIn) lastFile = '' prefix = 'Lexicon_Second_Run_Log_' logData = [] logPath = getDatedFilePath(prefix, dirLog) #print('log path:', logPath) timeStamp = getDateStamp() message = 'Starting processing at ' + timeStamp logData.append(message) print(message) for item in fileList: if (item > recentFile): lastFile = item message = 'Processsing item ' + item logData.append(message) print(message) message = processRawText(item, dirIn, dirOut) logData.append(message) print(message) #WRITE INI cf.set_config_value(cf.RECENT_OPEN_FILE2, lastFile) timeStamp = getDateStamp() message = 'Finished processing at ' + timeStamp logData.append(message) print(message) writeListToFile(logData, logPath) openDir(dirOut)
from nltk.corpus import words, stopwords import system_handler as sysHand #english_stopwords = stopwords.words('english') #filePath = "E:/FULLTEXT/SPECIALTY/English_Stop_Words.txt" #sysHand.writeListToFile(english_stopwords, filePath) filePath = "E:/FULLTEXT/SPECIALTY/Full_Words_List.txt" wordlist = words.words() sysHand.writeListToFile(wordlist, filePath) #print(english_stopwords)
def formatWordList(records): wordList = [] for word in records: wordList.append(word[0]) return wordList def getWordList(): DB_NAME = "lexicon" db = get_connection(DB_NAME) cursor = db.cursor() select_sql = ("select distinct word from google_defs") try: cursor.execute(select_sql) records = cursor.fetchall() return formatWordList(records) except Exception as e: print("Error encountered:", e) finally: cursor.close db.close if __name__ == "__main__": dirOut = "E:/FULLTEXT/SPECIALTY/" pathOut = dirOut + "Dictionary_Headword_List.txt" wordList = getWordList() writeListToFile(wordList, pathOut) openDir(dirOut)
itemData.append(wordDiv) elif (item.name == 'strong'): wordStrong = processStrong(item) if (wordStrong): itemData.append(wordStrong) return itemData #entryList = soup.find_all('div', {'class' : 'entryWrapper'}) #print(len(entryList)) #for item in entryList: # print(item) if __name__ == "__main__": WORD = "a" dirOut = "E:/FULLTEXT/LEXICO/TEXT" pathIn = "E:/FULLTEXT/LEXICO/HTML/" + WORD + ".html" pathOut = getFilePath(pathIn, dirOut) #print(pathOut) wordData = [] with open(pathIn, "r", encoding="utf-8") as file: contents = file.read() wordData = processLexico(contents) writeListToFile(wordData, pathOut) openDir(dirOut)
import re import system_handler as sysHand inPath = "E:/FULLTEXT/SPECIALTY/Oxford_Word_List.txt" outPath = "E:/FULLTEXT/SPECIALTY/Oxford_Two_Word_Compound.txt" inputText = """ Aberdeen Angus Aberdonian """ regPat = r'^\w+\s\w+$' pattern = re.compile(regPat, re.M) #finds = re.findall(pattern, inputText) #print(finds) data = sysHand.readTextFile(inPath) wordList = data.split('\n') twoWords = [] for word in wordList: matchObj = re.search(pattern, word) if (matchObj): twoWords.append(word) sysHand.writeListToFile(twoWords, outPath)