def runDownLoad(START_NUMBER, proxies, headers, mode, location): PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt" DIR_DATA_OUT = '' DIR_LOG_OUT = '' print('Path In:', PATH_IN) #For Home only if (mode == "local"): DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG" elif (mode == "remote"): if (location == "home"): DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" elif (location == "office"): DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML" DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG" print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT) STOP_NUMBER = START_NUMBER + 10 print('starting at:', START_NUMBER) print('using agent:', headers['User-Agent']) #NOTE: LOG IS FOR EVERY BATCH #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT) pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT) wordList = sysHand.getLineFromTextFile(PATH_IN) #results = [] status = [] dateStamp = sysHand.getDateStamp() status.append('Starting scraping Lexico at ' + dateStamp) status.append('Starting scraping at index ' + str(START_NUMBER)) status.append('Starting scraping using IP ' + proxies['http']) status.append('Starting scraping using agent ' + headers['User-Agent']) for i in range(START_NUMBER, STOP_NUMBER): pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT) word = wordList[i] (htmlData, message) = getSingleWord(word, proxies, headers) if (htmlData): with open(pathDataOut, "w", encoding='utf-8') as file: file.write(htmlData) print(i, ':', message) status.append(str(i) + ' ' + message) time.sleep(7) #sysHand.writeDataToJSON(results, pathDataOut) dateStamp = sysHand.getDateStamp() status.append('Ending scraping Lexico at ' + dateStamp) sysHand.writeListToFile(status, pathStatusOut)
def processRawText(item, dirIn, dirOut): word = item.replace('.txt', '') pathIn = os.path.join(dirIn, item) lines = getLineFromTextFile(pathIn) lineTotal = len(lines) if lineTotal == 1 and not lines[0].strip(): message = 'File ' + word + ' is empty' else: message = splitHeadWord(lines, word, dirOut) return message
def firstRun(word, filePath): lines = sh.getLineFromTextFile(filePath) #FIRST RUN #Purpose is to determine number of headwords hwIndexes = [] idx = 0 for line in lines: if '[headword]' in line: hwIndexes.append(idx) idx += 1 hwTotal = len(hwIndexes) if (hwTotal > 1): #print('multiple headwords') #print(hwIndexes) idxList = [] for i in range(hwTotal-1): #print(hwIndexes[i], hwIndexes[i+1] - 1) tup = (hwIndexes[i], hwIndexes[i+1]) idxList.append(tup) lastTup = (hwIndexes[hwTotal-1], len(lines) -1) idxList.append(lastTup) wordNum = 0 for tup in idxList: lowRange = tup[0] highRange = tup[1] wordNum += 1 newLines = [] for k in range(lowRange, highRange): newLines.append(lines[k]) ProcessSingleHeadword(word, newLines, wordNum) #print(lines[217]) else: #print('single headword') ProcessSingleHeadword(word, lines, 0)
import os, json import system_handler as sh from share_function import splitLine import single_word as sw from pprint import pprint fileName = 'A-star.txt' fileNameJSON = fileName.replace(".txt", ".json") dirIn = 'E:/FULLTEXT/LEXICO/COMPACT' dirOut = 'E:/FULLTEXT/LEXICO/JSON' pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = sh.getLineFromTextFile(pathIn) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if(lines): for line in lines: if(line): key, text = splitLine(line) sLines.append((key,text)) #run for significant index lineMap =[] idx = 0 sectionList = ['[headword]','[category]', '[secphrases]', '[secphrasal]','[secusage]','[secpronun]', '[secorigin]'] for sLine in sLines:
def loadData(self): temp = getLineFromTextFile(self.pathDict) self.DictList = [item for item in temp if item]
def loadData(self): temp = getLineFromTextFile(self.pathProxy) self.proxyList = [item for item in temp if item]
def runFourthProcess(fileName, dirIn, dirOut): fileNameJSON = fileName.replace(".txt", ".json") pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = getLineFromTextFile(pathIn) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if (lines): for line in lines: if (line): key, text = splitLine(line) sLines.append((key, text)) #run for significant index lineMap = [] idx = 0 sectionList = [ '[headword]', '[category]', '[secphrases]', '[secphrasal]', '[secusage]', '[secpronun]', '[secorigin]' ] for sLine in sLines: #print(sLine[0], sLine[1]) #if sLine[0] == '[headword]' for section in sectionList: if (sLine[0]) == section: lineMap.append((section, idx)) idx += 1 #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION #print('lineMap:', lineMap) idxMap = [] for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1) idxMap.append(tup) #STEP 3: HANDLE EACH SECTION #print('\nindex map:', idxMap) objectList = [] for item in idxMap: sectionName, firstIdex, lastIndex = item sectLines = [] for i in range(firstIdex, lastIndex): sectLines.append(lines[i]) if (sectionName == '[headword]'): objHW = runlib.processHeadWordLines(sectLines) objectList.append(objHW) #print(objHW) elif (sectionName == '[category]'): objCategory = runlib.processCategoryLines(sectLines) objectList.append(objCategory) elif (sectionName == '[secphrases]'): objPhrases = runlib.processPhraseLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secphrasal]'): objPhrases = runlib.processPhraseVerbLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secusage]'): objUsage = runlib.processUsageLines(sectLines) objectList.append(objUsage) elif (sectionName == '[secorigin]'): objOrigin = runlib.processOriginLines(sectLines) objectList.append(objOrigin) elif (sectionName == '[secpronun]'): objPhonetic = runlib.processPhoneticLines(sectLines) objectList.append(objPhonetic) pprint(objectList) """
def secondRun(fileName, dirIn, dirOut): pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileName) lines = getLineFromTextFile(pathIn) lineTuple = [] for line in lines: if (line): key, text = splitLine(line) lineTuple.append((key, text)) #print(lineTuple) span = [] lineMap = [] for i in range(len(lineTuple) - 1): #print(lineTuple[i]) #key = lineTuple[i][0] #value = lineTuple[i][1] #print(key, value) if (lineTuple[i][0] == lineTuple[i + 1][0]): #match span.append(i) else: span.append(i) lineMap.append(span) span = [] #last item if (i == len(lineTuple) - 2): #print('i+1:', i+1) #print(lineTuple[i], lineTuple[i+1]) if (lineTuple[i][0] != lineTuple[i + 1][0]): span = [] span.append(i + 1) lineMap.append(span) dataOut = [] #print(lineMap) for items in lineMap: if len(items) == 1: #print('single', items) #print() idx = items[0] #print(lineTuple[idx]) line = lineTuple[idx][0] + lineTuple[idx][1] #print(line) dataOut.append(line) else: #print('series', items) header = '' text = '' for idx in items: if lineTuple[idx]: #print(lineTuple[idx][0], lineTuple[idx][1]) if not header: header = lineTuple[idx][0] text += lineTuple[idx][1] + '|' line = header + text dataOut.append(line) writeListToFile(dataOut, pathOut)
def runFourthProcess(fileName, dirIn, dirOut): fileNameJSON = fileName.replace(".txt", ".json") pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = getLineFromTextFile(pathIn) #pprint(lines) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if (lines): for line in lines: if (line): key, text = splitLine(line) sLines.append((key, text)) #run for significant index lineMap = [] idx = 0 sectionList = [ '[headword]', '[category]', '[secphrases]', '[secphrasal]', '[secusage]', '[secpronun]', '[secorigin]' ] for sLine in sLines: #print(sLine[0], sLine[1]) #if sLine[0] == '[headword]' for section in sectionList: if (sLine[0]) == section: lineMap.append((section, idx)) idx += 1 #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION #print('lineMap:', lineMap) #print('len -1', len(lineMap)-1) idxMap = [] if len(lineMap) == 1: tup = (lineMap[0][0], lineMap[0][1], len(lines) - 1) idxMap.append(tup) elif len(lineMap) > 1: for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1) idxMap.append(tup) #STEP 3: HANDLE EACH SECTION #print('\nindex map:', idxMap) objectList = [] for item in idxMap: sectionName, firstIdex, lastIndex = item sectLines = [] for i in range(firstIdex, lastIndex): sectLines.append(lines[i]) if (sectionName == '[headword]'): objHW = runlib.processHeadWordLines(sectLines) objectList.append(objHW) #print(objHW) elif (sectionName == '[category]'): objCategory = runlib.processCategoryLines(sectLines) objectList.append(objCategory) elif (sectionName == '[secphrases]'): objPhrases = runlib.processPhraseLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secphrasal]'): objPhrases = runlib.processPhraseVerbLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secusage]'): objUsage = runlib.processUsageLines(sectLines) objectList.append(objUsage) elif (sectionName == '[secorigin]'): objOrigin = runlib.processOriginLines(sectLines) objectList.append(objOrigin) elif (sectionName == '[secpronun]'): objPhonetic = runlib.processPhoneticLines(sectLines) objectList.append(objPhonetic) #STEP 4: MERGE OBJECTS masterObject = {} for obj in objectList: for key in obj: masterObject[key] = obj[key] #@pprint(masterObject) #STEP 5: WRITE OUT JSON FILE with open(pathOut, 'w', encoding="utf-8") as outfile: json.dump(masterObject, outfile) message = 'Finished converting ' + fileName + ' to JSON' return message