Beispiel #1
0
def runDownLoad(START_NUMBER, proxies, headers, mode, location):

    PATH_IN = "E:/FULLTEXT/DICTIONARY/NORMALCASE/Combined Lexico Oxford.txt"

    DIR_DATA_OUT = ''
    DIR_LOG_OUT = ''

    print('Path In:', PATH_IN)
    #For Home only

    if (mode == "local"):
        DIR_DATA_OUT = "E:/FULLTEXT/LEXICO/HTML"
        DIR_LOG_OUT = "E:/FULLTEXT/LEXICO/LOG"

    elif (mode == "remote"):
        if (location == "home"):
            DIR_DATA_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Andy Anh/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"
        elif (location == "office"):
            DIR_DATA_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/HTML"
            DIR_LOG_OUT = "C:/Users/Administrator/Dropbox/PROGRAMMING/FULLTEXT/LEXICO/LOG"

    print('\nData Path:', DIR_DATA_OUT, '\nLog Path:', DIR_LOG_OUT)

    STOP_NUMBER = START_NUMBER + 10

    print('starting at:', START_NUMBER)
    print('using agent:', headers['User-Agent'])

    #NOTE: LOG IS FOR EVERY BATCH
    #pathDataOut, pathStatusOut = sysHand.getIncrementPath(START_NUMBER, PATH_DATA_OUT, PATH_LOG_OUT)
    pathStatusOut = sysHand.getIncrementLogPath(START_NUMBER, DIR_LOG_OUT)

    wordList = sysHand.getLineFromTextFile(PATH_IN)

    #results = []
    status = []
    dateStamp = sysHand.getDateStamp()
    status.append('Starting scraping Lexico at  ' + dateStamp)
    status.append('Starting scraping at index ' + str(START_NUMBER))
    status.append('Starting scraping using IP ' + proxies['http'])
    status.append('Starting scraping using agent ' + headers['User-Agent'])

    for i in range(START_NUMBER, STOP_NUMBER):
        pathDataOut = sysHand.getIncrementDataPath(i, DIR_DATA_OUT)
        word = wordList[i]
        (htmlData, message) = getSingleWord(word, proxies, headers)
        if (htmlData):
            with open(pathDataOut, "w", encoding='utf-8') as file:
                file.write(htmlData)
        print(i, ':', message)
        status.append(str(i) + ' ' + message)
        time.sleep(7)

    #sysHand.writeDataToJSON(results, pathDataOut)
    dateStamp = sysHand.getDateStamp()
    status.append('Ending scraping Lexico at ' + dateStamp)
    sysHand.writeListToFile(status, pathStatusOut)
Beispiel #2
0
def processRawText(item, dirIn, dirOut):
    word = item.replace('.txt', '')
    pathIn = os.path.join(dirIn, item)

    lines = getLineFromTextFile(pathIn)
    lineTotal = len(lines)
    if lineTotal == 1 and not lines[0].strip():
        message = 'File ' + word + ' is empty'
    else:
        message = splitHeadWord(lines, word, dirOut)

    return message
def firstRun(word, filePath):

	lines = sh.getLineFromTextFile(filePath)

	#FIRST RUN 
	#Purpose is to determine number of headwords
	hwIndexes = []
	idx = 0
	for line in lines:
		if '[headword]' in line:
			hwIndexes.append(idx)
		idx += 1
	hwTotal = len(hwIndexes) 
	if (hwTotal > 1):
		#print('multiple headwords')
		#print(hwIndexes)
		idxList = []
		for i in range(hwTotal-1):
			#print(hwIndexes[i], hwIndexes[i+1] - 1)
			tup = (hwIndexes[i], hwIndexes[i+1])
			idxList.append(tup)
		lastTup = (hwIndexes[hwTotal-1], len(lines) -1)
		idxList.append(lastTup)
		wordNum = 0
		for tup in idxList:
			lowRange = tup[0]
			highRange = tup[1]
			wordNum += 1
			newLines = []
			for k in range(lowRange, highRange):
				newLines.append(lines[k])
			ProcessSingleHeadword(word, newLines, wordNum)
		#print(lines[217])

	else:
		#print('single headword')
		ProcessSingleHeadword(word, lines, 0)
Beispiel #4
0
import os, json
import system_handler as sh
from share_function import splitLine
import single_word as sw
from pprint import pprint


fileName = 'A-star.txt'
fileNameJSON = fileName.replace(".txt", ".json")
dirIn = 'E:/FULLTEXT/LEXICO/COMPACT'
dirOut = 'E:/FULLTEXT/LEXICO/JSON'
pathIn = os.path.join(dirIn, fileName)
pathOut = os.path.join(dirOut, fileNameJSON)
lines = sh.getLineFromTextFile(pathIn)

sLines = []

#STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

if(lines):
	for line in lines:
		if(line):
			key, text = splitLine(line)
			sLines.append((key,text))

	#run for significant index
	lineMap =[]
	idx = 0
	sectionList = ['[headword]','[category]', '[secphrases]', '[secphrasal]','[secusage]','[secpronun]', '[secorigin]']

	for sLine in sLines:
Beispiel #5
0
	def loadData(self):
		temp = getLineFromTextFile(self.pathDict)
		self.DictList = [item for item in temp if item]
Beispiel #6
0
	def loadData(self):
		temp = getLineFromTextFile(self.pathProxy)
		self.proxyList = [item for item in temp if item]
Beispiel #7
0
def runFourthProcess(fileName, dirIn, dirOut):

    fileNameJSON = fileName.replace(".txt", ".json")

    pathIn = os.path.join(dirIn, fileName)
    pathOut = os.path.join(dirOut, fileNameJSON)
    lines = getLineFromTextFile(pathIn)

    sLines = []

    #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

    if (lines):
        for line in lines:
            if (line):
                key, text = splitLine(line)
                sLines.append((key, text))

        #run for significant index
        lineMap = []
        idx = 0
        sectionList = [
            '[headword]', '[category]', '[secphrases]', '[secphrasal]',
            '[secusage]', '[secpronun]', '[secorigin]'
        ]

        for sLine in sLines:
            #print(sLine[0], sLine[1])
            #if sLine[0] == '[headword]'
            for section in sectionList:
                if (sLine[0]) == section:
                    lineMap.append((section, idx))

            idx += 1

    #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION
    #print('lineMap:', lineMap)

    idxMap = []
    for i in range(len(lineMap) - 1):
        tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
        idxMap.append(tup)
        if (i == len(lineMap) - 2):
            lastIdx = i + 1
            #print(lineMap[lastIdx][1])
            tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1)
            idxMap.append(tup)

    #STEP 3: HANDLE EACH SECTION
    #print('\nindex map:', idxMap)

    objectList = []
    for item in idxMap:
        sectionName, firstIdex, lastIndex = item
        sectLines = []
        for i in range(firstIdex, lastIndex):
            sectLines.append(lines[i])

        if (sectionName == '[headword]'):
            objHW = runlib.processHeadWordLines(sectLines)
            objectList.append(objHW)
            #print(objHW)
        elif (sectionName == '[category]'):
            objCategory = runlib.processCategoryLines(sectLines)
            objectList.append(objCategory)

        elif (sectionName == '[secphrases]'):
            objPhrases = runlib.processPhraseLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secphrasal]'):
            objPhrases = runlib.processPhraseVerbLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secusage]'):
            objUsage = runlib.processUsageLines(sectLines)
            objectList.append(objUsage)

        elif (sectionName == '[secorigin]'):
            objOrigin = runlib.processOriginLines(sectLines)
            objectList.append(objOrigin)

        elif (sectionName == '[secpronun]'):
            objPhonetic = runlib.processPhoneticLines(sectLines)
            objectList.append(objPhonetic)

    pprint(objectList)
    """
def secondRun(fileName, dirIn, dirOut):

    pathIn = os.path.join(dirIn, fileName)

    pathOut = os.path.join(dirOut, fileName)

    lines = getLineFromTextFile(pathIn)

    lineTuple = []

    for line in lines:
        if (line):
            key, text = splitLine(line)
            lineTuple.append((key, text))
    #print(lineTuple)

    span = []
    lineMap = []

    for i in range(len(lineTuple) - 1):
        #print(lineTuple[i])
        #key = lineTuple[i][0]
        #value = lineTuple[i][1]
        #print(key, value)
        if (lineTuple[i][0] == lineTuple[i + 1][0]):
            #match
            span.append(i)
        else:
            span.append(i)
            lineMap.append(span)
            span = []

        #last item
        if (i == len(lineTuple) - 2):
            #print('i+1:', i+1)
            #print(lineTuple[i], lineTuple[i+1])
            if (lineTuple[i][0] != lineTuple[i + 1][0]):
                span = []
                span.append(i + 1)
                lineMap.append(span)

    dataOut = []
    #print(lineMap)
    for items in lineMap:
        if len(items) == 1:
            #print('single', items)
            #print()
            idx = items[0]
            #print(lineTuple[idx])
            line = lineTuple[idx][0] + lineTuple[idx][1]
            #print(line)
            dataOut.append(line)
        else:
            #print('series', items)
            header = ''
            text = ''
            for idx in items:
                if lineTuple[idx]:
                    #print(lineTuple[idx][0],  lineTuple[idx][1])
                    if not header:
                        header = lineTuple[idx][0]
                    text += lineTuple[idx][1] + '|'
            line = header + text
            dataOut.append(line)

    writeListToFile(dataOut, pathOut)
def runFourthProcess(fileName, dirIn, dirOut):

    fileNameJSON = fileName.replace(".txt", ".json")

    pathIn = os.path.join(dirIn, fileName)
    pathOut = os.path.join(dirOut, fileNameJSON)
    lines = getLineFromTextFile(pathIn)

    #pprint(lines)
    sLines = []

    #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

    if (lines):
        for line in lines:
            if (line):
                key, text = splitLine(line)
                sLines.append((key, text))

        #run for significant index
        lineMap = []
        idx = 0
        sectionList = [
            '[headword]', '[category]', '[secphrases]', '[secphrasal]',
            '[secusage]', '[secpronun]', '[secorigin]'
        ]

        for sLine in sLines:
            #print(sLine[0], sLine[1])
            #if sLine[0] == '[headword]'
            for section in sectionList:
                if (sLine[0]) == section:
                    lineMap.append((section, idx))

            idx += 1

    #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION
    #print('lineMap:', lineMap)
    #print('len -1', len(lineMap)-1)
    idxMap = []

    if len(lineMap) == 1:
        tup = (lineMap[0][0], lineMap[0][1], len(lines) - 1)
        idxMap.append(tup)
    elif len(lineMap) > 1:
        for i in range(len(lineMap) - 1):
            tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
            idxMap.append(tup)
            if (i == len(lineMap) - 2):
                lastIdx = i + 1
                #print(lineMap[lastIdx][1])
                tup = (lineMap[lastIdx][0], lineMap[lastIdx][1],
                       len(lines) - 1)
                idxMap.append(tup)

    #STEP 3: HANDLE EACH SECTION
    #print('\nindex map:', idxMap)

    objectList = []
    for item in idxMap:
        sectionName, firstIdex, lastIndex = item
        sectLines = []
        for i in range(firstIdex, lastIndex):
            sectLines.append(lines[i])

        if (sectionName == '[headword]'):
            objHW = runlib.processHeadWordLines(sectLines)
            objectList.append(objHW)
            #print(objHW)
        elif (sectionName == '[category]'):
            objCategory = runlib.processCategoryLines(sectLines)
            objectList.append(objCategory)

        elif (sectionName == '[secphrases]'):
            objPhrases = runlib.processPhraseLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secphrasal]'):
            objPhrases = runlib.processPhraseVerbLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secusage]'):
            objUsage = runlib.processUsageLines(sectLines)
            objectList.append(objUsage)

        elif (sectionName == '[secorigin]'):
            objOrigin = runlib.processOriginLines(sectLines)
            objectList.append(objOrigin)

        elif (sectionName == '[secpronun]'):
            objPhonetic = runlib.processPhoneticLines(sectLines)
            objectList.append(objPhonetic)

    #STEP 4: MERGE OBJECTS
    masterObject = {}
    for obj in objectList:
        for key in obj:
            masterObject[key] = obj[key]

    #@pprint(masterObject)

    #STEP 5: WRITE OUT JSON FILE
    with open(pathOut, 'w', encoding="utf-8") as outfile:
        json.dump(masterObject, outfile)

    message = 'Finished converting ' + fileName + ' to JSON'
    return message