def processSimplePhrase(lines):
    objectOut = {}

    termDict = {
        '[phrshead]': 'phrase',
        '[notegram]': 'grammar-notes',
        '[meanings]': 'meaning',
        '[examples]': 'examples',
        '[synonyms]': 'synonyms',
        '[notebold]': 'highlight',
        '[notetone]': 'register',
        '[notearea]': 'region-domain',
        '[sensenum]': 'sense-number',
        '[crossref]': 'cross-reference'
    }

    for line in lines:
        key, text = splitLine(line)
        newKey = termDict[key]
        if (key == '[examples]'):
            newText = listToList(text)
        else:
            temp = listToString(text)
            newText = temp.replace(', ,', ',')
        objectOut[newKey] = newText
    return objectOut
Esempio n. 2
0
def processSimpleCategory(sectLines):
    objectOut = {}

    termDict = {
        '[notegram]': 'grammar-notes',
        '[meanings]': 'meaning',
        '[examples]': 'examples',
        '[synonyms]': 'synonyms',
        '[notebold]': 'highlight',
        '[notetone]': 'register',
        '[notearea]': 'region-domain',
        '[crossref]': 'cross-reference'
    }

    categoryKey = ''
    for line in sectLines:
        key, text = splitLine(line)
        if (key == '[category]'):
            categoryKey = text
            objectOut[categoryKey] = {'senses': [{}]}
        elif (key == '[inflects]'):
            textInflects = listToString(text)
            objectOut[categoryKey]['inflections'] = textInflects
        else:
            newKey = termDict[key]
            if (key == '[examples]'):
                newText = listToList(text)
            else:
                temp = listToString(text)
                newText = temp.replace(', ,', ',')

            objectOut[categoryKey]['senses'][0][newKey] = newText

    #pprint(objectOut)
    return objectOut
Esempio n. 3
0
def processMeanings(senseLines):
    objectOut = {}
    termDict = {
        '[sensenum]': 'sense-number',
        '[notegram]': 'grammer-notes',
        '[meanings]': 'meaning',
        '[examples]': 'examples',
        '[synonyms]': 'synonyms',
        '[notebold]': 'highlight',
        '[notetone]': 'register',
        '[notearea]': 'region-domain',
        '[crossref]': 'cross-reference',
        '[variants]': 'spelling-variants'
    }

    for line in senseLines:
        #print('\n', line)
        key, text = splitLine(line)
        newKey = termDict[key]
        if (key == '[examples]'):
            newText = listToList(text)
        else:
            temp = listToString(text)
            newText = temp.replace(', ,', ',')
        objectOut[newKey] = newText

    #pprint(objectOut)
    return objectOut
Esempio n. 4
0
def processUsageLines(sectLines):
    objectOut = {}
    for line in sectLines:
        key, text = splitLine(line)
        #print('key:', key, 'text:', text)
        if (key == '[wrdusage]'):
            objectOut['usage'] = text
    #print(objectOut)
    return objectOut
Esempio n. 5
0
def processOriginLines(sectLines):
    objectOut = {}
    for line in sectLines:
        key, text = splitLine(line)
        #print('key:', key, 'text:', text)
        if (key == '[wordroot]'):
            objectOut['word-origin'] = text
    #print(objectOut)
    return objectOut
Esempio n. 6
0
def processPhoneticLines(sectLines):
    objectOut = {}
    for line in sectLines:
        key, text = splitLine(line)
        #print('key:', key, 'text:', text)
        if (key == '[phonetic]'):
            temp = listToString(text)
            objectOut['phonetic'] = temp
    #print(objectOut)
    return objectOut
Esempio n. 7
0
def processComplexCategory(sectLines):
    #PROCESSING A PART OF SPEECH
    num = 0
    lineMap = []

    #STEP 1: EXTRACT START INDEXES OF PART OF SPEECH HEADER AND MEANINGS
    for line in sectLines:
        key, text = splitLine(line)
        #print('\nkey:', key, 'text:', text)
        if (key == '[category]'):
            lineMap.append((key, num))
        elif (key == '[sensenum]'):
            lineMap.append((key, num))
        num += 1

    #STEP 2: EXTRACT START AND END INDEXES
    idxMap = []
    for i in range(len(lineMap) - 1):
        tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
        idxMap.append(tup)
        if (i == len(lineMap) - 2):
            lastIdx = i + 1
            #print(lineMap[lastIdx][1])
            tup = (lineMap[lastIdx][0], lineMap[lastIdx][1],
                   len(sectLines) - 1)
            idxMap.append(tup)

    #STEP 3: PROCESS EACH PART
    objectSenses = []
    objCatHeader = {}
    categoryKey = ''

    for item in idxMap:
        partName, firstIdex, lastIndex = item
        partLines = []
        for i in range(firstIdex, lastIndex):
            partLines.append(sectLines[i])

        if (partName == '[category]'):
            objCatHeader, categoryKey = processCategoryHeader(partLines)

            #print(objHW)
        if (partName == '[sensenum]'):
            objMeaning = processMeanings(partLines)
            objectSenses.append(objMeaning)

    #ASSEMBLY
    objCatHeader[categoryKey]['senses'] = objectSenses
    #pprint(objCatHeader)

    return objCatHeader
Esempio n. 8
0
def processCategoryHeader(headLines):
    objectOut = {}
    categoryKey = ''

    for line in headLines:
        key, text = splitLine(line)
        if (key == '[category]'):
            categoryKey = text
            objectOut[categoryKey] = {'senses': []}
        elif (key == '[inflects]'):
            textInflects = listToString(text)
            objectOut[categoryKey]['inflections'] = textInflects

    return (objectOut, categoryKey)
Esempio n. 9
0
def processPhraseVerbLines(sectLines):

    objectOut = {}
    #STEP 1: EXTRACT START INDEXES OF PART OF SPEECH HEADER AND MEANINGS
    lineMap = []
    num = 0
    for line in sectLines:
        key, text = splitLine(line)
        #print('\nkey:', key, 'text:', text)
        if (key == '[phrshead]'):
            lineMap.append((key, num))

        num += 1

    #print(lineMap)
    #STEP 2: EXTRACT START AND END INDEXES
    idxMap = []
    for i in range(len(lineMap) - 1):
        tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
        idxMap.append(tup)
        if (i == len(lineMap) - 2):
            lastIdx = i + 1
            #print(lineMap[lastIdx][1])
            tup = (lineMap[lastIdx][0], lineMap[lastIdx][1],
                   len(sectLines) - 1)
            idxMap.append(tup)

    #print(idxMap)

    #STEP 3: PROCESS EACH PART
    objectPhrases = []

    for item in idxMap:
        partName, firstIdex, lastIndex = item
        partLines = []
        for i in range(firstIdex, lastIndex):
            partLines.append(sectLines[i])

        if (partName == '[phrshead]'):
            objPhrase = processPhrase(partLines)
            objectPhrases.append(objPhrase)

    #remove empty phrase
    newObjectList = [item for item in objectPhrases if item]

    objectOut['phrases-verbs'] = newObjectList

    return objectOut
Esempio n. 10
0
def processComplexPhrase(phraseLines):
    objPhraseList = []
    phraseHead = ''
    lineMap = []
    num = 0
    for line in phraseLines:
        key, text = splitLine(line)
        #print('\nkey:', key, 'text:', text)
        if (key == '[phrshead]'):
            phraseHead = text
        elif (key == '[sensenum]'):
            lineMap.append((key, num))
        num += 1

    #pprint(lineMap)
    #STEP 2: EXTRACT START AND END INDEXES
    idxMap = []
    for i in range(len(lineMap) - 1):
        tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
        idxMap.append(tup)
        if (i == len(lineMap) - 2):
            lastIdx = i + 1
            #print(lineMap[lastIdx][1])
            tup = (lineMap[lastIdx][0], lineMap[lastIdx][1],
                   len(phraseLines) - 1)
            idxMap.append(tup)
    #pprint(idxMap)
    #STEP 3: PROCESS EACH PART

    for item in idxMap:
        partName, firstIdex, lastIndex = item
        partLines = []
        for i in range(firstIdex, lastIndex):
            partLines.append(phraseLines[i])

        if (partName == '[sensenum]'):
            objPhrasePart = processSimplePhrase(partLines)
            objPhrasePart['phrase'] = phraseHead
            objPhraseList.append(objPhrasePart)

    #pprint(objPhraseList)
    return objPhraseList
Esempio n. 11
0
def processHeadWordLines(sectLines):
    # DETERMINE IF THIS IS SIMPLE OR COMPLEX HEADWORD
    #pprint(sectLines)
    senseSigns = [
        '[sensenum]', '[notegram]', '[meanings]', '[examples]', '[synonyms]',
        '[notebold]', '[notetone]', '[notearea]', '[crossref]'
    ]

    complexHeadWord = False
    hwLastIndex = 0
    num = 0
    for line in sectLines:
        key, text = splitLine(line)
        if (key in senseSigns):
            complexHeadWord = True
            hwLastIndex = num
            #print('last index:', hwLastIndex)
            break
        num += 1

    if complexHeadWord:

        newHeadWordLines = []
        newSenseLines = []
        for i in range(hwLastIndex):
            newHeadWordLines.append(sectLines[i])
        for k in range(hwLastIndex, len(sectLines)):
            newSenseLines.append(sectLines[k])

        complexObject = processSimpleHeadword(newHeadWordLines)
        senseObject = processMeanings(newSenseLines)

        complexObject['symbol'] = senseObject
        #merge to objext
        #for key in senseObject:
        #	complexObject[key] = senseObject[key]
        return complexObject

    else:
        simpleObject = processSimpleHeadword(sectLines)
        return simpleObject
Esempio n. 12
0
def processPhrase(phraseLines):
    #COUNT NUMBER OF SENSES TO DECIDE SIMPLE OR COMPLEX PHRASE
    objectList = []
    senseTotal = 0
    for line in phraseLines:
        key, text = splitLine(line)
        #print('\nkey:', key, 'text:', text)

        if (key == '[sensenum]'):
            senseTotal += 1

    if (senseTotal > 0):
        #print('complex phrase')
        objList = processComplexPhrase(phraseLines)

    else:
        #print('simple phrase')
        objectPhrase = processSimplePhrase(phraseLines)
        objectList.append(objectPhrase)

    return objectList
Esempio n. 13
0
def processCategoryLines(sectLines):
    #FIRST MAKE SURE THIS IS SIMPLE OF COMPLEX CATEGORY
    #pprint(sectLines)
    senseTotal = 0

    for line in sectLines:
        key, text = splitLine(line)
        if (key == '[sensenum]'):
            senseTotal += 1
    #print('senseTotal', senseTotal)

    objectList = []

    if (senseTotal > 0):
        #print('complex')
        objectList = processComplexCategory(sectLines)
    else:
        #print('simple')
        objectList = processSimpleCategory(sectLines)

    return objectList
Esempio n. 14
0
def processHeadWordLines(sectLines):
    # CREATE AND RETURN A HEADWORD OBJECT
    headwordObject = {}

    headWordDict = {
        '[headword]': 'head-word',
        '[graphnum]': 'homograph-index',
        '[variants]': 'spelling-variants',
        '[graphnum]': 'homograph-index',
        '[phonetic]': 'phonetic-transcripts',
        '[crossref]': 'cross-reference'
    }

    for line in sectLines:
        #print('\n', line)
        key, text = splitLine(line)
        newKey = headWordDict[key]
        newText = listToString(text)
        headwordObject[newKey] = newText
        #print('newKey', newKey, 'newText', newText)

    return headwordObject
Esempio n. 15
0
def secondRun(fileName, dirIn, dirOut):

    pathIn = os.path.join(dirIn, fileName)

    pathOut = os.path.join(dirOut, fileName)

    lines = getLineFromTextFile(pathIn)

    lineTuple = []

    for line in lines:
        if (line):
            key, text = splitLine(line)
            lineTuple.append((key, text))
    #print(lineTuple)

    span = []
    lineMap = []

    for i in range(len(lineTuple) - 1):
        #print(lineTuple[i])
        #key = lineTuple[i][0]
        #value = lineTuple[i][1]
        #print(key, value)
        if (lineTuple[i][0] == lineTuple[i + 1][0]):
            #match
            span.append(i)
        else:
            span.append(i)
            lineMap.append(span)
            span = []

        #last item
        if (i == len(lineTuple) - 2):
            #print('i+1:', i+1)
            #print(lineTuple[i], lineTuple[i+1])
            if (lineTuple[i][0] != lineTuple[i + 1][0]):
                span = []
                span.append(i + 1)
                lineMap.append(span)

    dataOut = []
    #print(lineMap)
    for items in lineMap:
        if len(items) == 1:
            #print('single', items)
            #print()
            idx = items[0]
            #print(lineTuple[idx])
            line = lineTuple[idx][0] + lineTuple[idx][1]
            #print(line)
            dataOut.append(line)
        else:
            #print('series', items)
            header = ''
            text = ''
            for idx in items:
                if lineTuple[idx]:
                    #print(lineTuple[idx][0],  lineTuple[idx][1])
                    if not header:
                        header = lineTuple[idx][0]
                    text += lineTuple[idx][1] + '|'
            line = header + text
            dataOut.append(line)

    writeListToFile(dataOut, pathOut)
Esempio n. 16
0
def runFourthProcess(fileName, dirIn, dirOut):

    fileNameJSON = fileName.replace(".txt", ".json")

    pathIn = os.path.join(dirIn, fileName)
    pathOut = os.path.join(dirOut, fileNameJSON)
    lines = getLineFromTextFile(pathIn)

    sLines = []

    #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

    if (lines):
        for line in lines:
            if (line):
                key, text = splitLine(line)
                sLines.append((key, text))

        #run for significant index
        lineMap = []
        idx = 0
        sectionList = [
            '[headword]', '[category]', '[secphrases]', '[secphrasal]',
            '[secusage]', '[secpronun]', '[secorigin]'
        ]

        for sLine in sLines:
            #print(sLine[0], sLine[1])
            #if sLine[0] == '[headword]'
            for section in sectionList:
                if (sLine[0]) == section:
                    lineMap.append((section, idx))

            idx += 1

    #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION
    #print('lineMap:', lineMap)

    idxMap = []
    for i in range(len(lineMap) - 1):
        tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
        idxMap.append(tup)
        if (i == len(lineMap) - 2):
            lastIdx = i + 1
            #print(lineMap[lastIdx][1])
            tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1)
            idxMap.append(tup)

    #STEP 3: HANDLE EACH SECTION
    #print('\nindex map:', idxMap)

    objectList = []
    for item in idxMap:
        sectionName, firstIdex, lastIndex = item
        sectLines = []
        for i in range(firstIdex, lastIndex):
            sectLines.append(lines[i])

        if (sectionName == '[headword]'):
            objHW = runlib.processHeadWordLines(sectLines)
            objectList.append(objHW)
            #print(objHW)
        elif (sectionName == '[category]'):
            objCategory = runlib.processCategoryLines(sectLines)
            objectList.append(objCategory)

        elif (sectionName == '[secphrases]'):
            objPhrases = runlib.processPhraseLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secphrasal]'):
            objPhrases = runlib.processPhraseVerbLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secusage]'):
            objUsage = runlib.processUsageLines(sectLines)
            objectList.append(objUsage)

        elif (sectionName == '[secorigin]'):
            objOrigin = runlib.processOriginLines(sectLines)
            objectList.append(objOrigin)

        elif (sectionName == '[secpronun]'):
            objPhonetic = runlib.processPhoneticLines(sectLines)
            objectList.append(objPhonetic)

    pprint(objectList)
    """
Esempio n. 17
0
fileName = 'A-star.txt'
fileNameJSON = fileName.replace(".txt", ".json")
dirIn = 'E:/FULLTEXT/LEXICO/COMPACT'
dirOut = 'E:/FULLTEXT/LEXICO/JSON'
pathIn = os.path.join(dirIn, fileName)
pathOut = os.path.join(dirOut, fileNameJSON)
lines = sh.getLineFromTextFile(pathIn)

sLines = []

#STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

if(lines):
	for line in lines:
		if(line):
			key, text = splitLine(line)
			sLines.append((key,text))

	#run for significant index
	lineMap =[]
	idx = 0
	sectionList = ['[headword]','[category]', '[secphrases]', '[secphrasal]','[secusage]','[secpronun]', '[secorigin]']

	for sLine in sLines:
		#print(sLine[0], sLine[1])
		#if sLine[0] == '[headword]'
		for section in sectionList:
			if (sLine[0]) == section:
				lineMap.append((section, idx))

		idx += 1		
def runFourthProcess(fileName, dirIn, dirOut):

    fileNameJSON = fileName.replace(".txt", ".json")

    pathIn = os.path.join(dirIn, fileName)
    pathOut = os.path.join(dirOut, fileNameJSON)
    lines = getLineFromTextFile(pathIn)

    #pprint(lines)
    sLines = []

    #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START

    if (lines):
        for line in lines:
            if (line):
                key, text = splitLine(line)
                sLines.append((key, text))

        #run for significant index
        lineMap = []
        idx = 0
        sectionList = [
            '[headword]', '[category]', '[secphrases]', '[secphrasal]',
            '[secusage]', '[secpronun]', '[secorigin]'
        ]

        for sLine in sLines:
            #print(sLine[0], sLine[1])
            #if sLine[0] == '[headword]'
            for section in sectionList:
                if (sLine[0]) == section:
                    lineMap.append((section, idx))

            idx += 1

    #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION
    #print('lineMap:', lineMap)
    #print('len -1', len(lineMap)-1)
    idxMap = []

    if len(lineMap) == 1:
        tup = (lineMap[0][0], lineMap[0][1], len(lines) - 1)
        idxMap.append(tup)
    elif len(lineMap) > 1:
        for i in range(len(lineMap) - 1):
            tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1])
            idxMap.append(tup)
            if (i == len(lineMap) - 2):
                lastIdx = i + 1
                #print(lineMap[lastIdx][1])
                tup = (lineMap[lastIdx][0], lineMap[lastIdx][1],
                       len(lines) - 1)
                idxMap.append(tup)

    #STEP 3: HANDLE EACH SECTION
    #print('\nindex map:', idxMap)

    objectList = []
    for item in idxMap:
        sectionName, firstIdex, lastIndex = item
        sectLines = []
        for i in range(firstIdex, lastIndex):
            sectLines.append(lines[i])

        if (sectionName == '[headword]'):
            objHW = runlib.processHeadWordLines(sectLines)
            objectList.append(objHW)
            #print(objHW)
        elif (sectionName == '[category]'):
            objCategory = runlib.processCategoryLines(sectLines)
            objectList.append(objCategory)

        elif (sectionName == '[secphrases]'):
            objPhrases = runlib.processPhraseLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secphrasal]'):
            objPhrases = runlib.processPhraseVerbLines(sectLines)
            objectList.append(objPhrases)

        elif (sectionName == '[secusage]'):
            objUsage = runlib.processUsageLines(sectLines)
            objectList.append(objUsage)

        elif (sectionName == '[secorigin]'):
            objOrigin = runlib.processOriginLines(sectLines)
            objectList.append(objOrigin)

        elif (sectionName == '[secpronun]'):
            objPhonetic = runlib.processPhoneticLines(sectLines)
            objectList.append(objPhonetic)

    #STEP 4: MERGE OBJECTS
    masterObject = {}
    for obj in objectList:
        for key in obj:
            masterObject[key] = obj[key]

    #@pprint(masterObject)

    #STEP 5: WRITE OUT JSON FILE
    with open(pathOut, 'w', encoding="utf-8") as outfile:
        json.dump(masterObject, outfile)

    message = 'Finished converting ' + fileName + ' to JSON'
    return message