def processSimplePhrase(lines): objectOut = {} termDict = { '[phrshead]': 'phrase', '[notegram]': 'grammar-notes', '[meanings]': 'meaning', '[examples]': 'examples', '[synonyms]': 'synonyms', '[notebold]': 'highlight', '[notetone]': 'register', '[notearea]': 'region-domain', '[sensenum]': 'sense-number', '[crossref]': 'cross-reference' } for line in lines: key, text = splitLine(line) newKey = termDict[key] if (key == '[examples]'): newText = listToList(text) else: temp = listToString(text) newText = temp.replace(', ,', ',') objectOut[newKey] = newText return objectOut
def processSimpleCategory(sectLines): objectOut = {} termDict = { '[notegram]': 'grammar-notes', '[meanings]': 'meaning', '[examples]': 'examples', '[synonyms]': 'synonyms', '[notebold]': 'highlight', '[notetone]': 'register', '[notearea]': 'region-domain', '[crossref]': 'cross-reference' } categoryKey = '' for line in sectLines: key, text = splitLine(line) if (key == '[category]'): categoryKey = text objectOut[categoryKey] = {'senses': [{}]} elif (key == '[inflects]'): textInflects = listToString(text) objectOut[categoryKey]['inflections'] = textInflects else: newKey = termDict[key] if (key == '[examples]'): newText = listToList(text) else: temp = listToString(text) newText = temp.replace(', ,', ',') objectOut[categoryKey]['senses'][0][newKey] = newText #pprint(objectOut) return objectOut
def processMeanings(senseLines): objectOut = {} termDict = { '[sensenum]': 'sense-number', '[notegram]': 'grammer-notes', '[meanings]': 'meaning', '[examples]': 'examples', '[synonyms]': 'synonyms', '[notebold]': 'highlight', '[notetone]': 'register', '[notearea]': 'region-domain', '[crossref]': 'cross-reference', '[variants]': 'spelling-variants' } for line in senseLines: #print('\n', line) key, text = splitLine(line) newKey = termDict[key] if (key == '[examples]'): newText = listToList(text) else: temp = listToString(text) newText = temp.replace(', ,', ',') objectOut[newKey] = newText #pprint(objectOut) return objectOut
def processUsageLines(sectLines): objectOut = {} for line in sectLines: key, text = splitLine(line) #print('key:', key, 'text:', text) if (key == '[wrdusage]'): objectOut['usage'] = text #print(objectOut) return objectOut
def processOriginLines(sectLines): objectOut = {} for line in sectLines: key, text = splitLine(line) #print('key:', key, 'text:', text) if (key == '[wordroot]'): objectOut['word-origin'] = text #print(objectOut) return objectOut
def processPhoneticLines(sectLines): objectOut = {} for line in sectLines: key, text = splitLine(line) #print('key:', key, 'text:', text) if (key == '[phonetic]'): temp = listToString(text) objectOut['phonetic'] = temp #print(objectOut) return objectOut
def processComplexCategory(sectLines): #PROCESSING A PART OF SPEECH num = 0 lineMap = [] #STEP 1: EXTRACT START INDEXES OF PART OF SPEECH HEADER AND MEANINGS for line in sectLines: key, text = splitLine(line) #print('\nkey:', key, 'text:', text) if (key == '[category]'): lineMap.append((key, num)) elif (key == '[sensenum]'): lineMap.append((key, num)) num += 1 #STEP 2: EXTRACT START AND END INDEXES idxMap = [] for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(sectLines) - 1) idxMap.append(tup) #STEP 3: PROCESS EACH PART objectSenses = [] objCatHeader = {} categoryKey = '' for item in idxMap: partName, firstIdex, lastIndex = item partLines = [] for i in range(firstIdex, lastIndex): partLines.append(sectLines[i]) if (partName == '[category]'): objCatHeader, categoryKey = processCategoryHeader(partLines) #print(objHW) if (partName == '[sensenum]'): objMeaning = processMeanings(partLines) objectSenses.append(objMeaning) #ASSEMBLY objCatHeader[categoryKey]['senses'] = objectSenses #pprint(objCatHeader) return objCatHeader
def processCategoryHeader(headLines): objectOut = {} categoryKey = '' for line in headLines: key, text = splitLine(line) if (key == '[category]'): categoryKey = text objectOut[categoryKey] = {'senses': []} elif (key == '[inflects]'): textInflects = listToString(text) objectOut[categoryKey]['inflections'] = textInflects return (objectOut, categoryKey)
def processPhraseVerbLines(sectLines): objectOut = {} #STEP 1: EXTRACT START INDEXES OF PART OF SPEECH HEADER AND MEANINGS lineMap = [] num = 0 for line in sectLines: key, text = splitLine(line) #print('\nkey:', key, 'text:', text) if (key == '[phrshead]'): lineMap.append((key, num)) num += 1 #print(lineMap) #STEP 2: EXTRACT START AND END INDEXES idxMap = [] for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(sectLines) - 1) idxMap.append(tup) #print(idxMap) #STEP 3: PROCESS EACH PART objectPhrases = [] for item in idxMap: partName, firstIdex, lastIndex = item partLines = [] for i in range(firstIdex, lastIndex): partLines.append(sectLines[i]) if (partName == '[phrshead]'): objPhrase = processPhrase(partLines) objectPhrases.append(objPhrase) #remove empty phrase newObjectList = [item for item in objectPhrases if item] objectOut['phrases-verbs'] = newObjectList return objectOut
def processComplexPhrase(phraseLines): objPhraseList = [] phraseHead = '' lineMap = [] num = 0 for line in phraseLines: key, text = splitLine(line) #print('\nkey:', key, 'text:', text) if (key == '[phrshead]'): phraseHead = text elif (key == '[sensenum]'): lineMap.append((key, num)) num += 1 #pprint(lineMap) #STEP 2: EXTRACT START AND END INDEXES idxMap = [] for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(phraseLines) - 1) idxMap.append(tup) #pprint(idxMap) #STEP 3: PROCESS EACH PART for item in idxMap: partName, firstIdex, lastIndex = item partLines = [] for i in range(firstIdex, lastIndex): partLines.append(phraseLines[i]) if (partName == '[sensenum]'): objPhrasePart = processSimplePhrase(partLines) objPhrasePart['phrase'] = phraseHead objPhraseList.append(objPhrasePart) #pprint(objPhraseList) return objPhraseList
def processHeadWordLines(sectLines): # DETERMINE IF THIS IS SIMPLE OR COMPLEX HEADWORD #pprint(sectLines) senseSigns = [ '[sensenum]', '[notegram]', '[meanings]', '[examples]', '[synonyms]', '[notebold]', '[notetone]', '[notearea]', '[crossref]' ] complexHeadWord = False hwLastIndex = 0 num = 0 for line in sectLines: key, text = splitLine(line) if (key in senseSigns): complexHeadWord = True hwLastIndex = num #print('last index:', hwLastIndex) break num += 1 if complexHeadWord: newHeadWordLines = [] newSenseLines = [] for i in range(hwLastIndex): newHeadWordLines.append(sectLines[i]) for k in range(hwLastIndex, len(sectLines)): newSenseLines.append(sectLines[k]) complexObject = processSimpleHeadword(newHeadWordLines) senseObject = processMeanings(newSenseLines) complexObject['symbol'] = senseObject #merge to objext #for key in senseObject: # complexObject[key] = senseObject[key] return complexObject else: simpleObject = processSimpleHeadword(sectLines) return simpleObject
def processPhrase(phraseLines): #COUNT NUMBER OF SENSES TO DECIDE SIMPLE OR COMPLEX PHRASE objectList = [] senseTotal = 0 for line in phraseLines: key, text = splitLine(line) #print('\nkey:', key, 'text:', text) if (key == '[sensenum]'): senseTotal += 1 if (senseTotal > 0): #print('complex phrase') objList = processComplexPhrase(phraseLines) else: #print('simple phrase') objectPhrase = processSimplePhrase(phraseLines) objectList.append(objectPhrase) return objectList
def processCategoryLines(sectLines): #FIRST MAKE SURE THIS IS SIMPLE OF COMPLEX CATEGORY #pprint(sectLines) senseTotal = 0 for line in sectLines: key, text = splitLine(line) if (key == '[sensenum]'): senseTotal += 1 #print('senseTotal', senseTotal) objectList = [] if (senseTotal > 0): #print('complex') objectList = processComplexCategory(sectLines) else: #print('simple') objectList = processSimpleCategory(sectLines) return objectList
def processHeadWordLines(sectLines): # CREATE AND RETURN A HEADWORD OBJECT headwordObject = {} headWordDict = { '[headword]': 'head-word', '[graphnum]': 'homograph-index', '[variants]': 'spelling-variants', '[graphnum]': 'homograph-index', '[phonetic]': 'phonetic-transcripts', '[crossref]': 'cross-reference' } for line in sectLines: #print('\n', line) key, text = splitLine(line) newKey = headWordDict[key] newText = listToString(text) headwordObject[newKey] = newText #print('newKey', newKey, 'newText', newText) return headwordObject
def secondRun(fileName, dirIn, dirOut): pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileName) lines = getLineFromTextFile(pathIn) lineTuple = [] for line in lines: if (line): key, text = splitLine(line) lineTuple.append((key, text)) #print(lineTuple) span = [] lineMap = [] for i in range(len(lineTuple) - 1): #print(lineTuple[i]) #key = lineTuple[i][0] #value = lineTuple[i][1] #print(key, value) if (lineTuple[i][0] == lineTuple[i + 1][0]): #match span.append(i) else: span.append(i) lineMap.append(span) span = [] #last item if (i == len(lineTuple) - 2): #print('i+1:', i+1) #print(lineTuple[i], lineTuple[i+1]) if (lineTuple[i][0] != lineTuple[i + 1][0]): span = [] span.append(i + 1) lineMap.append(span) dataOut = [] #print(lineMap) for items in lineMap: if len(items) == 1: #print('single', items) #print() idx = items[0] #print(lineTuple[idx]) line = lineTuple[idx][0] + lineTuple[idx][1] #print(line) dataOut.append(line) else: #print('series', items) header = '' text = '' for idx in items: if lineTuple[idx]: #print(lineTuple[idx][0], lineTuple[idx][1]) if not header: header = lineTuple[idx][0] text += lineTuple[idx][1] + '|' line = header + text dataOut.append(line) writeListToFile(dataOut, pathOut)
def runFourthProcess(fileName, dirIn, dirOut): fileNameJSON = fileName.replace(".txt", ".json") pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = getLineFromTextFile(pathIn) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if (lines): for line in lines: if (line): key, text = splitLine(line) sLines.append((key, text)) #run for significant index lineMap = [] idx = 0 sectionList = [ '[headword]', '[category]', '[secphrases]', '[secphrasal]', '[secusage]', '[secpronun]', '[secorigin]' ] for sLine in sLines: #print(sLine[0], sLine[1]) #if sLine[0] == '[headword]' for section in sectionList: if (sLine[0]) == section: lineMap.append((section, idx)) idx += 1 #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION #print('lineMap:', lineMap) idxMap = [] for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1) idxMap.append(tup) #STEP 3: HANDLE EACH SECTION #print('\nindex map:', idxMap) objectList = [] for item in idxMap: sectionName, firstIdex, lastIndex = item sectLines = [] for i in range(firstIdex, lastIndex): sectLines.append(lines[i]) if (sectionName == '[headword]'): objHW = runlib.processHeadWordLines(sectLines) objectList.append(objHW) #print(objHW) elif (sectionName == '[category]'): objCategory = runlib.processCategoryLines(sectLines) objectList.append(objCategory) elif (sectionName == '[secphrases]'): objPhrases = runlib.processPhraseLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secphrasal]'): objPhrases = runlib.processPhraseVerbLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secusage]'): objUsage = runlib.processUsageLines(sectLines) objectList.append(objUsage) elif (sectionName == '[secorigin]'): objOrigin = runlib.processOriginLines(sectLines) objectList.append(objOrigin) elif (sectionName == '[secpronun]'): objPhonetic = runlib.processPhoneticLines(sectLines) objectList.append(objPhonetic) pprint(objectList) """
fileName = 'A-star.txt' fileNameJSON = fileName.replace(".txt", ".json") dirIn = 'E:/FULLTEXT/LEXICO/COMPACT' dirOut = 'E:/FULLTEXT/LEXICO/JSON' pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = sh.getLineFromTextFile(pathIn) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if(lines): for line in lines: if(line): key, text = splitLine(line) sLines.append((key,text)) #run for significant index lineMap =[] idx = 0 sectionList = ['[headword]','[category]', '[secphrases]', '[secphrasal]','[secusage]','[secpronun]', '[secorigin]'] for sLine in sLines: #print(sLine[0], sLine[1]) #if sLine[0] == '[headword]' for section in sectionList: if (sLine[0]) == section: lineMap.append((section, idx)) idx += 1
def runFourthProcess(fileName, dirIn, dirOut): fileNameJSON = fileName.replace(".txt", ".json") pathIn = os.path.join(dirIn, fileName) pathOut = os.path.join(dirOut, fileNameJSON) lines = getLineFromTextFile(pathIn) #pprint(lines) sLines = [] #STEP 1: CREATE A LINE MAP TO MARK WHERE SECTIONS START if (lines): for line in lines: if (line): key, text = splitLine(line) sLines.append((key, text)) #run for significant index lineMap = [] idx = 0 sectionList = [ '[headword]', '[category]', '[secphrases]', '[secphrasal]', '[secusage]', '[secpronun]', '[secorigin]' ] for sLine in sLines: #print(sLine[0], sLine[1]) #if sLine[0] == '[headword]' for section in sectionList: if (sLine[0]) == section: lineMap.append((section, idx)) idx += 1 #STEP 2: EXTRACT START AND END INDEX FOR EACH SECTION #print('lineMap:', lineMap) #print('len -1', len(lineMap)-1) idxMap = [] if len(lineMap) == 1: tup = (lineMap[0][0], lineMap[0][1], len(lines) - 1) idxMap.append(tup) elif len(lineMap) > 1: for i in range(len(lineMap) - 1): tup = (lineMap[i][0], lineMap[i][1], lineMap[i + 1][1]) idxMap.append(tup) if (i == len(lineMap) - 2): lastIdx = i + 1 #print(lineMap[lastIdx][1]) tup = (lineMap[lastIdx][0], lineMap[lastIdx][1], len(lines) - 1) idxMap.append(tup) #STEP 3: HANDLE EACH SECTION #print('\nindex map:', idxMap) objectList = [] for item in idxMap: sectionName, firstIdex, lastIndex = item sectLines = [] for i in range(firstIdex, lastIndex): sectLines.append(lines[i]) if (sectionName == '[headword]'): objHW = runlib.processHeadWordLines(sectLines) objectList.append(objHW) #print(objHW) elif (sectionName == '[category]'): objCategory = runlib.processCategoryLines(sectLines) objectList.append(objCategory) elif (sectionName == '[secphrases]'): objPhrases = runlib.processPhraseLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secphrasal]'): objPhrases = runlib.processPhraseVerbLines(sectLines) objectList.append(objPhrases) elif (sectionName == '[secusage]'): objUsage = runlib.processUsageLines(sectLines) objectList.append(objUsage) elif (sectionName == '[secorigin]'): objOrigin = runlib.processOriginLines(sectLines) objectList.append(objOrigin) elif (sectionName == '[secpronun]'): objPhonetic = runlib.processPhoneticLines(sectLines) objectList.append(objPhonetic) #STEP 4: MERGE OBJECTS masterObject = {} for obj in objectList: for key in obj: masterObject[key] = obj[key] #@pprint(masterObject) #STEP 5: WRITE OUT JSON FILE with open(pathOut, 'w', encoding="utf-8") as outfile: json.dump(masterObject, outfile) message = 'Finished converting ' + fileName + ' to JSON' return message