def createJsonRepresentation(app): unit = getUnit(app) root = {} allWits = [] rdgs = [el for el in app.childNodes if el.nodeType == 1] for rdg in rdgs: appLevel = {} appLevel['id'] = rdg.getAttribute('wit') tokenList = [] ws = rdg.getElementsByTagName('w') for ind, w in enumerate(ws): if not 3 in [child.nodeType for child in w.childNodes]: continue currentWord = w if ind == 0: previousWord = '' else: previousWord = ws[ind-1] token = {} token['t'] = currentWord.toxml()[8 + len(w.getAttribute('n')):-4] token['n'] = Preprocessing.conflate(currentWord) token['u'] = unit tokenList.append(token) appLevel['tokens'] = tokenList allWits.append(appLevel) root['witnesses'] = allWits return json.loads(json.dumps(root))
docLevel['id'] = afile tokenList = [] if debug: html.write('<h2>' + afile + '</h2><table border = "1"><th>Original<th>Conflated</th>') ws = minidom.parse(os.path.join(path, afile)).getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = '' try: previousWord = ws[w-1] except IndexError: pass token = {} token['t'] = currentWord.toxml()[8+len(currentWord.getAttribute('n')):-4] c = Preprocessing.conflate(currentWord) if c == Preprocessing.conflate(previousWord): c += '1' # tag '1' to the end of a wod that we suspect is repeated in the manuscript. token['n'] = c token['u'] = unit words.append(c) tokenList.append(token) docLevel['tokens'] = tokenList alldocs.append(docLevel) root['witnesses'] = alldocs with open(os.path.join(path, jsonFileName), 'w') as Json: Json.write(json.dumps(root, ensure_ascii=False).encode('utf-8')) print 'Took', datetime.datetime.now()-startTimeX2J, 'to execute XMLsToJSON.py'
root = {} alldocs = [] rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']] for rdg in rdgs: docLevel = {} docLevel['id'] = rdg.getAttribute('wit') tokenList = [] ws = rdg.getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = '' try: previousWord = ws[w-1] except IndexError: pass token = {} token['t'] = currentWord.toxml()[8 + len(ws[w].getAttribute('n')):-4] c = Preprocessing.conflate(currentWord) token['n'] = c token['u'] = unit words.append(c) tokenList.append(token) docLevel['tokens'] = tokenList alldocs.append(docLevel) root['witnesses'] = alldocs with open(os.path.join(path, afile[:-3] + 'json'), 'w') as Json: Json.write(json.dumps(root, ensure_ascii=False).encode('utf-8'))