import datetime, json, os, Preprocessing, sys, xml.dom.minidom as minidom os.chdir(os.path.abspath(os.path.dirname(__file__))) args = sys.argv assert len(args) == 3, "Expected 4 arguments exactly! -i followed by input directory path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" path = args[args.index('-i')+1] xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path)) l = len(xmls) count = 0 print for afile in xmls: count += 1 Preprocessing.updateProgressBar('XMLtoJSON.py', float(100)*count/l) unit = Preprocessing.parseName(afile) root = {} alldocs = [] rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']] for rdg in rdgs: docLevel = {} docLevel['id'] = rdg.getAttribute('wit') tokenList = [] ws = rdg.getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = ''
assert len(args) == 3, "Expected exactly 2 arguments!\n\n-i followed by input directory path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" def normalChars(l): return l.replace('<', '<').replace('>','>').replace('"', '"') path = args[args.index('-i')+1] jsons = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'json' , os.listdir(path)) os.chdir(path) c = 0 l = len(jsons) couldnt = [] print for afile in jsons: c += 1 Preprocessing.updateProgressBar('JSONtoXML.py', float(100)*c/l) data = json.loads(open(afile, 'r').read()) nameToNumber = {number:name for number, name in enumerate(data['witnesses'])} with codecs.open(afile[:-4] + 'xml','w') as out: doc = minidom.Document() witnessElement = doc.createElement('witnesses') doc.appendChild(witnessElement) blockc = 0 for block in data['table']: blockc += 1 blockElement = doc.createElement('block') blockElement.setAttributeNode(doc.createAttribute('n')) blockElement.setAttribute('n', str(blockc-1)) number = 0 for token in block: tokenElement = doc.createElement('token')
return previous_row[-1] def isBlank(node): return node.getAttribute('n') == '' os.chdir(path) if os.path.exists('Postprocessed'): shutil.rmtree('Postprocessed') os.mkdir('Postprocessed') print for afile in xmls: c += 1 Preprocessing.updateProgressBar('Postprocessing.py', float(100)*c/x) doc = minidom.parse(os.path.join(path, afile)) blocks = doc.getElementsByTagName('block') tokens = doc.getElementsByTagName('token') blanks = [token for token in tokens if token.getAttribute('n') == ''] if blanks: #generate dictionary of witness to its token nodes for each row column1Toks = blocks[0].getElementsByTagName('token') wit2toks = {} for token in column1Toks: wit = token.getAttribute('witness') row = [token for token in doc.getElementsByTagName('token') if token.nodeType == 1 and token.getAttribute('witness') == wit] wit2toks[wit] = row for (wit, row) in wit2toks.items(): #generate list of lists of sequences of empty tokens fin = []
if textNodeValue != '-': normalizedAttrValue = token[0]['n'] else: textNodeValue = '' normalizedAttrValue = '' tokenElement.appendChild(doc.createTextNode(textNodeValue)) tokenElement.setAttribute('n', normalizedAttrValue) tokenElement.setAttribute('u', unitValue) tokenElement.setAttribute('witness', nameToNumber[number]) blockElement.appendChild(tokenElement) number += 1 line.appendChild(blockElement) return pseudoPrettyPrint(normalChars(line.toprettyxml().encode('utf-8'))) if os.path.exists('output.xml'): os.remove('output.xml') with codecs.open('output.xml', 'a') as out: out.write('<collationOutput>\n') for app in apps: c += 1 Preprocessing.updateProgressBar('Collation', float(100)*c/l) collationResults = collate_pretokenized_json(createJsonRepresentation(app), 'json') out.write(processColumn(collationResults, getUnit(app))) if c % FLUSH == 0: Preprocessing.updateProgressBar('Collation', float(100)*c/l, True) gc.collect() out.write('</collationOutput>') print '\nTook', datetime.datetime.now() - startTime, 'to execute.'