assert len(args) == 5, "Expected 4 arguments! \n\n-i followed by input directory path\n-o followed by output file path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" assert '-o' in args and not args.index('-o') == len(args)-1, "No output file path provided" path = args[args.index('-i')+1] jsonFileName = os.path.join(os.getcwd(), args[args.index('-o')+1]) xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path)) root = {} alldocs = [] l = len(xmls) count = 0 for afile in xmls: count += 1 print 'XMLsToJSON.py: Processing', afile, 'file', count, 'out of', l unit = Preprocessing.parseName(afile) docLevel = {} docLevel['id'] = afile tokenList = [] if debug: html.write('<h2>' + afile + '</h2><table border = "1"><th>Original<th>Conflated</th>') ws = minidom.parse(os.path.join(path, afile)).getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = '' try: previousWord = ws[w-1] except IndexError:
doc = minidom.Document() witnessElement = doc.createElement('witnesses') doc.appendChild(witnessElement) blockc = 0 for block in data['table']: blockc += 1 blockElement = doc.createElement('block') blockElement.setAttributeNode(doc.createAttribute('n')) blockElement.setAttribute('n', str(blockc-1)) number = 0 for token in block: tokenElement = doc.createElement('token') tokenElement.setAttributeNode(doc.createAttribute('n')) tokenElement.setAttributeNode(doc.createAttribute('witness')) tokenElement.setAttributeNode(doc.createAttribute('u')) unitValue = Preprocessing.parseName(afile) if token: textNodeValue = token[0]['t'] normalizedAttrValue = token[0]['n'] else: textNodeValue = '' normalizedAttrValue = '' tokenElement.appendChild(doc.createTextNode(textNodeValue)) tokenElement.setAttribute('n', normalizedAttrValue) tokenElement.setAttribute('u', unitValue) tokenElement.setAttribute('witness', nameToNumber[number]) blockElement.appendChild(tokenElement) number += 1 witnessElement.appendChild(blockElement) for ln in doc.toprettyxml().split('\n'): out.write(normalChars(ln).encode('utf-8') + '\n')