Esempio n. 1
0
assert len(args) == 5, "Expected 4 arguments! \n\n-i followed by input directory path\n-o followed by output file path"
assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory"
assert '-o' in args and not args.index('-o') == len(args)-1, "No output file path provided"

path = args[args.index('-i')+1]
jsonFileName = os.path.join(os.getcwd(), args[args.index('-o')+1])

xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path))
root = {}
alldocs = []
l = len(xmls)
count = 0
for afile in xmls:
    count += 1
    print 'XMLsToJSON.py: Processing', afile, 'file', count, 'out of', l
    unit = Preprocessing.parseName(afile) 
    docLevel = {}
    docLevel['id'] = afile
    tokenList = []
    if debug:
        html.write('<h2>' + afile + '</h2><table border = "1"><th>Original<th>Conflated</th>')
    ws = minidom.parse(os.path.join(path, afile)).getElementsByTagName('w')
    words = []
    for w in range(len(ws)):
        if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w
            continue
        currentWord = ws[w]
        previousWord = ''
        try:
            previousWord = ws[w-1]
        except IndexError:
Esempio n. 2
0
 doc = minidom.Document()
 witnessElement = doc.createElement('witnesses')
 doc.appendChild(witnessElement)
 blockc = 0
 for block in data['table']:
     blockc += 1
     blockElement = doc.createElement('block')
     blockElement.setAttributeNode(doc.createAttribute('n'))
     blockElement.setAttribute('n', str(blockc-1))
     number = 0
     for token in block:
         tokenElement = doc.createElement('token')
         tokenElement.setAttributeNode(doc.createAttribute('n'))
         tokenElement.setAttributeNode(doc.createAttribute('witness'))
         tokenElement.setAttributeNode(doc.createAttribute('u'))
         unitValue = Preprocessing.parseName(afile)
         if token:
             textNodeValue = token[0]['t']
             normalizedAttrValue = token[0]['n']
         else:
             textNodeValue = ''
             normalizedAttrValue = ''
         tokenElement.appendChild(doc.createTextNode(textNodeValue))
         tokenElement.setAttribute('n', normalizedAttrValue)
         tokenElement.setAttribute('u', unitValue)
         tokenElement.setAttribute('witness', nameToNumber[number])
         blockElement.appendChild(tokenElement)
         number += 1
     witnessElement.appendChild(blockElement)
 for ln in doc.toprettyxml().split('\n'):
     out.write(normalChars(ln).encode('utf-8') + '\n')