コード例 #1
0
ファイル: XMLtoJSON.py プロジェクト: obdurodon/CollateOS
import datetime, json, os, Preprocessing, sys, xml.dom.minidom as minidom
os.chdir(os.path.abspath(os.path.dirname(__file__)))
args = sys.argv
assert len(args) == 3, "Expected 4 arguments exactly! -i followed by input directory path"
assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory"

path = args[args.index('-i')+1]

xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path))
l = len(xmls)
count = 0
print
for afile in xmls:
    count += 1
    Preprocessing.updateProgressBar('XMLtoJSON.py', float(100)*count/l)
    unit = Preprocessing.parseName(afile) 
    root = {}
    alldocs = []
    rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']]
    for rdg in rdgs:
        docLevel = {}
        docLevel['id'] = rdg.getAttribute('wit')
        tokenList = []
        ws = rdg.getElementsByTagName('w')
        words = []
        for w in range(len(ws)):
            if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w
                continue
            currentWord = ws[w]
            previousWord = ''
コード例 #2
0
ファイル: JSONtoXML.py プロジェクト: obdurodon/CollateOS
assert len(args) == 3, "Expected exactly 2 arguments!\n\n-i followed by input directory path"
assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory"

def normalChars(l):
    return l.replace('&lt;', '<').replace('&gt;','>').replace('&quot;', '"')

path = args[args.index('-i')+1]
jsons = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'json' , os.listdir(path))
os.chdir(path)
c = 0
l = len(jsons)
couldnt = []
print
for afile in jsons:
    c += 1
    Preprocessing.updateProgressBar('JSONtoXML.py', float(100)*c/l)
    data = json.loads(open(afile, 'r').read())
    nameToNumber = {number:name for number, name in enumerate(data['witnesses'])}
    with codecs.open(afile[:-4] + 'xml','w') as out:
        doc = minidom.Document()
        witnessElement = doc.createElement('witnesses')
        doc.appendChild(witnessElement)
        blockc = 0
        for block in data['table']:
            blockc += 1
            blockElement = doc.createElement('block')
            blockElement.setAttributeNode(doc.createAttribute('n'))
            blockElement.setAttribute('n', str(blockc-1))
            number = 0
            for token in block:
                tokenElement = doc.createElement('token')
コード例 #3
0
ファイル: Postprocessing.py プロジェクト: obdurodon/CollateOS
    return previous_row[-1]

def isBlank(node):
    return node.getAttribute('n') == ''

   
os.chdir(path)
if os.path.exists('Postprocessed'):
    shutil.rmtree('Postprocessed')
os.mkdir('Postprocessed')

print

for afile in xmls:
    c += 1
    Preprocessing.updateProgressBar('Postprocessing.py', float(100)*c/x)
    doc = minidom.parse(os.path.join(path, afile))
    blocks = doc.getElementsByTagName('block')
    tokens = doc.getElementsByTagName('token')
    blanks = [token for token in tokens if token.getAttribute('n') == '']
    if blanks:
        #generate dictionary of witness to its token nodes for each row
        column1Toks = blocks[0].getElementsByTagName('token')
        wit2toks = {}
        for token in column1Toks:
            wit = token.getAttribute('witness')
            row = [token for token in doc.getElementsByTagName('token') if token.nodeType == 1 and token.getAttribute('witness') == wit]
            wit2toks[wit] = row
        for (wit, row) in wit2toks.items():
            #generate list of lists of sequences of empty tokens
            fin = []
コード例 #4
0
ファイル: collateOS_1.0.py プロジェクト: obdurodon/CollateOS
            if textNodeValue != '-':
                normalizedAttrValue = token[0]['n']
            else:
                textNodeValue = ''
                normalizedAttrValue = ''
            tokenElement.appendChild(doc.createTextNode(textNodeValue))
            tokenElement.setAttribute('n', normalizedAttrValue)
            tokenElement.setAttribute('u', unitValue)
            tokenElement.setAttribute('witness', nameToNumber[number])
            blockElement.appendChild(tokenElement)
        number += 1
        line.appendChild(blockElement)
    return pseudoPrettyPrint(normalChars(line.toprettyxml().encode('utf-8')))

if os.path.exists('output.xml'):
    os.remove('output.xml')
with codecs.open('output.xml', 'a') as out:
    out.write('<collationOutput>\n')
    for app in apps:
        c += 1
        Preprocessing.updateProgressBar('Collation', float(100)*c/l)
        collationResults = collate_pretokenized_json(createJsonRepresentation(app), 'json')
        out.write(processColumn(collationResults, getUnit(app)))
        if c % FLUSH == 0:
            Preprocessing.updateProgressBar('Collation', float(100)*c/l, True)
            gc.collect()

    out.write('</collationOutput>')

print '\nTook', datetime.datetime.now() - startTime, 'to execute.'