-
Notifications
You must be signed in to change notification settings - Fork 0
/
xml2bar_extractor.py
170 lines (143 loc) · 6.16 KB
/
xml2bar_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import argparse
import json
import os
import sys
import time
import multiprocessing
import re
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from lxml import etree
parser = argparse.ArgumentParser(
description='This script is used for converting a xml file into character delimited file like csv.')
parser.add_argument(
'input_file', help='The input xml file that needed to parsed. Ex: data.xml', type=str)
parser.add_argument(
'template_file', help='This file have to contains all the extraction logic in a JSON file. Ex: abc.json', type=str)
parser.add_argument(
'output_file', help='The output file location. Ex: abc.csv', type=str)
parser.add_argument('delimiter',
type=str,
help='Delimiter character that will be used to delimit values. (Default = |) Ex: | or ,',
default='|')
parser.add_argument('wrapper_tag', help='This tag will used to wrap the whole xml output file.', type=str)
parser.add_argument('is_multiprocessing',
help='Parallel processing of the xml elements. Will result in unordered result. (Default=True)',
type=bool,
default=True)
parser.add_argument('is_multiprocessing',
help='Parallel processing of the xml elements. Will result in unordered result. (Default=True)',
type=bool,
default=True)
parser.add_argument('whole_element',
help='If True, the whole element tag will be outputted, instead of just the tag that is defined in the template file. ',
type=bool,
default=False)
args = parser.parse_args()
def validate_arguments():
input_file = args.input_file
template_file = args.template_file
output_file = args.output_file
delimiter = args.delimiter
is_multiprocessing = args.is_multiprocessing
wrapper_tag = args.wrapper_tag
is_whole_element = args.whole_element
if not os.path.isabs(args.input_file):
input_file = os.path.realpath('.') + os.path.sep + args.input_file
if not os.path.isfile(input_file):
sys.stderr.write("ERROR: The input file '" +
input_file + "' does not exist!\n")
sys.exit(1)
if not os.path.isabs(args.template_file):
template_file = os.path.realpath('.') + os.path.sep + args.template_file
if not os.path.isfile(template_file):
sys.stderr.write("ERROR: The template file '" +
template_file + "' does not exist!\n")
sys.exit(1)
if not os.path.isabs(args.output_file):
output_file = os.path.realpath('.') + os.path.sep + args.output_file
if not os.path.exists(os.path.dirname(output_file)):
sys.stdout.write("WARNING: The output directory '" +
os.path.dirname(output_file) + "' does not exist!\nCreating directories...")
os.makedirs(output_file)
return (input_file, template_file, output_file, delimiter, is_multiprocessing, wrapper_tag, is_whole_element)
def write_line(line_no, element, values_to_write):
values = list()
for value in values_to_write:
if not value:
values.append("")
else:
child = element.xpath(value)
for c in child:
values.append(etree.tostring(c))
text = ""
for item in values:
text += item
return text
def process_element(elem, path, is_whole_element):
output_line = ""
if isinstance(elem, list):
for sub_elem in elem:
output_line += process_element(sub_elem, path)
else:
for key, val in path.items():
if isinstance(val, list):
output_line = output_line + write_line(key, elem, val)
elif isinstance(val, dict):
temp_elem = elem.xpath(key)
output_line = output_line + process_element(temp_elem, val)
if (is_whole_element and output_line):
break;
return output_file;
def start_processing_element(params):
elem_string, d, is_whole_element = params
elem_string = re.sub(r"xmlns=\".*?\"", "", elem_string)
tree = etree.fromstring(elem_string)
result = process_element(tree, d, is_whole_element)
if (is_whole_element and result):
return elem_string
else:
return result
def element_generator(input_file, template, root_tag, is_whole_element):
root_tag_with_namespace = "{*}" + root_tag
for event, elem in etree.iterparse(input_file, tag=root_tag_with_namespace):
yield (etree.tostring(elem), template.get(root_tag), is_whole_element)
elem.clear()
if __name__ == '__main__':
start_time = time.time()
input_file, template_file, output_file, delimiter, is_multiprocessing, wrapper_tag, is_whole_element = validate_arguments()
counter = 0
try:
with open(args.template_file) as template_file:
template = json.loads(template_file.read(), object_pairs_hook=OrderedDict)
except Exception as e:
sys.stderr.write("ERROR: Invalid JSON template file! " + e.message + "\n")
sys.exit(1)
root_tag = next(iter(template))
f = open(output_file, 'w+')
f.writelines(wrapper_tag)
p = multiprocessing.Pool()
if is_multiprocessing:
try:
for result in p.imap_unordered(start_processing_element,
element_generator(input_file, template, root_tag, is_whole_element),
chunksize=100):
if (result and not is_whole_element):
f.write("<" + root_tag + ">\n")
f.write(result + str("\n"))
f.write("</" + root_tag + ">\n")
else:
f.write(result)
counter += 1
f.writelines(wrapper_tag)
finally:
p.close()
p.join()
else:
for elem in element_generator(input_file):
f.write(start_processing_element(elem))
counter += 1
sys.stdout.writelines("Total Account Extracted: " + str(counter))
print("\n--- %s seconds ---" % (time.time() - start_time))