def test__parse(query, target_output_filename, hints_for_manual=[]): for hint in hints_for_manual: print(hint) out = CommandLineInterface.parse(query) with open(constants.base_filepath + target_output_filename) as file: file_content = file.read() try: assert (XMLUtil.compress_xml(out) == XMLUtil.compress_xml(file_content)) # for now we wont look at xml format except AssertionError as ae: print(ae) print("content of " + target_output_filename + " was:") print(file_content) print("output of query " + query + " was:") print(out)
def extract_template_data_from_xml(template, xml): data = [] all_father = get_top_level_common_parent(template, xml) # a template can contain multiple repeating structures for repeating_structure in template: common_parent = XMLUtil.find_first_common_parent( xml, repeating_structure.tag) # (only in debugging mode): weird exception is thrown in the case I sent the output of # extract_from_xml_section to a variable. In case I print it, for example, no exception is thrown data.append(extract_from_xml(repeating_structure, common_parent)) extracted_xml_portion = ET.Element(all_father.tag) extracted_xml_portion.text = all_father.text extracted_xml_portion.tail = all_father.tail # in this case all_father == common_parent, therefore we can just return the only one content of data if len(data) == 1: return data[0] else: for x in data: extracted_xml_portion.append(x) return extracted_xml_portion
def call_filter(args): if len(args) == 6: input_file = args[0] output_file = args[1] candidate = args[2] field = args[3] # if you want to add comparison functions, take a look at src/Structures.py, # under 'get_comp_function_from_string' function comp = args[4] value = args[5] ct = makeConditionalTuple(candidate, field, comp, value) else: raise IncorrectArgumentNumberException(6, args) xml_tree = ET.ElementTree() xml_tree.parse(constants.base_filepath + input_file) extracted_xml = XMLFilter.filter_xml_tree([ct], xml_tree.getroot()) out = XMLUtil.xml_to_string(extracted_xml) if output_file != 'None': with open(constants.base_filepath + output_file, 'w') as file: file.write(out) return out
def get_top_level_common_parent(template, xml): common_parent = set() for x in template: common_parent.add(XMLUtil.find_first_common_parent(xml, x.tag)) if len(common_parent) == 1: return next(iter(common_parent)) else: raise NoCommonParentInTemplateTopLevelException([x for x in template])
def test__filter_xml_tree(in_file, out_file, list_condition_tuples): xml_tree = ET.parse(constants.base_filepath + in_file) xml = xml_tree.getroot() out = filter_xml_tree(list_condition_tuples, xml) data = XMLUtil.xml_to_string(out) with open(constants.base_filepath + out_file, 'w') as file: file.write(data)
def apply(xml, post_process): if post_process.tag == 'filter': return apply_filter(xml, post_processing_string_splitter(post_process.text)) elif post_process.tag == 'text_formatting': if post_process.text == 'compress': return XMLUtil.compress_xml(xml) elif post_process.text == 'indent': return XMLUtil.indent_xml(xml) elif post_process.tag == 'html_entitize': tag_to_entitize = xml.find('.//{*}' + post_process.text) if len(tag_to_entitize) > 1: raise XMLAnalyzerException.TooManyChildrenException( tag_to_entitize.tag, [x.tag for x in tag_to_entitize], 1) content = ET.tostring(tag_to_entitize[0]).decode() tag_to_entitize.text = html_entitize(content) tag_to_entitize.remove(tag_to_entitize[0]) # remove child return xml else: raise XMLAnalyzerException.InvalidPostProcessTagException( post_process.tag)
def call_extraction(args): if len(args) == 3: input_file = args[0] template_name = args[1] output_file = args[2] else: raise IncorrectArgumentNumberException(3, args) xml_tree = ET.ElementTree() with open(constants.base_filepath + input_file) as file: data = file.read() try: xml_tree = ET.fromstring(data) except (ET.XMLSyntaxError, ValueError) as e: data = re.sub('<\\?.*?\\?>', '', data) xml_tree = ET.fromstring(data) template = XMLUtil.Template(template_name) pre_processed_xml_tree = PreProcessing.apply_all( xml_tree, template.pre_process_queue) extracted_xml = XMLExtractor.extract_template_data_from_xml( template.get_template(), pre_processed_xml_tree) extracted_xml = PostProcessing.apply_all(extracted_xml, template.post_process_queue) out = XMLUtil.xml_to_string(extracted_xml) if output_file != 'None': with open(constants.base_filepath + output_file, 'w') as file: file.write(out) return out
def filter_xml_tree(conditions, xml): # expect a list of ConditionalTuple. if a ConditionalTuple arrives, we should wrap it in a list if type(conditions) is ConditionalTuple: conditions = [conditions] for cond in conditions: top_level = XMLUtil.find_first_common_parent(xml, cond.candidate) comp_func = make_cond(cond) for child in list(top_level): filter_xml(cond, comp_func, child, top_level) return xml