Ejemplo n.º 1
0
def test__parse(query, target_output_filename, hints_for_manual=[]):
    for hint in hints_for_manual:
        print(hint)
    out = CommandLineInterface.parse(query)
    with open(constants.base_filepath + target_output_filename) as file:
        file_content = file.read()

    try:
        assert (XMLUtil.compress_xml(out) == XMLUtil.compress_xml(file_content))  # for now we wont look at xml format
    except AssertionError as ae:
        print(ae)
        print("content of " + target_output_filename + " was:")
        print(file_content)
        print("output of query " + query + " was:")
        print(out)
Ejemplo n.º 2
0
def extract_template_data_from_xml(template, xml):

    data = []

    all_father = get_top_level_common_parent(template, xml)
    # a template can contain multiple repeating structures

    for repeating_structure in template:

        common_parent = XMLUtil.find_first_common_parent(
            xml, repeating_structure.tag)

        # (only in debugging mode): weird exception is thrown in the case I sent the output of
        # extract_from_xml_section to a variable. In case I print it, for example, no exception is thrown
        data.append(extract_from_xml(repeating_structure, common_parent))

    extracted_xml_portion = ET.Element(all_father.tag)
    extracted_xml_portion.text = all_father.text
    extracted_xml_portion.tail = all_father.tail

    # in this case all_father == common_parent, therefore we can just return the only one content of data
    if len(data) == 1:
        return data[0]
    else:
        for x in data:
            extracted_xml_portion.append(x)

        return extracted_xml_portion
Ejemplo n.º 3
0
def call_filter(args):
    if len(args) == 6:
        input_file = args[0]
        output_file = args[1]

        candidate = args[2]
        field = args[3]
        # if you want to add comparison functions, take a look at src/Structures.py,
        # under 'get_comp_function_from_string' function
        comp = args[4]
        value = args[5]

        ct = makeConditionalTuple(candidate, field, comp, value)

    else:
        raise IncorrectArgumentNumberException(6, args)

    xml_tree = ET.ElementTree()
    xml_tree.parse(constants.base_filepath + input_file)

    extracted_xml = XMLFilter.filter_xml_tree([ct], xml_tree.getroot())

    out = XMLUtil.xml_to_string(extracted_xml)

    if output_file != 'None':
        with open(constants.base_filepath + output_file, 'w') as file:
            file.write(out)
    return out
Ejemplo n.º 4
0
def get_top_level_common_parent(template, xml):
    common_parent = set()
    for x in template:
        common_parent.add(XMLUtil.find_first_common_parent(xml, x.tag))

    if len(common_parent) == 1:
        return next(iter(common_parent))
    else:
        raise NoCommonParentInTemplateTopLevelException([x for x in template])
Ejemplo n.º 5
0
def test__filter_xml_tree(in_file, out_file, list_condition_tuples):
    xml_tree = ET.parse(constants.base_filepath + in_file)
    xml = xml_tree.getroot()

    out = filter_xml_tree(list_condition_tuples, xml)

    data = XMLUtil.xml_to_string(out)
    with open(constants.base_filepath + out_file, 'w') as file:
        file.write(data)
Ejemplo n.º 6
0
def apply(xml, post_process):
    if post_process.tag == 'filter':
        return apply_filter(xml,
                            post_processing_string_splitter(post_process.text))
    elif post_process.tag == 'text_formatting':
        if post_process.text == 'compress':
            return XMLUtil.compress_xml(xml)
        elif post_process.text == 'indent':
            return XMLUtil.indent_xml(xml)
    elif post_process.tag == 'html_entitize':
        tag_to_entitize = xml.find('.//{*}' + post_process.text)
        if len(tag_to_entitize) > 1:
            raise XMLAnalyzerException.TooManyChildrenException(
                tag_to_entitize.tag, [x.tag for x in tag_to_entitize], 1)
        content = ET.tostring(tag_to_entitize[0]).decode()
        tag_to_entitize.text = html_entitize(content)
        tag_to_entitize.remove(tag_to_entitize[0])  # remove child
        return xml
    else:
        raise XMLAnalyzerException.InvalidPostProcessTagException(
            post_process.tag)
Ejemplo n.º 7
0
def call_extraction(args):
    if len(args) == 3:
        input_file = args[0]
        template_name = args[1]
        output_file = args[2]
    else:
        raise IncorrectArgumentNumberException(3, args)

    xml_tree = ET.ElementTree()
    with open(constants.base_filepath + input_file) as file:
        data = file.read()
    try:
        xml_tree = ET.fromstring(data)
    except (ET.XMLSyntaxError, ValueError) as e:

        data = re.sub('<\\?.*?\\?>', '', data)
        xml_tree = ET.fromstring(data)

    template = XMLUtil.Template(template_name)

    pre_processed_xml_tree = PreProcessing.apply_all(
        xml_tree, template.pre_process_queue)

    extracted_xml = XMLExtractor.extract_template_data_from_xml(
        template.get_template(), pre_processed_xml_tree)

    extracted_xml = PostProcessing.apply_all(extracted_xml,
                                             template.post_process_queue)

    out = XMLUtil.xml_to_string(extracted_xml)

    if output_file != 'None':
        with open(constants.base_filepath + output_file, 'w') as file:
            file.write(out)

    return out
Ejemplo n.º 8
0
def filter_xml_tree(conditions, xml):

    # expect a list of ConditionalTuple. if a ConditionalTuple arrives, we should wrap it in a list
    if type(conditions) is ConditionalTuple:
        conditions = [conditions]

    for cond in conditions:

        top_level = XMLUtil.find_first_common_parent(xml, cond.candidate)

        comp_func = make_cond(cond)

        for child in list(top_level):
            filter_xml(cond, comp_func, child, top_level)

    return xml