def extract_children(parent, children_xpaths, children_concat, children_range, unmatched_parent_text): concat_values_list, concat_used_nodes = extract_children_concat( parent, children_concat) range_values_list, standalone_values, range_used_nodes = ( extract_children_range(parent, children_range)) used_nodes = concat_used_nodes | range_used_nodes other_child_nodes = [ node for node in match_xpaths(parent, children_xpaths) if node not in used_nodes ] other_child_nodes_excl_parents = exclude_parents(other_child_nodes) text_content_list = filter_truthy( strip_all( get_stripped_text_content_list(other_child_nodes_excl_parents) + concat_values_list + range_values_list)) if len(other_child_nodes_excl_parents) != len(other_child_nodes): other_child_nodes_excl_parents_set = set( other_child_nodes_excl_parents) for child in other_child_nodes: if child not in other_child_nodes_excl_parents_set: text_values = filter_truthy( strip_all(get_immediate_text(child))) text_content_list.extend(text_values) if unmatched_parent_text: value = get_stripped_text_content(parent, exclude=set(other_child_nodes) | used_nodes).strip() if value and value not in text_content_list: text_content_list.append(value) return text_content_list, standalone_values
def extract_children_range(parent, children_range): used_nodes = set() values = [] standalone_values = [] get_logger().debug('children_range: %s', children_range) for range_item in children_range: temp_values, temp_used_nodes = extract_children_source_list( parent, [range_item.get('min'), range_item.get('max')] ) if len(temp_values) == 2: temp_values = strip_all(temp_values) if all(s.isdigit() for s in temp_values): num_values = [int(s) for s in temp_values] range_values = [str(x) for x in range(num_values[0], num_values[1] + 1)] if range_item.get('standalone'): standalone_values.extend(range_values) else: values.extend(range_values) used_nodes |= temp_used_nodes else: get_logger().info('values not integers: %s', temp_values) return values, standalone_values, used_nodes
def parse_includes(includes): return includes and set(strip_all(includes.split(',')))
def xml_root_to_target_annotations(xml_root, xml_mapping): if xml_root.tag not in xml_mapping: raise Exception("unrecognised tag: {} (available: {})".format( xml_root.tag, xml_mapping.sections())) mapping = xml_mapping[xml_root.tag] field_names = [k for k in mapping.keys() if '.' not in k] def get_mapping_flag(k, suffix): return mapping.get(k + suffix) == 'true' def get_match_multiple(k): return get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE) def get_bonding_flag(k): return get_mapping_flag(k, XmlMappingSuffix.BONDING) def get_require_next_flag(k): return get_mapping_flag(k, XmlMappingSuffix.REQUIRE_NEXT) get_unmatched_parent_text_flag = ( lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT)) get_logger().debug('fields: %s', field_names) target_annotations_with_pos = [] xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())} for k in field_names: match_multiple = get_match_multiple(k) bonding = get_bonding_flag(k) require_next = get_require_next_flag(k) unmatched_parent_text = get_unmatched_parent_text_flag(k) children_xpaths = parse_xpaths( mapping.get(k + XmlMappingSuffix.CHILDREN)) children_concat = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), []) children_range = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []) re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.REGEX)) extract_re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX)) get_logger().debug('extract_re_compiled_pattern (%s): %s', k, extract_re_compiled_pattern) priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) sub_xpaths = get_sub_mapping(mapping, k) get_logger().debug('sub_xpaths (%s): %s', k, sub_xpaths) xpaths = parse_xpaths(mapping[k]) get_logger().debug('xpaths(%s): %s', k, xpaths) for e in match_xpaths(xml_root, xpaths): e_pos = xml_pos_by_node.get(e) sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping, k) get_logger().debug('sub_annotations (%s): %s', k, sub_annotations) if children_xpaths: text_content_list, standalone_values = extract_children( e, children_xpaths, children_concat, children_range, unmatched_parent_text) else: text_content_list = filter_truthy( strip_all([get_stripped_text_content(e)])) standalone_values = [] if re_compiled_pattern: text_content_list = filter_truthy([ apply_pattern(s, re_compiled_pattern) for s in text_content_list ]) if extract_re_compiled_pattern: text_content_list = filter_truthy([ extract_using_regex(s, extract_re_compiled_pattern) for s in text_content_list ]) text_content_list = flatten_if_nested(text_content_list) if text_content_list: value = (text_content_list[0] if len(text_content_list) == 1 else sorted(text_content_list, key=lambda s: -len(s))) target_annotations_with_pos.append( ((-priority, e_pos), TargetAnnotation(value, k, match_multiple=match_multiple, bonding=bonding, require_next=require_next, sub_annotations=sub_annotations))) if standalone_values: for i, standalone_value in enumerate(standalone_values): target_annotations_with_pos.append( ((-priority, e_pos, i), TargetAnnotation(standalone_value, k, match_multiple=match_multiple, bonding=bonding, sub_annotations=sub_annotations))) target_annotations_with_pos = sorted(target_annotations_with_pos, key=lambda x: x[0]) get_logger().debug('target_annotations_with_pos:\n%s', target_annotations_with_pos) target_annotations = [x[1] for x in target_annotations_with_pos] get_logger().debug('target_annotations:\n%s', '\n'.join([' ' + str(a) for a in target_annotations])) return target_annotations
def parse_xpaths(s): return strip_all(s.strip().split('\n')) if s else None
def xml_root_to_target_annotations(xml_root, xml_mapping): if xml_root.tag not in xml_mapping: raise Exception("unrecognised tag: {} (available: {})".format( xml_root.tag, xml_mapping.sections())) mapping = xml_mapping[xml_root.tag] field_names = [k for k in mapping.keys() if '.' not in k] def get_mapping_flag(k, suffix): return mapping.get(k + suffix) == 'true' def get_match_multiple(k): return get_mapping_flag(k, XmlMappingSuffix.MATCH_MULTIPLE) def get_bonding_flag(k): return get_mapping_flag(k, XmlMappingSuffix.BONDING) def get_require_next_flag(k): return get_mapping_flag(k, XmlMappingSuffix.REQUIRE_NEXT) get_unmatched_parent_text_flag = ( lambda k: get_mapping_flag(k, XmlMappingSuffix.UNMATCHED_PARENT_TEXT)) LOGGER.debug('fields: %s', field_names) target_annotations_with_pos = [] xml_pos_by_node = {node: i for i, node in enumerate(xml_root.iter())} for k in field_names: match_multiple = get_match_multiple(k) bonding = get_bonding_flag(k) require_next = get_require_next_flag(k) unmatched_parent_text = get_unmatched_parent_text_flag(k) exclude_children_xpaths = parse_xpaths( mapping.get(k + XmlMappingSuffix.IGNORE)) LOGGER.debug('exclude_children_xpaths (%s): %s', k, exclude_children_xpaths) children_xpaths = parse_xpaths( mapping.get(k + XmlMappingSuffix.CHILDREN)) children_concat = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_CONCAT), []) children_range = parse_json_with_default( mapping.get(k + XmlMappingSuffix.CHILDREN_RANGE), []) re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.REGEX)) extract_re_compiled_pattern = re_compile_or_none( mapping.get(k + XmlMappingSuffix.EXTRACT_REGEX)) LOGGER.debug('extract_re_compiled_pattern (%s): %s', k, extract_re_compiled_pattern) priority = int(mapping.get(k + XmlMappingSuffix.PRIORITY, '0')) sub_xpaths = get_sub_mapping(mapping, k) LOGGER.debug('sub_xpaths (%s): %s', k, sub_xpaths) use_raw_text_value = mapping.get(k + XmlMappingSuffix.USE_RAW_TEXT) use_raw_text_config = strtobool( use_raw_text_value) if use_raw_text_value else None xpaths = parse_xpaths(mapping[k]) LOGGER.debug('xpaths(%s): %s', k, xpaths) for e in match_xpaths(xml_root, xpaths): e_pos = xml_pos_by_node.get(e) exclude_childrens = list(match_xpaths(e, exclude_children_xpaths)) LOGGER.debug('exclude_childrens (%s, %s): %s', k, e, exclude_childrens) sub_annotations = extract_sub_annotations(e, sub_xpaths, mapping, k) LOGGER.debug('sub_annotations (%s): %s', k, sub_annotations) use_raw_text = (use_raw_text_config if use_raw_text_config is not None else contains_raw_text(e)) should_use_children_xpaths = ( children_xpaths and (not is_wildcard_children_xpaths(children_xpaths) or not use_raw_text)) if should_use_children_xpaths: text_content_list, standalone_values = extract_children( e, children_xpaths, children_concat, children_range, unmatched_parent_text) else: text_content_list = filter_truthy( strip_all([ get_raw_text_content( e, exclude_childrens=exclude_childrens) ])) standalone_values = [] LOGGER.debug( 'text_content_list: %s, standalone_values: %s,' ' children_xpaths: %s, use_raw_text: %s', text_content_list, standalone_values, children_xpaths, use_raw_text) if re_compiled_pattern: text_content_list = filter_truthy([ apply_pattern(s, re_compiled_pattern) for s in text_content_list ]) if extract_re_compiled_pattern: text_content_list = filter_truthy([ extract_using_regex(s, extract_re_compiled_pattern) for s in text_content_list ]) text_content_list = flatten_if_nested(text_content_list) if text_content_list: value = (text_content_list[0] if len(text_content_list) == 1 else sorted(text_content_list, key=lambda s: -len(s))) target_annotations_with_pos.append( ((-priority, e_pos), TargetAnnotation(value, k, match_multiple=match_multiple, bonding=bonding, require_next=require_next, sub_annotations=sub_annotations))) if standalone_values: for i, standalone_value in enumerate(standalone_values): target_annotations_with_pos.append( ((-priority, e_pos, i), TargetAnnotation(standalone_value, k, match_multiple=match_multiple, bonding=bonding, sub_annotations=sub_annotations))) target_annotations_with_pos = sorted(target_annotations_with_pos, key=lambda x: x[0]) LOGGER.debug('target_annotations_with_pos:\n%s', target_annotations_with_pos) target_annotations = [x[1] for x in target_annotations_with_pos] LOGGER.debug('target_annotations:\n%s', '\n'.join([' ' + str(a) for a in target_annotations])) if not target_annotations and LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug( 'no target_annotations found for\nxml_mapping: %s\nxml: %s', xml_mapping, etree.tostring(xml_root, encoding='unicode')) return target_annotations