def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_extractor, following_extractors = extractors[0], extractors[1:] lelem = labelled_element extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = lelem(first_extractor) score, pindex, sindex = \ similar_region( page.page_tokens, self.template_tokens, labelled, start_index, end_region, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_extractor.extract( page, pindex, sindex, similar_ignored_regions, **kwargs) if following_extractors: _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_extractors: end_index, _, following_data = self._doextract( page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract( page, [first_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data if (not extracted_data and hasattr(first_extractor, 'annotation') and first_extractor.annotation): annotation = first_extractor.annotation or [] content = annotation.surrounds_attribute or [] attributes = annotation.tag_attributes attrs = chain(content, *(a for _, a in attributes)) if (any(isinstance(k, dict) and k.get('required') for k in attrs)): raise MissingRequiredError() return pindex, sindex, extracted_data
def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): """Carry out extraction of records using the given annotations in the page tokens bounded by start_index and end_index """ # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] current_extractor, following_extractors = extractors[0], extractors[1:] while (following_extractors and _int_cmp(labelled_element(following_extractors[0]).start_index, 'lt', labelled_element(current_extractor).end_index)): ex = following_extractors.pop(0) labelled = labelled_element(ex) if (isinstance(labelled, AnnotationTag) or (nested_regions and _int_cmp(labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and _int_cmp(labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))): nested_regions.append(ex) else: ignored_regions.append(ex) extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_index_exclusive = None if end_index is None else end_index + 1 labelled = labelled_element(current_extractor) score, pindex, sindex = \ similar_region(page.page_tokens, self.template_tokens, labelled, start_index, end_index_exclusive, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = current_extractor.extract(page, pindex, sindex, similar_ignored_regions, **kwargs) if extracted_data: if current_extractor.annotation.variant_id: extracted_data = [(current_extractor.annotation.variant_id, extracted_data)] if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data if following_extractors: _, _, following_data = self._doextract(page, following_extractors, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_extractors: end_index, _, following_data = self._doextract(page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [current_extractor], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data return pindex, sindex, extracted_data
def _doextract(self, page, region_elements, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): """Carry out extraction of records using the given annotations in the page tokens bounded by start_index and end_index """ # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_region, following_regions = region_elements[0], region_elements[1:] while following_regions and _labelled(following_regions[0]).start_index \ < _labelled(first_region).end_index: region = following_regions.pop(0) labelled = _labelled(region) if isinstance(labelled, AnnotationTag) or (nested_regions and \ _labelled(nested_regions[-1]).start_index < labelled.start_index \ < _labelled(nested_regions[-1]).end_index): nested_regions.append(region) else: ignored_regions.append(region) extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 labelled = _labelled(first_region) score, pindex, sindex = \ similar_region(page.page_tokens, self.template_tokens, labelled, start_index, end_region, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, \ i, start, sindex, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_region.extract(page, pindex, sindex, similar_ignored_regions, **kwargs) if extracted_data: if first_region.annotation.variant_id: extracted_data = [(first_region.annotation.variant_id, extracted_data)] if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data if following_regions: _, _, following_data = self._doextract(page, following_regions, sindex or start_index, end_index, **kwargs) extracted_data += following_data elif following_regions: end_index, _, following_data = self._doextract(page, following_regions, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract(page, [first_region], start_index, end_index - 1, nested_regions, ignored_regions, **kwargs) extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data return pindex, sindex, extracted_data
def _doextract(self, page, extractors, start_index, end_index, nested_regions=None, ignored_regions=None, **kwargs): # reorder extractors leaving nested ones for the end and separating # ignore regions nested_regions = nested_regions or [] ignored_regions = ignored_regions or [] first_extractor, following_extractors = extractors[0], extractors[1:] while (following_extractors and _int_cmp( labelled_element(following_extractors[0]).start_index, 'lt', labelled_element(first_extractor).end_index)): ex = following_extractors.pop(0) labelled = labelled_element(ex) if (isinstance(labelled, AnnotationTag) or (nested_regions and _int_cmp( labelled_element(nested_regions[-1]).start_index, 'lt', labelled.start_index) and _int_cmp( labelled.start_index, 'lt', labelled_element(nested_regions[-1]).end_index))): nested_regions.append(ex) else: ignored_regions.append(ex) lelem = labelled_element extracted_data = [] # end_index is inclusive, but similar_region treats it as exclusive end_region = None if end_index is None else end_index + 1 start_region = None if start_index is None else start_index - 1 labelled = lelem(first_extractor) try: score, pindex, sindex = similar_region(page.page_tokens, self.template_tokens, labelled, start_region, end_region, self.best_match, **kwargs) except IndexError: start_region, end_region = start_index, end_index score, pindex, sindex = similar_region(page.page_tokens, self.template_tokens, labelled, start_region, end_region, self.best_match, **kwargs) if score > 0: if isinstance(labelled, AnnotationTag): similar_ignored_regions = [] start = pindex for i in ignored_regions: s, p, e = similar_region(page.page_tokens, self.template_tokens, i, start, sindex, self.best_match, **kwargs) if s > 0: similar_ignored_regions.append(PageRegion(p, e)) start = e or start extracted_data = first_extractor.extract( page, pindex, sindex, similar_ignored_regions, **kwargs) if following_extractors: previous_extraction = start_region or sindex if previous_extraction: kwargs['previous'] = previous_extraction + 1 _, _, following_data = self._doextract(page, following_extractors, sindex or start_region, end_index, **kwargs) extracted_data += following_data if nested_regions: _, _, nested_data = self._doextract(page, nested_regions, pindex, sindex, **kwargs) extracted_data += nested_data elif following_extractors: end_index, _, following_data = self._doextract( page, following_extractors, start_index, end_index, **kwargs) if end_index is not None: pindex, sindex, extracted_data = self._doextract( page, [first_extractor], start_region, end_index, nested_regions, ignored_regions, **kwargs) if extracted_data and sindex: kwargs['previous'] = sindex + 1 extracted_data += following_data elif nested_regions: _, _, nested_data = self._doextract(page, nested_regions, start_index, end_index, **kwargs) extracted_data += nested_data if (hasattr(first_extractor, 'annotation') and first_extractor.annotation): annotation = first_extractor.annotation or [] content = annotation.surrounds_attribute or [] attributes = annotation.tag_attributes attrs = chain(content, *(a for _, a in attributes)) extracted_ids = { a['id'] for annos, _ in extracted_data for a in annos if isinstance(a, dict) and 'id' in a } if (any( isinstance(k, dict) and k.get('required') and k.get('id') not in extracted_ids for k in attrs)): raise MissingRequiredError() return pindex, sindex, extracted_data