Exemple #1
0
 def extract(self, page, start_index=0, end_index=None):
     items = []
     for extractor in self.extractors:
         extracted = extractor.extract(page, start_index, end_index,
                                       self.template.ignored_regions)
         for item in arg_to_iter(extracted):
             if item:
                 if isinstance(item, (ItemProcessor, dict)):
                     item[u'_template'] = self.template.id
         items.extend(filter(bool, arg_to_iter(extracted)))
     return items
Exemple #2
0
 def __init__(self, data, extractor, regions, parent_region=None,
              htmlpage=None):
     self.annotation = extractor.annotation
     self.id = self.annotation.metadata.get(u'id')
     self.regions = arg_to_iter(regions)
     parent_region = arg_to_iter(parent_region) if parent_region else []
     self.parent_region = parent_region
     self.modifiers = extractor.modifiers or {}
     self.schema = extractor.schema or {}
     if hasattr(htmlpage, u'htmlpage'):
         htmlpage = htmlpage.htmlpage
     self.htmlpage = htmlpage
     self.annotations = list(
         load_annotations(getattr(extractor, 'extractors', [])))
     self.fields = self._process_fields(data)
Exemple #3
0
 def _process_args(self, values):
     if self.spider_args is None:
         return []
     results = []
     for value in values:
         results.extend(arg_to_iter(self.spider_args.get(value, [])))
     return results
Exemple #4
0
 def _normalize_data(self, data):
     """Normalize extracted data for conversion into ItemFields."""
     if isinstance(data, dict):
         data = data.items()
     elif data and not isinstance(data[0], (tuple, dict)):
         data = [data]
     for i in data:
         if hasattr(i, u'items'):
             i = i.items()
         else:
             i = (i,)
         other_fields = []
         for fields in chain(arg_to_iter(i), other_fields):
             try:
                 fields, value = fields
             except ValueError:
                 for field in fields:
                     if hasattr(field, 'fields'):
                         yield field, None
                     elif len(field) == 2:
                         # Queue repeated fields for normalization
                         other_fields.append(field)
                 continue
             if isinstance(fields, list):
                 # More than a one attribute for a single annotation
                 for field in fields:
                     yield field, value
             elif isinstance(fields, six.string_types):
                 # Legacy field support
                 yield {u'field': fields, u'attribute': u'content'}, value
             else:
                 yield fields, value
Exemple #5
0
 def _process_css_and_xpath(self, annotations, selector):
     schema, modifiers, page = self.schema, self.modifiers, self.htmlpage
     region_ids = list(filter(bool, (region_id(r) for r in self.regions)))
     query = ','.join(('[data-tagid="%s"]' % rid for rid in region_ids))
     parents = {e._root for e in selector.css(query)}
     containers = ()
     if self.parent_region:
         if isinstance(self.parent_region, list):
             pquery = ', '.join(
                 '[data-tagid="{}"]'.format(self.get_region_id(r))
                 for r in self.parent_region)
         else:
             pquery = '[data-tagid="{}"]'.format(
                 self.get_region_id(self.parent_region))
         containers = {e._root for e in selector.css(pquery)}
     for i, a in enumerate(annotations, start=len(self.fields)):
         mode = a.get(u'selection_mode')
         query = a.get(mode if mode != 'css' else u'selector')
         try:
             elems = self._pick_elems(
                 getattr(selector, mode)(query), parents, containers)
         except ValueError:
             continue
         for elem in elems:
             elem._root.attrib.pop('data-tagid', None)
         extracted = elems.xpath(self.attribute_query(a)).extract()
         value = list(map(six.text_type.strip, extracted))
         aid = a.get(u'id') or i
         if value:
             value = [htmlregion(v) for v in arg_to_iter(value)]
             self.fields[aid] = ItemField(value, a, schema, modifiers, page)
         else:
             self.fields.pop(aid, None)
Exemple #6
0
def _gen_annotation_info(annotations):
    data = {}
    annotation_data = []
    for annotation in arg_to_iter(annotations):
        if 'annotations' in annotation:
            annotation_data.append({
                'id': annotation.get('id', short_guid()),
                'annotations': annotation.get('annotations', {}),
                'required': annotation.get('required', []),
                'required_fields': annotation.get('required', []),
                'variant': int(annotation.get('variant', 0)),
                'generated': annotation.get('generated', False),
                'text-content': annotation.get('text-content', 'content'),
                'item_container': annotation.get('item_container', False),
                'container_id': annotation.get('container_id'),
                'schema_id': annotation.get('schema_id'),
                'repeated': annotation.get('repeated'),
                'siblings': annotation.get('siblings'),
                'field': annotation.get('field'),
                'selector': annotation.get('selector'),
                'selection_mode': annotation.get('selection_mode'),
                'min_jump': annotation.get('min_jump', -1),
                'max_separator': annotation.get('max_separator', -1),
                'xpath': annotation.get('xpath')
            })
        if 'ignore' in annotation or 'ignore_beneath' in annotation:
            if annotation.get('ignore_beneath'):
                data['data-scrapy-ignore-beneath'] = 'true'
            elif annotation.get('ignore'):
                data['data-scrapy-ignore'] = 'true'
    if annotation_data:
        serialized = json.dumps(annotation_data).replace('"', '"')
        data['data-scrapy-annotate'] = serialized
    return data
Exemple #7
0
 def _create_start_urls(self, spec):
     url_type = spec.get('start_urls_type', 'start_urls')
     return StartUrlCollection(
         arg_to_iter(spec[url_type]),
         self.start_url_generators,
         url_type
     )
Exemple #8
0
 def _item_with_names(self, item, attribute=u'description'):
     item_dict = {}
     for field, value in item.items():
         if not (field and value):
             continue
         if hasattr(field, attribute):
             key = getattr(field, attribute)
             if getattr(field, 'should_overwrite', False):
                 item_dict[key] = value
             else:
                 item_dict[key] = [
                     v for v in chain(arg_to_iter(item_dict.get(key, [])),
                                      arg_to_iter(value))
                 ]
         else:
             item_dict[field] = value
     return item_dict
Exemple #9
0
 def _selector_annotations(self):
     for annotation in self.annotations:
         meta = annotation.metadata
         if meta.get(u'selection_mode') not in self.selector_modes:
             continue
         surrounds = arg_to_iter(annotation.surrounds_attribute) or []
         tags = chain(*(a for _, a in annotation.tag_attributes))
         for attribute in chain(surrounds, tags):
             new_attribute = {k: v for k, v in meta.items()}
             new_attribute.update(attribute)
             yield new_attribute
Exemple #10
0
 def _handle_unpaired_tag(self, html_tag):
     self.handle_ignore(html_tag, is_open=False)
     jannotations = self.read_jannotations(html_tag)
     for jannotation in arg_to_iter(jannotations):
         if self.unpairedtag_stack:
             self._close_unpaired_tag()
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         annotation = self.build_annotation(jannotation)
         self.handle_variant(annotation, is_open=False)
         self.annotations.append(annotation)
     self.next_tag_index += 1
Exemple #11
0
 def _handle_unpaired_tag(self, html_tag):
     self.handle_ignore(html_tag, is_open=False)
     jannotations = self.read_jannotations(html_tag)
     for jannotation in arg_to_iter(jannotations):
         if self.unpairedtag_stack:
             self._close_unpaired_tag()
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         annotation = self.build_annotation(jannotation)
         self.handle_variant(annotation, is_open=False)
         self.annotations.append(annotation)
     self.next_tag_index += 1
Exemple #12
0
    def extract(self, page, start_index=0, end_index=None):
        items = []
        for extractor in self.extractors:
            extracted = extractor.extract(page, start_index, end_index,
                                          self.template.ignored_regions)
            for item in arg_to_iter(extracted):
                if item:
                    if isinstance(item, (ItemProcessor, dict)):
                        item[u'_template'] = self.template.id
                    items.append(item)

        return items
Exemple #13
0
 def _process(self):
     values = []
     for value in arg_to_iter(self.value):
         if (isinstance(value, (HtmlPageParsedRegion, HtmlPageRegion)) and
                 hasattr(self.extractor, u'extractor')):
             value = self.extractor.extractor(value)
         if value:
             values.append(value)
     if hasattr(self.extractor, u'adapt'):
         values = [self.extractor.adapt(x, self.htmlpage) for x in values
                   if x and not isinstance(x, (dict, ItemProcessor))]
     else:
         values = list(filter(bool, values))
     return values
Exemple #14
0
 def _process_values(self, regions, htmlpage, extraction_func):
     values = []
     for value in arg_to_iter(regions):
         if (isinstance(value, (HtmlPageParsedRegion, HtmlPageRegion)) and
                 hasattr(extraction_func, 'extractor')):
             value = extraction_func.extractor(value)
         if value:
             values.append(value)
     if hasattr(extraction_func, 'adapt'):
         if hasattr(htmlpage, 'htmlpage'):
             htmlpage = htmlpage.htmlpage
         values = [extraction_func.adapt(x, htmlpage) for x in values
                   if x and not isinstance(x, dict)]
     else:
         values = list(filter(bool, values))
     return values
Exemple #15
0
 def _process_values(self, regions, htmlpage, extraction_func):
     values = []
     for value in arg_to_iter(regions):
         if (isinstance(value, (HtmlPageParsedRegion, HtmlPageRegion)) and
                 hasattr(extraction_func, 'extractor')):
             value = extraction_func.extractor(value)
         if value:
             values.append(value)
     if hasattr(extraction_func, 'adapt'):
         if hasattr(htmlpage, 'htmlpage'):
             htmlpage = htmlpage.htmlpage
         values = [extraction_func.adapt(x, htmlpage) for x in values
                   if x and not isinstance(x, dict)]
     else:
         values = list(filter(bool, values))
     return values
Exemple #16
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     field, field, _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction = copy.deepcopy(field_extraction)
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor.extract)
             extracted = self._process_values(regions, htmlpage,
                                              field_extraction)
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = [
                         custom_extractor_func(s, htmlpage)
                         for s in extracted
                     ]
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             yield (field_extraction, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             if self.legacy and annotation == 'variants':
                 yield (annotation,
                        self._process_variants(regions, htmlpage))
                 continue
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     annotation, annotation, _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             yield (extraction_func, values)
Exemple #17
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     field, field, _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction = copy.deepcopy(field_extraction)
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor.extract)
             extracted = self._process_values(
                 regions, htmlpage, field_extraction
             )
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = [custom_extractor_func(s, htmlpage)
                                  for s in extracted]
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             yield (field_extraction, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             if self.legacy and annotation == 'variants':
                 yield (annotation, self._process_variants(regions,
                                                           htmlpage))
                 continue
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     annotation, annotation, _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             yield (extraction_func, values)
Exemple #18
0
 def _process_fields(self, annotations, regions, htmlpage):
     for annotation in arg_to_iter(annotations):
         if isinstance(annotation, dict):
             field = annotation['field']
             try:
                 field_extraction = self.schema.attribute_map.get(field)
             except AttributeError:
                 field_extraction = None
             if field_extraction is None:
                 field_extraction = SlybotFieldDescriptor(
                     '', '', _DEFAULT_EXTRACTOR)
             if annotation.get('pre_text') or annotation.get('post_text'):
                 text_extractor = TextRegionDataExtractor(
                     annotation.get('pre_text', ''),
                     annotation.get('post_text', ''))
                 field_extraction.extractor = _compose(
                     field_extraction.extractor, text_extractor)
             extracted = self._process_values(
                 regions, htmlpage, field_extraction
             )
             for extractor in annotation.get('extractors', []):
                 custom_extractor_func = self.modifiers.get(extractor)
                 if custom_extractor_func and extracted:
                     extracted = custom_extractor_func(extracted, htmlpage)
             if annotation.get('required') and not extracted:
                 raise MissingRequiredError()
             if field_extraction.name != field_extraction.description:
                 field = field_extraction.description
             yield (field, extracted)
         else:
             # Legacy spiders have per attribute pipline extractors
             try:
                 extraction_func = self.schema.attribute_map.get(annotation)
             except AttributeError:
                 extraction_func = None
             if extraction_func is None:
                 extraction_func = SlybotFieldDescriptor(
                     '', '', _DEFAULT_EXTRACTOR)
             values = self._process_values(regions, htmlpage,
                                           extraction_func)
             if extraction_func.name != extraction_func.description:
                 annotation = extraction_func.description
             yield (annotation, values)
Exemple #19
0
 def _handle_open_tag(self, html_tag):
     ignored = self.handle_ignore(html_tag)
     tagname = self.handle_replacement(html_tag)
     jannotations = self.read_jannotations(html_tag)
     if not jannotations and tagname in self.labelled_tag_stacks:
         # add this tag to the stack to match correct end tag
         self.labelled_tag_stacks[tagname].append(None)
     increment = not jannotations
     for jannotation in arg_to_iter(jannotations):
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         annotation = self.build_annotation(jannotation)
         self.handle_generated(annotation, ignored)
         self.handle_variant(annotation)
         # Don't increment generated/text annotation
         if annotation.annotation_text is None and not increment:
             increment = True
         # look for a closing tag if the content is important
         if annotation.surrounds_attribute:
             self.labelled_tag_stacks[tagname].append(annotation)
         else:
             annotation.end_index = annotation.start_index + 1
             self.annotations.append(annotation)
     self.next_tag_index += increment
Exemple #20
0
 def _handle_open_tag(self, html_tag):
     ignored = self.handle_ignore(html_tag)
     tagname = self.handle_replacement(html_tag)
     jannotations = self.read_jannotations(html_tag)
     if not jannotations and tagname in self.labelled_tag_stacks:
         # add this tag to the stack to match correct end tag
         self.labelled_tag_stacks[tagname].append(None)
     increment = not jannotations
     for jannotation in arg_to_iter(jannotations):
         self.extra_required_attrs.extend(jannotation.pop('required', []))
         annotation = self.build_annotation(jannotation)
         self.handle_generated(annotation, ignored)
         self.handle_variant(annotation)
         # Don't increment generated/text annotation
         if annotation.annotation_text is None and not increment:
             increment = True
         # look for a closing tag if the content is important
         if annotation.surrounds_attribute:
             self.labelled_tag_stacks[tagname].append(annotation)
         else:
             annotation.end_index = annotation.start_index + 1
             self.annotations.append(annotation)
     self.next_tag_index += increment
Exemple #21
0
 def generate(self, annotations):
     data = {}
     annotation_data = []
     for annotation in arg_to_iter(annotations):
         if 'annotations' in annotation:
             annotation_data.append({
                 'id': annotation.get('id', short_guid()),
                 'annotations': annotation.get('annotations', {}),
                 'required': annotation.get('required', []),
                 'required_fields': annotation.get('required', []),
                 'variant': int(annotation.get('variant', 0)),
                 'generated': annotation.get('generated', False),
                 'text-content': annotation.get('text-content', 'content'),
                 'item_container': annotation.get('item_container', False),
                 'container_id': annotation.get('container_id'),
                 'schema_id': annotation.get('schema_id'),
                 'repeated': annotation.get('repeated'),
                 'siblings': annotation.get('siblings'),
                 'field': annotation.get('field'),
                 'selector': annotation.get('selector'),
                 'selection_mode': annotation.get('selection_mode'),
                 'min_jump': annotation.get('min_jump', -1),
                 'max_separator': annotation.get('max_separator', -1),
                 'xpath': annotation.get('xpath')
             })
         if 'ignore' in annotation or 'ignore_beneath' in annotation:
             if annotation.get('ignore_beneath'):
                 data['data-scrapy-ignore-beneath'] = 'true'
             elif annotation.get('ignore'):
                 data['data-scrapy-ignore'] = 'true'
     if annotation_data:
         if self.legacy:
             annotation_data = annotation_data[0]
         serialized = json.dumps(annotation_data).replace('"', '"')
         data['data-scrapy-annotate'] = serialized
     return data
Exemple #22
0
def load_annotations(extractor):
    for e in arg_to_iter(extractor):
        if hasattr(e, 'annotation') and not hasattr(e, u'extractors'):
            meta = e.annotation.metadata
            if u'attribute' not in meta:
                attribute = [a for a in e.annotation.tag_attributes]
                content = meta.get(u'text-content', u'content')
                attribute, ann = (attribute or [(content, None)])[0]
                if not e.annotation.surrounds_attribute:
                    meta['text-content'] = '#portia-content'
                meta[u'attribute'] = attribute
                if ann is not None:
                    if isinstance(ann, list):
                        ann = ann[0].get(u'field')
                    meta[u'field'] = ann
            if not meta.get(u'field'):
                attr = e.annotation.surrounds_attribute
                if isinstance(attr, list):
                    attr = attr[0].get(u'field')
                meta[u'field'] = attr
            yield e.annotation
        if hasattr(e, u'extractors') and not hasattr(e, u'schema'):
            for sub_e in load_annotations(e.extractors):
                yield sub_e
Exemple #23
0
 def _create_start_urls(self, spec):
     _type = spec.get('start_urls_type', 'start_urls')
     generator = self.start_url_generators[_type]
     generated = (generator(data) for data in arg_to_iter(spec[_type]))
     for url in itertools.chain(*(arg_to_iter(g) for g in generated)):
         yield url
Exemple #24
0
def apply_annotations(annotations, target_page, legacy=False):
    selector_annotations, tagid_annotations = _filter_annotations(annotations)
    inserts = defaultdict(list)
    numbered_html = add_tagids(target_page)
    if selector_annotations:
        converted_annotations = apply_selector_annotations(
            selector_annotations, numbered_html)
        tagid_annotations += converted_annotations
    target = iter(parse_html(numbered_html))
    output, tag_stack = [], []
    element = next(target)
    last_id = 0
    # XXX: A dummy element is added to the end so if the last annotation is
    #      generated it will be added to the output
    filtered = defaultdict(list)
    for grouped in tagid_annotations:
        for ann in arg_to_iter(grouped):
            filtered[ann['tagid']].append(ann)
    dummy = [(1e9, [{}])]
    sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] +
                                dummy)
    try:
        for aid, annotation_data in sorted_annotations:
            # Move target until replacement/insertion point
            while True:
                while not isinstance(element, HtmlTag) or element.tag == 'ins':
                    output.append(numbered_html[element.start:element.end])
                    element = next(target)
                if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                    last_id = element.attributes.get(TAGID)
                    tag_stack.append(last_id)
                if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack:
                    if ('__added' not in element.attributes and
                            last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    last_inserted = tag_stack.pop()
                    to_insert = inserts.pop(last_inserted, None)
                    if to_insert:
                        output.extend(to_insert)
                        # Skip all nodes up to the next HtmlTag as these
                        # have already been added
                        while True:
                            element = next(target)
                            try:
                                last_id = element.attributes.get(TAGID,
                                                                 last_id)
                            except AttributeError:
                                pass
                            if isinstance(element, HtmlTag):
                                break
                        continue
                if (last_id is not None and aid is not None and
                        int(last_id) < int(aid)):
                    if '__added' not in element.attributes:
                        output.append(numbered_html[element.start:element.end])
                        element.attributes['__added'] = True
                    element = next(target)
                else:
                    break

            generated = []
            next_generated = []
            regular_annotations = []
            # Place generated annotations at the end and sort by slice
            for annotation in sorted(annotation_data, key=_annotation_key):
                if annotation.get('generated'):
                    if annotation.get('insert_after'):
                        next_generated.append(annotation)
                    else:
                        generated.append(annotation)
                else:
                    regular_annotations.append(annotation)
            # Add annotations data as required
            if regular_annotations:
                annotation_info = _gen_annotation_info(regular_annotations,
                                                       legacy)
                for key, val in annotation_info.items():
                    element.attributes[key] = val
            next_text_section = ''
            if generated:
                inner_data, target = tee(target)
                nodes = _get_inner_nodes(inner_data)
                next_text_section = _get_generated_annotation(
                    element, generated, nodes, numbered_html, inserts,
                    legacy)
            if next_generated:
                inner_data, target = tee(target)
                open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1
                nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                         insert_after=True)
                next_text_section = _get_generated_annotation(
                    element, next_generated, nodes, numbered_html, inserts,
                    legacy)

            if '__added' not in element.attributes:
                output.append(serialize_tag(element))
                element.attributes['__added'] = True
            # If an <ins> tag has been inserted we need to move forward
            if next_text_section:
                while True:
                    elem = next(target)
                    if (isinstance(elem, HtmlDataFragment) and
                            elem.is_text_content):
                        break
                    output.append(numbered_html[elem.start:elem.end])
                output.append(next_text_section)
    # Reached the end of the document
    except StopIteration:
        output.append(numbered_html[element.start:element.end])
    else:
        for element in target:
            output.append(numbered_html[element.start:element.end])
    return remove_tagids(''.join(output))
Exemple #25
0
def make_spider(start_urls=None, sample=None):
    sample = [] if sample is None else arg_to_iter(sample)
    start_urls = [] if start_urls is None else arg_to_iter(start_urls)
    return {'start_urls': start_urls, 'templates': sample}
Exemple #26
0
 def _process_items(self, items, page):
     if not items:
         return []
     return arg_to_iter(self._validate_and_adapt_item(items, page))
Exemple #27
0
 def _process_items(self, items, page):
     if not items:
         return []
     return arg_to_iter(self._validate_and_adapt_item(items, page))
Exemple #28
0
 def __iter__(self):
     generated = (self._generate_urls(url) for url in self.start_urls)
     for url in chain(*(arg_to_iter(g) for g in generated)):
         yield url
Exemple #29
0
 def __iter__(self):
     generated = (self._generate_urls(url) for url in self.start_urls)
     for url in chain(*(arg_to_iter(g) for g in generated)):
         yield url
 def _process_items(self, items, page, region, surrounding_region):
     if not items:
         return []
     items = self._validate_and_adapt_item(items, page, region,
                                           surrounding_region)
     return arg_to_iter(items)
Exemple #31
0
    def apply(self):
        selector_annotations, tagid_annotations = self.split()
        inserts, numbered_html = defaultdict(list), self.numbered_html
        if selector_annotations:
            converted_annotations = self.apply_selector(selector_annotations)
            tagid_annotations += converted_annotations
        if not self.legacy:
            tagid_annotations = self.verify(
                [arg_to_iter(a) for a in tagid_annotations])
        target = iter(parse_html(numbered_html))
        output, stack = [], []
        elem = next(target)
        last_id = 0
        # XXX: A dummy element is added to the end so if the last annotation is
        #      generated it will be added to the output
        filtered = defaultdict(list)
        for grouped in tagid_annotations:
            for ann in arg_to_iter(grouped):
                filtered[ann['tagid']].append(ann)
        dummy = [(1e9, [{}])]
        sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()
                                     if k is not None])
        try:
            for aid, annotation_data in chain(sorted_annotations, dummy):
                # Move target until replacement/insertion point
                while True:
                    while not isinstance(elem, HtmlTag) or elem.tag == 'ins':
                        output.append(numbered_html[elem.start:elem.end])
                        elem = next(target)
                    if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}:
                        last_id = elem.attributes.get(TAGID)
                        stack.append(last_id)
                    if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack:
                        if ('__added' not in elem.attributes and
                                last_id is not None and aid is not None and
                                int(last_id) < int(aid)):
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        last_inserted = stack.pop()
                        to_insert = inserts.pop(last_inserted, None)
                        if to_insert:
                            output.extend(to_insert)
                            # Skip all nodes up to the next HtmlTag as these
                            # have already been added
                            while True:
                                elem = next(target)
                                try:
                                    last_id = elem.attributes.get(TAGID,
                                                                  last_id)
                                except AttributeError:
                                    pass
                                if isinstance(elem, HtmlTag):
                                    break
                            continue
                    if (last_id is not None and aid is not None and
                            int(last_id) < int(aid)):
                        if '__added' not in elem.attributes:
                            output.append(numbered_html[elem.start:elem.end])
                            elem.attributes['__added'] = True
                        elem = next(target)
                    else:
                        break

                generated = []
                next_generated = []
                regular_annotations = []
                # Place generated annotations at the end and sort by slice
                for annotation in sorted(annotation_data, key=_annotation_key):
                    if annotation.get('generated'):
                        if annotation.get('insert_after'):
                            next_generated.append(annotation)
                        else:
                            generated.append(annotation)
                    else:
                        regular_annotations.append(annotation)
                # Add annotations data as required
                if regular_annotations:
                    annotation_info = self.generate(regular_annotations)
                    for key, val in annotation_info.items():
                        elem.attributes[key] = val
                next_text_section = ''
                if generated:
                    inner_data, target = tee(target)
                    nodes = _get_inner_nodes(inner_data)
                    next_text_section = self._get_generated(
                        elem, generated, nodes, inserts)
                if next_generated:
                    inner_data, target = tee(target)
                    open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1
                    nodes = _get_inner_nodes(inner_data, open_tags=open_tags,
                                             insert_after=True)
                    next_text_section = self._get_generated(
                        elem, next_generated, nodes, inserts)

                if '__added' not in elem.attributes:
                    output.append(serialize_tag(elem))
                    elem.attributes['__added'] = True
                # If an <ins> tag has been inserted we need to move forward
                if next_text_section:
                    while True:
                        elem = next(target)
                        if (isinstance(elem, HtmlDataFragment) and
                                elem.is_text_content):
                            break
                        output.append(numbered_html[elem.start:elem.end])
                    output.append(next_text_section)
        # Reached the end of the document
        except StopIteration:
            output.append(numbered_html[elem.start:elem.end])
        else:
            for element in target:
                output.append(numbered_html[element.start:element.end])
        return remove_tagids(''.join(output))
Exemple #32
0
def make_spider(start_urls=None, sample=None):
    sample = [] if sample is None else arg_to_iter(sample)
    start_urls = [] if start_urls is None else arg_to_iter(start_urls)
    return {'start_urls': start_urls, 'templates': sample}
Exemple #33
0
 def _process_items(self, items, page, region, surrounding_region):
     if not items:
         return []
     items = self._validate_and_adapt_item(items, page, region,
                                           surrounding_region)
     return arg_to_iter(items)
Exemple #34
0
 def _create_start_urls(self, spec):
     _type = spec.get('start_urls_type', 'start_urls')
     generator = self.start_url_generators[_type]
     generated = (generator(data) for data in arg_to_iter(spec[_type]))
     for url in itertools.chain(*(arg_to_iter(g) for g in generated)):
         yield url
Exemple #35
0
 def _create_start_urls(self, spec):
     url_type = spec.get('start_urls_type', 'start_urls')
     return StartUrlCollection(
         arg_to_iter(spec[url_type]),
         self.start_url_generators,
     )
Exemple #36
0
 def serializer(cls, output):
     return [
         o.strftime(cls.DATETIME_FMT) if isinstance(o, datetime) else str(o)
         for o in arg_to_iter(output)
     ]