def extract(self, page, start_index=0, end_index=None): items = [] for extractor in self.extractors: extracted = extractor.extract(page, start_index, end_index, self.template.ignored_regions) for item in arg_to_iter(extracted): if item: if isinstance(item, (ItemProcessor, dict)): item[u'_template'] = self.template.id items.extend(filter(bool, arg_to_iter(extracted))) return items
def __init__(self, data, extractor, regions, parent_region=None, htmlpage=None): self.annotation = extractor.annotation self.id = self.annotation.metadata.get(u'id') self.regions = arg_to_iter(regions) parent_region = arg_to_iter(parent_region) if parent_region else [] self.parent_region = parent_region self.modifiers = extractor.modifiers or {} self.schema = extractor.schema or {} if hasattr(htmlpage, u'htmlpage'): htmlpage = htmlpage.htmlpage self.htmlpage = htmlpage self.annotations = list( load_annotations(getattr(extractor, 'extractors', []))) self.fields = self._process_fields(data)
def _process_args(self, values): if self.spider_args is None: return [] results = [] for value in values: results.extend(arg_to_iter(self.spider_args.get(value, []))) return results
def _normalize_data(self, data): """Normalize extracted data for conversion into ItemFields.""" if isinstance(data, dict): data = data.items() elif data and not isinstance(data[0], (tuple, dict)): data = [data] for i in data: if hasattr(i, u'items'): i = i.items() else: i = (i,) other_fields = [] for fields in chain(arg_to_iter(i), other_fields): try: fields, value = fields except ValueError: for field in fields: if hasattr(field, 'fields'): yield field, None elif len(field) == 2: # Queue repeated fields for normalization other_fields.append(field) continue if isinstance(fields, list): # More than a one attribute for a single annotation for field in fields: yield field, value elif isinstance(fields, six.string_types): # Legacy field support yield {u'field': fields, u'attribute': u'content'}, value else: yield fields, value
def _process_css_and_xpath(self, annotations, selector): schema, modifiers, page = self.schema, self.modifiers, self.htmlpage region_ids = list(filter(bool, (region_id(r) for r in self.regions))) query = ','.join(('[data-tagid="%s"]' % rid for rid in region_ids)) parents = {e._root for e in selector.css(query)} containers = () if self.parent_region: if isinstance(self.parent_region, list): pquery = ', '.join( '[data-tagid="{}"]'.format(self.get_region_id(r)) for r in self.parent_region) else: pquery = '[data-tagid="{}"]'.format( self.get_region_id(self.parent_region)) containers = {e._root for e in selector.css(pquery)} for i, a in enumerate(annotations, start=len(self.fields)): mode = a.get(u'selection_mode') query = a.get(mode if mode != 'css' else u'selector') try: elems = self._pick_elems( getattr(selector, mode)(query), parents, containers) except ValueError: continue for elem in elems: elem._root.attrib.pop('data-tagid', None) extracted = elems.xpath(self.attribute_query(a)).extract() value = list(map(six.text_type.strip, extracted)) aid = a.get(u'id') or i if value: value = [htmlregion(v) for v in arg_to_iter(value)] self.fields[aid] = ItemField(value, a, schema, modifiers, page) else: self.fields.pop(aid, None)
def _gen_annotation_info(annotations): data = {} annotation_data = [] for annotation in arg_to_iter(annotations): if 'annotations' in annotation: annotation_data.append({ 'id': annotation.get('id', short_guid()), 'annotations': annotation.get('annotations', {}), 'required': annotation.get('required', []), 'required_fields': annotation.get('required', []), 'variant': int(annotation.get('variant', 0)), 'generated': annotation.get('generated', False), 'text-content': annotation.get('text-content', 'content'), 'item_container': annotation.get('item_container', False), 'container_id': annotation.get('container_id'), 'schema_id': annotation.get('schema_id'), 'repeated': annotation.get('repeated'), 'siblings': annotation.get('siblings'), 'field': annotation.get('field'), 'selector': annotation.get('selector'), 'selection_mode': annotation.get('selection_mode'), 'min_jump': annotation.get('min_jump', -1), 'max_separator': annotation.get('max_separator', -1), 'xpath': annotation.get('xpath') }) if 'ignore' in annotation or 'ignore_beneath' in annotation: if annotation.get('ignore_beneath'): data['data-scrapy-ignore-beneath'] = 'true' elif annotation.get('ignore'): data['data-scrapy-ignore'] = 'true' if annotation_data: serialized = json.dumps(annotation_data).replace('"', '"') data['data-scrapy-annotate'] = serialized return data
def _create_start_urls(self, spec): url_type = spec.get('start_urls_type', 'start_urls') return StartUrlCollection( arg_to_iter(spec[url_type]), self.start_url_generators, url_type )
def _item_with_names(self, item, attribute=u'description'): item_dict = {} for field, value in item.items(): if not (field and value): continue if hasattr(field, attribute): key = getattr(field, attribute) if getattr(field, 'should_overwrite', False): item_dict[key] = value else: item_dict[key] = [ v for v in chain(arg_to_iter(item_dict.get(key, [])), arg_to_iter(value)) ] else: item_dict[field] = value return item_dict
def _selector_annotations(self): for annotation in self.annotations: meta = annotation.metadata if meta.get(u'selection_mode') not in self.selector_modes: continue surrounds = arg_to_iter(annotation.surrounds_attribute) or [] tags = chain(*(a for _, a in annotation.tag_attributes)) for attribute in chain(surrounds, tags): new_attribute = {k: v for k, v in meta.items()} new_attribute.update(attribute) yield new_attribute
def _handle_unpaired_tag(self, html_tag): self.handle_ignore(html_tag, is_open=False) jannotations = self.read_jannotations(html_tag) for jannotation in arg_to_iter(jannotations): if self.unpairedtag_stack: self._close_unpaired_tag() self.extra_required_attrs.extend(jannotation.pop('required', [])) annotation = self.build_annotation(jannotation) self.handle_variant(annotation, is_open=False) self.annotations.append(annotation) self.next_tag_index += 1
def extract(self, page, start_index=0, end_index=None): items = [] for extractor in self.extractors: extracted = extractor.extract(page, start_index, end_index, self.template.ignored_regions) for item in arg_to_iter(extracted): if item: if isinstance(item, (ItemProcessor, dict)): item[u'_template'] = self.template.id items.append(item) return items
def _process(self): values = [] for value in arg_to_iter(self.value): if (isinstance(value, (HtmlPageParsedRegion, HtmlPageRegion)) and hasattr(self.extractor, u'extractor')): value = self.extractor.extractor(value) if value: values.append(value) if hasattr(self.extractor, u'adapt'): values = [self.extractor.adapt(x, self.htmlpage) for x in values if x and not isinstance(x, (dict, ItemProcessor))] else: values = list(filter(bool, values)) return values
def _process_values(self, regions, htmlpage, extraction_func): values = [] for value in arg_to_iter(regions): if (isinstance(value, (HtmlPageParsedRegion, HtmlPageRegion)) and hasattr(extraction_func, 'extractor')): value = extraction_func.extractor(value) if value: values.append(value) if hasattr(extraction_func, 'adapt'): if hasattr(htmlpage, 'htmlpage'): htmlpage = htmlpage.htmlpage values = [extraction_func.adapt(x, htmlpage) for x in values if x and not isinstance(x, dict)] else: values = list(filter(bool, values)) return values
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( field, field, _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction = copy.deepcopy(field_extraction) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor.extract) extracted = self._process_values(regions, htmlpage, field_extraction) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = [ custom_extractor_func(s, htmlpage) for s in extracted ] if annotation.get('required') and not extracted: raise MissingRequiredError() yield (field_extraction, extracted) else: # Legacy spiders have per attribute pipline extractors if self.legacy and annotation == 'variants': yield (annotation, self._process_variants(regions, htmlpage)) continue try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( annotation, annotation, _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) yield (extraction_func, values)
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( field, field, _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction = copy.deepcopy(field_extraction) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor.extract) extracted = self._process_values( regions, htmlpage, field_extraction ) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = [custom_extractor_func(s, htmlpage) for s in extracted] if annotation.get('required') and not extracted: raise MissingRequiredError() yield (field_extraction, extracted) else: # Legacy spiders have per attribute pipline extractors if self.legacy and annotation == 'variants': yield (annotation, self._process_variants(regions, htmlpage)) continue try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( annotation, annotation, _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) yield (extraction_func, values)
def _process_fields(self, annotations, regions, htmlpage): for annotation in arg_to_iter(annotations): if isinstance(annotation, dict): field = annotation['field'] try: field_extraction = self.schema.attribute_map.get(field) except AttributeError: field_extraction = None if field_extraction is None: field_extraction = SlybotFieldDescriptor( '', '', _DEFAULT_EXTRACTOR) if annotation.get('pre_text') or annotation.get('post_text'): text_extractor = TextRegionDataExtractor( annotation.get('pre_text', ''), annotation.get('post_text', '')) field_extraction.extractor = _compose( field_extraction.extractor, text_extractor) extracted = self._process_values( regions, htmlpage, field_extraction ) for extractor in annotation.get('extractors', []): custom_extractor_func = self.modifiers.get(extractor) if custom_extractor_func and extracted: extracted = custom_extractor_func(extracted, htmlpage) if annotation.get('required') and not extracted: raise MissingRequiredError() if field_extraction.name != field_extraction.description: field = field_extraction.description yield (field, extracted) else: # Legacy spiders have per attribute pipline extractors try: extraction_func = self.schema.attribute_map.get(annotation) except AttributeError: extraction_func = None if extraction_func is None: extraction_func = SlybotFieldDescriptor( '', '', _DEFAULT_EXTRACTOR) values = self._process_values(regions, htmlpage, extraction_func) if extraction_func.name != extraction_func.description: annotation = extraction_func.description yield (annotation, values)
def _handle_open_tag(self, html_tag): ignored = self.handle_ignore(html_tag) tagname = self.handle_replacement(html_tag) jannotations = self.read_jannotations(html_tag) if not jannotations and tagname in self.labelled_tag_stacks: # add this tag to the stack to match correct end tag self.labelled_tag_stacks[tagname].append(None) increment = not jannotations for jannotation in arg_to_iter(jannotations): self.extra_required_attrs.extend(jannotation.pop('required', [])) annotation = self.build_annotation(jannotation) self.handle_generated(annotation, ignored) self.handle_variant(annotation) # Don't increment generated/text annotation if annotation.annotation_text is None and not increment: increment = True # look for a closing tag if the content is important if annotation.surrounds_attribute: self.labelled_tag_stacks[tagname].append(annotation) else: annotation.end_index = annotation.start_index + 1 self.annotations.append(annotation) self.next_tag_index += increment
def generate(self, annotations): data = {} annotation_data = [] for annotation in arg_to_iter(annotations): if 'annotations' in annotation: annotation_data.append({ 'id': annotation.get('id', short_guid()), 'annotations': annotation.get('annotations', {}), 'required': annotation.get('required', []), 'required_fields': annotation.get('required', []), 'variant': int(annotation.get('variant', 0)), 'generated': annotation.get('generated', False), 'text-content': annotation.get('text-content', 'content'), 'item_container': annotation.get('item_container', False), 'container_id': annotation.get('container_id'), 'schema_id': annotation.get('schema_id'), 'repeated': annotation.get('repeated'), 'siblings': annotation.get('siblings'), 'field': annotation.get('field'), 'selector': annotation.get('selector'), 'selection_mode': annotation.get('selection_mode'), 'min_jump': annotation.get('min_jump', -1), 'max_separator': annotation.get('max_separator', -1), 'xpath': annotation.get('xpath') }) if 'ignore' in annotation or 'ignore_beneath' in annotation: if annotation.get('ignore_beneath'): data['data-scrapy-ignore-beneath'] = 'true' elif annotation.get('ignore'): data['data-scrapy-ignore'] = 'true' if annotation_data: if self.legacy: annotation_data = annotation_data[0] serialized = json.dumps(annotation_data).replace('"', '"') data['data-scrapy-annotate'] = serialized return data
def load_annotations(extractor): for e in arg_to_iter(extractor): if hasattr(e, 'annotation') and not hasattr(e, u'extractors'): meta = e.annotation.metadata if u'attribute' not in meta: attribute = [a for a in e.annotation.tag_attributes] content = meta.get(u'text-content', u'content') attribute, ann = (attribute or [(content, None)])[0] if not e.annotation.surrounds_attribute: meta['text-content'] = '#portia-content' meta[u'attribute'] = attribute if ann is not None: if isinstance(ann, list): ann = ann[0].get(u'field') meta[u'field'] = ann if not meta.get(u'field'): attr = e.annotation.surrounds_attribute if isinstance(attr, list): attr = attr[0].get(u'field') meta[u'field'] = attr yield e.annotation if hasattr(e, u'extractors') and not hasattr(e, u'schema'): for sub_e in load_annotations(e.extractors): yield sub_e
def _create_start_urls(self, spec): _type = spec.get('start_urls_type', 'start_urls') generator = self.start_url_generators[_type] generated = (generator(data) for data in arg_to_iter(spec[_type])) for url in itertools.chain(*(arg_to_iter(g) for g in generated)): yield url
def apply_annotations(annotations, target_page, legacy=False): selector_annotations, tagid_annotations = _filter_annotations(annotations) inserts = defaultdict(list) numbered_html = add_tagids(target_page) if selector_annotations: converted_annotations = apply_selector_annotations( selector_annotations, numbered_html) tagid_annotations += converted_annotations target = iter(parse_html(numbered_html)) output, tag_stack = [], [] element = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items()] + dummy) try: for aid, annotation_data in sorted_annotations: # Move target until replacement/insertion point while True: while not isinstance(element, HtmlTag) or element.tag == 'ins': output.append(numbered_html[element.start:element.end]) element = next(target) if element.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = element.attributes.get(TAGID) tag_stack.append(last_id) if element.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and tag_stack: if ('__added' not in element.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True last_inserted = tag_stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: element = next(target) try: last_id = element.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(element, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in element.attributes: output.append(numbered_html[element.start:element.end]) element.attributes['__added'] = True element = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = _gen_annotation_info(regular_annotations, legacy) for key, val in annotation_info.items(): element.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = _get_generated_annotation( element, generated, nodes, numbered_html, inserts, legacy) if next_generated: inner_data, target = tee(target) open_tags = 0 if element.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = _get_generated_annotation( element, next_generated, nodes, numbered_html, inserts, legacy) if '__added' not in element.attributes: output.append(serialize_tag(element)) element.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[element.start:element.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def make_spider(start_urls=None, sample=None): sample = [] if sample is None else arg_to_iter(sample) start_urls = [] if start_urls is None else arg_to_iter(start_urls) return {'start_urls': start_urls, 'templates': sample}
def _process_items(self, items, page): if not items: return [] return arg_to_iter(self._validate_and_adapt_item(items, page))
def __iter__(self): generated = (self._generate_urls(url) for url in self.start_urls) for url in chain(*(arg_to_iter(g) for g in generated)): yield url
def _process_items(self, items, page, region, surrounding_region): if not items: return [] items = self._validate_and_adapt_item(items, page, region, surrounding_region) return arg_to_iter(items)
def apply(self): selector_annotations, tagid_annotations = self.split() inserts, numbered_html = defaultdict(list), self.numbered_html if selector_annotations: converted_annotations = self.apply_selector(selector_annotations) tagid_annotations += converted_annotations if not self.legacy: tagid_annotations = self.verify( [arg_to_iter(a) for a in tagid_annotations]) target = iter(parse_html(numbered_html)) output, stack = [], [] elem = next(target) last_id = 0 # XXX: A dummy element is added to the end so if the last annotation is # generated it will be added to the output filtered = defaultdict(list) for grouped in tagid_annotations: for ann in arg_to_iter(grouped): filtered[ann['tagid']].append(ann) dummy = [(1e9, [{}])] sorted_annotations = sorted([(int(k), v) for k, v in filtered.items() if k is not None]) try: for aid, annotation_data in chain(sorted_annotations, dummy): # Move target until replacement/insertion point while True: while not isinstance(elem, HtmlTag) or elem.tag == 'ins': output.append(numbered_html[elem.start:elem.end]) elem = next(target) if elem.tag_type in {OPEN_TAG, UNPAIRED_TAG}: last_id = elem.attributes.get(TAGID) stack.append(last_id) if elem.tag_type in {CLOSE_TAG, UNPAIRED_TAG} and stack: if ('__added' not in elem.attributes and last_id is not None and aid is not None and int(last_id) < int(aid)): output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True last_inserted = stack.pop() to_insert = inserts.pop(last_inserted, None) if to_insert: output.extend(to_insert) # Skip all nodes up to the next HtmlTag as these # have already been added while True: elem = next(target) try: last_id = elem.attributes.get(TAGID, last_id) except AttributeError: pass if isinstance(elem, HtmlTag): break continue if (last_id is not None and aid is not None and int(last_id) < int(aid)): if '__added' not in elem.attributes: output.append(numbered_html[elem.start:elem.end]) elem.attributes['__added'] = True elem = next(target) else: break generated = [] next_generated = [] regular_annotations = [] # Place generated annotations at the end and sort by slice for annotation in sorted(annotation_data, key=_annotation_key): if annotation.get('generated'): if annotation.get('insert_after'): next_generated.append(annotation) else: generated.append(annotation) else: regular_annotations.append(annotation) # Add annotations data as required if regular_annotations: annotation_info = self.generate(regular_annotations) for key, val in annotation_info.items(): elem.attributes[key] = val next_text_section = '' if generated: inner_data, target = tee(target) nodes = _get_inner_nodes(inner_data) next_text_section = self._get_generated( elem, generated, nodes, inserts) if next_generated: inner_data, target = tee(target) open_tags = 0 if elem.tag_type == UNPAIRED_TAG else 1 nodes = _get_inner_nodes(inner_data, open_tags=open_tags, insert_after=True) next_text_section = self._get_generated( elem, next_generated, nodes, inserts) if '__added' not in elem.attributes: output.append(serialize_tag(elem)) elem.attributes['__added'] = True # If an <ins> tag has been inserted we need to move forward if next_text_section: while True: elem = next(target) if (isinstance(elem, HtmlDataFragment) and elem.is_text_content): break output.append(numbered_html[elem.start:elem.end]) output.append(next_text_section) # Reached the end of the document except StopIteration: output.append(numbered_html[elem.start:elem.end]) else: for element in target: output.append(numbered_html[element.start:element.end]) return remove_tagids(''.join(output))
def _create_start_urls(self, spec): url_type = spec.get('start_urls_type', 'start_urls') return StartUrlCollection( arg_to_iter(spec[url_type]), self.start_url_generators, )
def serializer(cls, output): return [ o.strftime(cls.DATETIME_FMT) if isinstance(o, datetime) else str(o) for o in arg_to_iter(output) ]