def apply_extractors(descriptor, template_extractors, extractors): field_type_manager = FieldTypeManager() for field_name, field_extractors in template_extractors.items(): equeue = [] for eid in field_extractors: extractor_doc = extractors[eid] if "regular_expression" in extractor_doc: equeue.append( create_regex_extractor( extractor_doc["regular_expression"])) elif "type_extractor" in extractor_doc: # overrides default one descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class( extractor_doc["type_extractor"])()) if not field_name in descriptor.attribute_map: # if not defined type extractor, use text type by default, as it is by far the most commonly used descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text")()) if equeue: equeue.insert(0, descriptor.attribute_map[field_name].extractor) descriptor.attribute_map[field_name].extractor = PipelineExtractor( *equeue)
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema['fields'].items(): required = pdict['required'] pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pname, processor, required) descriptors.append(descriptor) return ItemDescriptor("", "", descriptors)
def apply_extractors(descriptor, template_extractors_ids, extractors): field_type_manager = FieldTypeManager() template_extractors = [extractors[eid] for eid in template_extractors_ids] for field_name, field_extractors in groupby(template_extractors or (), lambda x: x["field_name"]): equeue = [] for extractor_doc in field_extractors: if "regular_expression" in extractor_doc: equeue.append(create_regex_extractor(extractor_doc["regular_expression"])) elif "type_extractor" in extractor_doc: # overrides default one descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])()) if not field_name in descriptor.attribute_map: # if not defined type extractor, use text type by default, as it is by far the most commonly used descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name, field_name, field_type_manager.type_processor_class("text")()) if equeue: equeue.insert(0, descriptor.attribute_map[field_name].extractor) descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): data = extractor.extractor(txt) if data: return extractor.adapt(data, htmlpage) _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8') return _extractor
def create_slybot_item_descriptor(schema, schema_name=""): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema.get('fields', {}).items(): required = pdict['required'] pdisplay_name = pdict.get('name', pname) pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor, required) descriptors.append(descriptor) return SlybotItemDescriptor(schema_name, schema.get('name', schema_name), descriptors)
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() if schema is None: schema = {'id': 'item', 'properties': ()} descriptors = [] for pname, pdict in schema.get('properties', ()): description = pdict.get('description') required = not pdict.get('optional', True) pclass = field_type_manager.type_processor_class(pdict.get('type')) processor = pclass() descriptor = SlybotFieldDescriptor(pname, description, processor, required) descriptors.append(descriptor) return ItemDescriptor(schema['id'], schema.get('description'), descriptors)
def apply_extractors(descriptor, template_extractors, all_extractors): field_type_manager = FieldTypeManager() for eid in template_extractors or (): extractor_doc = all_extractors[eid] field_name = extractor_doc["field_name"] if not field_name in descriptor.attribute_map: descriptor.attribute_map[field_name] = SlybotFieldDescriptor( field_name, field_name, field_type_manager.type_processor_class("text") ) if "regular_expression" in extractor_doc: descriptor.attribute_map[field_name].extractor = create_regex_extractor(extractor_doc["regular_expression"]) else: descriptor.attribute_map[field_name].extractor = getattr(ExtractorTypes, extractor_doc["builtin_extractor"])
def create_slybot_item_descriptor(schema): field_type_manager = FieldTypeManager() if schema is None: schema = {"id": "item", "properties": ()} descriptors = [] for pname, pdict in schema.get("properties", ()): description = pdict.get("description") required = not pdict.get("optional", True) pclass = field_type_manager.type_processor_class(pdict.get("type")) processor = pclass() descriptor = SlybotFieldDescriptor(pname, description, processor, required) descriptors.append(descriptor) return ItemDescriptor(schema["id"], schema.get("description"), descriptors)
def create_slybot_item_descriptor(schema, schema_name=""): field_type_manager = FieldTypeManager() descriptors = [] for pname, pdict in schema['fields'].items(): required = pdict['required'] pdisplay_name = pdict.get('name', pname) pclass = field_type_manager.type_processor_class(pdict['type']) processor = pclass() descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor, required) descriptors.append(descriptor) return SlybotItemDescriptor(schema_name, schema.get('name', schema_name), descriptors)
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): if txt is None: return page = getattr(htmlpage, 'htmlpage', htmlpage) if not hasattr(txt, 'text_content'): txt = HtmlPageRegion(page, txt) data = extractor.extract(txt) if data: return extractor.adapt(data, page) _extractor.__name__ = ("Type Extractor: %s" % _type) return _extractor
def create_type_extractor(_type): types = FieldTypeManager() extractor = types.type_processor_class(_type)() def _extractor(txt, htmlpage=None): if txt is None: return page = getattr(htmlpage, 'htmlpage', htmlpage) if not hasattr(txt, 'text_content'): txt = HtmlPageRegion(page, txt) data = extractor.extract(txt) if data: return extractor.adapt(data, page) _extractor.__name__ = ("Type Extractor: %s" % _type).encode('utf-8') return _extractor