def __init__(self, name, xpath=None, quant='*', value=None, callback=None, group=None, children=None, css=None, filter=None): ''' name - the parsed item will be stored in a dictionary on this index (not to store the item, use '_' as a prefix of the name). xpath - xpath to the item. quant - used for validation. The expected number of items to be parsed. value - specify it, if you want the items to be extracted. Otherwise selector objects will be returned. callback - callback function to be called on each found item. It can take named argument "context", which is dictionary containing additionnal values. group - if not None, all the child nodes will be stored under one dictionary entry of group's name children - list of nested S objects. For each item found, each child will be called with the item as the selector. css - css selector, in a case xpath is not defined filter - one-argument function. Given the node from the xpath, return true, if to the node. Otherwise return False. `quant` is checked AFTER the filter is applied. ''' if (xpath is None) == (css is None): raise TypeError('Exactly one of `xpath` or `css` arguments must be specified.') self.name = name if xpath is not None: self.raw_xpath = xpath elif css_supported: self.raw_xpath = GenericTranslator().css_to_xpath(css) else: raise TypeError('Css selectors not supported, install cssselect library.') self.hashed_namespaces = None self.compiled_xpath = None self.quant = Quantity(quant) self.raw_value = value self.compiled_value = None self.callback = callback self.context_callback = callback and 'context' in get_func_args(callback) self.group = group self.children = children if children is not None else [] self.filter = filter
def __init__(self, name, xpath, quant='*', value=None, callback=None, group=None, children=None): ''' name - the parsed item will be stored in a dictionary on this index (not to store the item, use '_' as a prefix of the name). xpath - xpath to the item. quant - used for validation. The expected number of items to be parsed. value - specify it, if you want the items to be extracted. Otherwise selector objects will be returned. callback - callback function to be called on each found item. It can take named argument "context", which is dictionary containing additionnal values. group - if not None, all the child nodes will be stored under one dictionary entry of group's name children - list of nested S objects. For each item found, each child will be called with the item as the selector. ''' self.name = name self.xpath = xpath self.quant = Quantity(quant) self.value = value self.callback = callback self.group = group self.children = children if children is not None else []
class S(object): '''Class S is an efficient to select multiple, possibly nested, parts of the website, automatically validate their existence/quantity and parse them out. Parsed values are stored in a dictionary as result[S.name] = [parsed_items] Parsed values are either xpath nodes or unicode strings if S.value is used. ''' def __init__(self, name, xpath=None, quant='*', value=None, callback=None, group=None, children=None, css=None, filter=None): ''' name - the parsed item will be stored in a dictionary on this index (not to store the item, use '_' as a prefix of the name). xpath - xpath to the item. quant - used for validation. The expected number of items to be parsed. value - specify it, if you want the items to be extracted. Otherwise selector objects will be returned. callback - callback function to be called on each found item. It can take named argument "context", which is dictionary containing additionnal values. group - if not None, all the child nodes will be stored under one dictionary entry of group's name children - list of nested S objects. For each item found, each child will be called with the item as the selector. css - css selector, in a case xpath is not defined filter - one-argument function. Given the node from the xpath, return true, if to the node. Otherwise return False. `quant` is checked AFTER the filter is applied. ''' if (xpath is None) == (css is None): raise TypeError('Exactly one of `xpath` or `css` arguments must be specified.') self.name = name if xpath is not None: self.raw_xpath = xpath elif css_supported: self.raw_xpath = GenericTranslator().css_to_xpath(css) else: raise TypeError('Css selectors not supported, install cssselect library.') self.hashed_namespaces = None self.compiled_xpath = None self.quant = Quantity(quant) self.raw_value = value self.compiled_value = None self.callback = callback self.context_callback = callback and 'context' in get_func_args(callback) self.group = group self.children = children if children is not None else [] self.filter = filter def get_nodes(self, name): '''Return the list of S nodes with the given name.''' result = [] if self.name == name: result.append(self) for c in self.children: result += c.get_nodes(name) return result @property def visible(self): return self.name and not self.name.startswith('_') def _hash_namespaces(self, namespaces): return hash(frozenset(namespaces.iteritems())) if namespaces else 0 def parse(self, response_or_selector, context=None): from crawlmi.http import HtmlResponse, XmlResponse if isinstance(response_or_selector, (HtmlResponse, XmlResponse)): context = context or {} context['response'] = response_or_selector selector = response_or_selector.selector else: selector = response_or_selector return self._parse(selector, context) def _parse(self, selector, context): hashed_namespaces = self._hash_namespaces(selector.namespaces) if self.compiled_xpath is None or self.hashed_namespaces != hashed_namespaces: self.hashed_namespaces = hashed_namespaces self.compiled_xpath = selector.compile_xpath(self.raw_xpath) if self.raw_value is not None: self.compiled_value = selector.compile_xpath(self.raw_value) result = defaultdict(list) nodes = selector.select(self.compiled_xpath) original_num_nodes = len(nodes) if self.filter: nodes = filter(self.filter, nodes) filtered_num_nodes = len(nodes) if not self.quant.check_quantity(filtered_num_nodes): if self.filter: raise SValidationError( 'Number of `%s` nodes %s (%s before filtering) doesn\'t match the expected quant %s.' % (self.name, filtered_num_nodes, original_num_nodes, self.quant.raw_quant)) else: raise SValidationError( 'Number of selected `%s` nodes %s doesn\'t match the expected quant %s.' % (self.name, filtered_num_nodes, self.quant.raw_quant)) for node in nodes: if self.visible: if self.raw_value is not None: extracted = node.select(self.compiled_value).extract() if self.callback: try: if self.context_callback: extracted = [self.callback(v, context=context) for v in extracted] else: extracted = [self.callback(v) for v in extracted] except Exception as e: raise SValidationError( 'Callback function returned an error on node `%s`: %s' % (self.name, e)) result[self.name].extend(extracted) else: if self.callback: try: if self.context_callback: node = self.callback(node, context=context) else: node = self.callback(node) except Exception as e: raise SValidationError( 'Callback function returned an error on node `%s`: %s' % (self.name, e)) result[self.name].append(node) if self.group is not None: groupd = defaultdict(list) for c in self.children: for k, v in c._parse(node, context).iteritems(): groupd[k].extend(v) result[self.group].append(groupd) else: for c in self.children: for k, v in c._parse(node, context).iteritems(): result[k].extend(v) return result def add_child(self, child): self.children.append(child) @property def all_fields(self): result = {} if self.visible: result[self.name] = None children_fields = {} for c in self.children: children_fields.update(c.all_fields) if self.group is not None: result[self.group] = children_fields else: result.update(children_fields) return result @classmethod def absolute_url(cls, value, context): ''' Useful callback method when parsing out urls. ''' response = context['response'] return urlparse.urljoin(response.base_url, value)
def test_dig_2d(self): q = Quantity('5, 10') self._test_good(q, [5, 6, 7, 8, 9, 10]) self._test_bad(q, [0, 1, 2, 3, 4, 11, 12, 13, -5, -10])
def test_dig_1d(self): q = Quantity('47') self._test_good(q, [47]) self._test_bad(q, [0, 1, -1, -47, 100])
def test_ques(self): q = Quantity('?') self._test_good(q, [0, 1]) self._test_bad(q, [-2, -1, 2, 3, 10, 100])
def test_plus(self): q = Quantity('+') self._test_good(q, [1, 2, 5, 10, 1000, 2**30]) self._test_bad(q, [0, -1, -2])
def test_star(self): q = Quantity('*') self._test_good(q, [0, 1, 2, 5, 10, 1000, 2**30]) self._test_bad(q, [-1, -2])
def test_err(self): q = Quantity('*') err = ['0', [0], None, 'help'] for e in err: self.assertRaises(ValueError, q.check_quantity, e)
class S(object): '''Class S is an efficient to select multiple, possibly nested, parts of the website, automatically validate their existence/quantity and parse them out. Parsed values are stored in a dictionary as result[S.name] = [parsed_items] Parsed values are either xpath nodes or unicode strings if S.value is used. ''' def __init__(self, name, xpath, quant='*', value=None, callback=None, group=None, children=None): ''' name - the parsed item will be stored in a dictionary on this index (not to store the item, use '_' as a prefix of the name). xpath - xpath to the item. quant - used for validation. The expected number of items to be parsed. value - specify it, if you want the items to be extracted. Otherwise selector objects will be returned. callback - callback function to be called on each found item. It can take named argument "context", which is dictionary containing additionnal values. group - if not None, all the child nodes will be stored under one dictionary entry of group's name children - list of nested S objects. For each item found, each child will be called with the item as the selector. ''' self.name = name self.xpath = xpath self.quant = Quantity(quant) self.value = value self.callback = callback self.group = group self.children = children if children is not None else [] def get_nodes(self, name): '''Return the list of S nodes with the given name.''' result = [] if self.name == name: result.append(self) for c in self.children: result += c.get_nodes(name) return result @property def visible(self): return self.name and not self.name.startswith('_') def parse(self, selector, context=None): result = defaultdict(list) items = selector.select(self.xpath) num_items = len(items) if not self.quant.check_quantity(num_items): raise SValidationError( 'Number of selected `%s` items %s doesn\'t match the expected quant %s.' % (self.name, num_items, self.quant.raw_quant)) for item in items: if self.visible: if self.value is not None: extracted = item.select(self.value).extract() if self.callback is not None: context_callback = wrap_context(self.callback, context) try: extracted = map(context_callback, extracted) except Exception as e: raise SValidationError( 'Callback function returned an error on item `%s`: %s' % (self.name, e)) result[self.name].extend(extracted) else: if self.callback is not None: context_callback = wrap_context(self.callback, context) try: item = context_callback(item) except Exception as e: raise SValidationError( 'Callback function returned an error on item `%s`: %s' % (self.name, e)) result[self.name].append(item) if self.group is not None: groupd = defaultdict(list) for c in self.children: for k, v in c.parse(item, context).iteritems(): groupd[k].extend(v) result[self.group].append(groupd) else: for c in self.children: for k, v in c.parse(item, context).iteritems(): result[k].extend(v) return result def xpath_exists(self, selector): return len(selector.select(self.xpath)) >= 1 def add_child(self, child): self.children.append(child) @property def all_fields(self): result = {} if self.visible: result[self.name] = None children_fields = {} for c in self.children: children_fields.update(c.all_fields) if self.group is not None: result[self.group] = children_fields else: result.update(children_fields) return result