def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL':response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k,v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'.format(k), level=log.WARNING) continue val = get_v_x( self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex') ) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse_item(self, response, loop, fields): try: txt = utils.to_unicode(response.body) if hasattr(self, 'json_type') and self.json_type == 'list': l, r = txt.find('['), txt.rfind(']') else: l, r = txt.find('{'), txt.rfind('}') obj = json.loads(txt[l:r + 1]) self.macro.update({'URL': response.url}) for e in jsonpath.jsonpath(obj, loop or '$[]') or []: item = Item() for k, v in fields.iteritems(): if 'value' in v: v_x = v.get('value') elif 'jpath' in v: v_x = jsonpath.jsonpath( e, self.macro.expand(v.get('jpath'))) else: log.msg( u'field [{}] should contains "value" or "jpath"'. format(k), level=log.WARNING) continue val = utils.convert_type(v.get('parse', {}))(self.macro.expand(v_x)) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): item[k] = arg_to_iter(val) else: break else: yield item except Exception as ex: log.msg(u'{}\n{}'.format(response.url, traceback.format_exc()))
def parse_item(self, response, loop, fields): try: txt = utils.to_unicode(response.body) if hasattr(self, 'json_type') and self.json_type=='list': l, r = txt.find('['), txt.rfind(']') else: l, r = txt.find('{'), txt.rfind('}') obj = json.loads(txt[l:r+1]) self.macro.update({'URL':response.url}) for e in jsonpath.jsonpath(obj, loop or '$[]') or []: item = Item() for k,v in fields.iteritems(): if 'value' in v: v_x = v.get('value') elif 'jpath' in v: v_x = jsonpath.jsonpath(e, self.macro.expand(v.get('jpath'))) else: log.msg(u'field [{}] should contains "value" or "jpath"'.format(k), level=log.WARNING) continue val = utils.convert_type(v.get('parse', {}))(self.macro.expand(v_x)) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): item[k] = arg_to_iter(val) else: break else: yield item except Exception as ex: log.msg(u'{}\n{}'.format(response.url, traceback.format_exc()))