Esempio n. 1
0
    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL': response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k, v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.
                            format(k),
                            level=log.WARNING)
                    continue

                val = get_v_x(self.macro.expand(v_x),
                              utils.convert_type(v.get('parse', {})),
                              re=v.get('regex'))

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()
Esempio n. 2
0
    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL':response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k,v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.format(k), level=log.WARNING)
                    continue

                val = get_v_x(
                    self.macro.expand(v_x),
                    utils.convert_type(v.get('parse', {})),
                    re=v.get('regex')
                )

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()
Esempio n. 3
0
    def parse_item(self, response, loop, fields):
        try:
            txt = utils.to_unicode(response.body)
            if hasattr(self, 'json_type') and self.json_type == 'list':
                l, r = txt.find('['), txt.rfind(']')
            else:
                l, r = txt.find('{'), txt.rfind('}')
            obj = json.loads(txt[l:r + 1])
            self.macro.update({'URL': response.url})

            for e in jsonpath.jsonpath(obj, loop or '$[]') or []:
                item = Item()

                for k, v in fields.iteritems():
                    if 'value' in v:
                        v_x = v.get('value')
                    elif 'jpath' in v:
                        v_x = jsonpath.jsonpath(
                            e, self.macro.expand(v.get('jpath')))
                    else:
                        log.msg(
                            u'field [{}] should contains "value" or "jpath"'.
                            format(k),
                            level=log.WARNING)
                        continue

                    val = utils.convert_type(v.get('parse',
                                                   {}))(self.macro.expand(v_x))

                    if not val and 'default' in v:
                        val = self.macro.expand(v.get('default'))

                    qry = v.get('filter', {})
                    if utils.filter_data(qry, val):
                        item[k] = arg_to_iter(val)
                    else:
                        break
                else:
                    yield item

        except Exception as ex:
            log.msg(u'{}\n{}'.format(response.url, traceback.format_exc()))
Esempio n. 4
0
    def parse_item(self, response, loop, fields):
        try:
            txt = utils.to_unicode(response.body)
            if hasattr(self, 'json_type') and self.json_type=='list':
                l, r = txt.find('['), txt.rfind(']')
            else:
                l, r = txt.find('{'), txt.rfind('}')
            obj = json.loads(txt[l:r+1])
            self.macro.update({'URL':response.url})

            for e in jsonpath.jsonpath(obj, loop or '$[]') or []:
                item = Item()

                for k,v in fields.iteritems():
                    if 'value' in v:
                        v_x = v.get('value')
                    elif 'jpath' in v:
                        v_x = jsonpath.jsonpath(e, self.macro.expand(v.get('jpath')))
                    else:
                        log.msg(u'field [{}] should contains "value" or "jpath"'.format(k), level=log.WARNING)
                        continue

                    val = utils.convert_type(v.get('parse', {}))(self.macro.expand(v_x))

                    if not val and 'default' in v:
                        val = self.macro.expand(v.get('default'))

                    qry = v.get('filter', {})
                    if utils.filter_data(qry, val):
                        item[k] = arg_to_iter(val)
                    else:
                        break
                else:
                    yield item

        except Exception as ex:
            log.msg(u'{}\n{}'.format(response.url, traceback.format_exc()))