コード例 #1
0
    def parse_html_item(self, response, loop, fields):

        meta = response.meta
        hxs = Selector(response)
        self.macro.update({'URL':response.url, 'keyword':meta.get('keyword', '')})

        for e in hxs.xpath(loop or '(//*)[1]'):

            loader = ItemLoader(item=Item(), selector=e)

            for k,v in fields.iteritems():

                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'css' in v:
                    get_v_x = loader.get_css
                    v_x = v.get('css')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value", "xpath" or "css"'.format(k), level=log.WARNING)
                    continue

                val = get_v_x(
                    self.macro.expand(v_x, meta),
                    parser.make_parser(v.get('parse', {})),
                    re=v.get('regex')
                )

                if not val and 'default' in v:
                    val = arg_to_iter(self.macro.expand(v.get('default'), meta))

                if not (val or v.get('multi') or v.get('opt')):
                    log.msg(u'field [{}] is empty:\n{}'.format(k, loader.load_item()), level=log.WARNING)
                    break

                loader.add_value(k, val)

            else:

                yield loader.load_item()
コード例 #2
0
    def parse_json_item(self, response, loop, fields):

        meta = response.meta
        enc = getattr(self, 'json_enc', 'utf-8')
        txt = unicode(response.body, encoding=enc, errors='ignore')

        if hasattr(self, 'json_type') and self.json_type=='list':
            l, r = txt.find('['), txt.rfind(']')
        else:
            l, r = txt.find('{'), txt.rfind('}')
        obj = json.loads(txt[l:r+1])
        self.macro.update({'URL':response.url, 'keyword':meta.get('keyword', '')})

        for e in jsonpath.jsonpath(obj, loop or '$[]') or []:

            item = Item()

            for k,v in fields.iteritems():
                if 'value' in v:
                    v_x = self.macro.expand(v.get('value'))
                elif 'jpath' in v:
                    v_x = jsonpath.jsonpath(e, self.macro.expand(v.get('jpath')))
                    v_x = None if v_x==False else v_x
                else:
                    log.msg(u'field [{}] should contains "value" or "jpath"'.format(k), level=log.WARNING)
                    continue

                val = parser.make_parser(v.get('parse', {}))(v_x)

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                if not (val or v.get('multi') or v.get('opt')):
                    log.msg(u'field [{}] is empty:\n{}'.format(k, item), level=log.WARNING)
                    break

                item[k] = arg_to_iter(val)

            else:

                yield item