Example #1
0
        def filter_link(link):
            key = getattr(settings, 'DUPEFILTER_LINK_KEY') or self.DUPEFILTER_LINK_KEY
            added = r.sadd(key, link.url)

            if not added:
                forbidden_key = getattr(settings, 'FORBIDDEN_KEY')
                if forbidden_key and link.url in self.server.smembers(forbidden_key):
                    return not added

                log.msg(message='(spider:dupefilter:link) - %s' % link.url,
                        level=log.DEBUG)

            return added
Example #2
0
    def parse(self, response):
        people = response.url[:-len('joins')]

        groups = response.xpath('//li[@class=""]')

        for group in groups:
            page = group.xpath('.//div[@class="title"]/a/@href').extract()[0]
            name = group.xpath('.//div[@class="title"]/a/@title').extract()[0]
            # (xxx)
            num = group.xpath('.//span[@class="num"]/text()').extract()[0][1:-1]

            added = r.sadd(self.name + '_group', page)
            if added:
                yield GroupItem({
                    'page': page,
                    'name': name,
                    'num': int(num),
                })

            yield JoinsItem({
                'people': people,
                'group': page
            })