Example #1
0
    def parse_art(self, response):
        """
        part_art
            This function will extract data relevant for a art work.
            ('url', 'title', 'image', 'height', 'width', 'description') will be single valued.
            ('artist', 'path') can be a list.
        """
        item = Item()
        item_loader = ItemLoader(item=item, response=response)
        items_list = ('url', 'title', 'image', 'height', 'width',
                      'description')
        for name in items_list:
            item.fields[name] = Field(output_processor=TakeFirst())
        item.fields['artist'] = Field()
        item.fields['path'] = Field()

        item_loader.add_value('url', response.meta['url'])
        item_loader.add_xpath('artist', '//*[@id="content"]/h2/text()')
        item_loader.add_xpath('title', '//*[@id="content"]/h1/text()')
        item_loader.add_xpath('image', '//*[@id="body"]/img/@src',
                              MapCompose(lambda x: urljoin(self.base_url, x)))
        item_loader.add_xpath(
            'height', '//*[@id="content"]/dl/dd[3]/text()',
            MapCompose(
                lambda x: self.extract_physical_dimension(x, type='height')))
        item_loader.add_xpath(
            'width', '//*[@id="content"]/dl/dd[3]/text()',
            MapCompose(
                lambda x: self.extract_physical_dimension(x, type='width')))
        item_loader.add_xpath('description', '//*[@id="content"]/div/p/text()')
        item_loader.add_value('path', response.meta['browse_path'])
        return item_loader.load_item()
def test_hs_ext_item_scraped(hs_ext):
    hs_ext._write_item = mock.Mock()
    item = Item()
    spider = Spider('test')
    hs_ext.item_scraped(item, spider)
    assert hs_ext._write_item.call_count == 1
    assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
    def test_process_spider_output_stats_legacy(self):
        # testing the subclass not handling stats works at runtime
        # (i.e. that trying to update stats does not trigger exception)
        class LegacyDeltaFetchSubClass(self.mwcls):
            def __init__(self, dir, reset=False, *args, **kwargs):
                super(LegacyDeltaFetchSubClass, self).__init__(dir=dir,
                                                               reset=reset)
                self.something = True

        self._create_test_db()
        mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False)
        mw.spider_opened(self.spider)
        response = mock.Mock()
        response.request = Request('http://url',
                                   meta={'deltafetch_key': 'key'})
        result = []
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)), [])
        self.assertEqual(self.stats.get_stats(), {})
        result = [
            Request('http://url', meta={'deltafetch_key': 'key'}),
            Request('http://url1', meta={'deltafetch_key': 'test_key_1'})
        ]

        # stats should not be updated
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            [result[0]])
        self.assertEqual(self.stats.get_value('deltafetch/skipped'), None)

        result = [Item(), "not a base item"]
        self.assertEqual(
            list(mw.process_spider_output(response, result, self.spider)),
            result)
        self.assertEqual(self.stats.get_value('deltafetch/stored'), None)
Example #4
0
    def test_missmatched_wildcard(self):
        v1 = 1
        v2 = 2
        item = Item()
        args = ([(self.makeitem({'t': v1}), ), (self.makeitem({'t': v2}), )], )
        feed(item, 'a', args, 'a__')
        self.assertEqual(isinstance(item['a'], list), True)
        self.assertEqual(len(item['a']), 2)
        self.assertEqual(
            isinstance(item['a'][0], tuple)
            and isinstance(item['a'][1], tuple), True)
        self.assertEqual(
            isinstance(item['a'][0][0], Item)
            and isinstance(item['a'][1][0], Item), True)
        self.assertEqual(item['a'][0][0]['t'], v1)
        self.assertEqual(item['a'][1][0]['t'], v2)

        feed(item, 'b__c', args, 'a__')
        self.assertEqual(isinstance(item['b'], list), True)
        self.assertEqual(len(item['b']), 2)
        self.assertEqual(
            isinstance(item['b'][0], Item) and isinstance(item['b'][1], Item),
            True)
        self.assertEqual(
            isinstance(item['b'][0]['c'], Item)
            and isinstance(item['b'][1]['c'], Item), True)
        self.assertEqual(item['b'][0]['c']['t'], v1)
        self.assertEqual(item['b'][1]['c']['t'], v2)
Example #5
0
    def parse_item(self, response):
        item = Item()
        l = ItemLoader(item=item, response=response)
        
        for a in self.config["attributes"]:
            item.fields[a["name"]] = scrapy.Field()
            
            processors = []
            if "processors" in a:
                for p in a["processors"]:
                    if p == "join":
                        processors.append(Join())
                    elif p == "strip":
                        processors.append(MapCompose(str.strip))

            kwargs = {}
            if "regex" in a:
                kwargs["re"] = a["regex"]

            l.add_css(a["name"], a["selector"], *processors, **kwargs)
        
        item.fields["url"] = scrapy.Field()
        l.add_value("url", response.url)

        return l.load_item()
Example #6
0
 def generate_item(self, data, refer_id):
     """将传入的字典类型的data数据转换成item"""
     item = Item()
     for key, value in data.items():
         item.fields[key] = Field()
         item[key] = value
     return self.padding_item(item, refer_id)
Example #7
0
    def check_buzzwords(self, response):

        self.__class__.crawl_count += 1

        crawl_count = self.__class__.crawl_count

        ##### Change the words  ##########
        wordlist = [
            "phone",
            "hotel",
            "reservation",
            "booked",
        ]

        url = response.url
        contenttype = response.headers.get("content-type",
                                           "").decode('utf-8').lower()
        data = response.body.decode('utf-8')

        for word in wordlist:
            substrings = find_all_substrings(data, word)
            for pos in substrings:
                ok = False
                if not ok:
                    self.__class__.words_found += 1
                    print(word + ";" + url + ";")

        return Item()
    def check_buzzwords(self, response):

        self.__class__.crawl_count += 1

        crawl_count = self.__class__.crawl_count

        wordlist = [
            "Creating",
            "Deploy",
            "COZMO",
        ]

        url = response.url
        contenttype = response.headers.get("content-type",
                                           "").decode('utf-8').lower()
        data = response.body.decode('utf-8')

        for word in wordlist:
            substrings = find_all_substrings(data, word)
            for pos in substrings:
                ok = False
                if not ok:
                    self.__class__.words_found += 1
                    print(word + ";" + url + "")
        return Item()
Example #9
0
    def parse_item(self, response, loop, fields):
        hxs = HtmlXPathSelector(response)
        self.macro.update({'URL': response.url})

        for e in hxs.select(loop or '(//*)[1]'):
            loader = XPathItemLoader(item=Item(), selector=e)

            for k, v in fields.iteritems():
                if 'value' in v:
                    get_v_x = loader.get_value
                    v_x = v.get('value')
                elif 'xpath' in v:
                    get_v_x = loader.get_xpath
                    v_x = v.get('xpath')
                else:
                    log.msg(u'field [{}] should contains "value" or "xpath"'.
                            format(k),
                            level=log.WARNING)
                    continue

                val = get_v_x(self.macro.expand(v_x),
                              utils.convert_type(v.get('parse', {})),
                              re=v.get('regex'))

                if not val and 'default' in v:
                    val = self.macro.expand(v.get('default'))

                qry = v.get('filter', {})
                if utils.filter_data(qry, val):
                    loader.add_value(k, val)
                else:
                    break
            else:
                yield loader.load_item()
Example #10
0
 def parse(self, response):
     item = Item()
     l = ItemLoader(item=item, response=response)
     for name, xpath in response.meta['fields'].items():
         if xpath:
             item.fields[name] = Field()
             l.add_xpath(name, xpath)
     return l.load_item()
Example #11
0
    def test_isinstance_check(self):
        class SubclassedBaseItem(BaseItem):
            pass

        class SubclassedItem(Item):
            pass

        self.assertTrue(isinstance(BaseItem(), BaseItem))
        self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem))
        self.assertTrue(isinstance(Item(), BaseItem))
        self.assertTrue(isinstance(SubclassedItem(), BaseItem))

        # make sure internal checks using private _BaseItem class succeed
        self.assertTrue(isinstance(BaseItem(), _BaseItem))
        self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem))
        self.assertTrue(isinstance(Item(), _BaseItem))
        self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
Example #12
0
 def parse(self, response):
     item = Item()
     l = ItemLoader(item=item, response=response)
     for name, xpath in response.meta['fields'].iteritems():
         if xpath:
             # 动态添加新字段
             item.fields[name] = Field()
             l.add_xpath(name, xpath, MapCompose(unicode.strip, unicode.title))
     return l.load_item()
Example #13
0
    def parse_node(self, response, node):
        log.msg('Hi, this is a <%s> node!: %s' %
                (self.itertag, ''.join(node.extract())))

        item = Item()
        item['id'] = node.select('@id').extract()
        item['name'] = node.select('name').extract()
        item['description'] = node.select('description').extract()
        return item
Example #14
0
    def test_no_deprecation_warning(self):
        """
        Make sure deprecation warnings are NOT logged whenever BaseItem subclasses are used.
        """
        class SubclassedItem(Item):
            pass

        with catch_warnings(record=True) as warnings:
            Item()
            SubclassedItem()
            _BaseItem()
            self.assertFalse(isinstance("foo", _BaseItem))
            self.assertFalse(isinstance("foo", Item))
            self.assertFalse(isinstance("foo", SubclassedItem))
            self.assertTrue(isinstance(_BaseItem(), _BaseItem))
            self.assertTrue(isinstance(Item(), Item))
            self.assertTrue(isinstance(SubclassedItem(), SubclassedItem))
            self.assertEqual(len(warnings), 0)
Example #15
0
 def parse_item(self, response):
   sel = Selector(response)
   items = []
   lists = sel.xpath('//tt[@class="i-emp"]').extract()
   for com in lists:
     item = Item()
     item['title'] = com
     items.append(item)
   return (items)
    def test_iterate_spider_output(self):
        i = Item()
        r = Request('http://scrapytest.org')
        o = object()

        self.assertEqual(list(iterate_spider_output(i)), [i])
        self.assertEqual(list(iterate_spider_output(r)), [r])
        self.assertEqual(list(iterate_spider_output(o)), [o])
        self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
 def find_words(self, response):
     url = response.url
     contenttype = response.headers.get("content-type",
                                        "").decode("utf-8").lower()
     data = response.body.decode("utf-8")
     for w in self.words:
         if w in data:
             print(w + ":" + url)
             self.words.remove(w)
     return Item()
Example #18
0
    def test_drop_item_as_single_value(self):
        value = 1
        item = Item()
        output_pack = ((value, ), )
        feed(item, 'a__b', output_pack, 'e__f')
        self.assertEqual(isinstance(item['a'], Item), value)
        self.assertEqual(item['a']['b'], value)

        feed(item, 'a__c', ((DropItem(), ), ), 'e__g')
        self.assertEqual(item['a'], None)
Example #19
0
    def test_isinstance_check(self):
        class SubclassedBaseItem(BaseItem):
            pass

        class SubclassedItem(Item):
            pass

        with catch_warnings():
            filterwarnings("ignore", category=ScrapyDeprecationWarning)
            self.assertTrue(isinstance(BaseItem(), BaseItem))
            self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem))
            self.assertTrue(isinstance(Item(), BaseItem))
            self.assertTrue(isinstance(SubclassedItem(), BaseItem))

            # make sure internal checks using private _BaseItem class succeed
            self.assertTrue(isinstance(BaseItem(), _BaseItem))
            self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem))
            self.assertTrue(isinstance(Item(), _BaseItem))
            self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
Example #20
0
    def parse(self, response):
        root = lxml.html.fromhtml(response)

        items = []

        item = Item()
        item['url'] = response
        items.append(item)
        
        return items
Example #21
0
def copy_item(from_item, to_item=None):
    if to_item is None:
        to_item = Item()
    if hasattr(from_item, 'fields'):
        to_item.fields = from_item.fields
    for key, value in from_item.items():
        if key not in to_item:
            to_item.fields[key] = {}
        to_item[key] = value
    return to_item
Example #22
0
 def test_dictitem_deprecation_warning(self):
     """Make sure the DictItem deprecation warning is not issued for
     Item"""
     with catch_warnings(record=True) as warnings:
         item = Item()
         self.assertEqual(len(warnings), 0)
         class SubclassedItem(Item):
             pass
         subclassed_item = SubclassedItem()
         self.assertEqual(len(warnings), 0)
Example #23
0
    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        hxs = HtmlXPathSelector(response)
        item = Item()
        item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
        item['name'] = hxs.select('//td[@id="item_name"]/text()').extract()
        item['description'] = hxs.select(
            '//td[@id="item_description"]/text()').extract()
        return item
Example #24
0
    def parse(self, response):
        '''
            I'm highjacking this, so I can do my own thing with it.
        '''
        self.logger.info("Bacon")
        self.logger.info("Bannanas")
        self.logger.info("Spinach")

        self.mongoClient = MongoClient(
            "mongodb://*****:*****@ds147975.mlab.com:47975/craigslist")

        self.db = self.mongoClient.get_default_database()
        self.db["test"].insert_one({"a": 0})

        return [Item()]
Example #25
0
    def parse(self, response, node):
        """
        Parse the current response object, and return any Item and/or Request objects
        """
        log.msg("SCRAPING '%s'" % response.url)

        ## extract your data and yield as an Item (or DjangoItem if you're using django-celery)
        scraped_item = Item()
        scraped_item['some_value'] = "important value"
        yield scraped_item

        ## get the next URL to crawl
        next_url = self.url.next()
        yield Request(next_url)
    ##parse()
Example #26
0
    def parse_item(self, response):
        #it has the response of jobdescription url
        item = Item()
        # manually declaring an item which has ordered dict format
        item_container = ItemLoader(item=item, response=response)

        for name, xpath in response.meta['fields'].items():
            #first row of a csv file will be treated as name and following rows as xpath
            if xpath:
                #intializing the item fields manually
                item.fields[name] = Field()
                #adding an xpath to itemloader to obtain final result
                item_container.add_xpath(name, xpath)
        #populating an item which has final response
        return item_container.load_item()
Example #27
0
    def test_drop_item_in_lists(self):
        item = Item()
        output_pack = self.output_pack
        feed(item, 'c__d', output_pack, 'e__f__g__h')
        self.assertEqual(isinstance(item['c'], list), True)
        self.assertEqual(len(item['c']), 6)
        for sub_item in item['c']:
            self.assertEqual(isinstance(sub_item, Item), True)
        for i, sub_item in enumerate(item['c']):
            self.assertEqual(sub_item['d'], i + 1)

        feed(item, 'c__e', ([(DropItem(), ), (2, ), (3, ), (4, ), (5, ),
                             (6, )], ), 'g__t')
        self.assertEqual(len(item['c']), 5)
        for i, sub_item in enumerate(item['c']):
            self.assertEqual(sub_item['d'], i + 2)
Example #28
0
    def parse(self, response):
        """
        根据返回的 response 进行数据解析
        :param response: scrapy 框架返回的响应
        """
        item = Item()
        item_loader = ItemLoader(item=item, selector=response)
        for field in self.model_xpath:
            item.fields[field] = Field()
            if 'model_url' in field:
                item_loader.add_value(field, response.url)
            else:
                item_loader.add_xpath(field, self.model_xpath[field])

        item = self.format_item(item_loader.load_item())

        yield item
Example #29
0
    def test_complex_feed(self):
        item = Item()
        output_pack = self.output_pack
        feed(item, 'a__b__c__d', output_pack, 'e__f__g__h')
        self.assertEqual(isinstance(item['a'], list), True)
        self.assertEqual(len(item['a']), 2)
        i = 1
        for sub_item in item['a']:
            self.assertEqual(isinstance(sub_item, Item), True)
            self.assertEqual(isinstance(sub_item['b'], list), True)
            for sub_sub_item in sub_item['b']:
                self.assertEqual(isinstance(sub_sub_item, Item), True)
                self.assertEqual(isinstance(sub_sub_item['c'], list), True)
                for sub_sub_sub_item in sub_sub_item['c']:
                    self.assertEqual(isinstance(sub_sub_sub_item, Item), True)
                    self.assertEqual(sub_sub_sub_item['d'], i)
                    i = i + 1

        feed(item, 'b__c__d', output_pack, 'e__f__g__h')
        self.assertEqual(isinstance(item['b'], list), True)
        self.assertEqual(len(item['b']), 4)
        i = 1
        for sub_item in item['b']:
            self.assertEqual(isinstance(sub_item, Item), True)
            self.assertEqual(isinstance(sub_item['c'], list), True)
            for sub_sub_item in sub_item['c']:
                self.assertEqual(isinstance(sub_sub_item, Item), True)
                self.assertEqual(sub_sub_item['d'], i)
                i = i + 1

        feed(item, 'c__d', output_pack, 'e__f__g__h')
        self.assertEqual(isinstance(item['c'], list), True)
        self.assertEqual(len(item['c']), 6)
        for i, sub_item in enumerate(item['c']):
            self.assertEqual(isinstance(sub_item, Item), True)
            self.assertEqual(sub_item['d'], i + 1)

        feed(item, 'd', output_pack, 'e__f__g__h')
        self.assertEqual(item['d'], [1, 2, 3, 4, 5, 6])

        feed(item, 'e__f__g__h', output_pack, 'e__f__g__h')
        feed(item, 'e__h', output_pack, 'e__f__g__h')
        for i, sub_item in enumerate(item['e']):
            self.assertEqual(isinstance(sub_item, Item), True)
            for k, sub_sub_item in enumerate(sub_item['h']):
                self.assertEqual(sub_sub_item, i * 3 + k + 1)
Example #30
0
 def maybe_continue(self, item, response):
     meta = response.meta
     item = self.update_item(meta.get('item', Item()), item)
     lvl = meta.get('level', 0)
     mapping = self.mappings[lvl]
     fields = mapping['fields']
     for k, v in fields.iteritems():
         ps = v.get('parse', [{}])
         if not isinstance(ps, list):
             ps = [ps]
         if ps[-1].get('type') == 'continue':
             url = item[k][0]
             meta = {'level': lvl + 1, 'item': item}
             return Request(url,
                            meta=meta,
                            callback=self.parse_page,
                            dont_filter=True)
     return item