def parse_art(self, response): """ part_art This function will extract data relevant for a art work. ('url', 'title', 'image', 'height', 'width', 'description') will be single valued. ('artist', 'path') can be a list. """ item = Item() item_loader = ItemLoader(item=item, response=response) items_list = ('url', 'title', 'image', 'height', 'width', 'description') for name in items_list: item.fields[name] = Field(output_processor=TakeFirst()) item.fields['artist'] = Field() item.fields['path'] = Field() item_loader.add_value('url', response.meta['url']) item_loader.add_xpath('artist', '//*[@id="content"]/h2/text()') item_loader.add_xpath('title', '//*[@id="content"]/h1/text()') item_loader.add_xpath('image', '//*[@id="body"]/img/@src', MapCompose(lambda x: urljoin(self.base_url, x))) item_loader.add_xpath( 'height', '//*[@id="content"]/dl/dd[3]/text()', MapCompose( lambda x: self.extract_physical_dimension(x, type='height'))) item_loader.add_xpath( 'width', '//*[@id="content"]/dl/dd[3]/text()', MapCompose( lambda x: self.extract_physical_dimension(x, type='width'))) item_loader.add_xpath('description', '//*[@id="content"]/div/p/text()') item_loader.add_value('path', response.meta['browse_path']) return item_loader.load_item()
def test_hs_ext_item_scraped(hs_ext): hs_ext._write_item = mock.Mock() item = Item() spider = Spider('test') hs_ext.item_scraped(item, spider) assert hs_ext._write_item.call_count == 1 assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
def test_process_spider_output_stats_legacy(self): # testing the subclass not handling stats works at runtime # (i.e. that trying to update stats does not trigger exception) class LegacyDeltaFetchSubClass(self.mwcls): def __init__(self, dir, reset=False, *args, **kwargs): super(LegacyDeltaFetchSubClass, self).__init__(dir=dir, reset=reset) self.something = True self._create_test_db() mw = LegacyDeltaFetchSubClass(self.temp_dir, reset=False) mw.spider_opened(self.spider) response = mock.Mock() response.request = Request('http://url', meta={'deltafetch_key': 'key'}) result = [] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), []) self.assertEqual(self.stats.get_stats(), {}) result = [ Request('http://url', meta={'deltafetch_key': 'key'}), Request('http://url1', meta={'deltafetch_key': 'test_key_1'}) ] # stats should not be updated self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), [result[0]]) self.assertEqual(self.stats.get_value('deltafetch/skipped'), None) result = [Item(), "not a base item"] self.assertEqual( list(mw.process_spider_output(response, result, self.spider)), result) self.assertEqual(self.stats.get_value('deltafetch/stored'), None)
def test_missmatched_wildcard(self): v1 = 1 v2 = 2 item = Item() args = ([(self.makeitem({'t': v1}), ), (self.makeitem({'t': v2}), )], ) feed(item, 'a', args, 'a__') self.assertEqual(isinstance(item['a'], list), True) self.assertEqual(len(item['a']), 2) self.assertEqual( isinstance(item['a'][0], tuple) and isinstance(item['a'][1], tuple), True) self.assertEqual( isinstance(item['a'][0][0], Item) and isinstance(item['a'][1][0], Item), True) self.assertEqual(item['a'][0][0]['t'], v1) self.assertEqual(item['a'][1][0]['t'], v2) feed(item, 'b__c', args, 'a__') self.assertEqual(isinstance(item['b'], list), True) self.assertEqual(len(item['b']), 2) self.assertEqual( isinstance(item['b'][0], Item) and isinstance(item['b'][1], Item), True) self.assertEqual( isinstance(item['b'][0]['c'], Item) and isinstance(item['b'][1]['c'], Item), True) self.assertEqual(item['b'][0]['c']['t'], v1) self.assertEqual(item['b'][1]['c']['t'], v2)
def parse_item(self, response): item = Item() l = ItemLoader(item=item, response=response) for a in self.config["attributes"]: item.fields[a["name"]] = scrapy.Field() processors = [] if "processors" in a: for p in a["processors"]: if p == "join": processors.append(Join()) elif p == "strip": processors.append(MapCompose(str.strip)) kwargs = {} if "regex" in a: kwargs["re"] = a["regex"] l.add_css(a["name"], a["selector"], *processors, **kwargs) item.fields["url"] = scrapy.Field() l.add_value("url", response.url) return l.load_item()
def generate_item(self, data, refer_id): """将传入的字典类型的data数据转换成item""" item = Item() for key, value in data.items(): item.fields[key] = Field() item[key] = value return self.padding_item(item, refer_id)
def check_buzzwords(self, response): self.__class__.crawl_count += 1 crawl_count = self.__class__.crawl_count ##### Change the words ########## wordlist = [ "phone", "hotel", "reservation", "booked", ] url = response.url contenttype = response.headers.get("content-type", "").decode('utf-8').lower() data = response.body.decode('utf-8') for word in wordlist: substrings = find_all_substrings(data, word) for pos in substrings: ok = False if not ok: self.__class__.words_found += 1 print(word + ";" + url + ";") return Item()
def check_buzzwords(self, response): self.__class__.crawl_count += 1 crawl_count = self.__class__.crawl_count wordlist = [ "Creating", "Deploy", "COZMO", ] url = response.url contenttype = response.headers.get("content-type", "").decode('utf-8').lower() data = response.body.decode('utf-8') for word in wordlist: substrings = find_all_substrings(data, word) for pos in substrings: ok = False if not ok: self.__class__.words_found += 1 print(word + ";" + url + "") return Item()
def parse_item(self, response, loop, fields): hxs = HtmlXPathSelector(response) self.macro.update({'URL': response.url}) for e in hxs.select(loop or '(//*)[1]'): loader = XPathItemLoader(item=Item(), selector=e) for k, v in fields.iteritems(): if 'value' in v: get_v_x = loader.get_value v_x = v.get('value') elif 'xpath' in v: get_v_x = loader.get_xpath v_x = v.get('xpath') else: log.msg(u'field [{}] should contains "value" or "xpath"'. format(k), level=log.WARNING) continue val = get_v_x(self.macro.expand(v_x), utils.convert_type(v.get('parse', {})), re=v.get('regex')) if not val and 'default' in v: val = self.macro.expand(v.get('default')) qry = v.get('filter', {}) if utils.filter_data(qry, val): loader.add_value(k, val) else: break else: yield loader.load_item()
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].items(): if xpath: item.fields[name] = Field() l.add_xpath(name, xpath) return l.load_item()
def test_isinstance_check(self): class SubclassedBaseItem(BaseItem): pass class SubclassedItem(Item): pass self.assertTrue(isinstance(BaseItem(), BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem)) self.assertTrue(isinstance(Item(), BaseItem)) self.assertTrue(isinstance(SubclassedItem(), BaseItem)) # make sure internal checks using private _BaseItem class succeed self.assertTrue(isinstance(BaseItem(), _BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), _BaseItem)) self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
def parse(self, response): item = Item() l = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].iteritems(): if xpath: # 动态添加新字段 item.fields[name] = Field() l.add_xpath(name, xpath, MapCompose(unicode.strip, unicode.title)) return l.load_item()
def parse_node(self, response, node): log.msg('Hi, this is a <%s> node!: %s' % (self.itertag, ''.join(node.extract()))) item = Item() item['id'] = node.select('@id').extract() item['name'] = node.select('name').extract() item['description'] = node.select('description').extract() return item
def test_no_deprecation_warning(self): """ Make sure deprecation warnings are NOT logged whenever BaseItem subclasses are used. """ class SubclassedItem(Item): pass with catch_warnings(record=True) as warnings: Item() SubclassedItem() _BaseItem() self.assertFalse(isinstance("foo", _BaseItem)) self.assertFalse(isinstance("foo", Item)) self.assertFalse(isinstance("foo", SubclassedItem)) self.assertTrue(isinstance(_BaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), Item)) self.assertTrue(isinstance(SubclassedItem(), SubclassedItem)) self.assertEqual(len(warnings), 0)
def parse_item(self, response): sel = Selector(response) items = [] lists = sel.xpath('//tt[@class="i-emp"]').extract() for com in lists: item = Item() item['title'] = com items.append(item) return (items)
def test_iterate_spider_output(self): i = Item() r = Request('http://scrapytest.org') o = object() self.assertEqual(list(iterate_spider_output(i)), [i]) self.assertEqual(list(iterate_spider_output(r)), [r]) self.assertEqual(list(iterate_spider_output(o)), [o]) self.assertEqual(list(iterate_spider_output([r, i, o])), [r, i, o])
def find_words(self, response): url = response.url contenttype = response.headers.get("content-type", "").decode("utf-8").lower() data = response.body.decode("utf-8") for w in self.words: if w in data: print(w + ":" + url) self.words.remove(w) return Item()
def test_drop_item_as_single_value(self): value = 1 item = Item() output_pack = ((value, ), ) feed(item, 'a__b', output_pack, 'e__f') self.assertEqual(isinstance(item['a'], Item), value) self.assertEqual(item['a']['b'], value) feed(item, 'a__c', ((DropItem(), ), ), 'e__g') self.assertEqual(item['a'], None)
def test_isinstance_check(self): class SubclassedBaseItem(BaseItem): pass class SubclassedItem(Item): pass with catch_warnings(): filterwarnings("ignore", category=ScrapyDeprecationWarning) self.assertTrue(isinstance(BaseItem(), BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), BaseItem)) self.assertTrue(isinstance(Item(), BaseItem)) self.assertTrue(isinstance(SubclassedItem(), BaseItem)) # make sure internal checks using private _BaseItem class succeed self.assertTrue(isinstance(BaseItem(), _BaseItem)) self.assertTrue(isinstance(SubclassedBaseItem(), _BaseItem)) self.assertTrue(isinstance(Item(), _BaseItem)) self.assertTrue(isinstance(SubclassedItem(), _BaseItem))
def parse(self, response): root = lxml.html.fromhtml(response) items = [] item = Item() item['url'] = response items.append(item) return items
def copy_item(from_item, to_item=None): if to_item is None: to_item = Item() if hasattr(from_item, 'fields'): to_item.fields = from_item.fields for key, value in from_item.items(): if key not in to_item: to_item.fields[key] = {} to_item[key] = value return to_item
def test_dictitem_deprecation_warning(self): """Make sure the DictItem deprecation warning is not issued for Item""" with catch_warnings(record=True) as warnings: item = Item() self.assertEqual(len(warnings), 0) class SubclassedItem(Item): pass subclassed_item = SubclassedItem() self.assertEqual(len(warnings), 0)
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) item = Item() item['id'] = hxs.select('//td[@id="item_id"]/text()').re(r'ID: (\d+)') item['name'] = hxs.select('//td[@id="item_name"]/text()').extract() item['description'] = hxs.select( '//td[@id="item_description"]/text()').extract() return item
def parse(self, response): ''' I'm highjacking this, so I can do my own thing with it. ''' self.logger.info("Bacon") self.logger.info("Bannanas") self.logger.info("Spinach") self.mongoClient = MongoClient( "mongodb://*****:*****@ds147975.mlab.com:47975/craigslist") self.db = self.mongoClient.get_default_database() self.db["test"].insert_one({"a": 0}) return [Item()]
def parse(self, response, node): """ Parse the current response object, and return any Item and/or Request objects """ log.msg("SCRAPING '%s'" % response.url) ## extract your data and yield as an Item (or DjangoItem if you're using django-celery) scraped_item = Item() scraped_item['some_value'] = "important value" yield scraped_item ## get the next URL to crawl next_url = self.url.next() yield Request(next_url) ##parse()
def parse_item(self, response): #it has the response of jobdescription url item = Item() # manually declaring an item which has ordered dict format item_container = ItemLoader(item=item, response=response) for name, xpath in response.meta['fields'].items(): #first row of a csv file will be treated as name and following rows as xpath if xpath: #intializing the item fields manually item.fields[name] = Field() #adding an xpath to itemloader to obtain final result item_container.add_xpath(name, xpath) #populating an item which has final response return item_container.load_item()
def test_drop_item_in_lists(self): item = Item() output_pack = self.output_pack feed(item, 'c__d', output_pack, 'e__f__g__h') self.assertEqual(isinstance(item['c'], list), True) self.assertEqual(len(item['c']), 6) for sub_item in item['c']: self.assertEqual(isinstance(sub_item, Item), True) for i, sub_item in enumerate(item['c']): self.assertEqual(sub_item['d'], i + 1) feed(item, 'c__e', ([(DropItem(), ), (2, ), (3, ), (4, ), (5, ), (6, )], ), 'g__t') self.assertEqual(len(item['c']), 5) for i, sub_item in enumerate(item['c']): self.assertEqual(sub_item['d'], i + 2)
def parse(self, response): """ 根据返回的 response 进行数据解析 :param response: scrapy 框架返回的响应 """ item = Item() item_loader = ItemLoader(item=item, selector=response) for field in self.model_xpath: item.fields[field] = Field() if 'model_url' in field: item_loader.add_value(field, response.url) else: item_loader.add_xpath(field, self.model_xpath[field]) item = self.format_item(item_loader.load_item()) yield item
def test_complex_feed(self): item = Item() output_pack = self.output_pack feed(item, 'a__b__c__d', output_pack, 'e__f__g__h') self.assertEqual(isinstance(item['a'], list), True) self.assertEqual(len(item['a']), 2) i = 1 for sub_item in item['a']: self.assertEqual(isinstance(sub_item, Item), True) self.assertEqual(isinstance(sub_item['b'], list), True) for sub_sub_item in sub_item['b']: self.assertEqual(isinstance(sub_sub_item, Item), True) self.assertEqual(isinstance(sub_sub_item['c'], list), True) for sub_sub_sub_item in sub_sub_item['c']: self.assertEqual(isinstance(sub_sub_sub_item, Item), True) self.assertEqual(sub_sub_sub_item['d'], i) i = i + 1 feed(item, 'b__c__d', output_pack, 'e__f__g__h') self.assertEqual(isinstance(item['b'], list), True) self.assertEqual(len(item['b']), 4) i = 1 for sub_item in item['b']: self.assertEqual(isinstance(sub_item, Item), True) self.assertEqual(isinstance(sub_item['c'], list), True) for sub_sub_item in sub_item['c']: self.assertEqual(isinstance(sub_sub_item, Item), True) self.assertEqual(sub_sub_item['d'], i) i = i + 1 feed(item, 'c__d', output_pack, 'e__f__g__h') self.assertEqual(isinstance(item['c'], list), True) self.assertEqual(len(item['c']), 6) for i, sub_item in enumerate(item['c']): self.assertEqual(isinstance(sub_item, Item), True) self.assertEqual(sub_item['d'], i + 1) feed(item, 'd', output_pack, 'e__f__g__h') self.assertEqual(item['d'], [1, 2, 3, 4, 5, 6]) feed(item, 'e__f__g__h', output_pack, 'e__f__g__h') feed(item, 'e__h', output_pack, 'e__f__g__h') for i, sub_item in enumerate(item['e']): self.assertEqual(isinstance(sub_item, Item), True) for k, sub_sub_item in enumerate(sub_item['h']): self.assertEqual(sub_sub_item, i * 3 + k + 1)
def maybe_continue(self, item, response): meta = response.meta item = self.update_item(meta.get('item', Item()), item) lvl = meta.get('level', 0) mapping = self.mappings[lvl] fields = mapping['fields'] for k, v in fields.iteritems(): ps = v.get('parse', [{}]) if not isinstance(ps, list): ps = [ps] if ps[-1].get('type') == 'continue': url = item[k][0] meta = {'level': lvl + 1, 'item': item} return Request(url, meta=meta, callback=self.parse_page, dont_filter=True) return item