class YoutubeItem(scrapy.Item): id = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) name = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) videos = scrapy.Field( output_processor=Identity() ) partners = scrapy.Field( output_processor=Identity() ) subscribers = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) is_verified = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) pass
class BcDailyPostLoader(ItemLoader): default_item_class = BcDailyPost default_input_processor = MapCompose(strip_html5_whitespace) default_output_processor = Compose(TakeFirst()) title_out = Compose(TakeFirst(), normalize) tags_out = Identity() to_dl_out = Identity()
class CompanyLoader(ItemLoader): default_input_processor = MapCompose(lambda x: x.strip().replace('\n', '')) default_output_processor = TakeFirst() categories_out = Identity() postal_code_out = Compose(TakeFirst(), int) phone_in = MapCompose(lambda x: re.sub(u' |\xa0', '', x))
class YoutubeVideoItem(scrapy.Item): url = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) category = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) date = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) title = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) views = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) likes = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) dislikes = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) comments = scrapy.Field( output_processor=Identity() ) pass
class ScraperProductLoader(ItemLoader): """ Creates items via XPath or CSS expressions. Basically, reduces the amount of work involved in scraping items because the item loader can take an XPath or CSS expression and immediately load that into the item (or add multiple values, if the exist). As well, the item loader can handle custom input / output processing for common operations. More details available in the docs: http://doc.scrapy.org/en/latest/topics/loaders.html """ default_output_processor = TakeFirst() name_in = Compose(TakeFirst(), unicode.strip) description_in = Compose(Join(), unicode.strip, sanitize_html) details_in = Compose(Join(), sanitize_html) attributes_out = Compose(DefaultValue(lambda: {}), MergeDicts()) image_urls_out = Identity()
def parse_item(self, response): print(' >>>> %s' % response.url) loader = ItemLoader(item=MeizituItem(), response=response) loader.add_xpath('name', '//h2/a/text()') loader.add_xpath('img_urls', '//div/p/img/@src', Identity()) return loader.load_item()
def parse_item(self, response): print("parse_item ") l = ItemLoader(item=YouwuItem(), response=response) l.add_xpath('image_urls', "//img[@id='bigimg']/@src", Identity()) l.add_value('url', response.url) l.add_xpath('text', "//div[@id='photos']/h1/text()") return l.load_item()
def parse_item(self, response): print("parse_item ") l = ItemLoader(item=Mm131Item(), response=response) l.add_xpath('image_urls', "//div[@class='content-pic']/a/img/@src", Identity()) l.add_value('url', response.url) l.add_xpath('text', "//div[@class='content']/h5/text()") return l.load_item()
def parse_item(self, response): l = ItemLoader(item=MeizituItem(), response=response) l.add_xpath('name', '//h2/a/text()') #l.add_xpath('tag', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p") l.add_xpath('image_url', "//div[@id='picture']/p/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
class YoutubeFeedItem(scrapy.Item): id = scrapy.Field( input_processor=MapCompose(remove_tags), output_processor=TakeFirst() ) channels = scrapy.Field( output_processor=Identity() ) pass
def parse_item(self, response): print("parse_item ") l = ItemLoader(item=DaiqiyangItem(), response=response) l.add_xpath('image_urls', "//div[@class='showimg']/a/img/@src", Identity()) l.add_value('url', response.url) l.add_xpath('text', "//div[@class='crumbs']/h1/text()") return l.load_item()
def parse_item(self, response): print("parse_item ") l = ItemLoader(item=Www7160Item(), response=response) l.add_xpath('image_urls', "//div[@class='picsbox picsboxcenter']/p/a/img/@src", Identity()) l.add_value('url', response.url) l.add_xpath('text', "//div[@id='photos']/h1/text()") return l.load_item()
def parse_item(self, response): # sel2 = Selector(response) # link = sel2.xpath("//img/@src").extract() # image_item = ImageItem() # image_item['image_urls'] = link # yield image_item l = ItemLoader(item=ImageItem(), response=response) l.add_xpath('image_urls', "//input[@type='image']/@src", Identity()) filePath = response.meta['item'] l.add_value('url', response.url) l.add_value('filePath', filePath) return l.load_item()
def parse_item(self, response): l = ItemLoader(item=YoumeituItem(), response=response) l.add_xpath('name', '//h2/a/text()') l.add_xpath( 'tags', "//div[@id='maincontent']/div[@class='postmeta clearfix']/div[@class='metaRight']/p" ) l.add_xpath( 'image_urls', "//div[@id='mainbox']/div[@id='canvasbox']/div[@id='content']/a[@id='item-tip']/img/@src", Identity()) l.add_value('url', response.url) return l.load_item()
class URLListLoader(ItemLoader): default_output_processor = Identity() def __init__(self, item=None, selector=None, response=None, parent=None, **context): super().__init__(item, selector, response, parent, **context) # starcity item self.add_css( "urls", "div.listItem-details > h4.listItem-title > a::attr(href)") # gatherer item self.add_css("urls", "tr.cardItem > td > a::attr(href)")
class ArticleLoader(ItemLoader): default_input_processor = MapCompose(remove_tags, str.strip) default_output_processor = TakeFirst() url_in = Identity()