def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' log.msg( '\n\n{0}\n{1}\n{2}\n\n'.format('-' * 79, spider.name, '-' * 79), log.INFO) # figure out which fields to export based on the spiders name # this also exports the fields in the specified order if spider.name == 'YelpSpider': foo_fields = ['biz_id', 'timestamp', 'stars', 'review_count'] elif spider.name == 'AptratingsSpider': foo_fields = [ 'biz_id', 'recommended_by', 'total_overall_rating', 'overall_parking', 'overall_maintenance', 'overall_construction', 'overall_noise', 'overall_grounds', 'overall_safety', 'overall_office_staff', 'comment_url', 'comment_title', 'comment_username', 'comment_post_date', 'comment_years_stayed', 'comment_message', 'last_edited', 'replies', 'overall_rating', 'parking', 'maintenance', 'construction', 'noise', 'grounds', 'safety', 'office_staff' ] else: raise FeverException('a foo_fields variable is not yet \ defined for "{0}"'.format(spider.name)) # create a unique filename self.file = open( os.path.join('data', make_filename(spider.name, 'json')), 'w+b') self.json_exporter = FeverJsonItemExporter(self.file, fields_to_export=foo_fields, sort_keys=True, indent=2) # start exporting items to JSON file self.json_exporter.start_exporting()
class FeverbotPipeline(object): '''Pipeline for writing scrapped items to a Json file ''' def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' log.msg( '\n\n{0}\n{1}\n{2}\n\n'.format('-' * 79, spider.name, '-' * 79), log.INFO) # figure out which fields to export based on the spiders name # this also exports the fields in the specified order if spider.name == 'YelpSpider': foo_fields = ['biz_id', 'timestamp', 'stars', 'review_count'] elif spider.name == 'AptratingsSpider': foo_fields = [ 'biz_id', 'recommended_by', 'total_overall_rating', 'overall_parking', 'overall_maintenance', 'overall_construction', 'overall_noise', 'overall_grounds', 'overall_safety', 'overall_office_staff', 'comment_url', 'comment_title', 'comment_username', 'comment_post_date', 'comment_years_stayed', 'comment_message', 'last_edited', 'replies', 'overall_rating', 'parking', 'maintenance', 'construction', 'noise', 'grounds', 'safety', 'office_staff' ] else: raise FeverException('a foo_fields variable is not yet \ defined for "{0}"'.format(spider.name)) # create a unique filename self.file = open( os.path.join('data', make_filename(spider.name, 'json')), 'w+b') self.json_exporter = FeverJsonItemExporter(self.file, fields_to_export=foo_fields, sort_keys=True, indent=2) # start exporting items to JSON file self.json_exporter.start_exporting() def process_item(self, item, spider): '''Process scraped items one at a time ''' # create nested dict here? self.json_exporter.export_item(item) return item def spider_closed(self, spider): '''Close your exporters now that the spider is done scraping''' self.json_exporter.finish_exporting()
def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' log.msg('\n\n{0}\n{1}\n{2}\n\n'.format('-'*79, spider.name, '-'*79), log.INFO) # figure out which fields to export based on the spiders name # this also exports the fields in the specified order if spider.name == 'YelpSpider': foo_fields = [ 'biz_id', 'timestamp', 'stars', 'review_count' ] elif spider.name == 'AptratingsSpider': foo_fields = [ 'biz_id', 'recommended_by', 'total_overall_rating', 'overall_parking', 'overall_maintenance', 'overall_construction', 'overall_noise', 'overall_grounds', 'overall_safety', 'overall_office_staff', 'comment_url', 'comment_title', 'comment_username', 'comment_post_date', 'comment_years_stayed', 'comment_message', 'last_edited', 'replies', 'overall_rating', 'parking', 'maintenance', 'construction', 'noise', 'grounds', 'safety', 'office_staff' ] else: raise FeverException('a foo_fields variable is not yet \ defined for "{0}"'.format(spider.name)) # create a unique filename self.file = open(os.path.join('data', make_filename(spider.name, 'json')), 'w+b') self.json_exporter = FeverJsonItemExporter( self.file, fields_to_export=foo_fields, sort_keys=True, indent=2) # start exporting items to JSON file self.json_exporter.start_exporting()
class FeverbotPipeline(object): '''Pipeline for writing scrapped items to a Json file ''' def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): '''Initialize your exporters when spider first starts ''' log.msg('\n\n{0}\n{1}\n{2}\n\n'.format('-'*79, spider.name, '-'*79), log.INFO) # figure out which fields to export based on the spiders name # this also exports the fields in the specified order if spider.name == 'YelpSpider': foo_fields = [ 'biz_id', 'timestamp', 'stars', 'review_count' ] elif spider.name == 'AptratingsSpider': foo_fields = [ 'biz_id', 'recommended_by', 'total_overall_rating', 'overall_parking', 'overall_maintenance', 'overall_construction', 'overall_noise', 'overall_grounds', 'overall_safety', 'overall_office_staff', 'comment_url', 'comment_title', 'comment_username', 'comment_post_date', 'comment_years_stayed', 'comment_message', 'last_edited', 'replies', 'overall_rating', 'parking', 'maintenance', 'construction', 'noise', 'grounds', 'safety', 'office_staff' ] else: raise FeverException('a foo_fields variable is not yet \ defined for "{0}"'.format(spider.name)) # create a unique filename self.file = open(os.path.join('data', make_filename(spider.name, 'json')), 'w+b') self.json_exporter = FeverJsonItemExporter( self.file, fields_to_export=foo_fields, sort_keys=True, indent=2) # start exporting items to JSON file self.json_exporter.start_exporting() def process_item(self, item, spider): '''Process scraped items one at a time ''' # create nested dict here? self.json_exporter.export_item(item) return item def spider_closed(self, spider): '''Close your exporters now that the spider is done scraping''' self.json_exporter.finish_exporting()