Exemple #1
0
    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        log.msg(
            '\n\n{0}\n{1}\n{2}\n\n'.format('-' * 79, spider.name, '-' * 79),
            log.INFO)

        # figure out which fields to export based on the spiders name
        # this also exports the fields in the specified order
        if spider.name == 'YelpSpider':
            foo_fields = ['biz_id', 'timestamp', 'stars', 'review_count']
        elif spider.name == 'AptratingsSpider':
            foo_fields = [
                'biz_id', 'recommended_by', 'total_overall_rating',
                'overall_parking', 'overall_maintenance',
                'overall_construction', 'overall_noise', 'overall_grounds',
                'overall_safety', 'overall_office_staff', 'comment_url',
                'comment_title', 'comment_username', 'comment_post_date',
                'comment_years_stayed', 'comment_message', 'last_edited',
                'replies', 'overall_rating', 'parking', 'maintenance',
                'construction', 'noise', 'grounds', 'safety', 'office_staff'
            ]
        else:
            raise FeverException('a foo_fields variable is not yet \
            defined for "{0}"'.format(spider.name))

        # create a unique filename
        self.file = open(
            os.path.join('data', make_filename(spider.name, 'json')), 'w+b')

        self.json_exporter = FeverJsonItemExporter(self.file,
                                                   fields_to_export=foo_fields,
                                                   sort_keys=True,
                                                   indent=2)

        # start exporting items to JSON file
        self.json_exporter.start_exporting()
Exemple #2
0
class FeverbotPipeline(object):
    '''Pipeline for writing scrapped items to a Json file
    '''
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        log.msg(
            '\n\n{0}\n{1}\n{2}\n\n'.format('-' * 79, spider.name, '-' * 79),
            log.INFO)

        # figure out which fields to export based on the spiders name
        # this also exports the fields in the specified order
        if spider.name == 'YelpSpider':
            foo_fields = ['biz_id', 'timestamp', 'stars', 'review_count']
        elif spider.name == 'AptratingsSpider':
            foo_fields = [
                'biz_id', 'recommended_by', 'total_overall_rating',
                'overall_parking', 'overall_maintenance',
                'overall_construction', 'overall_noise', 'overall_grounds',
                'overall_safety', 'overall_office_staff', 'comment_url',
                'comment_title', 'comment_username', 'comment_post_date',
                'comment_years_stayed', 'comment_message', 'last_edited',
                'replies', 'overall_rating', 'parking', 'maintenance',
                'construction', 'noise', 'grounds', 'safety', 'office_staff'
            ]
        else:
            raise FeverException('a foo_fields variable is not yet \
            defined for "{0}"'.format(spider.name))

        # create a unique filename
        self.file = open(
            os.path.join('data', make_filename(spider.name, 'json')), 'w+b')

        self.json_exporter = FeverJsonItemExporter(self.file,
                                                   fields_to_export=foo_fields,
                                                   sort_keys=True,
                                                   indent=2)

        # start exporting items to JSON file
        self.json_exporter.start_exporting()

    def process_item(self, item, spider):
        '''Process scraped items one at a time
        '''
        # create nested dict here?
        self.json_exporter.export_item(item)
        return item

    def spider_closed(self, spider):
        '''Close your exporters now that the spider is done scraping'''
        self.json_exporter.finish_exporting()
    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        log.msg('\n\n{0}\n{1}\n{2}\n\n'.format('-'*79, spider.name, '-'*79), log.INFO)

        # figure out which fields to export based on the spiders name
        # this also exports the fields in the specified order
        if spider.name == 'YelpSpider':
            foo_fields = [
                    'biz_id',
                    'timestamp',
                    'stars',
                    'review_count'
                ]
        elif spider.name == 'AptratingsSpider':
            foo_fields = [
                    'biz_id',
                    'recommended_by',
                    'total_overall_rating',
                    'overall_parking',
                    'overall_maintenance',
                    'overall_construction',
                    'overall_noise',
                    'overall_grounds',
                    'overall_safety',
                    'overall_office_staff',

                    'comment_url',
                    'comment_title',
                    'comment_username',
                    'comment_post_date',
                    'comment_years_stayed',
                    'comment_message',
                    'last_edited',
                    'replies',

                    'overall_rating',
                    'parking',
                    'maintenance',
                    'construction',
                    'noise',
                    'grounds',
                    'safety',
                    'office_staff'
                ]
        else:
            raise FeverException('a foo_fields variable is not yet \
            defined for "{0}"'.format(spider.name))

        # create a unique filename
        self.file = open(os.path.join('data',
                            make_filename(spider.name, 'json')), 'w+b')

        self.json_exporter = FeverJsonItemExporter(
                                self.file,
                                fields_to_export=foo_fields,
                                sort_keys=True,
                                indent=2)

        # start exporting items to JSON file
        self.json_exporter.start_exporting()
class FeverbotPipeline(object):
    '''Pipeline for writing scrapped items to a Json file
    '''
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):
        '''Initialize your exporters when spider first starts
        '''
        log.msg('\n\n{0}\n{1}\n{2}\n\n'.format('-'*79, spider.name, '-'*79), log.INFO)

        # figure out which fields to export based on the spiders name
        # this also exports the fields in the specified order
        if spider.name == 'YelpSpider':
            foo_fields = [
                    'biz_id',
                    'timestamp',
                    'stars',
                    'review_count'
                ]
        elif spider.name == 'AptratingsSpider':
            foo_fields = [
                    'biz_id',
                    'recommended_by',
                    'total_overall_rating',
                    'overall_parking',
                    'overall_maintenance',
                    'overall_construction',
                    'overall_noise',
                    'overall_grounds',
                    'overall_safety',
                    'overall_office_staff',

                    'comment_url',
                    'comment_title',
                    'comment_username',
                    'comment_post_date',
                    'comment_years_stayed',
                    'comment_message',
                    'last_edited',
                    'replies',

                    'overall_rating',
                    'parking',
                    'maintenance',
                    'construction',
                    'noise',
                    'grounds',
                    'safety',
                    'office_staff'
                ]
        else:
            raise FeverException('a foo_fields variable is not yet \
            defined for "{0}"'.format(spider.name))

        # create a unique filename
        self.file = open(os.path.join('data',
                            make_filename(spider.name, 'json')), 'w+b')

        self.json_exporter = FeverJsonItemExporter(
                                self.file,
                                fields_to_export=foo_fields,
                                sort_keys=True,
                                indent=2)

        # start exporting items to JSON file
        self.json_exporter.start_exporting()


    def process_item(self, item, spider):
        '''Process scraped items one at a time
        '''
        # create nested dict here?
        self.json_exporter.export_item(item)
        return item


    def spider_closed(self, spider):
        '''Close your exporters now that the spider is done scraping'''
        self.json_exporter.finish_exporting()