def post(self):
        # get args
        self.start_cursor = self.request.get('cursor')
        self.filtering_event_key = self.request.get('event')
        self.filename = self.request.get('filename')
        self.csv_header = self.request.get('csv_header')
        self.worker_url = self.request.get('worker_url')

        self.event = Event.get(self.filtering_event_key) if self.filtering_event_key else None

        # get (base) query, skip query to cursor, filter for sites
        query = self.get_base_query()
        if self.start_cursor:
            query.with_cursor(self.start_cursor)
        fetched_sites = query.fetch(limit=self.sites_per_task)
        sites = self.filter_sites(fetched_sites)

        # write part of csv file to GCS
        csv_part_gcs_fd = cloudstorage.open(
            BUCKET_NAME + '/' + self.filename + '.part.' + self.start_cursor,
            'w',
            content_type='text/csv'
        )
        self._write_csv_rows(csv_part_gcs_fd, sites)
        csv_part_gcs_fd.close()

        # decide what to do next
        self.end_cursor = query.cursor()
        if self.end_cursor and self.start_cursor != self.end_cursor:
            # chain to next task
            taskqueue.add(
                url=self.worker_url,
                params=self.get_continuation_param_dict(),
                retry_options=taskqueue.TaskRetryOptions(task_retry_limit=3),
            )
        else:
            # finish file: combine parts and deduplicate lines
            logging.info(u"Deduplicating to create %s ..." % self.filename)

            sio = StringIO()
            path_prefix = BUCKET_NAME + '/' + self.filename + '.part'
            for gcs_file_stat in cloudstorage.listbucket(path_prefix):
                csv_part_gcs_fd = cloudstorage.open(gcs_file_stat.filename)
                for line in csv_part_gcs_fd:
                    sio.write(line)
                csv_part_gcs_fd.close()
            sio.seek(0)
            deduplicated_lines = set(line for line in sio)

            # write csv header and deduplicated lines to new file
            csv_complete_gcs_fd = cloudstorage.open(
                BUCKET_NAME + '/' + self.filename,
                'w',
                content_type='text/csv'
            )
            csv_complete_gcs_fd.write(self.csv_header.encode('utf-8'))
            for line in deduplicated_lines:
                csv_complete_gcs_fd.write(line)
            csv_complete_gcs_fd.close()
Exemple #2
0
    def post(self):
        # get args
        self.start_cursor = self.request.get('cursor')
        self.filtering_event_key = self.request.get('event')
        self.filename = self.request.get('filename')
        self.csv_header = self.request.get('csv_header')
        self.worker_url = self.request.get('worker_url')

        self.event = Event.get(
            self.filtering_event_key) if self.filtering_event_key else None

        # get (base) query, skip query to cursor, filter for sites
        query = self.get_base_query()
        if self.start_cursor:
            query.with_cursor(self.start_cursor)
        fetched_sites = query.fetch(limit=self.sites_per_task)
        sites = self.filter_sites(fetched_sites)

        # write part of csv file to GCS
        csv_part_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' + self.filename +
                                            '.part.' + self.start_cursor,
                                            'w',
                                            content_type='text/csv')
        self._write_csv_rows(csv_part_gcs_fd, sites)
        csv_part_gcs_fd.close()

        # decide what to do next
        self.end_cursor = query.cursor()
        if self.end_cursor and self.start_cursor != self.end_cursor:
            # chain to next task
            taskqueue.add(
                url=self.worker_url,
                params=self.get_continuation_param_dict(),
                retry_options=taskqueue.TaskRetryOptions(task_retry_limit=3),
            )
        else:
            # finish file: combine parts and deduplicate lines
            logging.info(u"Deduplicating to create %s ..." % self.filename)

            sio = StringIO()
            path_prefix = BUCKET_NAME + '/' + self.filename + '.part'
            for gcs_file_stat in cloudstorage.listbucket(path_prefix):
                csv_part_gcs_fd = cloudstorage.open(gcs_file_stat.filename)
                for line in csv_part_gcs_fd:
                    sio.write(line)
                csv_part_gcs_fd.close()
            sio.seek(0)
            deduplicated_lines = set(line for line in sio)

            # write csv header and deduplicated lines to new file
            csv_complete_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' +
                                                    self.filename,
                                                    'w',
                                                    content_type='text/csv')
            csv_complete_gcs_fd.write(self.csv_header.encode('utf-8'))
            for line in deduplicated_lines:
                csv_complete_gcs_fd.write(line)
            csv_complete_gcs_fd.close()
Exemple #3
0
    def _crunch_and_save(cls, event_key):
        event = Event.get(event_key)

        # crunch
        stats_d = crunch_incident_statistics(event)
        csv_content = incident_statistics_csv(stats_d)
        html_content = incident_statistics_html(stats_d)

        # save csv & html
        csv_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' +
                                       incident_statistics_csv_filename(event),
                                       'w',
                                       content_type='text/csv')
        csv_gcs_fd.write(csv_content.encode('utf-8'))
        csv_gcs_fd.close()

        html_gcs_fd = cloudstorage.open(
            BUCKET_NAME + '/' + incident_statistics_html_filename(event),
            'w',
            content_type='text/html')
        html_gcs_fd.write(html_content.encode('utf-8'))
        html_gcs_fd.close()
    def post(self):
        # get args
        self.start_cursor = self.request.get('cursor')
        self.filtering_event_key = self.request.get('event')
        self.filename = self.request.get('filename')
        self.csv_header = self.request.get('csv_header')
        self.worker_url = self.request.get('worker_url')

        self.event = Event.get(self.filtering_event_key) if self.filtering_event_key else None

        # get (base) query, skip query to cursor, filter for sites
        query = self.get_base_query()
        if self.start_cursor:
            query.with_cursor(self.start_cursor)
        fetched_sites = query.fetch(limit=self.sites_per_task)
        sites = self.filter_sites(fetched_sites)

        # try deleting before uploading
        try: 
        	logging.info("try to delete bucket")
        	cloudstorage.delete(BUCKET_NAME + '/' + self.filename)
        except Exception, e:
        	logging.error("Deleting bucket failed: %s" % e)
Exemple #5
0
    def _crunch_and_save(cls, event_key):
        event = Event.get(event_key)

        # crunch
        stats_d = crunch_incident_statistics(event)
        csv_content = incident_statistics_csv(stats_d)
        html_content = incident_statistics_html(stats_d)

        # save csv & html
        csv_gcs_fd = cloudstorage.open(
            BUCKET_NAME + '/' + incident_statistics_csv_filename(event),
            'w',
            content_type='text/csv'
        )
        csv_gcs_fd.write(csv_content.encode('utf-8'))
        csv_gcs_fd.close()

        html_gcs_fd = cloudstorage.open(
            BUCKET_NAME + '/' + incident_statistics_html_filename(event),
            'w',
            content_type='text/html'
        )
        html_gcs_fd.write(html_content.encode('utf-8'))
        html_gcs_fd.close()
Exemple #6
0
 def get_base_query(self):
     query = Site.all()
     if self.filtering_event_key:
         query.filter('event', Event.get(self.filtering_event_key))
     return query
 def get_base_query(self):
     query = Site.all()
     if self.filtering_event_key:
         query.filter('event', Event.get(self.filtering_event_key))
     return query