def post(self): # get args self.start_cursor = self.request.get('cursor') self.filtering_event_key = self.request.get('event') self.filename = self.request.get('filename') self.csv_header = self.request.get('csv_header') self.worker_url = self.request.get('worker_url') self.event = Event.get(self.filtering_event_key) if self.filtering_event_key else None # get (base) query, skip query to cursor, filter for sites query = self.get_base_query() if self.start_cursor: query.with_cursor(self.start_cursor) fetched_sites = query.fetch(limit=self.sites_per_task) sites = self.filter_sites(fetched_sites) # write part of csv file to GCS csv_part_gcs_fd = cloudstorage.open( BUCKET_NAME + '/' + self.filename + '.part.' + self.start_cursor, 'w', content_type='text/csv' ) self._write_csv_rows(csv_part_gcs_fd, sites) csv_part_gcs_fd.close() # decide what to do next self.end_cursor = query.cursor() if self.end_cursor and self.start_cursor != self.end_cursor: # chain to next task taskqueue.add( url=self.worker_url, params=self.get_continuation_param_dict(), retry_options=taskqueue.TaskRetryOptions(task_retry_limit=3), ) else: # finish file: combine parts and deduplicate lines logging.info(u"Deduplicating to create %s ..." % self.filename) sio = StringIO() path_prefix = BUCKET_NAME + '/' + self.filename + '.part' for gcs_file_stat in cloudstorage.listbucket(path_prefix): csv_part_gcs_fd = cloudstorage.open(gcs_file_stat.filename) for line in csv_part_gcs_fd: sio.write(line) csv_part_gcs_fd.close() sio.seek(0) deduplicated_lines = set(line for line in sio) # write csv header and deduplicated lines to new file csv_complete_gcs_fd = cloudstorage.open( BUCKET_NAME + '/' + self.filename, 'w', content_type='text/csv' ) csv_complete_gcs_fd.write(self.csv_header.encode('utf-8')) for line in deduplicated_lines: csv_complete_gcs_fd.write(line) csv_complete_gcs_fd.close()
def post(self): # get args self.start_cursor = self.request.get('cursor') self.filtering_event_key = self.request.get('event') self.filename = self.request.get('filename') self.csv_header = self.request.get('csv_header') self.worker_url = self.request.get('worker_url') self.event = Event.get( self.filtering_event_key) if self.filtering_event_key else None # get (base) query, skip query to cursor, filter for sites query = self.get_base_query() if self.start_cursor: query.with_cursor(self.start_cursor) fetched_sites = query.fetch(limit=self.sites_per_task) sites = self.filter_sites(fetched_sites) # write part of csv file to GCS csv_part_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' + self.filename + '.part.' + self.start_cursor, 'w', content_type='text/csv') self._write_csv_rows(csv_part_gcs_fd, sites) csv_part_gcs_fd.close() # decide what to do next self.end_cursor = query.cursor() if self.end_cursor and self.start_cursor != self.end_cursor: # chain to next task taskqueue.add( url=self.worker_url, params=self.get_continuation_param_dict(), retry_options=taskqueue.TaskRetryOptions(task_retry_limit=3), ) else: # finish file: combine parts and deduplicate lines logging.info(u"Deduplicating to create %s ..." % self.filename) sio = StringIO() path_prefix = BUCKET_NAME + '/' + self.filename + '.part' for gcs_file_stat in cloudstorage.listbucket(path_prefix): csv_part_gcs_fd = cloudstorage.open(gcs_file_stat.filename) for line in csv_part_gcs_fd: sio.write(line) csv_part_gcs_fd.close() sio.seek(0) deduplicated_lines = set(line for line in sio) # write csv header and deduplicated lines to new file csv_complete_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' + self.filename, 'w', content_type='text/csv') csv_complete_gcs_fd.write(self.csv_header.encode('utf-8')) for line in deduplicated_lines: csv_complete_gcs_fd.write(line) csv_complete_gcs_fd.close()
def _crunch_and_save(cls, event_key): event = Event.get(event_key) # crunch stats_d = crunch_incident_statistics(event) csv_content = incident_statistics_csv(stats_d) html_content = incident_statistics_html(stats_d) # save csv & html csv_gcs_fd = cloudstorage.open(BUCKET_NAME + '/' + incident_statistics_csv_filename(event), 'w', content_type='text/csv') csv_gcs_fd.write(csv_content.encode('utf-8')) csv_gcs_fd.close() html_gcs_fd = cloudstorage.open( BUCKET_NAME + '/' + incident_statistics_html_filename(event), 'w', content_type='text/html') html_gcs_fd.write(html_content.encode('utf-8')) html_gcs_fd.close()
def post(self): # get args self.start_cursor = self.request.get('cursor') self.filtering_event_key = self.request.get('event') self.filename = self.request.get('filename') self.csv_header = self.request.get('csv_header') self.worker_url = self.request.get('worker_url') self.event = Event.get(self.filtering_event_key) if self.filtering_event_key else None # get (base) query, skip query to cursor, filter for sites query = self.get_base_query() if self.start_cursor: query.with_cursor(self.start_cursor) fetched_sites = query.fetch(limit=self.sites_per_task) sites = self.filter_sites(fetched_sites) # try deleting before uploading try: logging.info("try to delete bucket") cloudstorage.delete(BUCKET_NAME + '/' + self.filename) except Exception, e: logging.error("Deleting bucket failed: %s" % e)
def _crunch_and_save(cls, event_key): event = Event.get(event_key) # crunch stats_d = crunch_incident_statistics(event) csv_content = incident_statistics_csv(stats_d) html_content = incident_statistics_html(stats_d) # save csv & html csv_gcs_fd = cloudstorage.open( BUCKET_NAME + '/' + incident_statistics_csv_filename(event), 'w', content_type='text/csv' ) csv_gcs_fd.write(csv_content.encode('utf-8')) csv_gcs_fd.close() html_gcs_fd = cloudstorage.open( BUCKET_NAME + '/' + incident_statistics_html_filename(event), 'w', content_type='text/html' ) html_gcs_fd.write(html_content.encode('utf-8')) html_gcs_fd.close()
def get_base_query(self): query = Site.all() if self.filtering_event_key: query.filter('event', Event.get(self.filtering_event_key)) return query