def create_aug(cache_temp): with contextlib.ExitStack() as stack: # Get augmentation data newdata = stack.enter_context( get_dataset(metadata, task['id'], format='csv'), ) # Get input data if it's a reference to a dataset if data_id: path = stack.enter_context( get_dataset(data_profile, data_id, format='csv'), ) data_file = stack.enter_context(open(path, 'rb')) else: data_file = io.BytesIO(data) # Perform augmentation writer = make_writer(cache_temp, format, format_options) logger.info("Performing augmentation with supplied data") augment( data_file, newdata, data_profile, task, writer, columns=columns, ) # ZIP result if it's a directory if os.path.isdir(cache_temp): logger.info("Result is a directory, creating ZIP file") zip_name = cache_temp + '.zip' with zipfile.ZipFile(zip_name, 'w') as zip_: make_zip_recursive(zip_, cache_temp) shutil.rmtree(cache_temp) os.rename(zip_name, cache_temp)
def create_aug(cache_temp): try: with contextlib.ExitStack() as stack: # Get augmentation data newdata = stack.enter_context( get_dataset(metadata, task['id'], format='csv'), ) # Get input data if it's a reference to a dataset if data_id: data_file = stack.enter_context( get_dataset(data_profile, data_id, format='csv'), ) else: data_file = io.BytesIO(data) # Perform augmentation logger.info("Performing augmentation with supplied data") augment( data_file, newdata, data_profile, task, columns=columns, destination=cache_temp, ) except AugmentationError as e: return self.send_error_json(400, str(e))
async def send_dataset(self, dataset_id, metadata): format, format_options, format_ext = self.read_format() materialize = metadata.get('materialize', {}) session_id = self.get_query_argument('session_id', None) # If there's a direct download URL if ('direct_url' in materialize and not session_id and format == 'csv' and not materialize.get('convert')): if format_options: return await self.send_error_json( 400, "Invalid output options", ) # Redirect the client to it logger.info("Sending redirect to direct_url") return self.redirect(materialize['direct_url']) with contextlib.ExitStack() as stack: try: dataset_path = stack.enter_context( get_dataset( metadata, dataset_id, format=format, format_options=format_options, )) except Exception: await self.send_error_json(500, "Materializer reports failure") raise if session_id: logger.info("Attaching to session") self.application.redis.rpush( 'session:' + session_id, json.dumps( { 'type': 'download', 'url': ('/download/' + dataset_id + '?' + self.serialize_format(format, format_options)), }, # Compact sort_keys=True, indent=None, separators=(',', ':'), ), ) return await self.send_json({'success': "attached to session"}) else: logger.info("Sending file...") return await self.send_file( dataset_path, dataset_id + (format_ext or ''), )
async def send_dataset(self, dataset_id, metadata, format='csv', format_options=None): materialize = metadata.get('materialize', {}) # If there's a direct download URL if ('direct_url' in materialize and format == 'csv' and not materialize.get('convert')): if format_options: return self.send_error_json(400, "Invalid output options") # Redirect the client to it logger.info("Sending redirect to direct_url") return self.redirect(materialize['direct_url']) else: # We want to catch exceptions from get_dataset(), without catching # exceptions from inside the with block # https://docs.python.org/3/library/contextlib.html#catching-exceptions-from-enter-methods stack = contextlib.ExitStack() try: dataset_path = stack.enter_context( get_dataset( metadata, dataset_id, format=format, format_options=format_options, ) ) except Exception: await self.send_error_json(500, "Materializer reports failure") raise with stack: if zipfile.is_zipfile(dataset_path): self.set_header('Content-Type', 'application/zip') self.set_header( 'Content-Disposition', 'attachment; filename="%s.zip"' % dataset_id) logger.info("Sending ZIP...") else: self.set_header('Content-Type', 'application/octet-stream') self.set_header('X-Content-Type-Options', 'nosniff') self.set_header('Content-Disposition', 'attachment; filename="%s"' % dataset_id) logger.info("Sending file...") with open(dataset_path, 'rb') as fp: BUFSIZE = 40960 buf = fp.read(BUFSIZE) while buf: self.write(buf) if len(buf) != BUFSIZE: break buf = fp.read(BUFSIZE) await self.flush() return self.finish()
def materialize_and_process_dataset( dataset_id, metadata, lazo_client, nominatim, geo_data, profile_semaphore, ): with contextlib.ExitStack() as stack: # Remove converters, we'll discover what's needed metadata = dict(metadata) materialize = dict(metadata.pop('materialize')) materialize.pop('convert', None) with prom_incremented(PROM_DOWNLOADING): dataset_path = stack.enter_context( get_dataset( dict(metadata, materialize=materialize), dataset_id, ) ) def convert_dataset(func, path): def convert(cache_temp): with open(cache_temp, 'w', newline='') as dst: func(path, dst) converted_key = dataset_cache_key( dataset_id, dict(metadata, materialize=materialize), 'csv', {}, ) return stack.enter_context( cache_get_or_set( '/cache/datasets', converted_key, convert, ) ) dataset_path = detect_format_convert_to_csv( dataset_path, convert_dataset, materialize, ) # Profile with profile_semaphore: with prom_incremented(PROM_PROFILING): with tracer.start_as_current_span( 'profile', attributes={'dataset': dataset_id}, ): logger.info("Profiling dataset %r", dataset_id) start = time.perf_counter() metadata = process_dataset( data=dataset_path, dataset_id=dataset_id, metadata=metadata, lazo_client=lazo_client, nominatim=nominatim, geo_data=geo_data, include_sample=True, coverage=True, plots=True, ) logger.info( "Profiling dataset %r took %.2fs", dataset_id, time.perf_counter() - start, ) metadata['materialize'] = materialize return metadata
def post(self): PROM_DOWNLOAD.inc() type_ = self.request.headers.get('Content-type', '') task = None data = None format, format_options = self.read_format() if type_.startswith('application/json'): task = self.get_json() elif (type_.startswith('multipart/form-data') or type_.startswith('application/x-www-form-urlencoded')): task = self.get_body_argument('task', None) if task is None and 'task' in self.request.files: task = self.request.files['task'][0].body.decode('utf-8') if task is not None: task = json.loads(task) data = self.get_body_argument('data', None) if 'data' in self.request.files: data = self.request.files['data'][0].body elif data is not None: data = data.encode('utf-8') if 'format' in self.request.files: return self.send_error_json( 400, "Sending 'format' in the POST data is no longer " "supported, please use query parameters", ) if task is None: return self.send_error_json( 400, "Either use multipart/form-data to send the 'data' file and " "'task' JSON, or use application/json to send 'task' alone", ) logger.info("Got POST download %s data", "without" if data is None else "with") # materialize augmentation data metadata = task['metadata'] if not data: return self.send_dataset( task['id'], metadata, format, format_options, ) else: # data try: data_profile, _ = self.handle_data_parameter(data) except ClientError as e: return self.send_error_json(400, str(e)) # first, look for possible augmentation search_results = get_augmentation_search_results( es=self.application.elasticsearch, lazo_client=self.application.lazo_client, data_profile=data_profile, query_args_main=None, query_sup_functions=None, query_sup_filters=None, tabular_variables=None, dataset_id=task['id'], union=False ) if not search_results: return self.send_error_json( 400, "The Datamart dataset referenced by 'task' cannot augment " "'data'", ) task = search_results[0] with get_dataset(metadata, task['id'], format='csv') as newdata: # perform augmentation logger.info("Performing half-augmentation with supplied data") new_path = augment( io.BytesIO(data), newdata, data_profile, task, return_only_datamart_data=True ) # FIXME: This always sends in D3M format # send a zip file self.set_header('Content-Type', 'application/zip') self.set_header( 'Content-Disposition', 'attachment; filename="augmentation.zip"') logger.info("Sending ZIP...") writer = RecursiveZipWriter(self.write) writer.write_recursive(new_path) writer.close() shutil.rmtree(os.path.abspath(os.path.join(new_path, '..')))
def materialize_and_process_dataset( dataset_id, metadata, lazo_client, nominatim, profile_semaphore, cache_invalid=False, ): with contextlib.ExitStack() as stack: with prom_incremented(PROM_DOWNLOADING): dataset_path = stack.enter_context( get_dataset(metadata, dataset_id, cache_invalid=cache_invalid) ) materialize = metadata.pop('materialize') # Check for Excel file format try: xlrd.open_workbook(dataset_path) except xlrd.XLRDError: pass else: logger.info("This is an Excel file") materialize.setdefault('convert', []).append({'identifier': 'xls'}) excel_temp_path = dataset_path + '.xls' os.rename(dataset_path, excel_temp_path) try: with open(dataset_path, 'w', newline='') as dst: xls_to_csv(excel_temp_path, dst) finally: os.remove(excel_temp_path) # Check for TSV file format with open(dataset_path, 'r') as fp: try: dialect = csv.Sniffer().sniff(fp.read(16384)) except Exception as error: # csv.Error, UnicodeDecodeError logger.error("csv.Sniffer error: %s", error) dialect = csv.get_dialect('excel') if getattr(dialect, 'delimiter', '') == '\t': logger.info("This is a TSV file") materialize.setdefault('convert', []).append({'identifier': 'tsv'}) tsv_temp_path = dataset_path + '.tsv' os.rename(dataset_path, tsv_temp_path) try: with open(dataset_path, 'w', newline='') as dst: tsv_to_csv(tsv_temp_path, dst) finally: os.remove(tsv_temp_path) # Check for pivoted temporal table with open(dataset_path, 'r') as fp: reader = csv.reader(fp) try: columns = next(iter(reader)) except StopIteration: columns = [] if len(columns) >= 3: non_matches = [ i for i, name in enumerate(columns) if parse_date(name) is None ] if len(non_matches) <= max(2.0, 0.20 * len(columns)): logger.info("Detected pivoted table") materialize.setdefault('convert', []).append({ 'identifier': 'pivot', 'except_columns': non_matches, }) pivot_temp_path = dataset_path + '.pivot.csv' os.rename(dataset_path, pivot_temp_path) try: with open(dataset_path, 'w', newline='') as dst: pivot_table(pivot_temp_path, dst, non_matches) finally: os.remove(pivot_temp_path) # Profile with profile_semaphore: with prom_incremented(PROM_PROFILING): logger.info("Profiling dataset %r", dataset_id) start = time.perf_counter() metadata = process_dataset( data=dataset_path, dataset_id=dataset_id, metadata=metadata, lazo_client=lazo_client, nominatim=nominatim, include_sample=True, coverage=True, plots=True, ) logger.info( "Profiling dataset %r took %.2fs", dataset_id, time.perf_counter() - start, ) metadata['materialize'] = materialize return metadata