def cleanup_upload(object_id, model=None, lang_code="en"): """ Task cleanup all data related to upload id """ with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "cleanup_upload"} ds_model, _ = get_serializer_by_model(model, logger_context) if not ds_model: return try: datasource = ds_model.objects.get(id=object_id) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found" % (model, object_id), extra=logger_context) return if datasource.expired_at > timezone.now(): logger.debug( "Skip datasource cleanup %s, expired_at in future %s" % (datasource.id, datasource.expired_at.isoformat())) cleanup_upload.apply_async((datasource.id, model, lang_code), eta=datasource.expired_at) return datasource_path = os.path.dirname(datasource.file.path) shutil.rmtree(datasource_path, ignore_errors=True) datasource.delete() logger.debug("Remove all data from %s" % (datasource_path))
def cleanup_upload(object_id, model=None, lang_code="en"): """ Task cleanup all data related to upload id """ with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "cleanup_upload"} ds_model, _ = get_serializer_by_model(model, logger_context) if not ds_model: return try: datasource = ds_model.objects.get(id=object_id) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found", model, object_id, extra=logger_context) return urls = getattr(datasource, "urls", []) protocol = None if not urls else get_protocol(urls[0]) if protocol == "file": logger.debug( "Skip datasource cleanup %s, file is located in DATAREGISTRY_MEDIA_ROOT", datasource.id) return if datasource.expired_at > timezone.now(): logger.debug( "Skip datasource cleanup %s, expired_at in future %s", datasource.id, datasource.expired_at.isoformat(), ) cleanup_upload.apply_async((datasource.id, model, lang_code), eta=datasource.expired_at) return datasource_paths = [ os.path.dirname(file.file.path) for file in datasource.files.all() ] for path in datasource_paths: shutil.rmtree(path, ignore_errors=True) datasource.delete() logger.debug("Remove all data from %s", datasource_paths)
def validate_data(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"} ds_model, serializer = get_serializer_by_model(model, logger_context) channel_layer = get_channel_layer() if not ds_model: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Model %s for datasource not found") % model }, ) return try: is_valid = False datasource = ds_model.objects.get(id=object_id) datasource.status = "validation" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) logger.debug("Start validation for %s file" % object_id) with open(SCHEMA_PATH) as fd: schema = json.loads(fd.read()) resource = "" if is_release_package(datasource.file.path): resource = "releases" elif is_record_package(datasource.file.path): resource = "records" if resource: path = pathlib.Path(datasource.file.path) workdir = path.parent filename = path.name total = path.stat().st_size analyzer = FileAnalyzer(workdir, schema=schema, root_key=resource, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES) timestamp = time.time() for read, count in analyzer.analyze_file(filename, with_preview=True): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": { "id": str(datasource.id) }, "progress": { "rows": count, "percentage": (read / total) * 100 if total else 0, "size": total, "read": read, }, }, ) timestamp = time.time() is_valid = True datasource.validation.is_valid = is_valid datasource.root_key = resource datasource.validation.save(update_fields=["is_valid"]) datasource.save(update_fields=["root_key"]) if is_valid and not datasource.available_tables and not datasource.analyzed_file: _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) analyzer.spec.dump(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables( analyzer.spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) elif is_valid and datasource.analyzed_file: spec = DataPreprocessor.restore(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables(spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found" % (model, object_id), extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Datasource %s not found") % object_id }, ) except (ijson.JSONError, ijson.IncompleteJSONError) as e: logger.info( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except OSError as e: logger.exception( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _( "Currently, the space limit was reached. Please try again later." ) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except Exception as e: logger.exception( "Error while validating data %s" % object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_ERROR": str(e) }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, )
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters" % model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) workdir = pathlib.Path(datasource.file.path).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener(workdir, options, spec.tables, root_key=datasource.root_key, **formats) timestamp = time.time() for count in flattener.flatten_file(datasource.file.path): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found" % (flatten_id, model), extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s" % (flatten_id, model, e), extra=extra) flatten.status = "failed" flatten.error = _( "Currently, the space limit was reached. Please try again later." ) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed" % (flatten_id, model, datasource.id), extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )
def download_data_source(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = { "DATASOURCE_ID": object_id, "TASK": "download_data_source" } channel_layer = get_channel_layer() ds_model, serializer = get_serializer_by_model(model, logger_context) if not ds_model or not serializer: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "error": _("Model %s for datasource not found") % model }, ) return try: datasource = ds_model.objects.get(id=object_id) datasource.status = "downloading" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) logger.debug( "Start download for %s" % object_id, extra={ "MESSAGE_ID": "download_start", "UPLOAD_ID": object_id, "URL": datasource.url }, ) r = requests.get(datasource.url, stream=True) if r.status_code != 200: logger.error( "Error while downloading data file for %s" % object_id, extra={ "MESSAGE_ID": "download_failed", "DATASOURCE_ID": object_id, "MODEL": model, "STATUS_CODE": r.status_code, }, ) datasource.error = _(f"{r.status_code}: {r.reason}") datasource.status = "failed" datasource.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) return size = int(r.headers.get("Content-Length", 0)) downloaded = 0 chunk_size = 10240 _file = ContentFile(b"") datasource.file.save("new", _file) with open(datasource.file.path, "wb") as fd: timestamp = time.time() for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk) downloaded += chunk_size if size != 0: progress = (downloaded / size) * 100 progress = progress if progress < 100 else 100 else: progress = size if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), "progress": int(progress), }, ) timestamp = time.time() if datasource.analyzed_data_url: r = requests.get(datasource.analyzed_data_url, stream=True) if r.status_code != 200: logger.error( "Error while downloading data file for %s" % object_id, extra={ "MESSAGE_ID": "download_failed", "DATASOURCE_ID": object_id, "MODEL": model, "STATUS_CODE": r.status_code, }, ) datasource.error = _(f"{r.status_code}: {r.reason}") datasource.status = "failed" datasource.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) return downloaded = 0 datasource.status = "analyzed_data.downloading" datasource.save(update_fields=["status"]) _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) with open(datasource.analyzed_file.path, "wb") as fd: timestamp = time.time() for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk) downloaded += chunk_size progress = (downloaded / size) * 100 progress = progress if progress < 100 else 100 if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation( instance=datasource), "progress": int(progress), }, ) timestamp = time.time() datasource.status = "queued.validation" datasource.downloaded = True expired_at = timezone.now() + timedelta( days=settings.JOB_FILES_TIMEOUT) datasource.expired_at = expired_at datasource.save( update_fields=["status", "downloaded", "expired_at"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) logger.info( "Complete download for %s" % object_id, extra={ "MESSAGE_ID": "download_complete", "UPLOAD_ID": object_id, "URL": datasource.url, "EXPIRED_AT": expired_at.isoformat(), }, ) task = validate_data.delay(object_id, model=model) datasource.validation.task_id = task.id datasource.validation.save(update_fields=["task_id"]) logger.info( "Schedule validation for %s" % object_id, extra={ "MESSAGE_ID": "schedule_validation", "UPLOAD_ID": object_id }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found" % (model, object_id), extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "error": _("Datasource %s not found") % object_id }, ) except OSError as e: logger.info( "Error while download datasource %s" % object_id, extra={ "MESSAGE_ID": "download_no_left_space", "DATASOURCE_ID": object_id, "MODEL": model, "ERROR": str(e), }, ) datasource.error = _( "Currently, the space limit was reached. Please try again later." ) datasource.status = "failed" datasource.save(update_fields=["status", "error"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) except Exception as e: logger.exception( "Error while download datasource %s" % object_id, extra={ "MESSAGE_ID": "download_exception", "DATASOURCE_ID": object_id, "MODEL": model, "ERROR": str(e), }, ) datasource.error = _( "Something went wrong. Contact with support service.") datasource.status = "failed" datasource.save(update_fields=["status", "error"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, )
def validate_data(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = {"DATASOURCE_ID": object_id, "TASK": "validate_data"} ds_model, serializer = get_serializer_by_model(model, logger_context) channel_layer = get_channel_layer() if not ds_model: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Model %s for datasource not found") % model }, ) return try: is_valid = False datasource = ds_model.objects.get(id=object_id) datasource.status = "validation" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) logger.debug("Start validation for %s file", object_id) paths = [ pathlib.Path(file.file.path) for file in datasource.files.all() ] workdir = paths[0].parent filenames = [pathlib.Path(path).name for path in paths] total = sum([ pathlib.Path(path).stat().st_size if get_reader(path) == open else gz_size(path) for path in paths ]) analyzer = FileAnalyzer(workdir, root_tables=ROOT_TABLES, combined_tables=COMBINED_TABLES) timestamp = time.time() filepaths = [workdir / filename for filename in filenames] for read, count in analyzer.analyze_file(filepaths, with_preview=True): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": { "id": str(datasource.id) }, "progress": { "rows": count, "percentage": (read / total) * 100 if total else 0, "size": total, "read": read, }, }, ) timestamp = time.time() is_valid = True datasource.validation.is_valid = is_valid datasource.root_key = analyzer.pkg_type datasource.validation.save(update_fields=["is_valid"]) datasource.order = ", ".join(analyzer.order) datasource.save() if is_valid and not datasource.available_tables and not datasource.analyzed_file: _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) analyzer.spec.dump(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables( analyzer.spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) elif is_valid and datasource.analyzed_file: spec = DataPreprocessor.restore(datasource.analyzed_file.path) available_tables, unavailable_tables = retrieve_tables(spec) datasource.available_tables = available_tables datasource.unavailable_tables = unavailable_tables datasource.save( update_fields=["available_tables", "unavailable_tables"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "datasource": serializer.to_representation(instance=datasource) }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found", model, object_id, extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.validate", "error": _("Datasource %s not found") % object_id }, ) except (ijson.JSONError, ijson.IncompleteJSONError) as e: logger.info( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except OSError as e: logger.exception( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _( "Something went wrong during processing of your file, please contact support" )) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, ) except Exception as e: logger.exception( "Error while validating data %s", object_id, extra={ "MESSAGE_ID": "validation_exception", "MODEL": model, "ID": object_id, "STR_EXCEPTION": e.__class__.__name__, "STR_ERROR": str(e), }, ) message = _("Error while validating data `%s`") % str(e) datasource.validation.errors = message datasource.validation.is_valid = False datasource.validation.save(update_fields=["errors", "is_valid"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.validate", "error": message }, )
def flatten_data(flatten_id, model=None, lang_code="en_US"): with internationalization(lang_code=lang_code): logger_context = { "FLATTEN_ID": flatten_id, "TASK": "flatten_data", "MODEL": model } channel_layer = get_channel_layer() if model not in getters: extra = { "MESSAGE_ID": "model_not_registered", "MODEL": model, "TASK": "flatten_data", "FLATTEN_ID": flatten_id, } logger.info("Model %s not registered in getters", model, extra=extra) return try: serializer = FlattenSerializer() flatten = Flatten.objects.get(id=flatten_id) selection = flatten.dataselection_set.all()[0] datasource = getattr(selection, f"{model.lower()}_set").all()[0] flatten.status = "processing" flatten.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) spec = DataPreprocessor.restore(datasource.analyzed_file.path) total_rows = spec.total_items opt = get_flatten_options(selection) # In case of exclusion of child tables, 'split' of root table should be set to 'False' for proper export # TODO: There should be better way to handle this (probably on library side) if "exclude" in opt: for _table in opt["exclude"]: _parent = spec.tables[_table].parent if _parent != "" and _parent.name in opt["selection"]: opt["selection"][_parent.name]["split"] = False logger.debug( "Generate options for export", extra={ "MESSAGE_ID": "generate_flatten_options", "DATASOURCE_ID": str(datasource.id), "MODEL": model, "SELECTION_ID": str(selection.id), "FLATTEN_ID": str(flatten.id), "OPTIONS": opt, }, ) options = FlattenOptions(**opt) files = [file.file.path for file in datasource.files.all()] workdir = pathlib.Path(files[0]).parent formats = {"csv": None, "xlsx": None} if flatten.export_format == flatten.CSV: workdir = workdir / "export" if not workdir.exists(): os.makedirs(workdir) formats[flatten.export_format] = workdir else: formats[flatten.export_format] = "result.xlsx" flattener = FileFlattener( workdir, options, tables=spec.tables, pkg_type=datasource.root_key, multiple_values=getattr(spec, "multiple_values", False), schema=spec.schema, **formats, ) timestamp = time.time() for count in flattener.flatten_file(files): if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": { "id": str(flatten.id) }, "progress": { "total_rows": total_rows, "processed": count, "percentage": (count / total_rows) * 100 if total_rows else total_rows, }, }, ) timestamp = time.time() if flatten.export_format == flatten.CSV: target_file = f"{workdir}/{datasource.id}.zip" zip_files(workdir, target_file, extension="csv") with open(target_file, "rb") as fd: file_ = File(fd) file_.name = f"{datasource.id}.zip" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) else: target_file = f"{workdir}/result.xlsx" with open(target_file, "rb") as fd: file_ = File(fd) file_.name = "result.xlsx" flatten.file = file_ flatten.status = "completed" flatten.save(update_fields=["file", "status"]) os.remove(fd.name) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except ObjectDoesNotExist: extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_not_found" logger.info("Flatten %s for %s model not found", flatten_id, model, extra=extra) except OSError as e: extra = deepcopy(logger_context) extra.update({ "MESSAGE_ID": "flatten_no_left_space", "DATASOURCE_ID": str(datasource.id), "ERROR_MSG": str(e) }) logger.info("Flatten %s for %s model failed: %s", flatten_id, model, e, extra=extra) flatten.status = "failed" flatten.error = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _( "Something went wrong during processing of your file, please contact support" )) flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, ) except (TypeError, Exception) as e: error_message = str(e) extra = deepcopy(logger_context) extra["MESSAGE_ID"] = "flatten_failed" extra["ERROR_MESSAGE"] = error_message logger.error( "Flatten %s for %s datasource %s failed", flatten_id, model, datasource.id, extra=extra, exc_info=True, ) flatten.status = "failed" flatten.error = error_message flatten.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{datasource.id}", { "type": "task.flatten", "flatten": serializer.to_representation(instance=flatten) }, )
def download_data_source(object_id, model=None, lang_code="en"): with internationalization(lang_code=lang_code): logger_context = { "DATASOURCE_ID": object_id, "TASK": "download_data_source" } channel_layer = get_channel_layer() ds_model, serializer = get_serializer_by_model(model, logger_context) if not ds_model or not serializer: async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "error": _("Model %s for datasource not found") % model }, ) return try: datasource = ds_model.objects.get(id=object_id) datasource.status = "downloading" datasource.save(update_fields=["status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) logger.debug( "Start download for %s", object_id, extra={ "MESSAGE_ID": "download_start", "UPLOAD_ID": object_id, "URL": datasource.urls }, ) if get_protocol(datasource.urls[0]) == "file": paths = [ str( dataregistry_path_resolver( dataregistry_path_formatter(url))).replace( settings.MEDIA_ROOT, "") for url in datasource.urls ] files = [DataFile.objects.create() for path in paths] files = multiple_file_assigner(files, paths) datasource.files.add(*files) else: r = requests.get(datasource.urls[0], stream=True) if r.status_code != 200: logger.error( "Error while downloading data file for %s", object_id, extra={ "MESSAGE_ID": "download_failed", "DATASOURCE_ID": object_id, "MODEL": model, "STATUS_CODE": r.status_code, }, ) datasource.error = f"{r.status_code}: {r.reason}" datasource.status = "failed" datasource.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) return size = int(r.headers.get("Content-Length", 0)) downloaded = 0 chunk_size = 10240 _file = ContentFile(b"") file_obj = DataFile.objects.create() file_obj.file.save("new", _file) file_obj.save() datasource.files.add(file_obj) with open(file_obj.file.path, "wb") as fd: timestamp = time.time() for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk) downloaded += chunk_size if size != 0: progress = (downloaded / size) * 100 progress = progress if progress < 100 else 100 else: progress = size if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation( instance=datasource), "progress": int(progress), }, ) timestamp = time.time() if datasource.analyzed_data_url: if get_protocol(datasource.analyzed_data_url) == "file": path = dataregistry_path_formatter( datasource.analyzed_data_url) path = dataregistry_path_resolver(path) path = str(path).replace(settings.MEDIA_ROOT, "") datasource.analyzed_file.name = path datasource.save() else: r = requests.get(datasource.analyzed_data_url, stream=True) if r.status_code != 200: logger.error( "Error while downloading data file for %s", object_id, extra={ "MESSAGE_ID": "download_failed", "DATASOURCE_ID": object_id, "MODEL": model, "STATUS_CODE": r.status_code, }, ) datasource.error = f"{r.status_code}: {r.reason}" datasource.status = "failed" datasource.save(update_fields=["error", "status"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation( instance=datasource), }, ) return downloaded = 0 datasource.status = "analyzed_data.downloading" datasource.save(update_fields=["status"]) _file = ContentFile(b"") datasource.analyzed_file.save("new", _file) with open(datasource.analyzed_file.path, "wb") as fd: timestamp = time.time() for chunk in r.iter_content(chunk_size=chunk_size): fd.write(chunk) downloaded += chunk_size progress = (downloaded / size) * 100 progress = progress if progress < 100 else 100 if (time.time() - timestamp) <= 1: continue async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation( instance=datasource), "progress": int(progress), }, ) timestamp = time.time() datasource.status = "queued.validation" datasource.downloaded = True expired_at = timezone.now() + timedelta( days=settings.JOB_FILES_TIMEOUT) datasource.expired_at = expired_at datasource.save( update_fields=["status", "downloaded", "expired_at"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, ) logger.info( "Complete download for %s", object_id, extra={ "MESSAGE_ID": "download_complete", "UPLOAD_ID": object_id, "URL": datasource.urls, "EXPIRED_AT": expired_at.isoformat(), }, ) task = validate_data.delay(object_id, model=model) datasource.validation.task_id = task.id datasource.validation.save(update_fields=["task_id"]) logger.info( "Schedule validation for %s", object_id, extra={ "MESSAGE_ID": "schedule_validation", "UPLOAD_ID": object_id }, ) except ObjectDoesNotExist: logger_context["MODEL"] = model logger_context["MESSAGE_ID"] = "datasource_not_found" logger.info("Datasource %s %s not found", model, object_id, extra=logger_context) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "error": _("Datasource %s not found") % object_id }, ) except (OSError, Exception) as e: message_id = { "OSError": "download_no_left_space", "Exception": "download_exception" } message = { "OSError": "Something went wrong during processing of your file, please contact support", "Exception": "Something went wrong. Contact with support service.", } log_level = {"OSError": logger.info, "Exception": logger.exception} log_level[type(e).__name__]( "Error while download datasource %s", object_id, extra={ "MESSAGE_ID": message_id[type(e).__name__], "DATASOURCE_ID": object_id, "MODEL": model, "ERROR": str(e), "STR_EXCEPTION": e.__class__.__name__, }, ) datasource.error = (_( "Currently, the space limit was reached. Please try again later." ) if "[Errno 28]" in str(e) else _(message[type(e).__name__])) datasource.status = "failed" datasource.save(update_fields=["status", "error"]) async_to_sync(channel_layer.group_send)( f"datasource_{object_id}", { "type": "task.download_data_source", "datasource": serializer.to_representation(instance=datasource), }, )