Beispiel #1
0
        def create_aug(cache_temp):
            with contextlib.ExitStack() as stack:
                # Get augmentation data
                newdata = stack.enter_context(
                    get_dataset(metadata, task['id'], format='csv'), )
                # Get input data if it's a reference to a dataset
                if data_id:
                    path = stack.enter_context(
                        get_dataset(data_profile, data_id, format='csv'), )
                    data_file = stack.enter_context(open(path, 'rb'))
                else:
                    data_file = io.BytesIO(data)
                # Perform augmentation
                writer = make_writer(cache_temp, format, format_options)
                logger.info("Performing augmentation with supplied data")
                augment(
                    data_file,
                    newdata,
                    data_profile,
                    task,
                    writer,
                    columns=columns,
                )

                # ZIP result if it's a directory
                if os.path.isdir(cache_temp):
                    logger.info("Result is a directory, creating ZIP file")
                    zip_name = cache_temp + '.zip'
                    with zipfile.ZipFile(zip_name, 'w') as zip_:
                        make_zip_recursive(zip_, cache_temp)
                    shutil.rmtree(cache_temp)
                    os.rename(zip_name, cache_temp)
Beispiel #2
0
 def create_aug(cache_temp):
     try:
         with contextlib.ExitStack() as stack:
             # Get augmentation data
             newdata = stack.enter_context(
                 get_dataset(metadata, task['id'], format='csv'),
             )
             # Get  input data if it's a reference to a dataset
             if data_id:
                 data_file = stack.enter_context(
                     get_dataset(data_profile, data_id, format='csv'),
                 )
             else:
                 data_file = io.BytesIO(data)
             # Perform augmentation
             logger.info("Performing augmentation with supplied data")
             augment(
                 data_file,
                 newdata,
                 data_profile,
                 task,
                 columns=columns,
                 destination=cache_temp,
             )
     except AugmentationError as e:
         return self.send_error_json(400, str(e))
Beispiel #3
0
    async def send_dataset(self, dataset_id, metadata):
        format, format_options, format_ext = self.read_format()

        materialize = metadata.get('materialize', {})

        session_id = self.get_query_argument('session_id', None)

        # If there's a direct download URL
        if ('direct_url' in materialize and not session_id and format == 'csv'
                and not materialize.get('convert')):
            if format_options:
                return await self.send_error_json(
                    400,
                    "Invalid output options",
                )
            # Redirect the client to it
            logger.info("Sending redirect to direct_url")
            return self.redirect(materialize['direct_url'])

        with contextlib.ExitStack() as stack:
            try:
                dataset_path = stack.enter_context(
                    get_dataset(
                        metadata,
                        dataset_id,
                        format=format,
                        format_options=format_options,
                    ))
            except Exception:
                await self.send_error_json(500, "Materializer reports failure")
                raise

            if session_id:
                logger.info("Attaching to session")
                self.application.redis.rpush(
                    'session:' + session_id,
                    json.dumps(
                        {
                            'type':
                            'download',
                            'url':
                            ('/download/' + dataset_id + '?' +
                             self.serialize_format(format, format_options)),
                        },
                        # Compact
                        sort_keys=True,
                        indent=None,
                        separators=(',', ':'),
                    ),
                )
                return await self.send_json({'success': "attached to session"})
            else:
                logger.info("Sending file...")
                return await self.send_file(
                    dataset_path,
                    dataset_id + (format_ext or ''),
                )
Beispiel #4
0
    async def send_dataset(self, dataset_id, metadata,
                           format='csv', format_options=None):
        materialize = metadata.get('materialize', {})

        # If there's a direct download URL
        if ('direct_url' in materialize and
                format == 'csv' and not materialize.get('convert')):
            if format_options:
                return self.send_error_json(400, "Invalid output options")
            # Redirect the client to it
            logger.info("Sending redirect to direct_url")
            return self.redirect(materialize['direct_url'])
        else:
            # We want to catch exceptions from get_dataset(), without catching
            # exceptions from inside the with block
            # https://docs.python.org/3/library/contextlib.html#catching-exceptions-from-enter-methods
            stack = contextlib.ExitStack()
            try:
                dataset_path = stack.enter_context(
                    get_dataset(
                        metadata, dataset_id,
                        format=format, format_options=format_options,
                    )
                )
            except Exception:
                await self.send_error_json(500, "Materializer reports failure")
                raise
            with stack:
                if zipfile.is_zipfile(dataset_path):
                    self.set_header('Content-Type', 'application/zip')
                    self.set_header(
                        'Content-Disposition',
                        'attachment; filename="%s.zip"' % dataset_id)
                    logger.info("Sending ZIP...")
                else:
                    self.set_header('Content-Type', 'application/octet-stream')
                    self.set_header('X-Content-Type-Options', 'nosniff')
                    self.set_header('Content-Disposition',
                                    'attachment; filename="%s"' % dataset_id)
                    logger.info("Sending file...")
                with open(dataset_path, 'rb') as fp:
                    BUFSIZE = 40960
                    buf = fp.read(BUFSIZE)
                    while buf:
                        self.write(buf)
                        if len(buf) != BUFSIZE:
                            break
                        buf = fp.read(BUFSIZE)
                    await self.flush()
                return self.finish()
Beispiel #5
0
def materialize_and_process_dataset(
    dataset_id, metadata,
    lazo_client, nominatim, geo_data,
    profile_semaphore,
):
    with contextlib.ExitStack() as stack:
        # Remove converters, we'll discover what's needed
        metadata = dict(metadata)
        materialize = dict(metadata.pop('materialize'))
        materialize.pop('convert', None)

        with prom_incremented(PROM_DOWNLOADING):
            dataset_path = stack.enter_context(
                get_dataset(
                    dict(metadata, materialize=materialize),
                    dataset_id,
                )
            )

        def convert_dataset(func, path):
            def convert(cache_temp):
                with open(cache_temp, 'w', newline='') as dst:
                    func(path, dst)
            converted_key = dataset_cache_key(
                dataset_id,
                dict(metadata, materialize=materialize),
                'csv',
                {},
            )
            return stack.enter_context(
                cache_get_or_set(
                    '/cache/datasets',
                    converted_key,
                    convert,
                )
            )

        dataset_path = detect_format_convert_to_csv(
            dataset_path,
            convert_dataset,
            materialize,
        )

        # Profile
        with profile_semaphore:
            with prom_incremented(PROM_PROFILING):
                with tracer.start_as_current_span(
                    'profile',
                    attributes={'dataset': dataset_id},
                ):
                    logger.info("Profiling dataset %r", dataset_id)
                    start = time.perf_counter()
                    metadata = process_dataset(
                        data=dataset_path,
                        dataset_id=dataset_id,
                        metadata=metadata,
                        lazo_client=lazo_client,
                        nominatim=nominatim,
                        geo_data=geo_data,
                        include_sample=True,
                        coverage=True,
                        plots=True,
                    )
                    logger.info(
                        "Profiling dataset %r took %.2fs",
                        dataset_id,
                        time.perf_counter() - start,
                    )

        metadata['materialize'] = materialize
        return metadata
Beispiel #6
0
    def post(self):
        PROM_DOWNLOAD.inc()

        type_ = self.request.headers.get('Content-type', '')

        task = None
        data = None
        format, format_options = self.read_format()
        if type_.startswith('application/json'):
            task = self.get_json()
        elif (type_.startswith('multipart/form-data') or
                type_.startswith('application/x-www-form-urlencoded')):
            task = self.get_body_argument('task', None)
            if task is None and 'task' in self.request.files:
                task = self.request.files['task'][0].body.decode('utf-8')
            if task is not None:
                task = json.loads(task)
            data = self.get_body_argument('data', None)
            if 'data' in self.request.files:
                data = self.request.files['data'][0].body
            elif data is not None:
                data = data.encode('utf-8')
            if 'format' in self.request.files:
                return self.send_error_json(
                    400,
                    "Sending 'format' in the POST data is no longer "
                    "supported, please use query parameters",
                )
        if task is None:
            return self.send_error_json(
                400,
                "Either use multipart/form-data to send the 'data' file and "
                "'task' JSON, or use application/json to send 'task' alone",
            )

        logger.info("Got POST download %s data",
                    "without" if data is None else "with")

        # materialize augmentation data
        metadata = task['metadata']

        if not data:
            return self.send_dataset(
                task['id'], metadata, format, format_options,
            )
        else:
            # data
            try:
                data_profile, _ = self.handle_data_parameter(data)
            except ClientError as e:
                return self.send_error_json(400, str(e))

            # first, look for possible augmentation
            search_results = get_augmentation_search_results(
                es=self.application.elasticsearch,
                lazo_client=self.application.lazo_client,
                data_profile=data_profile,
                query_args_main=None,
                query_sup_functions=None,
                query_sup_filters=None,
                tabular_variables=None,
                dataset_id=task['id'],
                union=False
            )

            if not search_results:
                return self.send_error_json(
                    400,
                    "The Datamart dataset referenced by 'task' cannot augment "
                    "'data'",
                )

            task = search_results[0]

            with get_dataset(metadata, task['id'], format='csv') as newdata:
                # perform augmentation
                logger.info("Performing half-augmentation with supplied data")
                new_path = augment(
                    io.BytesIO(data),
                    newdata,
                    data_profile,
                    task,
                    return_only_datamart_data=True
                )
                # FIXME: This always sends in D3M format

            # send a zip file
            self.set_header('Content-Type', 'application/zip')
            self.set_header(
                'Content-Disposition',
                'attachment; filename="augmentation.zip"')
            logger.info("Sending ZIP...")
            writer = RecursiveZipWriter(self.write)
            writer.write_recursive(new_path)
            writer.close()
            shutil.rmtree(os.path.abspath(os.path.join(new_path, '..')))
Beispiel #7
0
def materialize_and_process_dataset(
    dataset_id, metadata,
    lazo_client, nominatim,
    profile_semaphore,
    cache_invalid=False,
):
    with contextlib.ExitStack() as stack:
        with prom_incremented(PROM_DOWNLOADING):
            dataset_path = stack.enter_context(
                get_dataset(metadata, dataset_id, cache_invalid=cache_invalid)
            )
        materialize = metadata.pop('materialize')

        # Check for Excel file format
        try:
            xlrd.open_workbook(dataset_path)
        except xlrd.XLRDError:
            pass
        else:
            logger.info("This is an Excel file")
            materialize.setdefault('convert', []).append({'identifier': 'xls'})
            excel_temp_path = dataset_path + '.xls'
            os.rename(dataset_path, excel_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    xls_to_csv(excel_temp_path, dst)
            finally:
                os.remove(excel_temp_path)

        # Check for TSV file format
        with open(dataset_path, 'r') as fp:
            try:
                dialect = csv.Sniffer().sniff(fp.read(16384))
            except Exception as error:  # csv.Error, UnicodeDecodeError
                logger.error("csv.Sniffer error: %s", error)
                dialect = csv.get_dialect('excel')
        if getattr(dialect, 'delimiter', '') == '\t':
            logger.info("This is a TSV file")
            materialize.setdefault('convert', []).append({'identifier': 'tsv'})
            tsv_temp_path = dataset_path + '.tsv'
            os.rename(dataset_path, tsv_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    tsv_to_csv(tsv_temp_path, dst)
            finally:
                os.remove(tsv_temp_path)

        # Check for pivoted temporal table
        with open(dataset_path, 'r') as fp:
            reader = csv.reader(fp)
            try:
                columns = next(iter(reader))
            except StopIteration:
                columns = []
        if len(columns) >= 3:
            non_matches = [
                i for i, name in enumerate(columns)
                if parse_date(name) is None
            ]
            if len(non_matches) <= max(2.0, 0.20 * len(columns)):
                logger.info("Detected pivoted table")
                materialize.setdefault('convert', []).append({
                    'identifier': 'pivot',
                    'except_columns': non_matches,
                })
                pivot_temp_path = dataset_path + '.pivot.csv'
                os.rename(dataset_path, pivot_temp_path)
                try:
                    with open(dataset_path, 'w', newline='') as dst:
                        pivot_table(pivot_temp_path, dst, non_matches)
                finally:
                    os.remove(pivot_temp_path)

        # Profile
        with profile_semaphore:
            with prom_incremented(PROM_PROFILING):
                logger.info("Profiling dataset %r", dataset_id)
                start = time.perf_counter()
                metadata = process_dataset(
                    data=dataset_path,
                    dataset_id=dataset_id,
                    metadata=metadata,
                    lazo_client=lazo_client,
                    nominatim=nominatim,
                    include_sample=True,
                    coverage=True,
                    plots=True,
                )
                logger.info(
                    "Profiling dataset %r took %.2fs",
                    dataset_id,
                    time.perf_counter() - start,
                )

        metadata['materialize'] = materialize
        return metadata