def test_pivot(self):
     f_out = io.StringIO()
     pivot_table(
         os.path.join(os.path.dirname(__file__), 'data/dates_pivoted.csv'),
         f_out,
         [0],
     )
     with data('dates_pivoted.converted.csv', 'r', newline='') as f_exp:
         self.assertEqual(
             f_out.getvalue(),
             f_exp.read(),
         )
Example #2
0
 def test_pivot_years(self):
     """Test pivoting a dataset with year columns"""
     f_out = io.StringIO()
     pivot_table(
         os.path.join(os.path.dirname(__file__), 'data/years_pivoted.csv'),
         f_out,
         [0],
         'year',
     )
     with data('years_pivoted.converted.csv', 'r', newline='') as f_exp:
         self.assertEqual(
             f_out.getvalue(),
             f_exp.read(),
         )
Example #3
0
def detect_format_convert_to_csv(dataset_path, convert_dataset, materialize):
    """Detect supported formats and convert to CSV.

    :param dataset_path: Input dataset to be processed.
    :param convert_dataset: Function wrapping the conversion, in charge of
        creating the new file and cleaning up the previous one for each
        conversion. Takes the conversion function (filename, unicode file
        object), runs it, and returns the new path.
    :param materialize: Materialization info to be updated with the applied
        conversions.
    """
    with open(dataset_path, 'rb') as fp:
        magic = fp.read(16)

    # Check for Excel XLSX file format (2007+)
    if magic[:4] == b'PK\x03\x04':
        try:
            zip = zipfile.ZipFile(dataset_path)
        except zipfile.BadZipFile:
            pass
        else:
            if any(info.filename.startswith('xl/') for info in zip.infolist()):
                # Update metadata
                logger.info("This is an Excel XLSX (2007+) file")
                materialize.setdefault('convert',
                                       []).append({'identifier': 'xlsx'})

                # Update file
                dataset_path = convert_dataset(xlsx_to_csv, dataset_path)

    # Check for Excel XLS file format (1997-2003)
    if magic[:8] == b'\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1':
        # Update metadata
        logger.info("This is an Excel XLS (1997-2003) file")
        materialize.setdefault('convert', []).append({'identifier': 'xls'})

        # Update file
        dataset_path = convert_dataset(xls_to_csv, dataset_path)

    # Check for Stata file format
    if magic[:11] == b'<stata_dta>' or magic[:4] in (
            b'\x73\x01\x01\x00',
            b'\x73\x02\x01\x00',
            b'\x72\x01\x01\x00',
            b'\x72\x02\x01\x00',
            b'\x71\x01\x01\x01',
            b'\x71\x02\x01\x01',
    ):
        # Update metadata
        logger.info("This is a Stata file")
        materialize.setdefault('convert', []).append({'identifier': 'stata'})

        # Update file
        dataset_path = convert_dataset(stata_to_csv, dataset_path)

    # Check for SPSS file format
    if magic[:4] in (b'\xC1\xE2\xC3\xC9', b'$FL2', b'$FL3'):
        # Update metadata
        logger.info("This is an SPSS file")
        materialize.setdefault('convert', []).append({'identifier': 'spss'})

        # Update file
        dataset_path = convert_dataset(spss_to_csv, dataset_path)

    # Check for TSV file format
    with open(dataset_path, 'r') as fp:
        # Read at least 65kB and 3 lines, and at most 5MB
        sample = fp.read(65536)
        newlines = sample.count('\n')
        while newlines < 3 and len(sample) < 5242880:
            more = fp.read(65536)
            if not more:
                break
            sample += more
            newlines += more.count('\n')

        # Run the sniffer
        dialect = csv.get_dialect('excel')
        if newlines >= 3:
            try:
                dialect = csv.Sniffer().sniff(sample, DELIMITERS)
            except Exception as error:  # csv.Error, UnicodeDecodeError
                logger.warning("csv.Sniffer error: %s", error)
        else:
            logger.warning("Lines are too long to use csv.Sniffer")
    if getattr(dialect, 'delimiter', ',') != ',':
        # Update metadata
        logger.info("Detected separator is %r", dialect.delimiter)
        materialize.setdefault('convert', []).append({
            'identifier':
            'tsv',
            'separator':
            dialect.delimiter,
        })

        # Update file
        dataset_path = convert_dataset(
            lambda s, d: tsv_to_csv(s, d, separator=dialect.delimiter),
            dataset_path,
        )

    # Check for non-data rows at the top of the file
    with open(dataset_path, 'r') as fp:
        non_data_rows = count_garbage_rows(fp)
        if non_data_rows > 0:
            # Update metadata
            logger.info("Detected %d lines to skip", non_data_rows)
            materialize.setdefault('convert', []).append({
                'identifier':
                'skip_rows',
                'nb_rows':
                non_data_rows,
            })

            # Update file
            dataset_path = convert_dataset(
                lambda s, d: skip_rows(s, d, nb_rows=non_data_rows),
                dataset_path,
            )

    # Check for pivoted temporal table
    with open(dataset_path, 'r') as fp:
        reader = csv.reader(fp)
        try:
            columns = next(iter(reader))
        except StopIteration:
            columns = []
    if len(columns) >= 3:
        # Look for dates
        non_dates = [
            i for i, name in enumerate(columns) if parse_date(name) is None
        ]

        # Look for years
        def is_year(name, max_year=datetime.utcnow().year + 2):
            if len(name) != 4:
                return False
            try:
                return 1900 <= int(name) <= max_year
            except ValueError:
                return False

        non_years = [i for i, name in enumerate(columns) if not is_year(name)]

        # If there's enough matches, pivot
        non_matches = min([non_dates, non_years], key=len)
        if len(non_matches) <= max(2.0, 0.20 * len(columns)):
            date_label = 'year' if non_matches is non_years else 'date'

            # Update metadata
            logger.info("Detected pivoted table")
            materialize.setdefault('convert', []).append({
                'identifier':
                'pivot',
                'except_columns':
                non_matches,
                'date_label':
                date_label,
            })

            # Update file
            dataset_path = convert_dataset(
                lambda path, dst: pivot_table(path, dst, non_matches,
                                              date_label),
                dataset_path,
            )

    return dataset_path
Example #4
0
def detect_format_convert_to_csv(dataset_path, convert_dataset, materialize):
    """Detect supported formats and convert to CSV.

    :param dataset_path: Input dataset to be processed.
    :param convert_dataset: Function wrapping the conversion, in charge of
        creating the new file and cleaning up the previous one for each
        conversion. Takes the conversion function (filename, unicode file
        object), runs it, and returns the new path.
    :param materialize: Materialization info to be updated with the applied
        conversions.
    """
    # Check for Excel file format
    try:
        xlrd.open_workbook(dataset_path)
    except xlrd.XLRDError:
        pass
    else:
        # Update metadata
        logger.info("This is an Excel file")
        materialize.setdefault('convert', []).append({'identifier': 'xls'})

        # Update file
        dataset_path = convert_dataset(xls_to_csv, dataset_path)

    with open(dataset_path, 'rb') as fp:
        magic = fp.read(16)

    # Check for Stata file format
    if magic[:11] == b'<stata_dta>' or magic[:4] in (
            b'\x73\x01\x01\x00',
            b'\x73\x02\x01\x00',
            b'\x72\x01\x01\x00',
            b'\x72\x02\x01\x00',
            b'\x71\x01\x01\x01',
            b'\x71\x02\x01\x01',
    ):
        # Update metadata
        logger.info("This is a Stata file")
        materialize.setdefault('convert', []).append({'identifier': 'stata'})

        # Update file
        dataset_path = convert_dataset(stata_to_csv, dataset_path)

    # Check for SPSS file format
    if magic[:4] in (b'\xC1\xE2\xC3\xC9', b'$FL2', b'$FL3'):
        # Update metadata
        logger.info("This is an SPSS file")
        materialize.setdefault('convert', []).append({'identifier': 'spss'})

        # Update file
        dataset_path = convert_dataset(spss_to_csv, dataset_path)

    # Check for TSV file format
    with open(dataset_path, 'r') as fp:
        try:
            dialect = csv.Sniffer().sniff(fp.read(16384))
        except Exception as error:  # csv.Error, UnicodeDecodeError
            logger.warning("csv.Sniffer error: %s", error)
            dialect = csv.get_dialect('excel')
    if getattr(dialect, 'delimiter', ',') != ',':
        # Update metadata
        logger.info("Detected separator is %r", dialect.delimiter)
        materialize.setdefault('convert', []).append({
            'identifier':
            'tsv',
            'separator':
            dialect.delimiter,
        })

        # Update file
        dataset_path = convert_dataset(
            lambda s, d: tsv_to_csv(s, d, separator=dialect.delimiter),
            dataset_path,
        )

    # Check for pivoted temporal table
    with open(dataset_path, 'r') as fp:
        reader = csv.reader(fp)
        try:
            columns = next(iter(reader))
        except StopIteration:
            columns = []
    if len(columns) >= 3:
        non_matches = [
            i for i, name in enumerate(columns) if parse_date(name) is None
        ]
        if len(non_matches) <= max(2.0, 0.20 * len(columns)):
            # Update metadata
            logger.info("Detected pivoted table")
            materialize.setdefault('convert', []).append({
                'identifier':
                'pivot',
                'except_columns':
                non_matches,
            })

            # Update file
            dataset_path = convert_dataset(
                lambda path, dst: pivot_table(path, dst, non_matches),
                dataset_path,
            )

    return dataset_path
Example #5
0
def materialize_and_process_dataset(
    dataset_id, metadata,
    lazo_client, nominatim,
    profile_semaphore,
    cache_invalid=False,
):
    with contextlib.ExitStack() as stack:
        with prom_incremented(PROM_DOWNLOADING):
            dataset_path = stack.enter_context(
                get_dataset(metadata, dataset_id, cache_invalid=cache_invalid)
            )
        materialize = metadata.pop('materialize')

        # Check for Excel file format
        try:
            xlrd.open_workbook(dataset_path)
        except xlrd.XLRDError:
            pass
        else:
            logger.info("This is an Excel file")
            materialize.setdefault('convert', []).append({'identifier': 'xls'})
            excel_temp_path = dataset_path + '.xls'
            os.rename(dataset_path, excel_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    xls_to_csv(excel_temp_path, dst)
            finally:
                os.remove(excel_temp_path)

        # Check for TSV file format
        with open(dataset_path, 'r') as fp:
            try:
                dialect = csv.Sniffer().sniff(fp.read(16384))
            except Exception as error:  # csv.Error, UnicodeDecodeError
                logger.error("csv.Sniffer error: %s", error)
                dialect = csv.get_dialect('excel')
        if getattr(dialect, 'delimiter', '') == '\t':
            logger.info("This is a TSV file")
            materialize.setdefault('convert', []).append({'identifier': 'tsv'})
            tsv_temp_path = dataset_path + '.tsv'
            os.rename(dataset_path, tsv_temp_path)
            try:
                with open(dataset_path, 'w', newline='') as dst:
                    tsv_to_csv(tsv_temp_path, dst)
            finally:
                os.remove(tsv_temp_path)

        # Check for pivoted temporal table
        with open(dataset_path, 'r') as fp:
            reader = csv.reader(fp)
            try:
                columns = next(iter(reader))
            except StopIteration:
                columns = []
        if len(columns) >= 3:
            non_matches = [
                i for i, name in enumerate(columns)
                if parse_date(name) is None
            ]
            if len(non_matches) <= max(2.0, 0.20 * len(columns)):
                logger.info("Detected pivoted table")
                materialize.setdefault('convert', []).append({
                    'identifier': 'pivot',
                    'except_columns': non_matches,
                })
                pivot_temp_path = dataset_path + '.pivot.csv'
                os.rename(dataset_path, pivot_temp_path)
                try:
                    with open(dataset_path, 'w', newline='') as dst:
                        pivot_table(pivot_temp_path, dst, non_matches)
                finally:
                    os.remove(pivot_temp_path)

        # Profile
        with profile_semaphore:
            with prom_incremented(PROM_PROFILING):
                logger.info("Profiling dataset %r", dataset_id)
                start = time.perf_counter()
                metadata = process_dataset(
                    data=dataset_path,
                    dataset_id=dataset_id,
                    metadata=metadata,
                    lazo_client=lazo_client,
                    nominatim=nominatim,
                    include_sample=True,
                    coverage=True,
                    plots=True,
                )
                logger.info(
                    "Profiling dataset %r took %.2fs",
                    dataset_id,
                    time.perf_counter() - start,
                )

        metadata['materialize'] = materialize
        return metadata