Beispiel #1
0
    def _test_file_json(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = [
            json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n")
            if len(x) > 0
        ]

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_raw_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Beispiel #2
0
    def _test_file_csv(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to a csv using the dump utility and then compares the
            result to the csv_file.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file, Options(), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))

        assert expected_data == actual_data, "{0} != {1}".format(
            str(expected_data), str(actual_data))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file,
                     Options(no_headers=False),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))[1:]

        assert expected_data == actual_data, "{0} != {1}".format(
            str(expected_data), str(actual_data))
Beispiel #3
0
    def _test_file_csv(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to a csv using the dump utility and then compares the
            result to the csv_file.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding="utf-8") as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file, Options(), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM))

        #assert expected_data == actual_data, "{0} != {1}".format(
        #    str(expected_data), str(actual_data))
        self.tc.assertListEqual(expected_data, actual_data)

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file,
                     Options(no_headers=False),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data,
                                      delimiter=TAB_DELIM))[1:]

        self.tc.assertListEqual(expected_data, actual_data)
Beispiel #4
0
    def _test_file_csv(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to a csv using the dump utility and then compares the
            result to the csv_file.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file, Options(), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))

        assert expected_data == actual_data, "{0} != {1}".format(
            str(expected_data), str(actual_data))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file, Options(no_headers=False),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))[1:]

        assert expected_data == actual_data, "{0} != {1}".format(
            str(expected_data), str(actual_data))
Beispiel #5
0
    def _test_file_json(self, parquet_file, csv_file):
        """Test the dump function by outputting to a json file.

        Given the parquet_file and csv_file representation, converts the parquet_file to json using
        the dump utility and then compares the result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding='utf-8') as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file, Options(format='json'),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = [json.loads(x.rstrip()) for x in
                       actual_raw_data.read().split("\n") if len(x) > 0]

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_raw_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Beispiel #6
0
def read_parquet_data (filename):
    
    actual_raw_data = StringIO.StringIO()
    parquet.dump(filename, Options(format='csv'), out=actual_raw_data)
    actual_raw_data.seek(0, 0)
    actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))
    
    return actual_data
Beispiel #7
0
    def test_limit(self):
        """Test the limit option."""
        limit = 2
        expected_data = []
        with io.open(CSV_FILE, 'r', encoding="utf-8") as fo:
            expected_data = list(csv.reader(fo, delimiter='|'))[:limit]

        actual_raw_data = io.StringIO()
        parquet.dump(TEST_FILE, Options(limit=limit), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))

        self.assertListEqual(expected_data, actual_data)
Beispiel #8
0
    def test_limit(self):
        """Test the limit option"""
        limit = 2
        expected_data = []
        with io.open(CSV_FILE, 'r', encoding="utf-8") as fo:
            expected_data = list(csv.reader(fo, delimiter='|'))[:limit]

        actual_raw_data = io.StringIO()
        parquet.dump(TEST_FILE, Options(limit=limit), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))

        self.assertListEqual(expected_data, actual_data)
Beispiel #9
0
def main(argv=None):
    """Run parquet utility application."""
    argv = argv or sys.argv[1:]

    parser = argparse.ArgumentParser('parquet',
                                     description='Read parquet files')
    parser.add_argument('--metadata',
                        action='store_true',
                        help='show metadata on file')
    parser.add_argument('--row-group-metadata',
                        action='store_true',
                        help="show per row group metadata")
    parser.add_argument('--no-data',
                        action='store_true',
                        help="don't dump any data from the file")
    parser.add_argument('--limit',
                        action='store',
                        type=int,
                        default=-1,
                        help='max records to output')
    parser.add_argument('--col',
                        action='append',
                        type=str,
                        help='only include this column (can be '
                        'specified multiple times)')
    parser.add_argument('--no-headers',
                        action='store_true',
                        help='skip headers in output (only applies if '
                        'format=csv)')
    parser.add_argument('--format',
                        action='store',
                        type=str,
                        default='csv',
                        help='format for the output data. can be csv or json.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='log debug info to stderr')
    parser.add_argument('file', help='path to the file to parse')

    args = parser.parse_args(argv)

    setup_logging(args)

    # pylint: disable=import-outside-toplevel
    import parquet

    if args.metadata:
        parquet.dump_metadata(args.file, args.row_group_metadata)
    if not args.no_data:
        parquet.dump(args.file, args)
Beispiel #10
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, "rb") as f:
            expected_data = list(csv.reader(f, delimiter="|"))

        def _custom_datatype(in_dict, keys):
            """
            return rows like the csv outputter

            Could convert to a dataframe like this:
                import pandas
                df = pandas.DataFrame(in_dict)
                return df
            """
            columns = [in_dict[key] for key in keys]
            rows = zip(*columns)
            return rows

        actual_data = parquet.dump(parquet_file, Options(format="custom"), out=_custom_datatype)

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        def _custom_datatype(in_dict, keys):
            '''
            return rows like the csv outputter

            Could convert to a dataframe like this:
                import pandas
                df = pandas.DataFrame(in_dict)
                return df
            '''
            columns = [in_dict[key] for key in keys]
            rows = zip(*columns)
            return rows

        actual_data = parquet.dump(parquet_file, Options(format='custom'), out=_custom_datatype)

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Beispiel #12
0
def import_from_parquet(filename, encoding='utf-8', *args, **kwargs):
    'Import data from a Parquet file'

    # TODO: should be able to used fobj also

    data, field_names = parquet.dump(filename, OPTIONS, _callback)
    length = len(data[field_names[0]])
    table_rows = [[data[field_name][index] for field_name in field_names]
                  for index in range(length)]

    meta = {'imported_from': 'parquet', 'filename': filename,}
    return create_table([field_names] + table_rows, meta=meta, *args, **kwargs)
Beispiel #13
0
def main(argv=None):
    """Run parquet utility application."""
    argv = argv or sys.argv[1:]

    parser = argparse.ArgumentParser('parquet',
                                     description='Read parquet files')
    parser.add_argument('--metadata', action='store_true',
                        help='show metadata on file')
    parser.add_argument('--row-group-metadata', action='store_true',
                        help="show per row group metadata")
    parser.add_argument('--no-data', action='store_true',
                        help="don't dump any data from the file")
    parser.add_argument('--limit', action='store', type=int, default=-1,
                        help='max records to output')
    parser.add_argument('--col', action='append', type=str,
                        help='only include this column (can be '
                             'specified multiple times)')
    parser.add_argument('--no-headers', action='store_true',
                        help='skip headers in output (only applies if '
                             'format=csv)')
    parser.add_argument('--format', action='store', type=str, default='csv',
                        help='format for the output data. can be csv or json.')
    parser.add_argument('--debug', action='store_true',
                        help='log debug info to stderr')
    parser.add_argument('file',
                        help='path to the file to parse')

    args = parser.parse_args(argv)

    setup_logging(args)

    import parquet

    if args.metadata:
        parquet.dump_metadata(args.file, args.row_group_metadata)
    if not args.no_data:
        parquet.dump(args.file, args)
Beispiel #14
0
    def _test_file_csv(self, parquet_file, csv_file):
        """Test the dump function by outputting to a csv file.

        Given the parquet_file and csv_file representation, converts the parquet_file to a csv
        using the dump utility and then compares the result to the csv_file.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding="utf-8") as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file, Options(), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM))

        self.tc.assertListEqual(expected_data, actual_data)

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file, Options(no_headers=False),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter=TAB_DELIM))[1:]

        self.tc.assertListEqual(expected_data, actual_data)