Exemple #1
0
 def test_not_parquet_file(self):
     main = parquet.ParquetMain()
     with tempfile.NamedTemporaryFile() as t:
         t.write(b"blah")
         t.flush()
         self.assertFalse(main._check_header_magic_bytes(t))
         self.assertFalse(main._check_footer_magic_bytes(t))
Exemple #2
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'r') as f:
            expected_data = list(csv.reader(f, delimiter='|'))
        main = parquet.ParquetMain()

        def _custom_datatype(in_dict, keys):
            '''
            return rows like the csv outputter

            Could convert to a dataframe like this:
                import pandas
                df = pandas.DataFrame(in_dict)
                return df
            '''
            columns = [in_dict[key] for key in keys]
            rows = zip(*columns)
            return [x for x in rows]

        actual_data = main.dump(parquet_file,
                                Options(format='custom'),
                                out=_custom_datatype)
        assert len(expected_data) == len(actual_data)
        footer = main.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Exemple #3
0
    def _test_file_json(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'r') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_raw_data = StringIO()
        main = parquet.ParquetMain()
        main.dump(parquet_file, Options(format='json'), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = [
            json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n")
            if len(x) > 0
        ]

        assert len(expected_data) == len(actual_data)
        footer = main.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_raw_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Exemple #4
0
 def test_read_footer(self):
     main = parquet.ParquetMain()
     footer = main.read_footer(self.f)
     self.assertEquals(
         set([s.name for s in footer.schema]),
         set([
             "schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"
         ]))
Exemple #5
0
    def _test_file_csv(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to a csv using the dump utility and then compares the
            result to the csv_file.
        """
        expected_data = []
        with open(csv_file, 'r') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        main = parquet.ParquetMain()
        actual_raw_data = StringIO()
        main.dump(parquet_file, Options(), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))

        self._compare_data(expected_data, actual_data)

        actual_raw_data = StringIO()
        main.dump(parquet_file, Options(no_headers=False), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = list(csv.reader(actual_raw_data, delimiter='\t'))[1:]

        self._compare_data(expected_data, actual_data)
Exemple #6
0
 def test_dump_metadata(self):
     data = BytesIO()
     main = parquet.ParquetMain()
     main.dump_metadata(self.f, data)
Exemple #7
0
 def test_footer_bytes(self):
     main = parquet.ParquetMain()
     with open(self.f, 'rb') as fo:
         self.assertEquals(327, main._get_footer_size(fo))
Exemple #8
0
 def test_footer_magic_bytes(self):
     main = parquet.ParquetMain()
     with tempfile.NamedTemporaryFile() as t:
         t.write(b"PAR1_some_bogus_data_PAR1")
         t.flush()
         self.assertTrue(main._check_footer_magic_bytes(t))