Exemple #1
0
 def test_read_footer(self):
     footer = parquet.read_footer(self.f)
     self.assertEquals(
         set([s.name for s in footer.schema]),
         set([
             "schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"
         ]))
Exemple #2
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, "rb") as f:
            expected_data = list(csv.reader(f, delimiter="|"))

        def _custom_datatype(in_dict, keys):
            """
            return rows like the csv outputter

            Could convert to a dataframe like this:
                import pandas
                df = pandas.DataFrame(in_dict)
                return df
            """
            columns = [in_dict[key] for key in keys]
            rows = zip(*columns)
            return rows

        actual_data = parquet.dump(parquet_file, Options(format="custom"), out=_custom_datatype)

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Exemple #3
0
    def _test_file_json(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_raw_data = StringIO.StringIO()
        parquet.dump(parquet_file, Options(format='json'), out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = [
            json.loads(x.rstrip()) for x in actual_raw_data.read().split("\n")
            if len(x) > 0
        ]

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_raw_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Exemple #4
0
    def _test_file_custom(self, parquet_file, csv_file):
        """Test the DictReader function against csv data.

        Given the parquet_file and csv_file representation, reads the parquet file using DictReader
        and then compares the result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding="utf-8") as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_data = []
        with open(parquet_file, "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(
                    expected[i],
                    actual[c].decode('utf-8') if type(actual[c]) is bytes \
                    # this makes '0' = 0, since csv reads all strings.
                    else str(actual[c]))
Exemple #5
0
 def test_read_footer(self):
     """Test reading the footer."""
     footer = parquet.read_footer(TEST_FILE)
     self.assertEquals(
         set([s.name for s in footer.schema]),
         set(["schema", "n_regionkey", "n_name", "n_nationkey",
              "n_comment"]))
Exemple #6
0
    def _test_file_json(self, parquet_file, csv_file):
        """Test the dump function by outputting to a json file.

        Given the parquet_file and csv_file representation, converts the parquet_file to json using
        the dump utility and then compares the result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding='utf-8') as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_raw_data = io.StringIO()
        parquet.dump(parquet_file, Options(format='json'),
                     out=actual_raw_data)
        actual_raw_data.seek(0, 0)
        actual_data = [json.loads(x.rstrip()) for x in
                       actual_raw_data.read().split("\n") if len(x) > 0]

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_raw_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with io.open(csv_file, 'r', encoding="utf-8") as f:
            expected_data = list(csv.reader(f, delimiter=PIPE_DELIM))

        actual_data = []
        with open(parquet_file, "rb") as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(expected[i],
                    actual[c].decode('utf-8') if type(actual[c]) is bytes \
                    # this makes '0' = 0, since csv reads all strings.

                    else str(actual[c]))
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        def _custom_datatype(in_dict, keys):
            '''
            return rows like the csv outputter

            Could convert to a dataframe like this:
                import pandas
                df = pandas.DataFrame(in_dict)
                return df
            '''
            columns = [in_dict[key] for key in keys]
            rows = zip(*columns)
            return rows

        actual_data = parquet.dump(parquet_file, Options(format='custom'), out=_custom_datatype)

        assert len(expected_data) == len(actual_data)
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]

        for expected, actual in zip(expected_data, actual_data):
            assert len(expected) == len(actual)
            for i, c in enumerate(cols):
                if c in actual:
                    assert expected[i] == actual[c]
Exemple #9
0
 def test_read_footer(self):
     """Test reading the footer."""
     footer = parquet.read_footer(TEST_FILE)
     self.assertEquals(
         set([s.name for s in footer.schema]),
         set([
             "schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"
         ]))
Exemple #10
0
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_data = []
        with open(parquet_file) as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(expected[i], str(actual[c]))
    def _test_file_custom(self, parquet_file, csv_file):
        """ Given the parquet_file and csv_file representation, converts the
            parquet_file to json using the dump utility and then compares the
            result to the csv_file using column agnostic ordering.
        """
        expected_data = []
        with open(csv_file, 'rb') as f:
            expected_data = list(csv.reader(f, delimiter='|'))

        actual_data = []
        with open(parquet_file) as parquet_fo:
            actual_data = list(parquet.DictReader(parquet_fo))

        self.tc.assertEquals(len(expected_data), len(actual_data))
        footer = parquet.read_footer(parquet_file)
        cols = [s.name for s in footer.schema]
        for expected, actual in zip(expected_data, actual_data):
            self.tc.assertEquals(len(expected), len(actual))
            for i, c in enumerate([c for c in cols if c in actual]):
                self.tc.assertEquals(expected[i], str(actual[c]))
Exemple #12
0
 def test_read_footer(self):
     footer = parquet.read_footer(self.f)
     self.assertEquals(
         set([s.name for s in footer.schema]), set(["schema", "n_regionkey", "n_name", "n_nationkey", "n_comment"])
     )