コード例 #1
0
    def test_multicol(self):

        dialect = csv.Dialect
        dialect.delimiter = '|'
        dialect.quoting = csv.QUOTE_MINIMAL
        dialect.quotechar = '"'
        dialect.has_header = False
        dialect.lineterminator = '\n'
        columns = [1, 2]
        files = []
        files.append(generate_test_file(dialect.delimiter, 10_000_000))
        (fd, outfile) = tempfile.mkstemp(prefix='FreakerTestOut_')
        input_handler = file_io.InputHandler(files, dialect)
        output_handler = file_io.OutputHandler(outfile, input_handler.dialect)

        col_freak = mod.ColSetFreaker(input_handler,
                                      output_handler,
                                      col_type='specified',
                                      number=20_000_000,
                                      sampling_method='non',
                                      sampling_rate=None,
                                      sort_order='reverse',
                                      sort_col=1,
                                      max_key_len=50)
        col_freak.build_freq(columns)
        assert not col_freak.truncated
        pp(col_freak.field_freq)
        assert sum(col_freak.field_freq.values()) == 10_000_000
        for key in col_freak.field_freq.keys():
            assert key[0] in ['A1', 'A2', 'A3', 'A4']
            assert key[1] in ['B1', 'B2']
コード例 #2
0
 def test_multicol(self):
     col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler,
                                   self.col_type, self.number,
                                   self.sampling_method, self.sampling_rate,
                                   self.sortorder, self.sortcol,
                                   self.max_key_len)
     col_freak.build_freq(self.columns)
     assert not col_freak.truncated
     assert sum(col_freak.field_freq.values()) == 1000
     assert len(col_freak.field_freq) == 8  # four A* * two B*
     for key in col_freak.field_freq.keys():
         assert key[0] in ['A1', 'A2', 'A3', 'A4']
         assert key[1] in ['B1', 'B2']
コード例 #3
0
 def setup_method(self, method):
     (files, dialect, col_type, number, sampling_method, sampling_rate, _,
      max_key_len) = generate_col_freaker_dependencies()
     tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_')
     outfile = pjoin(tempdir, 'outfile.txt')
     sortorder = 'reverse'
     sortcol = 1
     input_handler = file_io.InputHandler(files, dialect)
     output_handler = file_io.OutputHandler(outfile, input_handler.dialect)
     self.col_freak = mod.ColSetFreaker(input_handler, output_handler,
                                        col_type, number, sampling_method,
                                        sampling_rate, sortorder, sortcol,
                                        max_key_len)
コード例 #4
0
 def test_single_col(self):
     self.columns = [1]
     col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler,
                                   self.col_type, self.number,
                                   self.sampling_method, self.sampling_rate,
                                   self.sortorder, self.sortcol,
                                   self.max_key_len)
     col_freak.build_freq(self.columns)
     assert not col_freak.truncated
     assert sum(col_freak.field_freq.values()) == 1000
     assert len(
         col_freak.field_freq
     ) == 4  # it's possible (but extremely unlikely) that there could be fewer entries
     for key in col_freak.field_freq.keys():
         assert key[0] in ['A1', 'A2', 'A3', 'A4']
コード例 #5
0
 def test_multicol_and_truncation(self):
     self.number = 4
     col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler,
                                   self.col_type, self.number,
                                   self.sampling_method, self.sampling_rate,
                                   self.sortorder, self.sortcol,
                                   self.max_key_len)
     col_freak.build_freq(self.columns)
     assert col_freak.truncated
     assert len(
         col_freak.field_freq
     ) == 4  # it's possible (but extremely unlikely) that there could be fewer entries
     for key in col_freak.field_freq.keys():
         assert key[0] in ['A1', 'A2', 'A3', 'A4']
         assert key[1] in ['B1', 'B2']