def test_multicol(self): dialect = csv.Dialect dialect.delimiter = '|' dialect.quoting = csv.QUOTE_MINIMAL dialect.quotechar = '"' dialect.has_header = False dialect.lineterminator = '\n' columns = [1, 2] files = [] files.append(generate_test_file(dialect.delimiter, 10_000_000)) (fd, outfile) = tempfile.mkstemp(prefix='FreakerTestOut_') input_handler = file_io.InputHandler(files, dialect) output_handler = file_io.OutputHandler(outfile, input_handler.dialect) col_freak = mod.ColSetFreaker(input_handler, output_handler, col_type='specified', number=20_000_000, sampling_method='non', sampling_rate=None, sort_order='reverse', sort_col=1, max_key_len=50) col_freak.build_freq(columns) assert not col_freak.truncated pp(col_freak.field_freq) assert sum(col_freak.field_freq.values()) == 10_000_000 for key in col_freak.field_freq.keys(): assert key[0] in ['A1', 'A2', 'A3', 'A4'] assert key[1] in ['B1', 'B2']
def test_multicol(self): col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler, self.col_type, self.number, self.sampling_method, self.sampling_rate, self.sortorder, self.sortcol, self.max_key_len) col_freak.build_freq(self.columns) assert not col_freak.truncated assert sum(col_freak.field_freq.values()) == 1000 assert len(col_freak.field_freq) == 8 # four A* * two B* for key in col_freak.field_freq.keys(): assert key[0] in ['A1', 'A2', 'A3', 'A4'] assert key[1] in ['B1', 'B2']
def setup_method(self, method): (files, dialect, col_type, number, sampling_method, sampling_rate, _, max_key_len) = generate_col_freaker_dependencies() tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_') outfile = pjoin(tempdir, 'outfile.txt') sortorder = 'reverse' sortcol = 1 input_handler = file_io.InputHandler(files, dialect) output_handler = file_io.OutputHandler(outfile, input_handler.dialect) self.col_freak = mod.ColSetFreaker(input_handler, output_handler, col_type, number, sampling_method, sampling_rate, sortorder, sortcol, max_key_len)
def test_single_col(self): self.columns = [1] col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler, self.col_type, self.number, self.sampling_method, self.sampling_rate, self.sortorder, self.sortcol, self.max_key_len) col_freak.build_freq(self.columns) assert not col_freak.truncated assert sum(col_freak.field_freq.values()) == 1000 assert len( col_freak.field_freq ) == 4 # it's possible (but extremely unlikely) that there could be fewer entries for key in col_freak.field_freq.keys(): assert key[0] in ['A1', 'A2', 'A3', 'A4']
def test_multicol_and_truncation(self): self.number = 4 col_freak = mod.ColSetFreaker(self.input_handler, self.output_handler, self.col_type, self.number, self.sampling_method, self.sampling_rate, self.sortorder, self.sortcol, self.max_key_len) col_freak.build_freq(self.columns) assert col_freak.truncated assert len( col_freak.field_freq ) == 4 # it's possible (but extremely unlikely) that there could be fewer entries for key in col_freak.field_freq.keys(): assert key[0] in ['A1', 'A2', 'A3', 'A4'] assert key[1] in ['B1', 'B2']