def __init__(self, in_fqfn: str, out_fqfn: str, sort_keys_config: SortKeysConfig, dialect: csvhelper.Dialect, dedupe: bool, keep_header: bool = True) -> None: self.dedupe = dedupe self.sort_key_config = sort_keys_config self.keep_header = keep_header self.all_recs: List[str] = [] self.keys: List[Tuple[Any, ...]] = [] self.header_rec = None self.stats = {} self.stats['recs_deduped'] = 0 #todo: handle relative path subtleties: # for reference: https://stackoverflow.com/questions/918154/relative-paths-in-python #if not isdir(dirname(out_fqfn)): # raise ValueError('Invalid sort output directory: %s' % out_fqfn) self.input_handler = file_io.InputHandler([in_fqfn], dialect) self.output_handler = file_io.OutputHandler(out_fqfn, self.input_handler.dialect, sys.stdout)
def test_multicol(self): dialect = csv.Dialect dialect.delimiter = '|' dialect.quoting = csv.QUOTE_MINIMAL dialect.quotechar = '"' dialect.has_header = False dialect.lineterminator = '\n' columns = [1, 2] files = [] files.append(generate_test_file(dialect.delimiter, 10_000_000)) (fd, outfile) = tempfile.mkstemp(prefix='FreakerTestOut_') input_handler = file_io.InputHandler(files, dialect.delimiter, dialect.quoting, dialect.quotechar, dialect.has_header) output_handler = file_io.OutputHandler(outfile, input_handler.dialect) col_freak = mod.ColSetFreaker(input_handler, output_handler, col_type='specified', number=20_000_000, sampling_method='non', sampling_rate=None, sort_order='reverse', sort_col=1, max_key_len=50) col_freak.build_freq(columns) assert not col_freak.truncated pp(col_freak.field_freq) assert sum(col_freak.field_freq.values()) == 10_000_000 for key in col_freak.field_freq.keys(): assert key[0] in ['A1', 'A2', 'A3', 'A4'] assert key[1] in ['B1', 'B2']
def setup_method(self, method): self.dialect = csv.Dialect self.dialect.delimiter = '|' self.dialect.quoting = csv.QUOTE_MINIMAL self.dialect.quotechar = '"' self.dialect.has_header = False self.dialect.lineterminator = '\n' self.files = [] self.files.append(generate_test_file(self.dialect.delimiter, 1000)) self.columns = [1, 2] self.col_type = 'specified' self.number = 1000 self.sampling_method = 'non' self.sampling_rate = None self.sortorder = 'reverse' self.sortcol = 1 self.max_key_len = 50 self.tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_') self.outfile = pjoin(self.tempdir, 'outfile.txt') self.input_handler = file_io.InputHandler(self.files, self.dialect.delimiter, self.dialect.quoting, self.dialect.quotechar, self.dialect.has_header) self.output_handler = file_io.OutputHandler(self.outfile, self.input_handler.dialect)
def setup_method(self, method): (files, dialect, col_type, number, sampling_method, sampling_rate, _, max_key_len) = generate_col_freaker_dependencies() tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_') outfile = pjoin(tempdir, 'outfile.txt') sortorder = 'reverse' sortcol = 1 input_handler = file_io.InputHandler(files, dialect) output_handler = file_io.OutputHandler(outfile, input_handler.dialect) self.col_freak = mod.ColSetFreaker(input_handler, output_handler, col_type, number, sampling_method, sampling_rate, sortorder, sortcol, max_key_len)
def _write_stdin_to_file(self): start_time = time.time() assert self.nconfig.infiles == ['-'] tf = tempfile.NamedTemporaryFile(prefix='gristle_slicer_stdin_temp_', delete=False) self.temp_fn = tf.name with open(self.temp_fn, 'w', newline='', encoding='utf-8') as outbuf: writer = csv.writer(outbuf) writer.writerows(self.input_handler) self.input_handler = file_io.InputHandler([self.temp_fn], self.nconfig.dialect, return_header=True) self._pp( f'--------> write_stdin_to_file duration: {time.time() - start_time:.2f}' )
def _setup_files(self) -> None: self.input_handler = file_io.InputHandler(self.nconfig.infiles, self.nconfig.dialect, return_header=True) self.output_handler = file_io.OutputHandler(self.nconfig.outfile, self.input_handler.dialect)