Ejemplo n.º 1
0
    def __init__(self,
                 in_fqfn: str,
                 out_fqfn: str,
                 sort_keys_config: SortKeysConfig,
                 dialect: csvhelper.Dialect,
                 dedupe: bool,
                 keep_header: bool = True) -> None:

        self.dedupe = dedupe
        self.sort_key_config = sort_keys_config
        self.keep_header = keep_header

        self.all_recs: List[str] = []
        self.keys: List[Tuple[Any, ...]] = []
        self.header_rec = None

        self.stats = {}
        self.stats['recs_deduped'] = 0

        #todo: handle relative path subtleties:
        #    for reference:  https://stackoverflow.com/questions/918154/relative-paths-in-python
        #if not isdir(dirname(out_fqfn)):
        #    raise ValueError('Invalid sort output directory: %s' % out_fqfn)

        self.input_handler = file_io.InputHandler([in_fqfn], dialect)

        self.output_handler = file_io.OutputHandler(out_fqfn,
                                                    self.input_handler.dialect,
                                                    sys.stdout)
    def test_multicol(self):

        dialect = csv.Dialect
        dialect.delimiter = '|'
        dialect.quoting = csv.QUOTE_MINIMAL
        dialect.quotechar = '"'
        dialect.has_header = False
        dialect.lineterminator = '\n'
        columns = [1, 2]
        files = []
        files.append(generate_test_file(dialect.delimiter, 10_000_000))
        (fd, outfile) = tempfile.mkstemp(prefix='FreakerTestOut_')
        input_handler = file_io.InputHandler(files, dialect.delimiter,
                                             dialect.quoting,
                                             dialect.quotechar,
                                             dialect.has_header)
        output_handler = file_io.OutputHandler(outfile, input_handler.dialect)

        col_freak = mod.ColSetFreaker(input_handler,
                                      output_handler,
                                      col_type='specified',
                                      number=20_000_000,
                                      sampling_method='non',
                                      sampling_rate=None,
                                      sort_order='reverse',
                                      sort_col=1,
                                      max_key_len=50)
        col_freak.build_freq(columns)
        assert not col_freak.truncated
        pp(col_freak.field_freq)
        assert sum(col_freak.field_freq.values()) == 10_000_000
        for key in col_freak.field_freq.keys():
            assert key[0] in ['A1', 'A2', 'A3', 'A4']
            assert key[1] in ['B1', 'B2']
Ejemplo n.º 3
0
 def setup_method(self, method):
     self.dialect = csv.Dialect
     self.dialect.delimiter = '|'
     self.dialect.quoting = csv.QUOTE_MINIMAL
     self.dialect.quotechar = '"'
     self.dialect.has_header = False
     self.dialect.lineterminator = '\n'
     self.files = []
     self.files.append(generate_test_file(self.dialect.delimiter, 1000))
     self.columns = [1, 2]
     self.col_type = 'specified'
     self.number = 1000
     self.sampling_method = 'non'
     self.sampling_rate = None
     self.sortorder = 'reverse'
     self.sortcol = 1
     self.max_key_len = 50
     self.tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_')
     self.outfile = pjoin(self.tempdir, 'outfile.txt')
     self.input_handler = file_io.InputHandler(self.files,
                                               self.dialect.delimiter,
                                               self.dialect.quoting,
                                               self.dialect.quotechar,
                                               self.dialect.has_header)
     self.output_handler = file_io.OutputHandler(self.outfile,
                                                 self.input_handler.dialect)
Ejemplo n.º 4
0
 def setup_method(self, method):
     (files, dialect, col_type, number, sampling_method, sampling_rate, _,
      max_key_len) = generate_col_freaker_dependencies()
     tempdir = tempfile.mkdtemp(prefix='test_gristle_freaker_')
     outfile = pjoin(tempdir, 'outfile.txt')
     sortorder = 'reverse'
     sortcol = 1
     input_handler = file_io.InputHandler(files, dialect)
     output_handler = file_io.OutputHandler(outfile, input_handler.dialect)
     self.col_freak = mod.ColSetFreaker(input_handler, output_handler,
                                        col_type, number, sampling_method,
                                        sampling_rate, sortorder, sortcol,
                                        max_key_len)
Ejemplo n.º 5
0
    def _write_stdin_to_file(self):
        start_time = time.time()
        assert self.nconfig.infiles == ['-']

        tf = tempfile.NamedTemporaryFile(prefix='gristle_slicer_stdin_temp_',
                                         delete=False)
        self.temp_fn = tf.name
        with open(self.temp_fn, 'w', newline='', encoding='utf-8') as outbuf:
            writer = csv.writer(outbuf)
            writer.writerows(self.input_handler)
        self.input_handler = file_io.InputHandler([self.temp_fn],
                                                  self.nconfig.dialect,
                                                  return_header=True)
        self._pp(
            f'--------> write_stdin_to_file duration: {time.time() - start_time:.2f}'
        )
Ejemplo n.º 6
0
 def _setup_files(self) -> None:
     self.input_handler = file_io.InputHandler(self.nconfig.infiles,
                                               self.nconfig.dialect,
                                               return_header=True)
     self.output_handler = file_io.OutputHandler(self.nconfig.outfile,
                                                 self.input_handler.dialect)