def analyze_file(self) -> csvhelper.Dialect: """ analyzes a file to determine the structure of the file in terms of whether or it it is delimited, what the delimiter is, etc. """ if os.path.getsize(self.fqfn) == 0: raise IOErrorEmptyFile("Empty File") if self._delimiter: if self._quoting_num is None: self._quoting_num = csv.QUOTE_MINIMAL self.dialect = csvhelper.Dialect(self._delimiter, self._get_has_header(), self._quoting_num, self.quote_char, None, None, '\n', False) else: self.dialect = self._get_dialect() self.dialect.lineterminator = '\n' self._quoting_num = self.dialect.quoting self._delimiter = self.dialect.delimiter self.format_type = self._get_format_type() self.dialect.has_header = self._get_has_header(self._has_header) self._has_header = self.dialect.has_header # unrelated to dialect, actually uses csv dialect info: self.field_cnt = self._get_field_cnt() self.record_cnt, self.record_cnt_is_est = self._count_records() return self.dialect
def setup_method(self, method): self.temp_dir = tempfile.mkdtemp(prefix='gristle_diff_') self.dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_NONE, has_header=False) self.dialect.delimiter = '\t' file1_recs = [['chg-row', '4', '14'], ['del-row', '6', '16'], ['same-row', '8', '18']] self.file1 = generate_test_file(self.temp_dir, 'old_', '.csv', self.dialect, file1_recs) file2_recs = [['chg-row', '4', '1a'], ['new-row', '13a', '45b'], ['same-row', '8', '18']] self.file2 = generate_test_file(self.temp_dir, 'new_', '.csv', self.dialect, file2_recs) self.config = Config(self.temp_dir) self.config.add_property({'delimiter': 'tab'}) self.config.add_property({'has_header': False}) self.config.add_property({'quoting': csv.QUOTE_NONE}) self.config.add_property({'col_names': ['col0', 'col1', 'col2']}) self.config.add_property({'key_cols': ['0']}) self.config.add_property({'compare_cols': ['2']}) self.config.add_property({'temp_dir': self.temp_dir}) self.config.add_property({'files': [self.file1, self.file2]}) self.config.add_assignment('chgnew', 'col1', 'copy', None, 'old', 'col0')
def setup_method(self, method): self.temp_dir = tempfile.mkdtemp(prefix='gristle_test_') self.dialect = csvhelper.Dialect(delimiter=',', quoting=csv.QUOTE_NONE, has_header=False) self.fqfn = create_test_file(self.temp_dir) self.out_dir = tempfile.mkdtemp(prefix='gristle_out_')
def setup_method(self, method): self.record_cnt = 100 self.dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_NONE, has_header=False) self.test_fqfn = generate_test_file1(self.dialect, self.record_cnt) self.file_typer = mod.FileTyper(self.dialect, self.test_fqfn) self.file_typer.analyze_file()
def setup_method(self, method): self.temp_dir = tempfile.mkdtemp(prefix='gristle_diff_') self.dialect = csvhelper.Dialect(delimiter=',', quoting=csv.QUOTE_NONE, has_header=False) start_time = time.time() print('\ncreating test files - starting') self.files = CreateTestFiles(1000000, self.temp_dir) print('creating test files - done with duration of %d seconds' % int(time.time() - start_time))
def test_quote_all(self): dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_ALL, has_header=False) self.test_fqfn = generate_test_file1(dialect, self.record_cnt) file_typer = mod.FileTyper(dialect, self.test_fqfn) file_typer.analyze_file() assert file_typer.record_cnt == self.record_cnt assert file_typer.field_cnt == 4
def test_load_and_gets(self): dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_ALL, has_header=True) fqfn = ttools.make_team_file(self.temp_dir, dialect, 10) header = csvhelper.Header() header.load_from_file(fqfn, dialect) assert len(header.raw_field_names) assert len(header.field_names) assert header.get_field_position('role') == 2 assert header.get_field_name(3) == 'name' assert header.get_field_position_from_any('3') == 3 assert header.get_field_position_from_any('name') == 3
def test_empty_file(self): dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_ALL, has_header=False) fqfn = ttools.make_team_file(self.temp_dir, dialect, 0) with pytest.raises(EOFError): resulting_dialect = csvhelper.get_dialect([fqfn], delimiter=None, quoting=None, quotechar=None, has_header=None, doublequote=None, escapechar=None, skipinitialspace=False, verbosity='normal')
def test_multiple_files(self): dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_ALL, has_header=False) fqfn1 = ttools.make_team_file(self.temp_dir, dialect, 0) fqfn2 = ttools.make_team_file(self.temp_dir, dialect, 1000) resulting_dialect = csvhelper.get_dialect([fqfn1, fqfn2], delimiter=None, quoting=None, quotechar=None, has_header=None, doublequote=None, escapechar=None, skipinitialspace=False, verbosity='normal') assert resulting_dialect.delimiter == '|' assert resulting_dialect.quoting == csv.QUOTE_ALL assert resulting_dialect.quotechar == '"' assert resulting_dialect.has_header is False
def test_nondup(self): in_fqfn = create_test_file(self.temp_dir, duplicate=True) dialect = csvhelper.Dialect(delimiter=',', quoting=csv.QUOTE_NONE, quotechar=None, has_header=False, doublequote=False) out_fqfn = in_fqfn + '.sorted' cmd = f''' {pjoin(SCRIPT_DIR, 'gristle_sorter')} \ -i {in_fqfn} -o {out_fqfn} -k 0sf ''' executor(cmd, expect_success=True) recs = get_file_contents(out_fqfn, dialect) assert recs[0][0] == '1' assert recs[1][0] == '2' assert recs[2][0] == '3' assert recs[3][0] == '3' assert recs[4][0] == '4'
def test_two_keys(self): in_fqfn = create_complex_test_file(self.temp_dir, header=False) dialect = csvhelper.Dialect(delimiter=',', quoting=csv.QUOTE_NONE, quotechar=None, has_header=False, doublequote=False) out_fqfn = in_fqfn + '.sorted' cmd = f''' {pjoin(SCRIPT_DIR, 'gristle_sorter')} \ -i {in_fqfn} -o {out_fqfn} -k 0ir 1sf -q quote_none -d ',' --has-no-header --no-doublequote ''' executor(cmd, expect_success=True) actual_recs = get_file_contents(out_fqfn, dialect) expected_recs = [['4', 'aaa', 'a23'], ['4', 'aba', 'a23'], ['4', 'bbb', 'a23'], ['3', 'aaa', 'b23'], ['3', 'aaa', 'b23'], ['1', 'aaa', 'a23']] pp(actual_recs) assert actual_recs == expected_recs
def test_get_overridden_dialect(self): dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_ALL, has_header=False) fqfn = ttools.make_team_file(self.temp_dir, dialect, 1000) resulting_dialect = csvhelper.get_dialect([fqfn], delimiter=',', quoting='quote_none', quotechar='!', has_header=True, doublequote=False, escapechar='\\', skipinitialspace=False, verbosity='normal') assert resulting_dialect.delimiter == ',' assert resulting_dialect.quoting == csv.QUOTE_NONE assert resulting_dialect.quotechar == '!' assert resulting_dialect.has_header is True assert resulting_dialect.doublequote is False assert resulting_dialect.escapechar == '\\'
def test_non_override(self): dialect = csvhelper.Dialect(delimiter='|', has_header=False, quoting=csv.QUOTE_NONE, quotechar='!', doublequote=False, escapechar='\\') override_dialect = csvhelper.override_dialect(dialect, delimiter=None, quoting=None, quotechar=None, has_header=None, doublequote=None, skipinitialspace=False, escapechar=None) assert override_dialect.delimiter == '|' assert override_dialect.quoting == csv.QUOTE_NONE assert override_dialect.quotechar == '!' assert override_dialect.has_header is False assert override_dialect.doublequote is False assert override_dialect.escapechar == '\\'
def setup_method(self, method): self.temp_dir = tempfile.mkdtemp(prefix='gristle_diff_') self.dialect = csvhelper.Dialect(delimiter='|', quoting=csv.QUOTE_NONE, has_header=False)