def test_header_present_regexes_present(self): infile = StringIO('\n'.join(_raw_txt_1.splitlines()[1:])) data = csv.reader(infile) headers = {'name': r'(\w+\s+)+', 'email': '\w+@\w+\.\w+'} row = _next(reheadered(data, headers, header_present=True)) assert row['name'] == 'Charles the Great' assert row['email'] == '*****@*****.**'
def test_header_present_guessed(self): infile = StringIO(_raw_txt_1) data = csv.reader(infile) headers = {'name': r'(\w+\s+)+', 'email': '\w+@\w+\.\w+'} row = _next(reheadered(data, headers)) assert row['name'] == 'Nellie Newsock' assert row['email'] == '*****@*****.**'
def test_header_absent_regexes_present(self): infile = StringIO('\n'.join(_raw_txt_1.splitlines()[1:])) data = csv.reader(infile) headers = {'name': r'(\w+\s+)+', 'email': '\w+@\w+\.\w+'} row = _next(reheadered(data, headers, header_present=False)) assert row['name'] == 'Nellie Newsock' assert row['email'] == '*****@*****.**'
def test_list_of_lists_whitespace_before_headers(self): src = "\n\n\n\n" + _raw_txt_1 data = _data(src=src, reader=csv.reader, with_headers=True) for row in reheadered(data, ['name', 'email', 'zip']): assert 'name' in row assert 'email' in row assert 'zip' in row
def test_whitespace_safe_in_data(self): for row in reheadered( _data(_raw_txt_2), ['zipcode', 'Name', 'e-mail'], minimum_score=100): assert 'zipcode' in row assert 'Name' in row assert 'e-mail' in row
def test_regexes_preferred_to_fuzzy_match(self): headers = {'columnA': '\w+@\w+\.\w+', 'columnB': '\d+'} for row in reheadered(_data(), headers): assert 'columnA' in row assert '@' in row['columnA'] assert 'columnB' in row if row['columnB']: assert re.search('\d+', row['columnB'])
def test_mix_regexes_with_column_name_matches(self): headers = {'columnA': '\w+@\w+\.\w+', 'zip': None} for row in reheadered(_data(), headers): assert 'columnA' in row assert '@' in row['columnA'] assert 'zip' in row if row['zip']: assert re.search('\d+', row['zip'])
def test_optional_in_regex(self): headers = {'zip': '\w+@\w+\.\w+', '?:email': '\d+'} for row in reheadered(_data(), headers): assert 'zip' in row assert '@' in row['zip'] assert 'email' in row if row['email']: assert re.search('\d+', row['email'])
def test_prefer_fuzzy(self): headers = {'columnA': '\w+@\w+\.\w+', 'name': '\d+'} for row in reheadered(_data(), headers, prefer_fuzzy=True): assert 'columnA' in row assert '@' in row['columnA'] assert 'name' in row if row['name']: assert not re.search('\d+', row['name'])
def test_compiled_regexes(self): columnAregex = re.compile(r"""\w+ # name @\w+ # email provider \.\w+ # domain""", re.VERBOSE) headers = {'columnA': columnAregex, 'columnB': re.compile(r'\d+')} for row in reheadered(_data(), headers): assert 'columnA' in row assert '@' in row['columnA'] assert 'columnB' in row if row['columnB']: assert re.search(r'\d+', row['columnB'])
def test_fuzzy_column_name_match(self): headers = ['Name', 'mail', 'zipcode'] for row in reheadered(_data(), headers): assert 'Name' in row assert 'name' not in row assert 'mail' in row assert row['mail'] assert 'email' not in row assert 'zipcode' in row assert 'zip' not in row
def test_compiled_regexes(self): columnAregex = re.compile( r"""\w+ # name @\w+ # email provider \.\w+ # domain""", re.VERBOSE) headers = {'columnA': columnAregex, 'columnB': re.compile(r'\d+')} for row in reheadered(_data(), headers): assert 'columnA' in row assert '@' in row['columnA'] assert 'columnB' in row if row['columnB']: assert re.search(r'\d+', row['columnB'])
def test_fuzzy_column_name_match_list_of_lists(self): data = _data(reader=csv.reader, with_headers=True) headers = ['Name', 'mail', 'zipcode'] for row in reheadered(data, headers): assert 'Name' in row assert 'name' not in row assert 'mail' in row assert row['mail'] assert 'email' not in row assert 'zipcode' in row assert 'zip' not in row
def test_list_of_lists_no_data(self): infile = StringIO(_raw_txt_1.splitlines()[0]) data = csv.reader(infile) with pytest.raises(StopIteration): _next(reheadered(data, ['name', 'email', 'zip']))
def test_perfect_column_name_match(self): for row in reheadered(_data(), ['name', 'email', 'zip']): assert 'name' in row assert 'email' in row assert 'zip' in row
def test_optional_column_marker_tolerated(self): headers = ['Name', '?:mail', 'zip'] for row in reheadered(_data(), headers): assert 'mail' in row assert '?:mail' not in row
def test_custom_optional_marker(self): headers = ['Name', 'mail', 'zip', 'OPTIONAL~nationality'] for row in reheadered(_data(), headers, optional_prefix='OPTIONAL~'): assert 'mail' in row assert 'nationality' not in row
def test_dict_of_headers_accepted(self): headers = {'name': r'(\w+\s+)+', 'email': '\w+@\w+\.\w+'} data = _data() _next(reheadered(data, headers))
def test_low_minimum_score(self): headers = ['Name', 'mail', 'zip_code'] for row in reheadered(_data(), headers, minimum_score=50): assert 'zip_code' in row
def test_list_of_lists_accepted(self): headers = {'name': r'(\w+\s+)+', 'email': '\w+@\w+\.\w+'} data = _data(reader=csv.reader, with_headers=True) _next(reheadered(data, headers))
def test_whitespace_safe_in_expected(self): for row in reheadered(_data(), [' name', 'email', ' zip']): assert 'name' in row assert 'email' in row assert 'zip' in row
def test_keep_extra_with_fuzzy_match(self): for row in reheadered(_data(), ['Name', 'e-mail'], keep_extra=True): assert 'Name' in row assert 'e-mail' in row assert 'zip' in row
def test_perfect_column_name_match_list_of_lists(self): data = _data(reader=csv.reader, with_headers=True) for row in reheadered(data, ['name', 'email', 'zip']): assert 'name' in row assert 'email' in row assert 'zip' in row
def test_keep_extra(self): for row in reheadered(_data(), ['name', 'email'], keep_extra=True): assert 'name' in row assert 'email' in row assert 'zip' in row
def test_high_minimum_score(self): headers = ['Name', 'mail', 'zip'] with pytest.raises(KeyError): _next(reheadered(_data(), headers, minimum_score=90))
def test_reheadered_accepts_basic_args(self): reheadered([{}, ], [])
def test_keep_extra_false(self): for row in reheadered(_data(), ['name', 'email'], keep_extra=False): assert 'name' in row assert 'email' in row assert 'zip' not in row assert '' not in row
def test_fuzzy_column_name_match_failure(self): headers = ['Name', 'mail', 'thy one true zip code'] with pytest.raises(KeyError): _next(reheadered(_data(), headers))
def test_optional_column_marker_honored(self): headers = ['Name', 'mail', 'zip', '?:nationality'] for row in reheadered(_data(), headers): assert 'mail' in row assert 'nationality' not in row
def test_header_absent_and_no_regexes(self): infile = StringIO('\n'.join(_raw_txt_1.splitlines()[1:])) data = csv.reader(infile) headers = ['name', 'email', 'zip'] with pytest.raises(KeyError): _next(reheadered(data, headers, header_present=False))
def test_reheadered_accepts_basic_args(self): reheadered([ {}, ], [])
def test_whitespace_safe_in_data(self): for row in reheadered(_data(_raw_txt_2), ['zipcode', 'Name', 'e-mail'], minimum_score=100): assert 'zipcode' in row assert 'Name' in row assert 'e-mail' in row