def test_json_with_multiple_fields_multiple_rows(temp_json_file): write_json(temp_json_file, [{ 'field_one': 'abc', 'field_two': 50 } for _ in range(1000)]) records = list(extract_data_from_json(temp_json_file.name)) assert sorted(records) == sorted( [MetadataRecord('field_one', 'abc')] * 1000 + [MetadataRecord('field_two', 50)] * 1000, )
def test_type_inconsistency(): scenario = [ MetadataRecord('wrong_field', 20), MetadataRecord('wrong_field', 'abc') ] with pytest.raises(CrawlingError) as exc: list(crawl(scenario)) info = exc.value assert info.args[0] == "The type of field 'wrong_field' is not consistent"
def test_json_with_multiple_fields(temp_json_file): write_json(temp_json_file, [{ 'field_one': 'abc', 'field_two': 50, 'field_three': None }]) records = list(extract_data_from_json(temp_json_file.name)) assert sorted(records) == sorted([ MetadataRecord('field_one', 'abc'), MetadataRecord('field_two', 50), MetadataRecord('field_three', None), ])
def test_csv_with_multiple_fields(temp_csv_file): write_csv(temp_csv_file, ['field_one', 'field_two', 'field_three'], [{ 'field_one': '"abc"', 'field_two': 50, 'field_three': 'null' }]) records = list(extract_data_from_csv(temp_csv_file.name)) assert sorted(records) == sorted([ MetadataRecord('field_one', 'abc'), MetadataRecord('field_two', 50), MetadataRecord('field_three', None), ])
def _perform_extractor( file_path: str) -> Generator[MetadataRecord, None, None]: """ Perform the extraction of records from the given JSON file. The returned generator object produces one MetadataRecord for each element in each JSON object read from file_path. Note: this function reads the hole JSON file and produces the records one by one. It must be improved to read objects lazily. :param file_path: the path to the file to create records from :return: a generator object that produces MetadataField objects :raises ExtractionError if extraction fails """ with open(file_path, mode='r') as json_file: # TODO: avoid reading the hole file at once for obj in json.load(json_file): if not isinstance(obj, Mapping): raise ExtractionError( f'Invalid JSON structure. It must contain a list of objects' ) for key, value in obj.items(): yield MetadataRecord(key, value)
def test_csv_with_several_str_field(temp_csv_file): write_csv(temp_csv_file, ['field'], [{ 'field': f'"{idx}"' } for idx in range(100)]) records = list(extract_data_from_csv(temp_csv_file.name)) assert sorted(records) == sorted( [MetadataRecord('field', f'{idx}') for idx in range(100)])
def test_invalid_type(): scenario = [MetadataRecord('wrong_field', [1, 2, 3])] with pytest.raises(CrawlingError) as exc: list(crawl(scenario)) info = exc.value assert info.args[0] == "The type of field 'wrong_field' is unknown"
def _perform_extraction( file_path: str) -> Generator[MetadataRecord, None, None]: """ Perform the extraction of records from the given CSV file. The returned generator object produces N * M MetadataRecord for a CSV with N columns and M rows. :param file_path: the path to the file to create records from :return: a generator object that produces MetadataField objects :raises ExtractionError if extraction fails """ with open(file_path, newline='', mode='r') as csv_file: csv_reader = DictReader(csv_file, delimiter=',', quoting=QUOTE_NONE) for row in csv_reader: for key, value in row.items(): try: key = _sanitize_key(key) except AttributeError: raise ExtractionError( f"Missing column name for value {value} at line {csv_reader.line_num}" ) try: value = _sanitize_value(value) except ValueError: if not value: raise ExtractionError( f"Missing value for column '{key}' at line {csv_reader.line_num}" ) else: raise ExtractionError( f"Unknown type for value '{value}' (column '{key}') at line " f"{csv_reader.line_num}") yield MetadataRecord(key, value)
def test_csv_with_single_str_field(temp_csv_file): write_csv(temp_csv_file, ['field'], [{'field': '"string_value"'}]) records = list(extract_data_from_csv(temp_csv_file.name)) assert records == [MetadataRecord('field', "string_value")]
def test_json_with_several_nulls_field(temp_json_file): write_json(temp_json_file, [{'field': None} for _ in range(100)]) records = list(extract_data_from_json(temp_json_file.name)) assert records == [MetadataRecord('field', None)] * 100
def test_csv_with_single_null_field(temp_csv_file): write_csv(temp_csv_file, ['field'], [{'field': 'null'}]) records = list(extract_data_from_csv(temp_csv_file.name)) assert records == [MetadataRecord('field', None)]
import pytest from crawler import crawl, CrawlingError from common import MetadataRecord, Metadata @pytest.mark.parametrize('scenario, expected_result', [ ([], []), ([MetadataRecord('field', 30)], [Metadata('field', 'I', 1, 0)]), ([MetadataRecord('field', 'abc')], [Metadata('field', 'S', 1, 0)]), ([MetadataRecord('field', None)], [Metadata('field', None, 1, 1)]), ([MetadataRecord('field', 30)] * 10 + [MetadataRecord('field', None)] * 5, [Metadata('field', 'I', 15, 5)]), ([MetadataRecord('field', 'abc')] * 10 + [MetadataRecord('field', None)] * 5, [Metadata('field', 'S', 15, 5)]), ([MetadataRecord('f1', 1)] * 5 + [MetadataRecord('f2', 'abc')] * 5 + [MetadataRecord('f1', 2)] * 10 + [MetadataRecord('f2', None)] * 5, [Metadata('f1', 'I', 15, 0), Metadata('f2', 'S', 10, 5)]), ]) def test_crawling_mixed_fields(scenario, expected_result): assert sorted(list(crawl(scenario))) == sorted(expected_result) def test_type_inconsistency(): scenario = [ MetadataRecord('wrong_field', 20), MetadataRecord('wrong_field', 'abc') ] with pytest.raises(CrawlingError) as exc:
def test_json_unicode_value(temp_json_file): write_json(temp_json_file, [{'field': '短消息'}]) records = list(extract_data_from_json(temp_json_file.name)) assert records == [MetadataRecord('field', '短消息')]
def test_json_with_single_int_field(temp_json_file): write_json(temp_json_file, [{'field': 50}]) records = list(extract_data_from_json(temp_json_file.name)) assert records == [MetadataRecord('field', 50)]
def test_json_unicode_column_name(temp_json_file): write_json(temp_json_file, [{'短消息': 50}]) records = list(extract_data_from_json(temp_json_file.name)) assert records == [MetadataRecord('短消息', 50)]
def test_csv_with_several_nulls_field(temp_csv_file): write_csv(temp_csv_file, ['field'], [{ 'field': 'null' } for _ in range(100)]) records = list(extract_data_from_csv(temp_csv_file.name)) assert records == [MetadataRecord('field', None)] * 100
def test_json_with_several_int_fields(temp_json_file): write_json(temp_json_file, [{'field': idx} for idx in range(100)]) records = list(extract_data_from_json(temp_json_file.name)) assert sorted(records) == sorted( [MetadataRecord('field', idx) for idx in range(100)])
def test_csv_unicode_value(temp_csv_file): write_csv(temp_csv_file, ['field'], [{'field': '"短消息"'}]) records = list(extract_data_from_csv(temp_csv_file.name)) assert records == [MetadataRecord('field', '短消息')]
def test_json_with_single_str_field(temp_json_file): write_json(temp_json_file, [{'field': 'string_value'}]) records = list(extract_data_from_json(temp_json_file.name)) assert records == [MetadataRecord('field', 'string_value')]