Example #1
0
def test_json_with_multiple_fields_multiple_rows(temp_json_file):
    write_json(temp_json_file, [{
        'field_one': 'abc',
        'field_two': 50
    } for _ in range(1000)])

    records = list(extract_data_from_json(temp_json_file.name))
    assert sorted(records) == sorted(
        [MetadataRecord('field_one', 'abc')] * 1000 +
        [MetadataRecord('field_two', 50)] * 1000, )
Example #2
0
def test_type_inconsistency():
    scenario = [
        MetadataRecord('wrong_field', 20),
        MetadataRecord('wrong_field', 'abc')
    ]

    with pytest.raises(CrawlingError) as exc:
        list(crawl(scenario))

    info = exc.value
    assert info.args[0] == "The type of field 'wrong_field' is not consistent"
Example #3
0
def test_json_with_multiple_fields(temp_json_file):
    write_json(temp_json_file, [{
        'field_one': 'abc',
        'field_two': 50,
        'field_three': None
    }])
    records = list(extract_data_from_json(temp_json_file.name))
    assert sorted(records) == sorted([
        MetadataRecord('field_one', 'abc'),
        MetadataRecord('field_two', 50),
        MetadataRecord('field_three', None),
    ])
Example #4
0
def test_csv_with_multiple_fields(temp_csv_file):
    write_csv(temp_csv_file, ['field_one', 'field_two', 'field_three'],
              [{
                  'field_one': '"abc"',
                  'field_two': 50,
                  'field_three': 'null'
              }])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert sorted(records) == sorted([
        MetadataRecord('field_one', 'abc'),
        MetadataRecord('field_two', 50),
        MetadataRecord('field_three', None),
    ])
Example #5
0
def _perform_extractor(
        file_path: str) -> Generator[MetadataRecord, None, None]:
    """
    Perform the extraction of records from the given JSON file.

    The returned generator object produces one MetadataRecord for each element
    in each JSON object read from file_path.

    Note: this function reads the hole JSON file and produces the records one by
    one. It must be improved to read objects lazily.

    :param file_path: the path to the file to create records from
    :return: a generator object that produces MetadataField objects
    :raises ExtractionError if extraction fails
    """
    with open(file_path, mode='r') as json_file:
        # TODO: avoid reading the hole file at once
        for obj in json.load(json_file):
            if not isinstance(obj, Mapping):
                raise ExtractionError(
                    f'Invalid JSON structure. It must contain a list of objects'
                )

            for key, value in obj.items():
                yield MetadataRecord(key, value)
Example #6
0
def test_csv_with_several_str_field(temp_csv_file):
    write_csv(temp_csv_file, ['field'], [{
        'field': f'"{idx}"'
    } for idx in range(100)])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert sorted(records) == sorted(
        [MetadataRecord('field', f'{idx}') for idx in range(100)])
Example #7
0
def test_invalid_type():
    scenario = [MetadataRecord('wrong_field', [1, 2, 3])]

    with pytest.raises(CrawlingError) as exc:
        list(crawl(scenario))

    info = exc.value
    assert info.args[0] == "The type of field 'wrong_field' is unknown"
Example #8
0
def _perform_extraction(
        file_path: str) -> Generator[MetadataRecord, None, None]:
    """
    Perform the extraction of records from the given CSV file.

    The returned generator object produces N * M MetadataRecord for a CSV
    with N columns and M rows.

    :param file_path: the path to the file to create records from
    :return: a generator object that produces MetadataField objects
    :raises ExtractionError if extraction fails
    """
    with open(file_path, newline='', mode='r') as csv_file:
        csv_reader = DictReader(csv_file, delimiter=',', quoting=QUOTE_NONE)
        for row in csv_reader:
            for key, value in row.items():
                try:
                    key = _sanitize_key(key)
                except AttributeError:
                    raise ExtractionError(
                        f"Missing column name for value {value} at line {csv_reader.line_num}"
                    )

                try:
                    value = _sanitize_value(value)
                except ValueError:
                    if not value:
                        raise ExtractionError(
                            f"Missing value for column '{key}' at line {csv_reader.line_num}"
                        )
                    else:
                        raise ExtractionError(
                            f"Unknown type for value '{value}' (column '{key}') at line "
                            f"{csv_reader.line_num}")

                yield MetadataRecord(key, value)
Example #9
0
def test_csv_with_single_str_field(temp_csv_file):
    write_csv(temp_csv_file, ['field'], [{'field': '"string_value"'}])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert records == [MetadataRecord('field', "string_value")]
Example #10
0
def test_json_with_several_nulls_field(temp_json_file):
    write_json(temp_json_file, [{'field': None} for _ in range(100)])
    records = list(extract_data_from_json(temp_json_file.name))
    assert records == [MetadataRecord('field', None)] * 100
Example #11
0
def test_csv_with_single_null_field(temp_csv_file):
    write_csv(temp_csv_file, ['field'], [{'field': 'null'}])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert records == [MetadataRecord('field', None)]
Example #12
0
import pytest

from crawler import crawl, CrawlingError
from common import MetadataRecord, Metadata


@pytest.mark.parametrize('scenario, expected_result', [
    ([], []),
    ([MetadataRecord('field', 30)], [Metadata('field', 'I', 1, 0)]),
    ([MetadataRecord('field', 'abc')], [Metadata('field', 'S', 1, 0)]),
    ([MetadataRecord('field', None)], [Metadata('field', None, 1, 1)]),
    ([MetadataRecord('field', 30)] * 10 + [MetadataRecord('field', None)] * 5,
     [Metadata('field', 'I', 15, 5)]),
    ([MetadataRecord('field', 'abc')] * 10 +
     [MetadataRecord('field', None)] * 5, [Metadata('field', 'S', 15, 5)]),
    ([MetadataRecord('f1', 1)] * 5 + [MetadataRecord('f2', 'abc')] * 5 +
     [MetadataRecord('f1', 2)] * 10 + [MetadataRecord('f2', None)] * 5,
     [Metadata('f1', 'I', 15, 0),
      Metadata('f2', 'S', 10, 5)]),
])
def test_crawling_mixed_fields(scenario, expected_result):
    assert sorted(list(crawl(scenario))) == sorted(expected_result)


def test_type_inconsistency():
    scenario = [
        MetadataRecord('wrong_field', 20),
        MetadataRecord('wrong_field', 'abc')
    ]

    with pytest.raises(CrawlingError) as exc:
Example #13
0
def test_json_unicode_value(temp_json_file):
    write_json(temp_json_file, [{'field': '短消息'}])
    records = list(extract_data_from_json(temp_json_file.name))
    assert records == [MetadataRecord('field', '短消息')]
Example #14
0
def test_json_with_single_int_field(temp_json_file):
    write_json(temp_json_file, [{'field': 50}])
    records = list(extract_data_from_json(temp_json_file.name))
    assert records == [MetadataRecord('field', 50)]
Example #15
0
def test_json_unicode_column_name(temp_json_file):
    write_json(temp_json_file, [{'短消息': 50}])
    records = list(extract_data_from_json(temp_json_file.name))
    assert records == [MetadataRecord('短消息', 50)]
Example #16
0
def test_csv_with_several_nulls_field(temp_csv_file):
    write_csv(temp_csv_file, ['field'], [{
        'field': 'null'
    } for _ in range(100)])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert records == [MetadataRecord('field', None)] * 100
Example #17
0
def test_json_with_several_int_fields(temp_json_file):
    write_json(temp_json_file, [{'field': idx} for idx in range(100)])
    records = list(extract_data_from_json(temp_json_file.name))
    assert sorted(records) == sorted(
        [MetadataRecord('field', idx) for idx in range(100)])
Example #18
0
def test_csv_unicode_value(temp_csv_file):
    write_csv(temp_csv_file, ['field'], [{'field': '"短消息"'}])
    records = list(extract_data_from_csv(temp_csv_file.name))
    assert records == [MetadataRecord('field', '短消息')]
Example #19
0
def test_json_with_single_str_field(temp_json_file):
    write_json(temp_json_file, [{'field': 'string_value'}])
    records = list(extract_data_from_json(temp_json_file.name))
    assert records == [MetadataRecord('field', 'string_value')]