def module_target_schema_with_transient(
        module_source_schema, module_target_track_with_transient) -> Schema:
    temporal: Track = module_target_track_with_transient("t", "temporal")
    immutable: Track = module_target_track_with_transient("i", "immutable")
    target_schema = Schema(temporal, immutable)
    target_schema.source = module_source_schema
    return target_schema
Beispiel #2
0
 def build(cls, conf_dir, data_dir, name):
     """Build task from yaml, read all input data and create corresponding
     objects"""
     logging.info("Constructing task execution plan.")
     path_locator = PathLocator(conf=conf_dir, data=data_dir)
     logging.info(
         "Configuration base directory is %s; data base directory is %s." %
         (conf_dir, data_dir))
     task_path: str = os.path.join(path_locator.tasks_dir, name + '.yaml')
     with open(task_path, 'r') as f:
         logging.info("Task configuration loaded from %s." % task_path)
         spec = yaml.safe_load(f)
     resulting_in = spec.get('resulting_in', {})
     task = cls(path_locator=path_locator,
                origin_data=spec['starting_with']['data'],
                origin_schema=Schema.load(path_locator,
                                          spec['starting_with']['schema']),
                target_data=resulting_in.get('data'),
                target_schema=Schema.load(path_locator,
                                          resulting_in.get('schema')))
     task.load_steps(spec['steps'])
     # If the last step is a Consume step we don't need target data
     assert task.target_data is not None or isinstance(
         task.steps[-1], Consume)
     return task
Beispiel #3
0
def schema_validate(schema_basepath: str, schema_name: str,
                    schema_source_name: Optional[str]) -> None:
    """Validates a schema without doing any other work."""
    source: Optional[Schema] = None
    if schema_source_name is not None:
        source = Schema.load(schema_source_name, schema_basepath)
    Schema.load(schema_name, schema_basepath, source_schema=source)
Beispiel #4
0
 def build(cls, context: Context, name: str) -> "Task":
     """Build task from yaml, read all input data and create corresponding
     objects"""
     logging.info("Constructing task execution plan.")
     logging.info(
         "Configuration base directory is %s; entities directory is %s." %
         (context.conf_dir, context.entities_input_dir))
     task_path: str = os.path.join(context.conf_dir, 'tasks',
                                   name + '.yaml')
     with open(task_path, 'r') as f:
         logging.info("Task configuration loaded from %s." % task_path)
         spec = yaml.safe_load(f)
     resulting_in = spec.get('resulting_in', {})
     origin_schema = Schema.load(spec['starting_with']['schema'],
                                 context.schemas_dir)
     assert origin_schema is not None
     task = cls(context=context,
                origin_data=spec['starting_with']['data'],
                origin_schema=origin_schema,
                target_data=resulting_in.get('data'),
                target_schema=Schema.load(resulting_in.get('schema'),
                                          context.schemas_dir))
     task.load_steps(spec['steps'])
     # If the last step is a Consume step we don't need target data
     assert task.target_data is not None or isinstance(
         task.steps[-1], Consume)
     return task
Beispiel #5
0
 def from_files(cls, schema_basepath: str, source_schema: str, target_schema: str, output_file: TextIO) -> None:
     source_schema_instance: Optional[Schema] = Schema.load(source_schema, base_path=schema_basepath)
     target_schema_instance: Optional[Schema] = Schema.load(target_schema, source_schema=source_schema_instance,
                                                  base_path=schema_basepath)
     assert target_schema_instance is not None
     export: "ExportLinkages" = cls(target_schema_instance, output_file)
     export()
     output_file.close()
Beispiel #6
0
    def __init__(self, schema: Schema, list_id: VariableId,
                 argument_id: VariableId, identifier_id: Optional[VariableId]):

        self.subjects_path: ListType[str] = schema.get(list_id).absolute_path
        self.arg_path: ListType[str] = schema.get(argument_id).relative_path

        self.identifier_path: Optional[ListType[str]] = None

        if identifier_id is not None:
            self.identifier_path = schema.get(identifier_id).relative_path
    def _make_schema(temporal: bool) -> Schema:
        if temporal:
            temporal: Track = Track.build(make_spec(), None, "temporal")
            immutable: Track = Track.build({}, None, "immutable")
            schema: Schema = Schema(temporal, immutable)
        else:
            immutable: Track = Track.build(make_spec(), None, "immutable")
            temporal: Track = Track.build({}, None, "temporal")
            schema: Schema = Schema(temporal, immutable)

        return schema
Beispiel #8
0
 def from_files(cls, schema_basepath: str, source_schema: str, target_schema: str, input_file: TextIO, suffix: str) -> None:
     source_schema_instance: Optional[Schema] = Schema.load(source_schema, base_path=schema_basepath)
     target_schema_instance: Optional[Schema] = Schema.load(target_schema, source_schema=source_schema_instance,
                                                  base_path=schema_basepath)
     assert target_schema_instance is not None
     do_import: "ImportLinkages" = cls(target_schema_instance, input_file)
     do_import()
     input_file.close()
     output_schema_relpath: str = "%s_%s" % (target_schema, suffix)
     output_path: str = os.path.join(schema_basepath, output_schema_relpath)
     if not os.path.exists(output_path):
         os.mkdir(output_path)
     target_schema_instance.serialize(output_path)
Beispiel #9
0
def test_get_conflict_raises(track_type):
    t_spec: Dict = {
        "A": {
            "name": "temporal variable",
            "data_type": "Text",
            "sort_order": 0
        }
    }
    t_track = Track.build(t_spec, None, "temporal")
    i_spec = copy.deepcopy(t_spec)
    i_track = Track.build(i_spec, None, "immutable")
    schema = Schema(t_track, i_track)
    with pytest.raises(ValueError):
        schema.get("A", track_type=track_type)
Beispiel #10
0
    def standalone(cls, context: Context, translate_dir: str, trace_dir: str,
                   source_schema_name: str, target_schema_name: str,
                   output_filename: str) -> None:
        source_schema: Optional[Schema] = Schema.load(source_schema_name,
                                                      context.schemas_dir)
        assert source_schema is not None

        schema: Optional[Schema] = Schema.load(target_schema_name,
                                               context.schemas_dir,
                                               source_schema)
        assert schema is not None

        coverage: "SourceCoverage" = cls(context, schema, translate_dir,
                                         trace_dir, output_filename)
        coverage("dummy", None)
def test_list_na_absent_match(complex_track, empty_track, entity_id):
    """If the fixture has an explicit |--|NA|--| and the item is missing in the actual, count it as a match."""
    schema: Schema = Schema(empty_track, complex_track)
    fixture: Dict = {
        "outer": [{
            "the_folder": {
                "inner": [{
                    "some_text": "foo"
                }, {
                    "some_text": POLYTROPOS_NA
                }],
            }
        }]
    }
    observed: Dict = {
        "outer": [{
            "the_folder": {
                "inner": [{
                    "some_text": "foo"
                }, {}],
            }
        }]
    }
    expected: Outcome = Outcome()
    expected.matches.append(
        ValueMatch(entity_id, "immutable", "/outer", "List",
                   json.dumps(fixture["outer"])))

    actual: Outcome = Outcome()

    crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture,
                                           observed, actual)
    crawl()

    assert expected == actual
def nested_list_schema() -> Schema:
    immutable_spec: Dict = {
        "outer_list_1_id": {
            "name": "outer_list_1",
            "data_type": "List",
            "sort_order": 0
        },
        "inner_list_1_id": {
            "name": "inner_list",
            "data_type": "List",
            "parent": "outer_list_1_id",
            "sort_order": 0
        },
        "name_1_id": {
            "name": "name",
            "data_type": "Text",
            "parent": "inner_list_1_id",
            "sort_order": 0
        }
    }

    immutable_track: Track = Track.build(immutable_spec, None, "immutable")
    temporal_track: Track = Track.build({}, None, "Temporal")
    schema: Schema = Schema(temporal_track, immutable_track)
    return schema
Beispiel #13
0
def variable_catalog(schema_basepath: str, schema_name: str,
                     fh: TextIO) -> None:
    schema: Optional[Schema] = Schema.load(schema_name,
                                           base_path=schema_basepath)
    assert schema is not None
    write_catalog(schema, fh)
    fh.close()
Beispiel #14
0
 def _do_nearest_list_test(innermost: str, middle: str, outermost: str,
                           expected: str) -> None:
     spec: Dict = {
         "innermost": {
             "data_type": innermost,
             "name": "innermost",
             "sort_order": 0,
             "parent": "middle"
         },
         "middle": {
             "data_type": middle,
             "name": "middle",
             "sort_order": 0,
             "parent": "outermost"
         },
         "outermost": {
             "data_type": outermost,
             "name": "outermost",
             "sort_order": 0
         }
     }
     immutable: Track = Track.build(spec, None, "i")
     temporal: Track = Track.build({}, None, "t")
     schema: Schema = Schema(temporal, immutable)
     innermost: Variable = schema.get(cast(VariableId, "innermost"))
     assert innermost.nearest_list == expected
def test_simple_na_present_with_value_mismatch(simple_track, empty_track, entity_id):
    """If the fixture has an explicit |--|NA|--| and the item is present in the actual with a non-null value, count it
    as a mismatch."""
    schema: Schema = Schema(empty_track, simple_track)
    fixture: Dict = {
        "some_multiple_text": POLYTROPOS_NA,
        "outer": {
            "some_multiple_text": ["foo", "bar"]
        }
    }
    observation: Dict = {
        "some_multiple_text": ["123"],
        "outer": {
            "some_multiple_text": ["foo", "bar"]
        }
    }
    expected: Outcome = Outcome()
    expected.mismatches.append(ValueMismatch(entity_id, "immutable", "/some_multiple_text", "MultipleText", POLYTROPOS_NA, json.dumps(["123"])))
    expected.matches.append(ValueMatch(entity_id, "immutable", "/outer/some_multiple_text", "MultipleText", json.dumps(["foo", "bar"])))

    actual: Outcome = Outcome()

    crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture, observation, actual)
    crawl()

    assert expected == actual
def test_nested_does_not_short_circuit_crawl():
    """Bug history:
         - Detected around 9/20/2019
         - Isolated minimum reproducible case on 9/24/2019
         - Caused by commit e23b825 (8/27/2019)
         - Regression test based on minimum reproducible case
    """
    spec: Dict = {
        "root": {
            "name": "return",
            "data_type": "Folder",
            "sort_order": 0
        },
        "application_submissions": {
            "name": "application_submissions",
            "data_type": "List",
            "parent": "root",
            "sort_order": 0
        },
        "award_restrict": {
            "name": "award_restrict",
            "data_type": "Text",
            "parent": "application_submissions",
            "sort_order": 0
        },
        "filer": {
            "name": "filer",
            "data_type": "Folder",
            "parent": "root",
            "sort_order": 1
        },
        "name_org": {
            "name": "name_org",
            "data_type": "Text",
            "parent": "filer",
            "sort_order": 0
        }
    }

    temporal: Track = Track.build(spec, None, "temporal")
    immutable: Track = Track.build({}, None, "immutable")
    schema: Schema = Schema(temporal, immutable, name="semantic")

    basepath: str = os.path.dirname(os.path.abspath(__file__))
    composite_path: str = os.path.join(basepath, "data")

    shutil.rmtree(output_path, ignore_errors=True)
    os.makedirs(output_path)
    with Context.build(conf_dir="dummy", data_dir="dummy") as context:
        coverage: CoverageFile = CoverageFile(context, schema, output_path + "/semantic", None, None)
        coverage(composite_path, "dummy")

    expected_path: str = os.path.join(basepath, "expected.csv")
    actual_path: str = os.path.join(output_path, "semantic_temporal.csv")
    with open(expected_path) as expected_fh, open(actual_path) as actual_fh:
        expected: csv.DictReader = csv.DictReader(expected_fh)
        actual: csv.DictReader = csv.DictReader(actual_fh)
        e_rows = [row for row in expected]
        a_rows = [row for row in actual]
        assert a_rows == e_rows
Beispiel #17
0
def outcomes(example_path) -> FixtureOutcomes:
    schema: Schema = Schema.load("conf/schemas/simple", example_path)

    fixture_path: str = os.path.join(example_path, "data", "fixtures")
    obs_path: str = os.path.join(example_path, "data", "observations")

    return FixtureOutcomes(schema, fixture_path, obs_path)
Beispiel #18
0
 def build(
         cls, path_locator: PathLocator, schema: Schema, name: str, target_schema: str, id_var: str,
         input_schema_vars: Dict, output_schema_vars: Dict
 ): 
     target_schema_instance: Schema = Schema.load(path_locator, target_schema)
     aggregations: Dict[str, Type] = load(cls)
     input_variables: Dict[str, Variable] = {
         var_name: schema.get(var_id)
         for var_name, var_id in input_schema_vars.items()
     }
     output_variables: Dict[str, Variable] = {
         var_name: target_schema_instance.get(var_id)
         for var_name, var_id in output_schema_vars.items()
     }
     return aggregations[name](origin_schema=schema, target_schema=target_schema_instance, id_var=id_var,
                               **input_variables, **output_variables)
Beispiel #19
0
def target_schema(source_schema) -> Schema:
    spec_path: str = os.path.join(basepath, "target_spec.json")
    with open(spec_path) as fh:
        spec: Dict = json.load(fh)
    temporal: Track = Track.build(spec, source_schema.temporal, "temporal")
    immutable: Track = Track.build({}, source_schema.immutable, "immutable")
    return Schema(temporal, immutable)
def test_list_has_sentinal_value_raises(complex_track, empty_track, entity_id,
                                        bad_value):
    """If the fixture has an explicit |--|NA|--| and the item has the same string, an error is raised because we need
    to choose a new, truly unique sentinel value for missing data."""
    schema: Schema = Schema(empty_track, complex_track)
    fixture: Dict = {
        "outer": [{
            "the_folder": {
                "inner": [{
                    "some_text": "foo"
                }, {
                    "some_text": POLYTROPOS_NA
                }],
            }
        }]
    }
    observed: Dict = {
        "outer": [{
            "the_folder": {
                "inner": [{
                    "some_text": "foo"
                }, {
                    "some_text": bad_value
                }],
            }
        }]
    }
    actual: Outcome = Outcome()

    crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture,
                                           observed, actual)
    with pytest.raises(ValueError):
        crawl()
def test_folder_na_present_as_none_mismatch(simple_track, empty_track,
                                            entity_id):
    """If the fixture has an explicit |--|NA|--| and the item is present in the actual with the value None, count it
    as a mismatch."""
    schema: Schema = Schema(empty_track, simple_track)
    fixture: Dict = {
        "some_text": "bar",
        "outer": {
            "some_text": POLYTROPOS_NA,
        }
    }
    observation: Dict = {
        "some_text": "bar",
        "outer": {
            "some_text": None,
        }
    }
    expected: Outcome = Outcome()
    expected.mismatches.append(
        ValueMismatch(entity_id, "immutable", "/outer/some_text", "Text",
                      POLYTROPOS_NA, None))
    expected.matches.append(
        ValueMatch(entity_id, "immutable", "/some_text", "Text", "bar"))

    actual: Outcome = Outcome()

    crawl: CrawlImmutable = CrawlImmutable(entity_id, schema, fixture,
                                           observation, actual)
    crawl()

    assert expected == actual
Beispiel #22
0
def schema() -> Schema:
    temporal_spec: Dict = {
        "integer_source": {
            "name": "integer_source",
            "data_type": "Integer",
            "sort_order": 0
        },
        "decimal_source": {
            "name": "decimal_source",
            "data_type": "Decimal",
            "sort_order": 1
        },
        "currency_source": {
            "name": "currency_source",
            "data_type": "Currency",
            "sort_order": 2
        }
    }
    immutable_spec: Dict = {
        "target": {
            "name": "target",
            "data_type": "Decimal",
            "sort_order": 0
        }
    }
    temporal: Track = Track(temporal_spec, None, "temporal")
    immutable: Track = Track(immutable_spec, None, "immutable")
    return Schema(temporal, immutable)
def test_underscore_folders_ignored():
    spec: Dict = {
        "binary_in_root": {
            "name": "the_binary",
            "data_type": "Binary",
            "sort_order": 0
        }
    }
    immutable: Track = Track.build(spec, None, "immutable")
    temporal: Track = Track.build({}, None, "temporal")
    schema: Schema = Schema(temporal, immutable)
    content: Dict = {
        "immutable": {
            "the_binary": "true",
            "_folder": {
                "foo": "shouldn't matter",
                "bar": "also shouldn't matter"
            }
        }
    }
    expected: Dict = {
        "immutable": {
            "the_binary": True,
            "_folder": {
                "foo": "shouldn't matter",
                "bar": "also shouldn't matter"
            }
        }
    }
    composite: Composite = Composite(schema, content)
    cast: Cast = Cast(schema, {})
    cast(composite)
    assert composite.content == expected
Beispiel #24
0
    def _do_cast_error_test(data_type: str, raw: Optional[Any]):
        spec: Dict = {
            "var": {
                "name": "the_var",
                "data_type": data_type,
                "sort_order": 0
            }
        }
        immutable: Track = Track.build(spec, None, "immutable")
        temporal: Track = Track.build({}, None, "temporal")
        schema: Schema = Schema(temporal, immutable)
        content: Dict = {"immutable": {"the_var": raw}}
        composite: Composite = Composite(schema, content)
        cast: Cast = Cast(schema, {})
        cast(composite)

        expected: Dict = {
            "immutable": {
                "qc": {
                    "_exceptions": {
                        "cast_errors": {
                            "the_var": raw
                        }
                    }
                }
            }
        }
        actual: Dict = composite.content
        assert actual == expected
Beispiel #25
0
def list_in_nested_folder_schema(empty_track) -> Schema:
    spec: Dict = {
        "folder_in_root": {
            "name": "parent",
            "data_type": "Folder",
            "sort_order": 0
        },
        "folder_in_folder": {
            "name": "child",
            "data_type": "Folder",
            "parent": "folder_in_root",
            "sort_order": 0
        },
        "folder_in_folder_in_folder": {
            "name": "grandchild",
            "data_type": "Folder",
            "parent": "folder_in_folder",
            "sort_order": 0
        },
        "nested_list": {
            "name": "the_list",
            "data_type": "List",
            "parent": "folder_in_folder_in_folder",
            "sort_order": 0
        },
        "list_text": {
            "name": "some_text",
            "data_type": "Text",
            "parent": "nested_list",
            "sort_order": 0
        }
    }
    test_track: Track = Track.build(spec, None, "")
    return Schema(test_track, empty_track)
Beispiel #26
0
def simple_schema(empty_track) -> Schema:
    spec: Dict = {
        "text_in_root": {
            "name": "some_text",
            "data_type": "Text",
            "sort_order": 0
        },
        "folder_in_root": {
            "name": "the_folder",
            "data_type": "Folder",
            "sort_order": 1
        },
        "text_in_folder": {
            "name": "some_text",
            "data_type": "Text",
            "parent": "folder_in_root",
            "sort_order": 0
        },
        "int_in_folder": {
            "name": "some_number",
            "data_type": "Integer",
            "parent": "folder_in_root",
            "sort_order": 1
        }
    }
    test_track: Track = Track.build(spec, None, "")
    return Schema(test_track, empty_track)
Beispiel #27
0
def schema() -> Schema:
    temporal_spec: Dict = {
        "the_subject": {
            "name": "source",
            "data_type": "Integer",
            "sort_order": 0
        }
    }
    temporal: Track = Track.build(temporal_spec, None, "temporal")

    immutable_spec: Dict = {
        "the_target": {
            "name": "limit",
            "data_type": "Integer",
            "sort_order": 0
        },
        "the_period_id": {
            "name": "limit_period",
            "data_type": "Text",
            "sort_order": 1
        }
    }
    immutable: Track = Track.build(immutable_spec, None, "immutable")
    schema: Schema = Schema(temporal, immutable)
    return schema
Beispiel #28
0
 def build(cls, path_locator, schema: Schema, name: str, subjects: Dict):
     logging.info('Building instance of filter class "%s"' % name)
     filters = load(cls)
     variables = {
         var_name: schema.get(var_id)
         for var_name, var_id in subjects.items()
     }
     return filters[name](schema=schema, **variables)
Beispiel #29
0
 def _target_schema(source: Schema, data_type: str = "Text") -> Schema:
     temporal_spec: Dict = target_spec("t", data_type)
     temporal: Track = Track.build(temporal_spec, source.temporal,
                                   "temporal")
     immutable_spec: Dict = target_spec("i", data_type)
     immutable: Track = Track.build(immutable_spec, source.immutable,
                                    "immutable")
     return Schema(temporal, immutable, name="target", source=source)
Beispiel #30
0
    def standalone(cls, context: Context, schema_name: str, output_prefix: str,
                   t_group: Optional[VariableId], i_group: Optional[VariableId], exclude_trivial: bool = False) -> None:

        schema: Optional[Schema] = Schema.load(schema_name, context.schemas_dir)
        assert schema is not None
        # TODO Refactor so unnecessary arguments aren't required.
        coverage: "CoverageFile" = cls(context, schema, output_prefix, t_group, i_group, exclude_trivial)
        coverage(context.entities_input_dir, None)