Beispiel #1
0
class TestEvaluateSSD(TestWithServer):
    """
    Tests the comparison of SSDs
    """
    def __init__(self, method_name="runTest"):
        super().__init__(method_name)
        self._datasets = None
        self._ontologies = None

        path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                            "resources")
        self._test_owl = os.path.join(path, 'owl',
                                      'dataintegration_report_ontology.ttl')
        self._museum_owl_dir = os.path.join(path, 'owl', 'museum_edm')
        self._business_file = os.path.join(path, 'data', 'businessInfo.csv')
        self._business_ssd = os.path.join(path, 'ssd', 'businessInfo.ssd')
        self._cities_file = os.path.join(path, 'data', 'getCities.csv')
        self._cities_ssd = os.path.join(path, 'ssd', 'getCities.ssd')
        self._tricky_cities_ssd = os.path.join(path, 'ssd', 'tricky.ssd')
        self._objects_owl = os.path.join(path, 'owl', 'objects.ttl')
        self._paintings_file = os.path.join(path, 'data', 'paintings.csv')
        self._paintings_ssd = os.path.join(path, 'ssd', 'paintings.ssd')
        self._museum_file = os.path.join(path, 'data', 'museum.csv')
        self._museum_ssd = os.path.join(path, 'ssd', 'museum.ssd')

        self._ssd_path = os.path.join(path, 'ssd')
        self._data_path = os.path.join(path, 'data')

    def setUp(self):
        self._datasets = DataSetEndpoint(self._session)
        self._ontologies = OntologyEndpoint(self._session)
        self._ssds = SSDEndpoint(self._session, self._datasets,
                                 self._ontologies)
        self._clear_storage()

    def _clear_storage(self):
        """Removes all server elements"""
        for ssd in self._ssds.items:
            self._ssds.remove(ssd)

        for ds in self._datasets.items:
            self._datasets.remove(ds)

        for on in self._ontologies.items:
            self._ontologies.remove(on)

    def tearDown(self):
        """
        Be sure to remove all storage elements once a test is finished...
        :return:
        """
        self._clear_storage()

    def test_evaluate_business(self):
        """
        Tests evaluation for business
        :return:
        """
        dataset = self._datasets.upload(self._business_file)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]

        new_json = dataset.bind_ssd(self._business_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 4)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         3)  # these are only object properties

        res = self._ssds.compare(ssd, ssd)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_tricky_cities(self):
        """
        Here the ssd has two class nodes of the same type
        :return:
        """
        self._datasets.upload(self._cities_file)
        on = self._ontologies.upload(self._test_owl)

        dataset = self._datasets.items[0]
        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]

        new_json = dataset.bind_ssd(self._tricky_cities_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 2)
        self.assertEqual(len(ssd.data_nodes), 2)
        self.assertEqual(len(ssd.mappings), 2)
        self.assertEqual(len(ssd.object_links),
                         1)  # these are only object properties

        res = self._ssds.compare(ssd, ssd)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_ssd_evaluate_tricky_cities(self):
        """
        Here the ssd has two class nodes of the same type.
        We use the evaluate method of ssd and not from the server.
        :return:
        """
        self._datasets.upload(self._cities_file)
        on = self._ontologies.upload(self._test_owl)

        dataset = self._datasets.items[0]
        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]

        new_json = dataset.bind_ssd(self._tricky_cities_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 2)
        self.assertEqual(len(ssd.data_nodes), 2)
        self.assertEqual(len(ssd.mappings), 2)
        self.assertEqual(len(ssd.object_links),
                         1)  # these are only object properties

        res = ssd.evaluate(ssd)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_country_names(self):
        """
        Tests evaluation for country_names
        :return:
        """
        path = os.path.join(self._data_path, "country_names.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "country_names.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 1)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        #self.assertEqual(len(ssd.links), 4)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         0)  # these are only object properties

        res = self._ssds.compare(ssd, ssd, False, False)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_country_names_zero(self):
        """
        Tests evaluation for country_names
        If we ignore everything, then it will all 0
        :return:
        """
        path = os.path.join(self._data_path, "country_names.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        ssd_path = os.path.join(self._ssd_path, "country_names.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        res = self._ssds.compare(ssd, ssd, True, True)
        print(res)

        self.assertEqual(res['precision'], 0)
        self.assertEqual(res['recall'], 0)
        self.assertEqual(res['jaccard'], 0)

    def test_evaluate_places_dif(self):
        """
        Tests evaluation for places_dif
        :return:
        """
        path = os.path.join(self._data_path, "places_dif.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "places_dif.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 4)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.links), 7)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         3)  # these are only object properties

        res = self._ssds.compare(ssd, ssd)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_places_mix(self):
        """
        Tests evaluation for places_mix
        :return:
        """
        path = os.path.join(self._data_path, "places_mix.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "places_mix.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 2)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.links), 5)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         1)  # these are only object properties

        res = self._ssds.compare(ssd, ssd, False, True)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_paintings(self):
        """
        Here we have a class node with no data nodes
        :return:
        """
        self._datasets.upload(self._paintings_file)
        on = self._ontologies.upload(self._objects_owl)

        dataset = self._datasets.items[0]
        #print(dataset)
        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]
        #print("namespaces: ", ontology._prefixes)
        #print("class nodes: ", list(ontology._iclass_nodes()))
        #print("data nodes: ", list(ontology._idata_nodes()))
        #print("links: ", list(ontology._ilinks()))

        new_json = dataset.bind_ssd(self._paintings_ssd, [ontology],
                                    str(ontology._prefixes['']))

        # print("************************")
        # print("new json...")
        # pprint(new_json)

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        # pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 3)
        self.assertEqual(len(ssd.links), 4)
        self.assertEqual(len(ssd.data_links), 2)
        self.assertEqual(len(ssd.object_links), 2)
        self.assertEqual(len(ssd.data_nodes), 2)
        self.assertEqual(len(ssd.mappings), 2)

        res = self._ssds.compare(ssd, ssd, True, True)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)

    def test_evaluate_museum(self):
        """
        Here we have a class node with no data nodes, a list of ontologies, class instance link.
        Not all columns from file get mapped.
        :return:
        """
        dataset = self._datasets.upload(self._museum_file)

        ontologies = []
        for path in os.listdir(self._museum_owl_dir):
            f = os.path.join(self._museum_owl_dir, path)
            ontologies.append(self._ontologies.upload(f))

        new_json = dataset.bind_ssd(self._museum_ssd, ontologies,
                                    KARMA_DEFAULT_NS)

        empty_ssd = SSD(dataset, ontologies)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 6)
        self.assertEqual(len(ssd.links),
                         14)  # class instance, data property, object property
        self.assertEqual(len(ssd.data_nodes), 10)
        self.assertEqual(len(ssd.mappings), 10)

        res = self._ssds.compare(ssd, ssd, False, False)
        print(res)

        self.assertEqual(res['precision'], 1)
        self.assertEqual(res['recall'], 1)
        self.assertEqual(res['jaccard'], 1)
Beispiel #2
0
class TestMuseum(TestWithServer):
    """
    Tests the JsonReader/JsonWriter for SSD
    """
    def __init__(self, method_name="runTest"):
        super().__init__(method_name)
        self._datasets = None
        self._ontologies = None

        self._benchmark_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources", "museum_benchmark")
        self._museum_owl_dir = os.path.join(self._benchmark_path, "owl")
        self._museum_data = os.path.join(self._benchmark_path, 'dataset')
        self._museum_ssd = os.path.join(self._benchmark_path, 'ssd')

    def setUp(self):
        self._datasets = DataSetEndpoint(self._session)
        self._ontologies = OntologyEndpoint(self._session)
        self._ssds = SSDEndpoint(self._session, self._datasets, self._ontologies)
        self._clear_storage()

        _logger.debug("SetUp complete")

    def tearDown(self):
        """
        Be sure to remove all storage elements once a test is finished...
        :return:
        """
        self._clear_storage()

    def _add_owls(self):
        # add ontologies
        ontologies = []
        for path in os.listdir(self._museum_owl_dir):
            f = os.path.join(self._museum_owl_dir, path)
            ontologies.append(self._ontologies.upload(f))
        return ontologies

    def _add_datasets(self, ontologies):
        # we need to unzip first the data
        if os.path.exists(os.path.join(self._museum_data, "data.tar.gz")):
            with tarfile.open(os.path.join(self._museum_data, "data.tar.gz")) as f:
                f.extractall(path=self._museum_data)

        # add datasets with their ssds
        for ds in os.listdir(self._museum_data):
            if ds.endswith(".gz"):
                continue # skip archive
            ds_f = os.path.join(self._museum_data, ds)
            ds_name = os.path.splitext(ds)[0]
            ssd_f = os.path.join(self._museum_ssd, ds_name + ".ssd")

            _logger.debug("Adding dataset: {}".format(ds_f))
            dataset = self._datasets.upload(ds_f, description="museum_benchmark")

            _logger.debug("Adding ssd: {}".format(ssd_f))
            new_json = dataset.bind_ssd(ssd_f, ontologies, KARMA_DEFAULT_NS)
            empty_ssd = SSD(dataset, ontologies)
            ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
            self._ssds.upload(ssd)
            # we remove the csv dataset
            os.remove(ds_f)

    def _clear_storage(self):
        """Removes all server elements"""
        for ssd in self._ssds.items:
            self._ssds.remove(ssd)

        for ds in self._datasets.items:
            self._datasets.remove(ds)

        for on in self._ontologies.items:
            self._ontologies.remove(on)

    def test_evaluate_museum(self):
        """
        Test that museum benchmark can be uploaded!
        :return:
        """
        ontologies = self._add_owls()
        self._add_datasets(ontologies)
        self.assertEqual(len(self._ssds.items), 29)
        self.assertEqual(len(self._datasets.items), 29)
        self.assertEqual(len(self._ontologies.items), 11)

    def test_mappings(self):
        """
        Get label data for museum benchmark files
        :return:
        """
        ontologies = self._add_owls()
        self._add_datasets(ontologies)
        self.assertEqual(len(self._ssds.items), 29)
        self.assertEqual(len(self._datasets.items), 29)
        self.assertEqual(len(self._ontologies.items), 11)

        for ssd in self._ssds.items:
            print("name: ", ssd.name)
            print("mappings: ",ssd.mappings)
            csvF = ssd.name + ".columnmap.txt"
            with open(csvF, "w+") as csvfile:
                csvwriter = csv.writer(csvfile, dialect="excel")
                csvwriter.writerow(["key", "column_name", "source_name", "semantic_type"])
                for key, value in ssd.mappings.items():
                    label = key.class_node.label + "---" + key.label
                    column = value.name
                    print("column: ", column, ", label:", label)
                    csvwriter.writerow([column, column, ssd.name, label])

        self.fail()

    def test_s27(self):
        """

        :return:
        """
        ontologies = self._add_owls()
        self.assertEqual(len(self._ontologies.items), 11)
        self._add_datasets(ontologies)
        self.assertEqual(len(self._ssds.items), 29)
        self.assertEqual(len(self._datasets.items), 29)

        # first grab the JSON object from the dataset...
        datasets = self._datasets.items
        s27_ds = ""
        for ds in datasets:
            if "s27-s-the-huntington.json" in ds.filename:
                s27_ds = ds

        # secondly grab the reference SSD
        s27_ssd = os.path.join(self._benchmark_path, "ssd", "s27-s-the-huntington.json.ssd")

        # bind the dataset and ssd together...
        new_json = s27_ds.bind_ssd(s27_ssd, ontologies, KARMA_DEFAULT_NS)

        # create a new ssd and update with the reference JSON
        empty_ssd = SSD(s27_ds, ontologies)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        # upload to the server and ensure it is parsed correctly...
        f = self._ssds.upload(ssd)
        self.assertEqual(set(ssd.mappings), set(f.mappings))
Beispiel #3
0
class TestSSD(TestWithServer):
    """
    Tests the SSD class
    """
    def __init__(self, method_name="runTest"):
        super().__init__(method_name)
        self._datasets = None
        self._ontologies = None
        self._ssds = None

        path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "resources")
        self._test_file = os.path.join(path, 'data', 'businessInfo.csv')
        self._test_owl = os.path.join(path, 'owl', 'dataintegration_report_ontology.ttl')
        self._test_ssd = os.path.join(path, 'ssd', 'businessInfo.ssd')

    def setUp(self):
        self._datasets = DataSetEndpoint(self._session)
        self._ontologies = OntologyEndpoint(self._session)
        self._ssds = SSDEndpoint(self._session, self._datasets, self._ontologies)
        self._clear_storage()
        assert(os.path.isfile(self._test_file))

    def _clear_storage(self):
        """Removes all server elements"""
        for ssd in self._ssds.items:
            self._ssds.remove(ssd)

        for ds in self._datasets.items:
            self._datasets.remove(ds)

        for on in self._ontologies.items:
            self._ontologies.remove(on)

    def tearDown(self):
        """
        Be sure to remove all storage elements once a test is finished...
        :return:
        """
        self._clear_storage()

    def test_create(self):
        """
        Tests the SSD creation
        :return:
        """
        self._build_simple()

    def _build_simple(self):

        ds = self._datasets.upload(self._test_file)
        on = self._ontologies.upload(self._test_owl)

        single = SSD(dataset=ds, ontology=on)

        self.assertEqual(len(single.data_nodes), 0)
        self.assertEqual(len(single.links), 0)
        self.assertEqual(len(single.columns), 4)
        return single

    def test_map_simple(self):
        """
        Tests the map function for SSD mapping with one map
        :return:
        """
        simple = self._build_simple()

        simple.map(Column("ceo"), DataNode(ClassNode("Person"), "name"))

        self.assertEqual(len(simple.class_nodes), 1)
        self.assertEqual(len(simple.data_nodes), 1)
        self.assertEqual(len(simple.data_links), 1)
        self.assertEqual(len(simple.object_links), 0)

    def test_map_full(self):
        """
        Tests the map function for SSD mapping with full map
        :return:
        """
        simple = self._build_simple()

        (simple
         .map(Column("company"), DataNode(ClassNode("Organization"), "name"))
         .map(Column("ceo"), DataNode(ClassNode("Person"), "name"))
         .map(Column("city"), DataNode(ClassNode("City"), "name"))
         .map(Column("state"), DataNode(ClassNode("State"), "name")))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_duplicate_column_block(self):
        """
        Tests the map function can only map a column once
        :return:
        """
        simple = self._build_simple()

        with self.assertRaises(Exception):
            (simple
             .map(Column("company"), "Organization.name")
             .map(Column("ceo"), "Person.name")
             .map(Column("city"), "City.name")
             .map(Column("state"), "State.name")
             .map(Column("company"), "State.name")
             .map(Column("company"), "Person.name"))

    def test_duplicate_data_node_block(self):
        """
        Tests the map function can only map a data node once
        :return:
        """
        simple = self._build_simple()

        with self.assertRaises(Exception):
            (simple
             .map(Column("company"), "Organization.name")
             .map(Column("ceo"), "Person.name")
             .map(Column("city"), "Person.name")
             .map(Column("state"), "State.name"))

    def test_map_short_hand(self):
        """
        Tests the map function for SSD mapping with short hand strings
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name"))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_remove(self):
        """
        Tests the removal function when removing data nodes and columns
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .remove(DataNode(ClassNode("Person"), "name")))

        self.assertEqual(len(simple.class_nodes), 3)
        self.assertEqual(len(simple.data_nodes), 3)
        self.assertEqual(len(simple.data_links), 3)
        self.assertEqual(len(simple.object_links), 0)

    def test_remove_column(self):
        """
        Tests the removal function when removing data nodes and columns
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .remove(DataNode(ClassNode("Person"), "name"))
         .remove(Column("city")))

        self.assertEqual(len(simple.class_nodes), 2)
        self.assertEqual(len(simple.data_nodes), 2)
        self.assertEqual(len(simple.data_links), 2)
        self.assertEqual(len(simple.object_links), 0)

    def test_remove_restore(self):
        """
        Tests the removal function and then adding back
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .remove(DataNode(ClassNode("Person"), "name"))
         .remove(Column("city"))
         .map("ceo", "Person.name")
         .map("city", "City.name"))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_remove_links(self):
        """
        Tests the removal function and then adding back
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .link("Organization", "ceo", "Person")
         .remove_link("ceo"))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_map_multi_instance(self):
        """
        Tests the map function with multiple instances

        :return:
        """
        simple = self._build_simple()

        (simple
         .map(Column("company"),
              DataNode(ClassNode("Person", 0), "name"))
         .map(Column("ceo"),
              DataNode(ClassNode("Person", 1), "name"))
         .map(Column("city"),
              DataNode(ClassNode("Person", 2), "name"))
         .map(Column("state"),
              DataNode(ClassNode("Person", 3), "name")))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_map_multi_instance_links(self):
        """
        Tests the map function with multiple instances in the links

        :return:
        """
        simple = self._build_simple()

        (simple
         .map(Column("company"),
              DataNode(ClassNode("City", 0), "name"))
         .map(Column("ceo"),
              DataNode(ClassNode("City", 1), "name"))
         .map(Column("city"),
              DataNode(ClassNode("City", 2), "name"))
         .map(Column("state"),
              DataNode(ClassNode("City", 3), "name"))
         .link(ClassNode("City", 0), "nearby", ClassNode("City", 1))
         .link(ClassNode("City", 1), "nearby", ClassNode("City", 2))
         .link(ClassNode("City", 2), "nearby", ClassNode("City", 3)))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 3)

    def test_map_ambiguous_class(self):
        """
        Should throw error to not upload
        :return:
        """
        simple = self._build_simple()

        with self.assertRaises(Exception):
            (simple
             .map("English", "Place.name")
             .map("German", "Place.name")
             .map("French", "Place.name")
             .map("Russian", "Place.name"))

    def test_map_multi_data_prop(self):
        """
        Multiple data properties should be allowed on a single class if index is specified
        """
        simple = self._build_simple()
        # should be ok.
        (simple
         .map("company", DataNode(ClassNode("Place"), "name", 0))
         .map("ceo", DataNode(ClassNode("Place"), "name", 1))
         .map("city", DataNode(ClassNode("Place"), "name", 2))
         .map("state", DataNode(ClassNode("Place"), "name", 3)))

        self.assertEqual(len(simple.class_nodes), 1)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 0)

    def test_map_overwrite(self):
        """
        Tests that re-specifying a link or class node should overwrite
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Person.name")
         .map("ceo", "Person.birthDate")
         .map("city", "City.name")
         .map("state", "State.name")
         .link("City", "state", "State")
         .link("Person", "bornIn", "City")
         .link("City", "state", "State")
         .link("Person", "bornIn", "City"))

        self.assertEqual(len(simple.class_nodes), 3)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 2)

    def test_map_links(self):
        """
        Tests the map function for SSD mapping with full map
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .link("City", "state", "State")
         .link("Organization", "location", "City")
         .link("Person", "worksFor", "Organization"))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 3)

    def test_default_map_prefix(self):
        """
        Tests that the map function adds the default prefixes when missing
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name"))

        prefixes = [z.prefix for z in simple.data_nodes]

        self.assertEqual(prefixes, [simple.default_namespace for _ in prefixes])

        prefixes = [z.prefix for z in simple.class_nodes]

        self.assertEqual(prefixes, [simple.default_namespace for _ in prefixes])

    def test_default_link_prefix(self):
        """
        Tests that the link function adds the default prefixes when missing
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", DataNode(ClassNode("City", prefix=simple.default_namespace), "name"))
         .map("state", DataNode(ClassNode("State", prefix=simple.default_namespace), "name"))
         .link("City", "isPartOf", "State"))

        # the last link should use default namespace, if not, there will be an
        # ambiguity error...

        prefixes = [z.prefix for z in simple.data_nodes]

        self.assertEqual(prefixes, [simple.default_namespace for _ in prefixes])

        prefixes = [z.prefix for z in simple.class_nodes]

        self.assertEqual(prefixes, [simple.default_namespace for _ in prefixes])

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 1)

    def test_stored(self):
        """
        Tests that the storage flag resets when changes are made locally
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .link("Organization", "ceo", "Person")
         .link("Person", "livesIn", "City")
         .link("City", "state", "State"))

        uploaded = self._ssds.upload(simple)

        self.assertTrue(uploaded.stored)

        uploaded.link("City", "isPartOf", "State")

        self.assertFalse(uploaded.stored)

        self._ssds.remove(uploaded)

    def test_map_link_with_no_data_nodes(self):
        """
        Tests the link function when there are no data nodes
        :return:
        """
        simple = self._build_simple()

        # has the only link between Person (which has no data nodes)
        #
        #          .--- Person -----.
        #         /                  \
        # Organization              Place             Event
        #    /                      /     \             |
        #  name                   name   postalCode   endDate

        (simple
         .map("company", "Organization.name")
         .map("city", "Place.name")
         .map("state", "Place.postalCode")
         .map("ceo", "Event.endDate")
         .link("Person", "livesIn", "Place")
         .link("Organization", "ceo", "Person"))

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.data_links), 4)
        self.assertEqual(len(simple.object_links), 2)

    def test_json_writer(self):
        """
        Tests the json writer
        :return:
        """
        simple = self._build_simple()

        (simple
         .map("company", "Organization.name")
         .map("ceo", "Person.name")
         .map("city", "City.name")
         .map("state", "State.name")
         .link("City", "state", "State")
         .link("Organization", "location", "City")
         .link("Person", "worksFor", "Organization"))

        j = SSDJsonWriter(simple).to_dict()
        now = datetime.datetime.now().strftime(format="%Y-%m-%dT%H:%M:%S.%f")
        j["dateCreated"] = now
        j["dateModified"] = now

        test = self._build_simple().update(j, self._datasets, self._ontologies)

        self.assertEqual(len(simple.class_nodes), 4)
        self.assertEqual(len(simple.data_nodes), 4)
        self.assertEqual(len(simple.object_links), 3)
        self.assertEqual(len(simple.data_links), 4)
        self.assertSetEqual(set(simple.class_nodes), set(test.class_nodes))
        self.assertSetEqual(set(simple.data_nodes), set(test.data_nodes))
        self.assertSetEqual(set(simple.object_links), set(test.object_links))
        self.assertDictEqual(simple.mappings, test.mappings)
        self.assertSetEqual(set(simple.data_links), set(test.data_links))
class TestDataSetEndpoint(TestWithServer):
    """
    Tests the dataset endpoint
    """
    def __init__(self, method_name="runTest"):
        super().__init__(method_name)
        self._datasets = None
        path = os.path.join(os.path.dirname(__file__), "resources")
        self._test_file = os.path.join(path, 'data', 'businessInfo.csv')

    def setUp(self):
        self._datasets = DataSetEndpoint(self._session)
        self._clear_datasets()
        assert (os.path.isfile(self._test_file))

    def _clear_datasets(self):
        """Removes all datasets"""
        for ds in self._datasets.items:
            self._datasets.remove(ds)

    def tearDown(self):
        """
        Be sure to remove all datasets once a test is finished...
        :return:
        """
        self._clear_datasets()

    def test_upload(self):
        """
        Tests the uploading of a dataset
        :return:
        """
        self.assertEqual(len(self._datasets.items), 0)

        # only a dataset string path should be allowed
        with self.assertRaises(Exception):
            self._datasets.upload(1234)

        # only an existing dataset string path should be allowed
        with self.assertRaises(Exception):
            self._datasets.upload('resources/data/doesnt-exist.csv')

        # now upload a nice dataset
        d = self._datasets.upload(self._test_file)

        self.assertEqual(len(self._datasets.items), 1)
        self.assertEqual(d.filename, os.path.basename(self._test_file))
        self.assertEqual([c.name for c in d.columns],
                         ['company', 'ceo', 'city', 'state'])

    def test_remove(self):
        """
        Tests the removal of a dataset
        :return:
        """
        self.assertEqual(len(self._datasets.items), 0)

        # upload some datasets
        d1 = self._datasets.upload(self._test_file)
        d2 = self._datasets.upload(self._test_file)
        self.assertEqual(len(self._datasets.items), 2)

        # now let's remove one
        self._datasets.remove(d1)
        self.assertEqual(len(self._datasets.items), 1)
        self.assertEqual(self._datasets.items[0].id, d2.id)

        # let's remove the other by id
        self._datasets.remove(d2.id)
        self.assertEqual(len(self._datasets.items), 0)

    def test_items(self):
        """
        Tests item manipulation for the dataset list
        :return:
        """
        self.assertEqual(len(self._datasets.items), 0)

        # upload some datasets
        d1 = self._datasets.upload(self._test_file)
        d2 = self._datasets.upload(self._test_file)
        self.assertEqual(len(self._datasets.items), 2)

        # now let's remove one
        self._datasets.remove(d1)
        self.assertEqual(len(self._datasets.items), 1)

        # make sure the right one remains
        self.assertEqual(self._datasets.items[0], d2)

        # now let's remove the last one
        self._datasets.remove(d2)
        self.assertEqual(len(self._datasets.items), 0)
class TestSSDJson(TestWithServer):
    """
    Tests the JsonReader/JsonWriter for SSD
    """
    def __init__(self, method_name="runTest"):
        super().__init__(method_name)
        self._datasets = None
        self._ontologies = None

        path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                            "resources")
        self._test_owl = os.path.join(path, 'owl',
                                      'dataintegration_report_ontology.ttl')
        self._museum_owl_dir = os.path.join(path, 'owl', 'museum_edm')
        self._business_file = os.path.join(path, 'data', 'businessInfo.csv')
        self._business_ssd = os.path.join(path, 'ssd', 'businessInfo.ssd')
        self._cities_file = os.path.join(path, 'data', 'getCities.csv')
        self._cities_ssd = os.path.join(path, 'ssd', 'getCities.ssd')
        self._tricky_cities_ssd = os.path.join(path, 'ssd', 'tricky.ssd')
        self._objects_owl = os.path.join(path, 'owl', 'objects.ttl')
        self._paintings_file = os.path.join(path, 'data', 'paintings.csv')
        self._paintings_ssd = os.path.join(path, 'ssd', 'paintings.ssd')
        self._museum_file = os.path.join(path, 'data', 'museum.csv')
        self._museum_ssd = os.path.join(path, 'ssd', 'museum.ssd')

        self._ssd_path = os.path.join(path, 'ssd')
        self._data_path = os.path.join(path, 'data')

    def setUp(self):
        self._datasets = DataSetEndpoint(self._session)
        self._ontologies = OntologyEndpoint(self._session)
        self._ssds = SSDEndpoint(self._session, self._datasets,
                                 self._ontologies)
        self._clear_storage()

    def _clear_storage(self):
        """Removes all server elements"""
        for ssd in self._ssds.items:
            self._ssds.remove(ssd)

        for ds in self._datasets.items:
            self._datasets.remove(ds)

        for on in self._ontologies.items:
            self._ontologies.remove(on)

    def tearDown(self):
        """
        Be sure to remove all storage elements once a test is finished...
        :return:
        """
        self._clear_storage()

    def test_business(self):
        """
        Tests business info
        :return:
        """
        dataset = self._datasets.upload(self._business_file)
        ontology = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        new_json = dataset.bind_ssd(self._business_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, [ontology])
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 4)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.links), 7)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         3)  # these are only object properties

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(self._ssds.items), 1)
        self.assertEqual(len(uploaded.class_nodes), 4)
        self.assertEqual(len(uploaded.data_nodes), 4)
        self.assertEqual(len(uploaded.mappings), 4)
        self.assertEqual(len(uploaded.links), 7)  # class and data links
        self.assertEqual(len(uploaded.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(uploaded.object_links),
                         3)  # these are only object properties

    def test_tricky_cities(self):
        """
        Here the ssd has two class nodes of the same type
        :return:
        """
        self._datasets.upload(self._cities_file)
        on = self._ontologies.upload(self._test_owl)
        dataset = self._datasets.items[0]

        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]
        new_json = dataset.bind_ssd(self._tricky_cities_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)
        #pprint(ssd.json)

        self.assertEqual(len(ssd.class_nodes), 2)
        self.assertEqual(len(ssd.data_nodes), 2)
        self.assertEqual(len(ssd.mappings), 2)
        self.assertEqual(len(ssd.object_links),
                         1)  # these are only object properties

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(self._ssds.items), 1)
        self.assertEqual(len(uploaded.class_nodes), 2)
        self.assertEqual(len(uploaded.data_nodes), 2)
        self.assertEqual(len(uploaded.mappings), 2)
        self.assertEqual(len(uploaded.object_links),
                         1)  # these are only object properties

    def test_evaluate_country_names(self):
        """
        Tests evaluation for country_names
        :return:
        """
        path = os.path.join(self._data_path, "country_names.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "country_names.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 1)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         0)  # these are only object properties

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(uploaded.class_nodes), 1)
        self.assertEqual(len(uploaded.data_nodes), 4)
        self.assertEqual(len(uploaded.mappings), 4)
        self.assertEqual(len(uploaded.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(uploaded.object_links),
                         0)  # these are only object properties

    def test_evaluate_places_dif(self):
        """
        Tests evaluation for places_dif
        :return:
        """
        path = os.path.join(self._data_path, "places_dif.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "places_dif.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 4)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.links), 7)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         3)  # these are only object properties

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(uploaded.class_nodes), 4)
        self.assertEqual(len(uploaded.data_nodes), 4)
        self.assertEqual(len(uploaded.mappings), 4)
        self.assertEqual(len(uploaded.links), 7)  # class and data links
        self.assertEqual(len(uploaded.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(uploaded.object_links),
                         3)  # these are only object properties

    def test_evaluate_places_mix(self):
        """
        Tests evaluation for places_mix
        :return:
        """
        path = os.path.join(self._data_path, "places_mix.csv")
        dataset = self._datasets.upload(path)
        on = self._ontologies.upload(self._test_owl)

        assert (issubclass(type(dataset), DataSet))

        ssd_path = os.path.join(self._ssd_path, "places_mix.ssd")
        new_json = dataset.bind_ssd(ssd_path, [on], str(on._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 2)
        self.assertEqual(len(ssd.data_nodes), 4)
        self.assertEqual(len(ssd.mappings), 4)
        self.assertEqual(len(ssd.links), 5)  # class and data links
        self.assertEqual(len(ssd.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(ssd.object_links),
                         1)  # these are only object properties

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(uploaded.class_nodes), 2)
        self.assertEqual(len(uploaded.data_nodes), 4)
        self.assertEqual(len(uploaded.mappings), 4)
        self.assertEqual(len(uploaded.links), 5)  # class and data links
        self.assertEqual(len(uploaded.data_links),
                         4)  # these are only data properties
        self.assertEqual(len(uploaded.object_links),
                         1)  # these are only object properties

    def test_evaluate_paintings(self):
        """
        Here we have a class node with no data nodes
        :return:
        """
        self._datasets.upload(self._paintings_file)
        on = self._ontologies.upload(self._objects_owl)

        dataset = self._datasets.items[0]
        assert (issubclass(type(dataset), DataSet))

        ontology = self._ontologies.items[0]

        new_json = dataset.bind_ssd(self._paintings_ssd, [ontology],
                                    str(ontology._prefixes['']))

        empty_ssd = SSD(dataset, on)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 3)
        self.assertEqual(len(ssd.links), 4)
        self.assertEqual(len(ssd.data_links), 2)
        self.assertEqual(len(ssd.object_links), 2)
        self.assertEqual(len(ssd.data_nodes), 2)
        self.assertEqual(len(ssd.mappings), 2)

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(uploaded.class_nodes), 3)
        self.assertEqual(len(uploaded.links), 4)
        self.assertEqual(len(uploaded.data_links), 2)
        self.assertEqual(len(uploaded.object_links), 2)
        self.assertEqual(len(uploaded.data_nodes), 2)
        self.assertEqual(len(uploaded.mappings), 2)

    def test_evaluate_museum(self):
        """
        Here we have a class node with no data nodes, a list of ontologies, class instance link.
        Not all columns from file get mapped.
        :return:
        """
        dataset = self._datasets.upload(self._museum_file)

        ontologies = []
        for path in os.listdir(self._museum_owl_dir):
            f = os.path.join(self._museum_owl_dir, path)
            ontologies.append(self._ontologies.upload(f))

        assert (issubclass(type(dataset), DataSet))

        self.assertEqual(len(ontologies), 11)

        new_json = dataset.bind_ssd(self._museum_ssd, ontologies,
                                    KARMA_DEFAULT_NS)

        empty_ssd = SSD(dataset, ontologies)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        self.assertEqual(len(ssd.class_nodes), 6)
        self.assertEqual(len(ssd.links),
                         14)  # class instance, data property, object property
        self.assertEqual(len(ssd.data_links),
                         9)  # class instance, data property
        self.assertEqual(len(ssd.object_links), 5)  # object property
        self.assertEqual(len(ssd.data_nodes), 10)
        self.assertEqual(len(ssd.mappings), 10)

        uploaded = self._ssds.upload(ssd)
        self.assertTrue(uploaded.stored)
        self.assertEqual(len(uploaded.class_nodes), 6)
        self.assertEqual(len(uploaded.links),
                         14)  # class instance, data property, object property
        self.assertEqual(len(uploaded.data_links),
                         9)  # class instance, data property
        self.assertEqual(len(uploaded.object_links), 5)  # object property
        self.assertEqual(len(uploaded.data_nodes), 10)
        self.assertEqual(len(uploaded.mappings), 10)

    def test_show_multi_levels(self):
        data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                 "resources", "museum_benchmark", "data_copy",
                                 "s07-s-13.json.csv")
        dataset = self._datasets.upload(data_path)

        print("--------------------1")

        ontologies = []
        for path in os.listdir(self._museum_owl_dir):
            f = os.path.join(self._museum_owl_dir, path)
            ontologies.append(self._ontologies.upload(f))

        print("--------------------2")

        self.assertEqual(len(ontologies), 11)

        ssd_path = os.path.join(self._ssd_path, "s07_many_levels.ssd")
        new_json = dataset.bind_ssd(ssd_path, ontologies, KARMA_DEFAULT_NS)

        empty_ssd = SSD(dataset, ontologies)
        ssd = empty_ssd.update(new_json, self._datasets, self._ontologies)

        print("HERE!!!!!!!!!!!!!!!!!!!")
        ssd.show()
        print(ssd)

        self.fail()