def test_semantic_types(self, data_set, test_sizes):
        logging.info("Testing semantic types.")
        rank_score_map = defaultdict(lambda: defaultdict(lambda: 0))
        count_map = defaultdict(lambda: defaultdict(lambda: 0))

        index_config = {'name': data_set}
        source_map = self.dataset_map[data_set]
        double_name_list = list(source_map.values()) * 2
        file_write.write("Dataset: " + data_set + "\n")
        for size in test_sizes:
            start_time = time.time()

            for idx, source_name in enumerate(list(source_map.keys())):
                train_names = [
                    source.index_name
                    for source in double_name_list[idx + 1:idx + size + 1]
                ]
                train_examples_map = searcher.search_types_data(
                    index_config, train_names)
                source = source_map[source_name]

                for column in source.column_map.values():
                    if column.semantic_type:
                        textual_train_map = searcher.search_similar_text_data(
                            index_config, column.value_text, train_names)
                        semantic_types = column.predict_type(
                            train_examples_map, textual_train_map,
                            self.random_forest)
                        logging.debug(
                            "    semantic types: {}".format(semantic_types))

                        for threshold in [0.01]:
                            found = False
                            rank = 1
                            rank_score = 0
                            for prediction in semantic_types:
                                if column.semantic_type in prediction[1]:
                                    if prediction[0] > threshold and prediction[
                                            0] != 0:
                                        rank_score = 1.0 / (rank)
                                    found = True
                                    break
                                if prediction[0] != 0:
                                    rank += len(prediction[1])

                            if not found and semantic_types[0][0] < threshold:
                                rank_score = 1
                            file_write.write(column.name + "\t" +
                                             column.semantic_type + "\t" +
                                             str(semantic_types) + "\n")
                            file_write.write(str(rank_score) + "\n")
                            rank_score_map[size][threshold] += rank_score
                            count_map[size][threshold] += 1
            running_time = time.time() - start_time
            for threshold in [0.01]:
                file_write.write("Size: " + str(size) + " F-measure: " +
                                 str(rank_score_map[size][threshold] * 1.0 /
                                     count_map[size][threshold]) + " Time: " +
                                 str(running_time) + " Count: " +
                                 str(count_map[size][threshold]) + "\n")
    def test_semantic_types(self, data_set, test_sizes):
        rank_score_map = defaultdict(lambda: defaultdict(lambda: 0))
        count_map = defaultdict(lambda: defaultdict(lambda: 0))

        index_config = {'name': data_set}
        source_map = self.dataset_map[data_set]
        double_name_list = source_map.values() * 2
        file_write.write("Dataset: " + data_set + "\n")
        for size in test_sizes:
            start_time = time.time()

            for idx, source_name in enumerate(source_map.keys()):
                train_names = [source.index_name for source in double_name_list[idx + 1: idx + size + 1]]
                train_examples_map = searcher.search_types_data(index_config, train_names)
                source = source_map[source_name]

                for column in source.column_map.values():
                    if column.semantic_type:
                        textual_train_map = searcher.search_similar_text_data(index_config, column.value_text,
                                                                              train_names)
                        semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest)

                        for threshold in [0.0]:
                            found = False
                            rank = 1
                            rank_score = 0
                            for prediction in semantic_types[:1]:
                                if column.semantic_type in prediction[1]:
                                    if prediction[0] > threshold and prediction[0] != 0:
                                        rank_score = 1.0 / (rank)
                                    found = True
                                    break
                                if prediction[0] != 0:
                                    rank += len(prediction[1])

                            if not found and semantic_types[0][0] < threshold:
                                rank_score = 1
                            # file_write.write(
                            #     column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n")
                            file_write.write(str(rank_score) + "\n")
                            rank_score_map[size][threshold] += rank_score
                            count_map[size][threshold] += 1
            running_time = time.time() - start_time
            for threshold in [0.0]:
                file_write.write(
                    "Size: " + str(size) + " F-measure: " + str(
                        rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str(
                        running_time) + " Count: " + str(count_map[size][threshold]) + "\n")
    def test_semantic_types_from_2_sets(self, train_set, test_set):
        self.read_class_type_from_csv("data/datasets/%s/classes.csv" %
                                      test_set)
        print(self.file_class_map.keys())
        rank_score_map = defaultdict(lambda: 0)
        count_map = defaultdict(lambda: 0)

        source_result_map = {}
        train_index_config = {'name': train_set}

        for idx, source_name in enumerate(self.dataset_map[test_set]):
            if source_name not in self.file_class_map:
                continue
            train_examples_map = searcher.search_types_data(
                train_index_config, [self.file_class_map[source_name]])

            source = self.dataset_map[test_set][source_name]

            column_result_map = {}
            for column in source.column_map.values():

                if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type:
                    continue

                textual_train_map = searcher.search_similar_text_data(
                    train_index_config, column.value_text,
                    [self.file_class_map[source_name]])

                semantic_types = column.predict_type(train_examples_map,
                                                     textual_train_map,
                                                     self.random_forest)

                print(column.name)

                file_write.write(column.name + "\t" + column.semantic_type +
                                 "\t" + str(semantic_types) + "\n")

                for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]:
                    rank = 0
                    found = False
                    rank_score = 0
                    for prediction in semantic_types:
                        if column.semantic_type in prediction[1]:
                            if prediction[0][1] >= threshold:
                                rank_score = 1.0 / (rank + 1)
                            found = True

                        if not found and prediction[0][0] != 0:
                            rank += len(prediction[1])

                    if not found:
                        if semantic_types[0][0][1] < threshold:
                            rank_score = 1
                    file_write.write(str(rank_score) + "\n")
                    rank_score_map[threshold] += rank_score
                    count_map[threshold] += 1

            source_result_map[source_name] = column_result_map

        for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]:
            file_write.write(" MRR: " + str(rank_score_map[threshold] * 1.0 /
                                            count_map[threshold]) +
                             " Count: " + str(count_map[threshold]) + "\n")
        return source_result_map
    def test_semantic_types_from_2_sets(self, train_set, test_set):
        # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set)
        # print self.file_class_map.keys()
        rank_score_map = defaultdict(lambda: 0)
        count_map = defaultdict(lambda: 0)

        source_result_map = {}
        train_index_config = {'name': train_set}
        train_names = [source.index_name for source in self.dataset_map[train_set].values()]

        for idx, source_name in enumerate(self.dataset_map[test_set]):
            # if source_name not in self.file_class_map:
            #     continue
            train_examples_map = searcher.search_types_data(train_index_config,
                                                            train_names)

            source = self.dataset_map[test_set][source_name]
            self.logger.info("Test source: %s", source_name)

            column_result_map = {}
            for column in source.column_map.values():
                # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type:
                #     continue
                if not column.semantic_type or not column.value_list:
                    continue

                textual_train_map = searcher.search_similar_text_data(train_index_config, column.value_text,
                                                                      train_names)

                semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest)
                column_result_map[column.name] = semantic_types
                self.logger.info("    -> column: %s", column.name)

                file_write.write(
                    column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n")

                for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]:
                    found = False
                    rank = 1
                    rank_score = 0
                    for prediction in semantic_types[:1]:
                        if column.semantic_type in prediction[1]:
                            if prediction[0] > threshold and prediction[0] != 0:
                                rank_score = 1.0 / rank
                            found = True
                            break
                        if prediction[0] != 0:
                            rank += len(prediction[1])

                    if not found and semantic_types[0][0] < threshold:
                        rank_score = 1
                    file_write.write(str(rank_score) + "\n")
                    rank_score_map[threshold] += rank_score
                    count_map[threshold] += 1

            source_result_map[source_name] = column_result_map

        for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]:
            file_write.write(
                " MRR: " + str(
                    rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str(
                    count_map[threshold]) + " threshold=" + str(threshold) + "\n")
        return source_result_map
Beispiel #5
0
    def test_semantic_types_from_2_sets(self, train_set, test_set):

        # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set)
        # print self.file_class_map.keys()
        rank_score_map = defaultdict(lambda: 0)
        count_map = defaultdict(lambda: 0)

        source_result_map = {}
        train_index_config = {'name': train_set}
        train_names = [
            source.index_name
            for source in self.dataset_map[train_set].values()
        ]
        self.logger.info("Train source: %s", train_names)

        valid = True
        for idx, source_name in enumerate(self.dataset_map[test_set]):
            # if source_name not in self.file_class_map:
            #     continue
            train_examples_map = searcher.search_types_data(
                train_index_config, train_names)

            source = self.dataset_map[test_set][source_name]
            self.logger.info("Test source: %s", source_name)

            column_result_map = {}
            for column in source.column_map.values():
                # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type:
                #     continue
                if not column.semantic_type or not column.value_list:
                    continue

                textual_train_map = searcher.search_similar_text_data(
                    train_index_config, column.value_text, train_names)
                # self.logger.info(textual_train_map)

                try:
                    semantic_types = column.predict_type(
                        train_examples_map, textual_train_map,
                        self.random_forest)
                except KeyError:
                    print("KEY ERROR")
                    valid = False
                    break

                # if(not semantic_types):
                #    self.logger.info("Could not do "+column.name)
                #    continue

                column_result_map[column.name] = semantic_types
                self.logger.info("    -> column: %s", column.name)

                file_write.write(column.name + "\t" + column.semantic_type +
                                 "\t" + str(semantic_types) + "\n")

                for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]:
                    found = False
                    rank = 1
                    rank_score = 0
                    for prediction in semantic_types[:1]:
                        if column.semantic_type in prediction[1]:
                            if prediction[0] > threshold and prediction[0] != 0:
                                rank_score = 1.0 / rank
                            found = True
                            break
                        if prediction[0] != 0:
                            rank += len(prediction[1])

                    if not found and semantic_types[0][0] < threshold:
                        rank_score = 1
                    file_write.write(str(rank_score) + "\n")
                    rank_score_map[threshold] += rank_score
                    count_map[threshold] += 1

            source_result_map[source_name] = column_result_map

#         for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]:
#             file_write.write(
#                 " MRR: " + str(
#                     rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str(
#                     count_map[threshold]) + " threshold=" + str(threshold) + "\n")
        return source_result_map