def test_semantic_types(self, data_set, test_sizes): logging.info("Testing semantic types.") rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = list(source_map.values()) * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(list(source_map.keys())): train_names = [ source.index_name for source in double_name_list[idx + 1:idx + size + 1] ] train_examples_map = searcher.search_types_data( index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, train_names) semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) logging.debug( " semantic types: {}".format(semantic_types)) for threshold in [0.01]: found = False rank = 1 rank_score = 0 for prediction in semantic_types: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[ 0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.01]: file_write.write("Size: " + str(size) + " F-measure: " + str(rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str(running_time) + " Count: " + str(count_map[size][threshold]) + "\n")
def test_semantic_types(self, data_set, test_sizes): rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = source_map.values() * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(source_map.keys()): train_names = [source.index_name for source in double_name_list[idx + 1: idx + size + 1]] train_examples_map = searcher.search_types_data(index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data(index_config, column.value_text, train_names) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) for threshold in [0.0]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 # file_write.write( # column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.0]: file_write.write( "Size: " + str(size) + " F-measure: " + str( rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str( running_time) + " Count: " + str(count_map[size][threshold]) + "\n")
def test_semantic_types_from_2_sets(self, train_set, test_set): self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) print(self.file_class_map.keys()) rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} for idx, source_name in enumerate(self.dataset_map[test_set]): if source_name not in self.file_class_map: continue train_examples_map = searcher.search_types_data( train_index_config, [self.file_class_map[source_name]]) source = self.dataset_map[test_set][source_name] column_result_map = {} for column in source.column_map.values(): if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: continue textual_train_map = searcher.search_similar_text_data( train_index_config, column.value_text, [self.file_class_map[source_name]]) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) print(column.name) file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]: rank = 0 found = False rank_score = 0 for prediction in semantic_types: if column.semantic_type in prediction[1]: if prediction[0][1] >= threshold: rank_score = 1.0 / (rank + 1) found = True if not found and prediction[0][0] != 0: rank += len(prediction[1]) if not found: if semantic_types[0][0][1] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]: file_write.write(" MRR: " + str(rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str(count_map[threshold]) + "\n") return source_result_map
def test_semantic_types_from_2_sets(self, train_set, test_set): # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) # print self.file_class_map.keys() rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} train_names = [source.index_name for source in self.dataset_map[train_set].values()] for idx, source_name in enumerate(self.dataset_map[test_set]): # if source_name not in self.file_class_map: # continue train_examples_map = searcher.search_types_data(train_index_config, train_names) source = self.dataset_map[test_set][source_name] self.logger.info("Test source: %s", source_name) column_result_map = {} for column in source.column_map.values(): # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: # continue if not column.semantic_type or not column.value_list: continue textual_train_map = searcher.search_similar_text_data(train_index_config, column.value_text, train_names) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) column_result_map[column.name] = semantic_types self.logger.info(" -> column: %s", column.name) file_write.write( column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / rank found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: file_write.write( " MRR: " + str( rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str( count_map[threshold]) + " threshold=" + str(threshold) + "\n") return source_result_map
def test_semantic_types_from_2_sets(self, train_set, test_set): # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) # print self.file_class_map.keys() rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} train_names = [ source.index_name for source in self.dataset_map[train_set].values() ] self.logger.info("Train source: %s", train_names) valid = True for idx, source_name in enumerate(self.dataset_map[test_set]): # if source_name not in self.file_class_map: # continue train_examples_map = searcher.search_types_data( train_index_config, train_names) source = self.dataset_map[test_set][source_name] self.logger.info("Test source: %s", source_name) column_result_map = {} for column in source.column_map.values(): # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: # continue if not column.semantic_type or not column.value_list: continue textual_train_map = searcher.search_similar_text_data( train_index_config, column.value_text, train_names) # self.logger.info(textual_train_map) try: semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) except KeyError: print("KEY ERROR") valid = False break # if(not semantic_types): # self.logger.info("Could not do "+column.name) # continue column_result_map[column.name] = semantic_types self.logger.info(" -> column: %s", column.name) file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / rank found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map # for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: # file_write.write( # " MRR: " + str( # rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str( # count_map[threshold]) + " threshold=" + str(threshold) + "\n") return source_result_map