class SemanticLabeler: def __init__(self, sc): self.sc = sc self.source_map = {} self.random_forest = None def read_data_sources(self, folder_path): data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in os.listdir(data_folder_path): extension = os.path.splitext(filename)[1] source = Source(os.path.splitext(filename)[0], self.sc) file_path = os.path.join(data_folder_path, filename) if extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) self.source_map[filename] = source for filename in os.listdir(model_folder_path): source = self.source_map[os.path.splitext(os.path.splitext(filename)[0])[0]] source.read_semantic_type_json(os.path.join(model_folder_path, filename)) def train_random_forest(self, train_size): self.random_forest = MyRandomForest() self.random_forest.train() def train_semantic_types(self, size_list): for idx in range(len(self.source_map)): for size in size_list: for source_name in (self.source_map.keys() * 2)[idx + 1: idx + size + 1]: source = self.source_map[source_name] source.save(index_config={'size': size}) # TODO def test_semantic_types(self): pass
class SemanticLabeler: logger = get_logger("SemanticLabeler", level=logging.DEBUG) def __init__(self): self.dataset_map = {} self.file_class_map = {} self.random_forest = None def preprocess_memex_data_sources(self, folder_path): source_map = OrderedDict() for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) print file_path with open(file_path, "r") as f: for json_line in f.readlines(): json_obj = json.loads(json_line) source_name = json_obj["tld"] if source_name not in source_map: source_map[source_name] = Source(source_name) source = source_map[source_name] for attr in json_obj: if attr.startswith("inferlink"): attr_name = attr.split("_")[1] if attr_name not in source.column_map: source.column_map[attr_name] = Column(attr_name, source.name) source.column_map[attr_name].semantic_type = attr_name for ele1 in json_obj[attr]: if isinstance(ele1["result"], dict): source.column_map[attr_name].add_value(ele1["result"]["value"]) else: for ele2 in ele1["result"]: source.column_map[attr_name].add_value(ele2["value"]) for source in source_map.values(): if source.column_map: source.write_csv_file("data/datasets/memex/%s" % source.name) def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning("Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext(os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json(os.path.join(model_folder_path, filename)) else: print source source.read_semantic_type_from_gold(os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print len(semantic_type_set) print attr_count def train_random_forest(self, train_sizes, data_sets): self.random_forest = MyRandomForest(data_sets, self.dataset_map, "model/lr.pkl") self.random_forest.train(train_sizes) def train_semantic_types(self, dataset_list): for name in dataset_list: self.logger.debug("Indexing dataset %s", name) index_config = {'name': re.sub(not_allowed_chars, "!", name)} indexer.init_analyzers(index_config) source_map = self.dataset_map[name] for idx, key in enumerate(source_map.keys()): source = source_map[key] source.save(index_config={'name': re.sub(not_allowed_chars, "!", name)}) self.logger.debug(" + finish index source: %s", key) def predict_semantic_type_for_column(self, column): train_examples_map = searcher.search_types_data("index_name", []) textual_train_map = searcher.search_similar_text_data("index_name", column.value_text, []) return column.predict_type(train_examples_map, textual_train_map, self.random_forest) def test_semantic_types(self, data_set, test_sizes): rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = source_map.values() * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(source_map.keys()): train_names = [source.index_name for source in double_name_list[idx + 1: idx + size + 1]] train_examples_map = searcher.search_types_data(index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data(index_config, column.value_text, train_names) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) for threshold in [0.0]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 # file_write.write( # column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.0]: file_write.write( "Size: " + str(size) + " F-measure: " + str( rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str( running_time) + " Count: " + str(count_map[size][threshold]) + "\n") def read_class_type_from_csv(self, file_path): self.file_class_map = {} with open(file_path, "r") as f: csv_reader = csv.reader(f) for row in csv_reader: self.file_class_map[row[0].replace(".tar.gz", ".csv")] = row[1] def test_semantic_types_from_2_sets(self, train_set, test_set): # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) # print self.file_class_map.keys() rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} train_names = [source.index_name for source in self.dataset_map[train_set].values()] for idx, source_name in enumerate(self.dataset_map[test_set]): # if source_name not in self.file_class_map: # continue train_examples_map = searcher.search_types_data(train_index_config, train_names) source = self.dataset_map[test_set][source_name] self.logger.info("Test source: %s", source_name) column_result_map = {} for column in source.column_map.values(): # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: # continue if not column.semantic_type or not column.value_list: continue textual_train_map = searcher.search_similar_text_data(train_index_config, column.value_text, train_names) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) column_result_map[column.name] = semantic_types self.logger.info(" -> column: %s", column.name) file_write.write( column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / rank found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: file_write.write( " MRR: " + str( rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str( count_map[threshold]) + " threshold=" + str(threshold) + "\n") return source_result_map def write_data_for_transform(self, name): for source_name, source in self.dataset_map[name].items(): for attribute in source.column_map.values(): attribute.write_to_data_file()
class SemanticLabeler: def __init__(self, data_folder=os.path.join("data", "datasets")): logging.info( "Initializing semantic labeler with data folder: {}".format( data_folder)) self.data_folder = data_folder self.dataset_map = {} self.file_class_map = {} self.random_forest = None def reset(self): logging.info("Resetting semantic labeler") self.dataset_map = {} self.file_class_map = {} self.random_forest = None logging.info("Cleaning elasticsearch indexer") indexer.clean() def read_data_sources(self, folder_paths): logging.info("Reading data sources...") for folder_name in folder_paths: folder_path = os.path.join(self.data_folder, folder_name) logging.info("-->folder: {}".format(folder_path)) source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "data") model_folder_path = os.path.join(folder_path, "model") for filename in os.listdir(data_folder_path): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue logging.info(" ...file: {}".format(filename)) print(filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map def write_data_sources(self, limit=500, filter_unknown=False): logging.info("Writing available sources from semantic_labeler") for folder_name, source_map in self.dataset_map.items(): for filename, source in source_map.items(): filepath = os.path.join( "data", "write_csv_datasets_" + str(filter_unknown), filename + ".csv") source.write_csv_file(filepath, limit, filter_unknown) filepath = os.path.join( "data", "write_columnmap_" + str(filter_unknown), filename + ".columnmap.txt") source.write_column_map(filepath, filter_unknown) # with open(filepath, "w+") as f: # f.write(str(source.column_map)) def train_random_forest(self, train_sizes, data_sets): logging.info("Training random forest on {} datasets.".format( len(data_sets))) self.random_forest = MyRandomForest(data_sets, self.dataset_map) self.random_forest.train(train_sizes) def train_semantic_types(self, dataset_list): logging.info("Training semantic types on {} datasets.".format( len(dataset_list))) for name in dataset_list: logging.info(" training semantic types on {} ".format(name)) index_config = {'name': re.sub(not_allowed_chars, "!", name)} indexer.init_analyzers(index_config) source_map = self.dataset_map[name] for source in source_map.values(): # source = source_map[source_map.keys()[idx]] source.save(index_config={ 'name': re.sub(not_allowed_chars, "!", name) }) def predict_semantic_type_for_column(self, column): logging.info("Predicting semantic type for column: {}.".format(column)) if self.random_forest is None: logging.error("Prediction not possible. Model not trained.") raise Exception("Prediction not possible. Model not trained.") start_time = time.time() # source_name = "" # if column.source_name: # index_name = re.sub(not_allowed_chars, "", column.source_name) # source_name = column.source_name # index_config = {'name': index_name} # train_examples_map = searcher.search_types_data(index_config, []) # textual_train_map = searcher.search_similar_text_data(index_config, column.value_text, []) # else: # train_examples_map = searcher.search_types_data("", []) # textual_train_map = searcher.search_similar_text_data("", column.value_text, []) # # index_config = {'name': "train_data"} index_config = "" train_examples_map = searcher.search_types_data(index_config, []) textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, []) logging.info("Train examples map size {}".format( len(train_examples_map))) cur_res = { 'source_name': column.source_name, 'column_name': column.name, 'correct_label': column.semantic_type, 'scores': [(1.0, 'fail')] } try: semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) all_preds = [] for (score, labels) in semantic_types: all_preds += [(score, l) for l in labels] # normalize scores so that they sum up to 1 total = sum([element[0] for element in all_preds]) if total > 0: cur_res['scores'] = [(score / total, l) for score, l in all_preds] else: cur_res['scores'] = [(score, l) for score, l in all_preds] logging.info("Scores normalized") except Exception as e: logging.warning( "Could not get predictions for column {} due to {}".format( column.name, e)) cur_res['scores'] = [(1.0, 'fail')] running_time = time.time() - start_time return { "folder_name": "", "running_time": running_time, "predictions": [cur_res] } def predict_folder_semantic_types(self, folder_name): """ Predict semantic types for all sources in folder :param folder_name: :return: """ logging.info( "Predicting semantic types for folder: {}.".format(folder_name)) if self.random_forest is None: logging.error("Prediction not possible. Model not trained.") raise Exception("Prediction not possible. Model not trained.") if folder_name not in self.dataset_map: logging.error( "Prediction not possible: folder is not indexed by semantic labeler." ) raise Exception( "Prediction not possible: folder is not indexed by semantic labeler." ) result = [] source_map = self.dataset_map[folder_name] start_time = time.time() for source in source_map.values(): # we need to index the source index_config = {'name': source.index_name} source.save(index_config) for column in source.column_map.values(): cur_res = { 'source_name': source.name, 'column_name': column.name, 'correct_label': column.semantic_type, 'scores': [] } train_examples_map = searcher.search_types_data( index_config, []) textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, []) try: semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) logging.info( "Column <{}> predicted semantic types {}".format( column.name, semantic_types)) all_preds = [] for (score, labels) in semantic_types: all_preds += [(score, l) for l in labels] # normalize scores so that they sum up to 1 total = sum([element[0] for element in all_preds]) if total > 0: cur_res['scores'] = [(score / total, l) for score, l in all_preds] else: cur_res['scores'] = [(score, l) for score, l in all_preds] logging.info("Scores normalized") except Exception as e: logging.warning( "Could not get predictions for column {} due to {}". format(column.name, e)) cur_res['scores'] = [(1.0, 'fail')] result.append(cur_res) running_time = time.time() - start_time return { "folder_name": folder_name, "running_time": running_time, "predictions": result } def test_semantic_types(self, data_set, test_sizes): logging.info("Testing semantic types.") rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = list(source_map.values()) * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(list(source_map.keys())): train_names = [ source.index_name for source in double_name_list[idx + 1:idx + size + 1] ] train_examples_map = searcher.search_types_data( index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, train_names) semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) logging.debug( " semantic types: {}".format(semantic_types)) for threshold in [0.01]: found = False rank = 1 rank_score = 0 for prediction in semantic_types: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[ 0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.01]: file_write.write("Size: " + str(size) + " F-measure: " + str(rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str(running_time) + " Count: " + str(count_map[size][threshold]) + "\n") def read_class_type_from_csv(self, file_path): self.file_class_map = {} with open(file_path, "r") as f: csv_reader = csv.reader(f) for row in csv_reader: self.file_class_map[row[0].replace(".tar.gz", ".csv")] = row[1] def test_semantic_types_from_2_sets(self, train_set, test_set): self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) print(self.file_class_map.keys()) rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} for idx, source_name in enumerate(self.dataset_map[test_set]): if source_name not in self.file_class_map: continue train_examples_map = searcher.search_types_data( train_index_config, [self.file_class_map[source_name]]) source = self.dataset_map[test_set][source_name] column_result_map = {} for column in source.column_map.values(): if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: continue textual_train_map = searcher.search_similar_text_data( train_index_config, column.value_text, [self.file_class_map[source_name]]) semantic_types = column.predict_type(train_examples_map, textual_train_map, self.random_forest) print(column.name) file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]: rank = 0 found = False rank_score = 0 for prediction in semantic_types: if column.semantic_type in prediction[1]: if prediction[0][1] >= threshold: rank_score = 1.0 / (rank + 1) found = True if not found and prediction[0][0] != 0: rank += len(prediction[1]) if not found: if semantic_types[0][0][1] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map for threshold in [0.1, 0.15, 0.2, 0.25, 0.5]: file_write.write(" MRR: " + str(rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str(count_map[threshold]) + "\n") return source_result_map
class SemanticLabeler: logger = get_logger("SemanticLabeler", level=logging.DEBUG) def __init__(self): self.dataset_map = {} self.file_class_map = {} self.random_forest = None def preprocess_memex_data_sources(self, folder_path): source_map = OrderedDict() for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) print(file_path) with open(file_path, "r") as f: for json_line in f.readlines(): json_obj = json.loads(json_line) source_name = json_obj["tld"] if source_name not in source_map: source_map[source_name] = Source(source_name) source = source_map[source_name] for attr in json_obj: if attr.startswith("inferlink"): attr_name = attr.split("_")[1] if attr_name not in source.column_map: source.column_map[attr_name] = Column( attr_name, source.name) source.column_map[ attr_name].semantic_type = attr_name for ele1 in json_obj[attr]: if isinstance(ele1["result"], dict): source.column_map[attr_name].add_value( ele1["result"]["value"]) else: for ele2 in ele1["result"]: source.column_map[attr_name].add_value( ele2["value"]) for source in source_map.values(): if source.column_map: source.write_csv_file("data/datasets/memex/%s" % source.name) def read_data_sources(self, folder_paths): semantic_type_set = set() attr_count = 0 for folder_name in folder_paths: self.logger.debug("Read dataset: %s", folder_name) folder_path = "data/datasets/%s" % folder_name source_map = OrderedDict() data_folder_path = os.path.join(folder_path, "tables") model_folder_path = os.path.join(folder_path, "models") for filename in sorted(os.listdir(data_folder_path)): extension = os.path.splitext(filename)[1] if ".DS" in filename: continue self.logger.debug(" -> read: %s", filename) source = Source(os.path.splitext(filename)[0]) file_path = os.path.join(data_folder_path, filename) if "full" in data_folder_path: source.read_data_from_wc_csv(file_path) elif extension == ".csv": source.read_data_from_csv(file_path) elif extension == ".json": source.read_data_from_json(file_path) elif extension == ".xml": source.read_data_from_xml(file_path) else: source.read_data_from_text_file(file_path) source_map[filename] = source if ('rowNumber' in source.column_map): del source.column_map['rowNumber'] # NOTE: BINH delete empty columns here!!!, blindly follows the code in indexer:36 for key in list(source.column_map.keys()): column = source.column_map[key] if column.semantic_type: if len(column.value_list) == 0: del source.column_map[key] source.empty_val_columns[key] = column logging.warning( "Indexer: IGNORE COLUMN `%s` in source `%s` because of empty values", column.name, source.name) for column in source.column_map.values(): semantic_type_set.add(column.semantic_type) attr_count += len(source.column_map.values()) if os.path.exists(model_folder_path): for filename in os.listdir(model_folder_path): if ".DS" in filename: continue try: source = source_map[os.path.splitext( os.path.splitext(filename)[0])[0]] except: source = source_map[filename] extension = os.path.splitext(filename)[1] if extension == ".json": source.read_semantic_type_json( os.path.join(model_folder_path, filename)) else: print(source) source.read_semantic_type_from_gold( os.path.join(model_folder_path, filename)) self.dataset_map[folder_name] = source_map # print semantic_type_set print(len(semantic_type_set)) print(attr_count) def train_random_forest(self, train_sizes, data_sets): self.random_forest = MyRandomForest(data_sets, self.dataset_map, "model/lr_all.pkl") self.random_forest.train(train_sizes) def train_semantic_types(self, dataset_list): print("train_semantic_types") for name in dataset_list: self.logger.debug("Indexing dataset %s", name) index_config = {'name': re.sub(not_allowed_chars, "!", name)} indexer.init_analyzers(index_config) source_map = self.dataset_map[name] for idx, key in enumerate(source_map.keys()): source = source_map[key] print("Index ", key) successful = source.save( index_config={ 'name': re.sub(not_allowed_chars, "!", name) }) if (not successful): self.logger.info("Error while parsing file ", key) print("Error while parsing file.") self.logger.debug(" + finish index source: %s", key) def predict_semantic_type_for_column(self, column): train_examples_map = searcher.search_types_data("index_name", []) textual_train_map = searcher.search_similar_text_data( "index_name", column.value_text, []) return column.predict_type(train_examples_map, textual_train_map, self.random_forest) def test_semantic_types(self, data_set, test_sizes): print("test_semantic_types") rank_score_map = defaultdict(lambda: defaultdict(lambda: 0)) count_map = defaultdict(lambda: defaultdict(lambda: 0)) index_config = {'name': data_set} source_map = self.dataset_map[data_set] double_name_list = source_map.values() * 2 file_write.write("Dataset: " + data_set + "\n") for size in test_sizes: start_time = time.time() for idx, source_name in enumerate(source_map.keys()): train_names = [ source.index_name for source in double_name_list[idx + 1:idx + size + 1] ] train_examples_map = searcher.search_types_data( index_config, train_names) source = source_map[source_name] for column in source.column_map.values(): if column.semantic_type: textual_train_map = searcher.search_similar_text_data( index_config, column.value_text, train_names) semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) for threshold in [0.0]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[ 0] != 0: rank_score = 1.0 / (rank) found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 # file_write.write( # column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") file_write.write(str(rank_score) + "\n") rank_score_map[size][threshold] += rank_score count_map[size][threshold] += 1 running_time = time.time() - start_time for threshold in [0.0]: file_write.write("Size: " + str(size) + " F-measure: " + str(rank_score_map[size][threshold] * 1.0 / count_map[size][threshold]) + " Time: " + str(running_time) + " Count: " + str(count_map[size][threshold]) + "\n") def read_class_type_from_csv(self, file_path): self.file_class_map = {} with open(file_path, "r") as f: csv_reader = csv.reader(f) for row in csv_reader: self.file_class_map[row[0].replace(".tar.gz", ".csv")] = row[1] def test_semantic_types_from_2_sets(self, train_set, test_set): # self.read_class_type_from_csv("data/datasets/%s/classes.csv" % test_set) # print self.file_class_map.keys() rank_score_map = defaultdict(lambda: 0) count_map = defaultdict(lambda: 0) source_result_map = {} train_index_config = {'name': train_set} train_names = [ source.index_name for source in self.dataset_map[train_set].values() ] self.logger.info("Train source: %s", train_names) valid = True for idx, source_name in enumerate(self.dataset_map[test_set]): # if source_name not in self.file_class_map: # continue train_examples_map = searcher.search_types_data( train_index_config, train_names) source = self.dataset_map[test_set][source_name] self.logger.info("Test source: %s", source_name) column_result_map = {} for column in source.column_map.values(): # if not column.semantic_type or not column.value_list or "ontology" not in column.semantic_type: # continue if not column.semantic_type or not column.value_list: continue textual_train_map = searcher.search_similar_text_data( train_index_config, column.value_text, train_names) # self.logger.info(textual_train_map) try: semantic_types = column.predict_type( train_examples_map, textual_train_map, self.random_forest) except KeyError: print("KEY ERROR") valid = False break # if(not semantic_types): # self.logger.info("Could not do "+column.name) # continue column_result_map[column.name] = semantic_types self.logger.info(" -> column: %s", column.name) file_write.write(column.name + "\t" + column.semantic_type + "\t" + str(semantic_types) + "\n") for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: found = False rank = 1 rank_score = 0 for prediction in semantic_types[:1]: if column.semantic_type in prediction[1]: if prediction[0] > threshold and prediction[0] != 0: rank_score = 1.0 / rank found = True break if prediction[0] != 0: rank += len(prediction[1]) if not found and semantic_types[0][0] < threshold: rank_score = 1 file_write.write(str(rank_score) + "\n") rank_score_map[threshold] += rank_score count_map[threshold] += 1 source_result_map[source_name] = column_result_map # for threshold in [0.0, 0.1, 0.15, 0.2, 0.25, 0.5]: # file_write.write( # " MRR: " + str( # rank_score_map[threshold] * 1.0 / count_map[threshold]) + " Count: " + str( # count_map[threshold]) + " threshold=" + str(threshold) + "\n") return source_result_map def write_data_for_transform(self, name): for source_name, source in self.dataset_map[name].items(): for attribute in source.column_map.values(): attribute.write_to_data_file()