def get_vectors(self): qnodes = set(self.loaded_file.loc[:, self.input_column_name]) embedding_file = self.kwargs['embedding_file'] url = self.kwargs['embedding_url'] if embedding_file: with open(embedding_file, 'rt') as fd: for line in fd: fields = line.strip().split('\t') qnode = fields[0] if qnode in qnodes: self.vectors_map[qnode] = np.asarray( list(map(float, fields[1:]))) elif url: found_one = False for i, qnode in enumerate(qnodes): # Use str, becauses of missing values (nan) response = requests.get(url + str(qnode)) if response.status_code == 200: result = response.json() if result['found']: found_one = True self.vectors_map[qnode] = np.asarray( list( map(float, result['_source']['embedding'].split()))) if i > 100 and not found_one: raise TLException( f'Failing to find vectors: {url} {qnode}') if not found_one: raise TLException(f'Failed to find any vectors: {url} {qnode}')
def process_vectors(self): """ apply corresponding vector strategy to process the calculated vectors :return: """ vector_strategy = self.kwargs.get("column_vector_strategy", "centroid-of-singletons") if vector_strategy == "centroid-of-singletons": if not self._centroid_of_singletons(): raise TLException( f'Column_vector_stragtegy {vector_strategy} failed') else: raise TLException(f'Unknown column_vector_stragtegy')
def _get_centroid(self, vector_strategy: str): """ function used to calculate the column-vector(centroid) value """ n_value = int(self.kwargs.pop("n_value")) if vector_strategy == "ground-truth": if "GT_kg_id" not in self.loaded_file: raise TLException( "The input file does not have `GT_kg_id` column! Can't run with ground-truth " "strategy") candidate_nodes = list(set(self.loaded_file["GT_kg_id"].tolist())) elif vector_strategy == "exact-matches": candidate_nodes = list(set(self.loaded_file["kg_id"].tolist())) else: raise TLException( "Unknown vector vector strategy {}".format(vector_strategy)) candidate_nodes = [ each for each in candidate_nodes if each != "" and each is not np.nan ] # get corresponding column of each candidate nodes nodes_map = defaultdict(set) for each_node in candidate_nodes: for group, nodes in self.groups.items(): if each_node in nodes: nodes_map[group].add(each_node) # random sample nodes if needed nodes_map_updated = {} for group, nodes in nodes_map.items(): if n_value != 0 and n_value < len(nodes): nodes_map_updated[group] = random.sample(nodes, n_value) else: nodes_map_updated[group] = nodes # get centroid for each column for group, nodes in nodes_map_updated.items(): temp = [] for each_node in sorted(list(nodes)): temp.append(self.vectors_map[each_node]) each_centroid = np.mean(np.array(temp), axis=0) self.centroid[group] = each_centroid
def run(**kwargs): try: from tl.features.external_embedding import EmbeddingVector vector_transformer = EmbeddingVector(kwargs) vector_transformer.get_vectors() vector_transformer.process_vectors() vector_transformer.add_score_column() vector_transformer.print_output() except: message = 'Command: score-using-embedding\n' message += 'Error Message: {}\n'.format(traceback.format_exc()) raise TLException(message)
def compute_distance(self, v1: np.array, v2: np.array): if self.kwargs["distance_function"] == "cosine": val = 1 - cosine(v1, v2) elif self.kwargs["distance_function"] == "euclidean": val = euclidean(v1, v2) # because we need score higher to be better, here we use the reciprocal value if val == 0: val = float("inf") else: val = 1 / val else: raise TLException("Unknown distance function {}".format( self.kwargs["distance_function"])) return val
def run(**kwargs): try: import pandas as pd from tl.features.feature_voting import feature_voting input_file_path = kwargs.pop("input_file") input_column_names = kwargs.pop("input_column_names") df = pd.read_csv(input_file_path) feature_col_names = input_column_names.split(',') odf = feature_voting(feature_col_names, df) odf.to_csv(sys.stdout, index=False) except: message = 'Command: feature-voting\n' message += 'Error Message: {}\n'.format(traceback.format_exc()) raise TLException(message)
def run_one_pipeline(config: dict, timeout=3600): """ Main running subprocess for one pipeline """ running_option = "" # setup the specific gpu usage if given if config.get("gpu_id"): running_option += "export CUDA_VISIBLE_DEVICES={}\n".format( config["gpu_id"]) input_file = config["input"] update_part_name = input_file.split("/")[-1].replace(".csv", "") if not config["command"].startswith("tl"): running_option += "tl " cli_file_path = os.path.abspath(__file__).replace( "utility/utility.py", "cli") all_commands = set([each.replace(".py", "") \ for each in os.listdir(cli_file_path) \ if each.endswith(".py") and not each.startswith("__")]) re_match = re.compile("(" + "|".join(all_commands) + "){1}") file_insert_pos = re_match.search(config["command"]) config["command"] = config["command"][:file_insert_pos.end()] + " " + \ input_file + " " + config["command"][file_insert_pos.end():] # main running table linker function running_option += config["command"].replace("{}", update_part_name) if timeout: res = timeout_call(timeout, Utility.execute_shell_code, [running_option, config["debug"]]) else: res = Utility.execute_shell_code(running_option, debug=config["debug"]) if res is None: raise TLException( "Timeout on {} seconds when running pipeline on {}!".format( timeout, update_part_name)) if res == "": raise TLException( "Executing Error when running pipeline on {}!".format( update_part_name)) res_io = StringIO(res) output_file = pd.read_csv(res_io, dtype=object) # add ground truth if ground truth given if "GT_kg_id" not in res and config.get( "ground_truth_directory") != "": name = config.get("ground_truth_pattern").replace( "{}", update_part_name) gt_file_path = os.path.join(config.get("ground_truth_directory"), name) output_file = evaluation.ground_truth_labeler(gt_file_path, df=output_file) # if output folder given, write the output of each pipeline if config.get("output_folder") != "": output_name = config.get("output_name") name = output_name.replace("{}", update_part_name) output_path = os.path.join(output_name, name) output_file.to_csv(output_path, index=False) # evaluate the prediction if we can if "GT_kg_id" in res: evaluation_res = evaluation.metrics(column=config["score_column"], df=output_file) else: evaluation_res = pd.DataFrame() return evaluation_res
def run(**kwargs): if len(kwargs.get("pipeline")) == 0: raise TLException("pipeline command must be given.") parallel_count = int(kwargs['parallel_count']) input_files = kwargs["input"] running_configs = [] gpu_resources = kwargs.get("gpu_resources") # setup the running config pipeline_cleaned = kwargs['pipeline'] for i, each in enumerate(input_files): each_config = { "input": each, "command": pipeline_cleaned, "output_folder": kwargs.get("output_folder"), "output_name": kwargs.get("output_name"), "ground_truth_pattern": kwargs.get("ground_truth_pattern"), "ground_truth_directory": kwargs.get("ground_truth_directory", ""), "score_column": kwargs.get("score_column"), "debug": kwargs.get("debug", False) } running_configs.append(each_config) if len(gpu_resources) > 0: each_config["gpu_id"] = gpu_resources[i % len(gpu_resources)] else: each_config["gpu_id"] = None # start running try: from multiprocessing import Pool from tqdm import tqdm import time import pandas as pd from io import StringIO from tl.utility.run_pipelines_utility import PipelineUtility if parallel_count == 1: results = [] for each in tqdm(running_configs): results.append(PipelineUtility.run_one_pipeline(each)) else: from multiprocessing import set_start_method set_start_method("spawn") # use multiprocess pool function to run in parallel mode p = Pool(parallel_count) result = p.map_async(PipelineUtility.run_one_pipeline, running_configs) pbar = tqdm(total=len(running_configs)) previous_remain = len(running_configs) while not result.ready(): remain_job = result._number_left if remain_job != previous_remain: pbar.update(previous_remain - remain_job) previous_remain = remain_job time.sleep(2) pbar.close() results = result.get() p.close() p.join() PipelineUtility.print_pipeline_running_results( results, omit_header=kwargs['omit_headers'], input_files=input_files, tag=kwargs.get('tag')) except: message = 'Command: run-pipeline\n' message += 'Error Message: {}\n'.format(traceback.format_exc()) raise TLException(message)