def get_vectors(self):
     qnodes = set(self.loaded_file.loc[:, self.input_column_name])
     embedding_file = self.kwargs['embedding_file']
     url = self.kwargs['embedding_url']
     if embedding_file:
         with open(embedding_file, 'rt') as fd:
             for line in fd:
                 fields = line.strip().split('\t')
                 qnode = fields[0]
                 if qnode in qnodes:
                     self.vectors_map[qnode] = np.asarray(
                         list(map(float, fields[1:])))
     elif url:
         found_one = False
         for i, qnode in enumerate(qnodes):
             # Use str, becauses of missing values (nan)
             response = requests.get(url + str(qnode))
             if response.status_code == 200:
                 result = response.json()
                 if result['found']:
                     found_one = True
                     self.vectors_map[qnode] = np.asarray(
                         list(
                             map(float,
                                 result['_source']['embedding'].split())))
             if i > 100 and not found_one:
                 raise TLException(
                     f'Failing to find vectors: {url} {qnode}')
         if not found_one:
             raise TLException(f'Failed to find any vectors: {url} {qnode}')
 def process_vectors(self):
     """
     apply corresponding vector strategy to process the calculated vectors
     :return:
     """
     vector_strategy = self.kwargs.get("column_vector_strategy",
                                       "centroid-of-singletons")
     if vector_strategy == "centroid-of-singletons":
         if not self._centroid_of_singletons():
             raise TLException(
                 f'Column_vector_stragtegy {vector_strategy} failed')
     else:
         raise TLException(f'Unknown column_vector_stragtegy')
    def _get_centroid(self, vector_strategy: str):
        """
            function used to calculate the column-vector(centroid) value
        """
        n_value = int(self.kwargs.pop("n_value"))

        if vector_strategy == "ground-truth":
            if "GT_kg_id" not in self.loaded_file:
                raise TLException(
                    "The input file does not have `GT_kg_id` column! Can't run with ground-truth "
                    "strategy")
            candidate_nodes = list(set(self.loaded_file["GT_kg_id"].tolist()))
        elif vector_strategy == "exact-matches":
            candidate_nodes = list(set(self.loaded_file["kg_id"].tolist()))
        else:
            raise TLException(
                "Unknown vector vector strategy {}".format(vector_strategy))
        candidate_nodes = [
            each for each in candidate_nodes
            if each != "" and each is not np.nan
        ]

        # get corresponding column of each candidate nodes
        nodes_map = defaultdict(set)
        for each_node in candidate_nodes:
            for group, nodes in self.groups.items():
                if each_node in nodes:
                    nodes_map[group].add(each_node)

        # random sample nodes if needed
        nodes_map_updated = {}

        for group, nodes in nodes_map.items():
            if n_value != 0 and n_value < len(nodes):
                nodes_map_updated[group] = random.sample(nodes, n_value)
            else:
                nodes_map_updated[group] = nodes

        # get centroid for each column
        for group, nodes in nodes_map_updated.items():
            temp = []
            for each_node in sorted(list(nodes)):
                temp.append(self.vectors_map[each_node])
            each_centroid = np.mean(np.array(temp), axis=0)
            self.centroid[group] = each_centroid
def run(**kwargs):
    try:
        from tl.features.external_embedding import EmbeddingVector
        vector_transformer = EmbeddingVector(kwargs)
        vector_transformer.get_vectors()
        vector_transformer.process_vectors()
        vector_transformer.add_score_column()
        vector_transformer.print_output()
    except:
        message = 'Command: score-using-embedding\n'
        message += 'Error Message:  {}\n'.format(traceback.format_exc())
        raise TLException(message)
Beispiel #5
0
 def compute_distance(self, v1: np.array, v2: np.array):
     if self.kwargs["distance_function"] == "cosine":
         val = 1 - cosine(v1, v2)
     elif self.kwargs["distance_function"] == "euclidean":
         val = euclidean(v1, v2)
         # because we need score higher to be better, here we use the reciprocal value
         if val == 0:
             val = float("inf")
         else:
             val = 1 / val
     else:
         raise TLException("Unknown distance function {}".format(
             self.kwargs["distance_function"]))
     return val
def run(**kwargs):
    try:
        import pandas as pd
        from tl.features.feature_voting import feature_voting
        input_file_path = kwargs.pop("input_file")
        input_column_names = kwargs.pop("input_column_names")
        df = pd.read_csv(input_file_path)
        feature_col_names = input_column_names.split(',')

        odf = feature_voting(feature_col_names, df)

        odf.to_csv(sys.stdout, index=False)

    except:
        message = 'Command: feature-voting\n'
        message += 'Error Message:  {}\n'.format(traceback.format_exc())
        raise TLException(message)
Beispiel #7
0
    def run_one_pipeline(config: dict, timeout=3600):
        """
            Main running subprocess for one pipeline
        """
        running_option = ""
        # setup the specific gpu usage if given
        if config.get("gpu_id"):
            running_option += "export CUDA_VISIBLE_DEVICES={}\n".format(
                config["gpu_id"])
        input_file = config["input"]
        update_part_name = input_file.split("/")[-1].replace(".csv", "")

        if not config["command"].startswith("tl"):
            running_option += "tl "

        cli_file_path = os.path.abspath(__file__).replace(
            "utility/utility.py", "cli")
        all_commands = set([each.replace(".py", "") \
                            for each in os.listdir(cli_file_path) \
                            if each.endswith(".py") and not each.startswith("__")])

        re_match = re.compile("(" + "|".join(all_commands) + "){1}")
        file_insert_pos = re_match.search(config["command"])
        config["command"] = config["command"][:file_insert_pos.end()] + " " + \
                            input_file + " " + config["command"][file_insert_pos.end():]
        # main running table linker function
        running_option += config["command"].replace("{}", update_part_name)

        if timeout:
            res = timeout_call(timeout, Utility.execute_shell_code,
                               [running_option, config["debug"]])
        else:
            res = Utility.execute_shell_code(running_option,
                                             debug=config["debug"])

        if res is None:
            raise TLException(
                "Timeout on {} seconds when running pipeline on {}!".format(
                    timeout, update_part_name))
        if res == "":
            raise TLException(
                "Executing Error when running pipeline on {}!".format(
                    update_part_name))

        res_io = StringIO(res)
        output_file = pd.read_csv(res_io, dtype=object)

        # add ground truth if ground truth given
        if "GT_kg_id" not in res and config.get(
                "ground_truth_directory") != "":
            name = config.get("ground_truth_pattern").replace(
                "{}", update_part_name)
            gt_file_path = os.path.join(config.get("ground_truth_directory"),
                                        name)
            output_file = evaluation.ground_truth_labeler(gt_file_path,
                                                          df=output_file)

        # if output folder given, write the output of each pipeline
        if config.get("output_folder") != "":
            output_name = config.get("output_name")
            name = output_name.replace("{}", update_part_name)
            output_path = os.path.join(output_name, name)
            output_file.to_csv(output_path, index=False)

        # evaluate the prediction if we can
        if "GT_kg_id" in res:
            evaluation_res = evaluation.metrics(column=config["score_column"],
                                                df=output_file)
        else:
            evaluation_res = pd.DataFrame()
        return evaluation_res
Beispiel #8
0
def run(**kwargs):
    if len(kwargs.get("pipeline")) == 0:
        raise TLException("pipeline command must be given.")

    parallel_count = int(kwargs['parallel_count'])
    input_files = kwargs["input"]
    running_configs = []
    gpu_resources = kwargs.get("gpu_resources")

    # setup the running config
    pipeline_cleaned = kwargs['pipeline']
    for i, each in enumerate(input_files):
        each_config = {
            "input": each,
            "command": pipeline_cleaned,
            "output_folder": kwargs.get("output_folder"),
            "output_name": kwargs.get("output_name"),
            "ground_truth_pattern": kwargs.get("ground_truth_pattern"),
            "ground_truth_directory": kwargs.get("ground_truth_directory", ""),
            "score_column": kwargs.get("score_column"),
            "debug": kwargs.get("debug", False)
        }
        running_configs.append(each_config)
        if len(gpu_resources) > 0:
            each_config["gpu_id"] = gpu_resources[i % len(gpu_resources)]
        else:
            each_config["gpu_id"] = None

    # start running
    try:
        from multiprocessing import Pool
        from tqdm import tqdm
        import time
        import pandas as pd
        from io import StringIO
        from tl.utility.run_pipelines_utility import PipelineUtility
        if parallel_count == 1:
            results = []
            for each in tqdm(running_configs):
                results.append(PipelineUtility.run_one_pipeline(each))
        else:
            from multiprocessing import set_start_method
            set_start_method("spawn")

            # use multiprocess pool function to run in parallel mode
            p = Pool(parallel_count)
            result = p.map_async(PipelineUtility.run_one_pipeline,
                                 running_configs)
            pbar = tqdm(total=len(running_configs))
            previous_remain = len(running_configs)
            while not result.ready():
                remain_job = result._number_left
                if remain_job != previous_remain:
                    pbar.update(previous_remain - remain_job)
                    previous_remain = remain_job
                time.sleep(2)
            pbar.close()
            results = result.get()
            p.close()
            p.join()

        PipelineUtility.print_pipeline_running_results(
            results,
            omit_header=kwargs['omit_headers'],
            input_files=input_files,
            tag=kwargs.get('tag'))
    except:
        message = 'Command: run-pipeline\n'
        message += 'Error Message:  {}\n'.format(traceback.format_exc())
        raise TLException(message)