Exemple #1
0
    def save_remove_n_edges(edges: pandas.DataFrame, n):
        """
        removes n edges from 'edges' so that no node is removed in the process, i.e. the total number
        of nodes in 'edges' stays the same
        :param edges: original edges
        :param n: number of how many edges should be removed
        :return:
        """
        if n < 1:
            return edges
        all_edges = SortedList(list(edges[globConst.NODE1_ID_COL_NAME].append(edges[globConst.NODE2_ID_COL_NAME])))
        edges_list = set(all_edges)
        edges_count_dict = {x: all_edges.count(x) for x in edges_list}
        i = 0

        for _ in range(1000):
            drop_indices_candidates = random.sample(edges.index.values.tolist(), n)
            drop_indices = []
            tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
            for drop_index in tqdm(drop_indices_candidates, file=tqdmbuffer):
                if i == n:
                    break
                drop_edge_candidate = edges.loc[drop_index]
                if (
                    edges_count_dict[drop_edge_candidate[globConst.NODE1_ID_COL_NAME]] > 1
                    and edges_count_dict[drop_edge_candidate[globConst.NODE2_ID_COL_NAME]] > 1
                ):
                    drop_indices.append(drop_index)
                    i += 1
            edges.drop(inplace=True, index=drop_indices)
            if i == n:
                break
        edges.reset_index(drop=True, inplace=True)
        return edges
 def meta_edges_to_graph(self, edge_metadata_list, tn=None):
     edges_dic = {}
     nodes_dic = {}
     namespaces_set = set()
     tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
     it = tqdm(edge_metadata_list,
               file=tqdmbuffer,
               desc="meta edges to graph")
     for d in it:
         it.write(f"Converting {d}")
         nodes1, nodes2, edges = self.create_nodes_and_edges(d, tn)
         if str(d.edgeType) in edges_dic:
             edges_dic[str(d.edgeType)].update(edges)
         else:
             edges_dic[str(d.edgeType)] = edges
         if str(d.node1_type) in nodes_dic:
             nodes_dic[str(d.node1_type)].update(nodes1)
         else:
             nodes_dic[str(d.node1_type)] = nodes1
         if str(d.node2_type) in nodes_dic:
             nodes_dic[str(d.node2_type)].update(nodes2)
         else:
             nodes_dic[str(d.node2_type)] = nodes2
         namespaces_set.update([str(node.namespace) for node in nodes1])
         namespaces_set.update([str(node.namespace) for node in nodes2])
     return nodes_dic, edges_dic, namespaces_set
    def create_input_files(self):
        logging.info("## Start creating input files ##")
        skip = None
        for_all = False
        if not globalConfig.INTERACTIVE_MODE:
            skip = globalConfig.SKIP_EXISTING_FILES
            for_all = True
        if not os.path.exists(gcConst.IN_FILE_PATH):
            os.makedirs(gcConst.IN_FILE_PATH)
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        it = tqdm(self.file_readers, file=tqdmbuffer)
        for reader in it:
            if reader.readerType not in self.readerType_processor_map:
                it.write(f"There is no processor for the reader {reader.readerType}")
                continue
            it.write(f"Reading: {reader.__class__.__name__}")
            # check beforehand if read in content is processed as parsing can be time consuming
            all_files_exist = all(
                os.path.isfile(
                    os.path.join(gcConst.IN_FILE_PATH, self.infileType_inMetadata_map[processor.infileType].csv_name)
                )
                for processor in self.readerType_processor_map[reader.readerType]
            )
            if all_files_exist and not for_all and self.readerType_processor_map[reader.readerType]:
                first_processor = self.readerType_processor_map[reader.readerType][0]
                first_processor_out_path = os.path.join(
                    gcConst.IN_FILE_PATH, (self.infileType_inMetadata_map[first_processor.infileType]).csv_name
                )
                if globConst.GUI_MODE:
                    from openbiolink.gui.gui import skipExistingFiles

                    skip, for_all = skipExistingFiles(first_processor_out_path)
                else:
                    skip, for_all = Cli.skip_existing_files(first_processor_out_path)

            if not skip or not all_files_exist:
                # execute processors
                in_data = reader.read_file()
                # fixme  ResourceWarning: Enable tracemalloc to get the object allocation traceback
                for processor in self.readerType_processor_map[reader.readerType]:
                    out_file_path = os.path.join(
                        gcConst.IN_FILE_PATH, (self.infileType_inMetadata_map[processor.infileType]).csv_name
                    )
                    if not for_all:
                        if globConst.GUI_MODE:
                            from openbiolink.gui.gui import skipExistingFiles

                            skip, for_all = skipExistingFiles(out_file_path)
                        else:
                            skip, for_all = Cli.skip_existing_files(out_file_path)
                    if not (skip and os.path.isfile(out_file_path)):
                        out_data = processor.process(in_data)
                        FileWriter.write_to_file(out_data, out_file_path)
Exemple #4
0
 def download(url, o_file_path):
     opener = urllib.request.build_opener()
     opener.addheaders = [("User-agent", "Mozilla/5.0")]
     urllib.request.install_opener(opener)
     try:
         tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
         with tqdm(unit="B", unit_scale=True, file=tqdmbuffer) as t:
             reporthook = FileDownloader.download_progress_hook(t)
             urllib.request.urlretrieve(url, o_file_path, reporthook)
     except urllib.error.HTTPError as err:
         logging.error("HTTP %s %s:  %s" % (err.code, err.msg, err.geturl()))
         sys.exit()
     except urllib.error.URLError as err:
         logging.error("Url Error: %s" % (err.msg))
         sys.exit()
 def download_db_files(
     self, skip_existing: bool = True,
 ):
     logging.info("## Start downloading files ##")
     directory = gcConst.O_FILE_PATH
     os.makedirs(directory, exist_ok=True)
     tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
     it = tqdm(self.db_file_metadata, file=tqdmbuffer, desc="Downloading files")
     for db_file in it:
         path = os.path.join(directory, db_file.ofile_name)
         if skip_existing and os.path.exists(path):
             it.write(f"Skipping: {db_file.NAME}")
             continue
         if not globConst.GUI_MODE:
             it.write(f"Downloading: {db_file.NAME}")
         FileDownloader.download(db_file.url, path)
Exemple #6
0
    def generate_random_neg_samples(self, pos_samples, distrib="orig"):
        col_names = globConst.COL_NAMES_EDGES
        pos_samples = pos_samples[col_names]
        neg_samples = pandas.DataFrame(columns=col_names)
        pos_samples = self.add_edge_type_key_column(pos_samples)

        # generate distribution of meta_edge types for negative samples
        meta_edges = list(self.meta_edges_dic.keys())
        meta_edges.sort()
        neg_samples_count_meta_edges = {}
        if distrib == "uni":
            num_tp_examples, _ = pos_samples.shape
            neg_samples_metaEdges = list(
                numpy.random.choice(meta_edges, num_tp_examples))
            neg_samples_metaEdges.sort()
            neg_samples_count_meta_edges = {
                e: neg_samples_metaEdges.count(e)
                for e in set(neg_samples_metaEdges)
                if neg_samples_metaEdges.count(e) > 0
            }
        elif distrib == "orig":
            for key in self.meta_edges_dic.keys():
                num_entry = len(pos_samples.loc[(
                    pos_samples[ttsConst.EDGE_TYPE_KEY_NAME] == key)])
                if num_entry > 0:
                    neg_samples_count_meta_edges[key] = num_entry

        # generate a negative sub-sample for each negative meta_edge type
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        for meta_edge_triple_key, count in tqdm(sorted(
                neg_samples_count_meta_edges.items()),
                                                file=tqdmbuffer):
            node_type_1, edge_type, node_type_2 = self.meta_edges_dic[
                meta_edge_triple_key]
            pos_samples_of_meta_edge = pos_samples.loc[(pos_samples[
                ttsConst.EDGE_TYPE_KEY_NAME] == meta_edge_triple_key)]

            if (
                    edge_type in self.tn_edgeTypes
            ):  # only onto edgesTypes can appear multiple times, there should be no onto tn
                neg_samples = neg_samples.append(
                    self.subsample_with_tn(
                        meta_edge_triple_key=meta_edge_triple_key,
                        subsample_size=count,
                        exclude_df=pos_samples_of_meta_edge[col_names],
                    ),
                    ignore_index=True,
                )
            else:
                neg_samples = neg_samples.append(
                    self.generate_n_random_samples(
                        n=count,
                        node_type_1=node_type_1,
                        edge_type=edge_type,
                        node_type_2=node_type_2,
                        exclude_df=pos_samples_of_meta_edge[col_names],
                    ),
                    ignore_index=True,
                )
        neg_samples[globConst.VALUE_COL_NAME] = 0

        return neg_samples[col_names + [globConst.VALUE_COL_NAME]]
    def evaluate_ranked_metrics_1(self,
                                  ks,
                                  metrics,
                                  unfiltered_setting=True,
                                  filtered_setting=False):
        metric_results = {}

        # get corrupted triples
        pos_test_examples = self.test_examples[self.test_examples[
            globConst.VALUE_COL_NAME] == 1]
        pos_test_examples_array = pos_test_examples.values
        nodes_array = self.nodes.values

        mapped_pos_triples, mapped_nodes = self.get_mapped_triples_and_nodes(
            triples=pos_test_examples_array, nodes=nodes_array)

        node_types = np.unique(mapped_nodes[:, 1])
        nodes_dic = {
            nodeType: mapped_nodes[np.where(mapped_nodes[:, 1] == nodeType)][:,
                                                                             0]
            for nodeType in node_types
        }

        filtered_ranks_corrupted_heads = []
        filtered_ranks_corrupted_tails = []
        unfiltered_ranks_corrupted_heads = []
        unfiltered_ranks_corrupted_tails = []

        print("calculating corrupted triples")
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        for pos_example in tqdm(mapped_pos_triples,
                                total=mapped_pos_triples.shape[0],
                                file=tqdmbuffer):
            (
                unfiltered_corrupted_head,
                unfiltered_corrupted_tail,
                filtered_corrupted_head,
                filtered_corrupted_tail,
            ) = utils.calc_corrupted_triples(
                pos_example=pos_example,
                nodes=mapped_nodes,
                nodes_dic=nodes_dic,
                filtered=filtered_setting,
                pos_examples=mapped_pos_triples,
            )
            if unfiltered_setting:
                unfiltered_ranks_corrupted_heads.append(
                    self.get_rank_for_corrupted_examples(
                        unfiltered_corrupted_head, pos_example))
                unfiltered_ranks_corrupted_tails.append(
                    self.get_rank_for_corrupted_examples(
                        unfiltered_corrupted_tail, pos_example))
            if filtered_setting:
                filtered_ranks_corrupted_heads.append(
                    self.get_rank_for_corrupted_examples(
                        filtered_corrupted_head, pos_example))
                filtered_ranks_corrupted_tails.append(
                    self.get_rank_for_corrupted_examples(
                        filtered_corrupted_tail, pos_example))

        filtered_num_examples = len(filtered_ranks_corrupted_heads)
        unfiltered_num_examples = len(unfiltered_ranks_corrupted_heads)

        # HITS@K
        if RankMetricType.HITS_AT_K in metrics:
            metric_results[
                RankMetricType.HITS_AT_K] = self.calculate_hits_at_k(
                    ks=ks,
                    ranks_corrupted_heads=filtered_ranks_corrupted_heads,
                    ranks_corrupted_tails=filtered_ranks_corrupted_tails,
                    num_examples=filtered_num_examples,
                )
        # HITS@K unfiltered
        if RankMetricType.HITS_AT_K_UNFILTERED in metrics:
            metric_results[
                RankMetricType.
                HITS_AT_K_UNFILTERED] = self.calculate_hits_at_k(
                    ks=ks,
                    ranks_corrupted_heads=unfiltered_ranks_corrupted_heads,
                    ranks_corrupted_tails=unfiltered_ranks_corrupted_tails,
                    num_examples=unfiltered_num_examples,
                )
        # MRR
        if RankMetricType.MRR in metrics:
            metric_results[RankMetricType.MRR] = self.calculate_mrr(
                ranks_corrupted_heads=filtered_ranks_corrupted_heads,
                ranks_corrupted_tails=filtered_ranks_corrupted_tails,
                num_examples=filtered_num_examples,
            )
        # MRR unfiltered
        if RankMetricType.MRR_UNFILTERED in metrics:
            metric_results[RankMetricType.MRR] = self.calculate_mrr(
                ranks_corrupted_heads=unfiltered_ranks_corrupted_heads,
                ranks_corrupted_tails=unfiltered_ranks_corrupted_tails,
                num_examples=unfiltered_num_examples,
            )
        return metric_results
    def evaluate_ranked_metrics_3(self,
                                  ks,
                                  metrics,
                                  unfiltered_setting=True,
                                  filtered_setting=False):
        metric_results = {}
        k_raw_corrupted_head = []
        for _ in ks:
            k_raw_corrupted_head.append([])
        k_raw_corrupted_tail = []
        for _ in ks:
            k_raw_corrupted_tail.append([])

        # get corrupted triples
        pos_test_examples = self.test_examples[self.test_examples[
            globConst.VALUE_COL_NAME] == 1]
        pos_test_examples_array = pos_test_examples.values
        nodes_array = self.nodes.values

        mapped_pos_triples, mapped_nodes = self.get_mapped_triples_and_nodes(
            triples=pos_test_examples_array, nodes=nodes_array)
        nodeTypes = np.unique(mapped_nodes[:, 1])
        nodes_dic = {
            nodeType:
            np.unique(mapped_nodes[np.where(mapped_nodes[:,
                                                         1] == nodeType)][:,
                                                                          0])
            for nodeType in nodeTypes
        }
        head_tuples = mapped_pos_triples[:, 0:2]
        head_tuples = np.unique(head_tuples, axis=0)
        tail_tuples = mapped_pos_triples[:, 1:3]
        tail_tuples = np.unique(tail_tuples, axis=0)
        head_ranks = []
        # corrupting tail
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        for head, relation in tqdm(head_tuples, file=tqdmbuffer):
            data = mapped_pos_triples[np.where(
                (mapped_pos_triples[:, 0] == head) *
                (mapped_pos_triples[:, 1] == relation))]

            ranked_pos_examples, _ = self.model.get_ranked_and_sorted_predictions(
                data)
            _, corrupted_examples, _, _ = utils.calc_corrupted_triples(
                pos_example=data[0],
                nodes=mapped_nodes,
                nodes_dic=nodes_dic,
                filtered=False,
                pos_examples=mapped_pos_triples,
            )
            all_examples = np.unique(np.row_stack(
                (corrupted_examples, np.column_stack(
                    (data, [0] * len(data))))),
                                     axis=0)  # todo VERY WRONG!
            ranked_all_examples, _ = self.model.get_ranked_and_sorted_predictions(
                all_examples)
            increase_search_frame_by = [0] * len(ks)
            for example in ranked_pos_examples:
                search_data = ranked_all_examples[
                    0:ks[-1] + 1, :]  # fixme this should be more?
                for i, k in enumerate(ks):
                    current_k = k + increase_search_frame_by[i]
                    current_k = min(current_k, len(search_data))
                    index = np.where(search_data[:, 2] == example[2])[0]
                    if index <= current_k:
                        k_raw_corrupted_tail[i].append(1)
                        increase_search_frame_by[i] += 1
                    else:
                        k_raw_corrupted_tail[i].append(0)

        # corrupting head
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        for relation, tail in tqdm(tail_tuples, file=tqdmbuffer):
            data = mapped_pos_triples[np.where(
                (mapped_pos_triples[:, 1] == relation) *
                (mapped_pos_triples[:, 2] == tail))]

            ranked_pos_examples, _ = self.model.get_ranked_and_sorted_predictions(
                data)
            corrupted_examples, _, _, _ = utils.calc_corrupted_triples(
                pos_example=data[0],
                nodes=mapped_nodes,
                nodes_dic=nodes_dic,
                filtered=False,
                pos_examples=mapped_pos_triples,
            )
            all_examples = np.unique(np.row_stack(
                (corrupted_examples, np.column_stack(
                    (data, [0] * len(data))))),
                                     axis=0)  # todo VERY WRONG!
            ranked_all_examples, _ = self.model.get_ranked_and_sorted_predictions(
                all_examples)
            increase_search_frame_by = [0] * len(ks)
            for example in ranked_pos_examples:
                search_data = ranked_all_examples[0:ks[-1] + 1, :]
                for i, k in enumerate(ks):
                    current_k = k + increase_search_frame_by[i]
                    current_k = min(current_k, len(search_data))
                    index = np.where(search_data[:, 0] == example[0])[0] + 1
                    if index <= current_k:
                        k_raw_corrupted_head[i].append(1)
                        increase_search_frame_by[i] += 1
                    else:
                        k_raw_corrupted_head[i].append(0)
        k_results_corrupted_head = []
        for i, k in enumerate(ks):
            k_results_corrupted_head.append(
                sum(k_raw_corrupted_head[i]) / len(k_raw_corrupted_head[i]))
        k_results_corrupted_tail = []
        for i, k in enumerate(ks):
            k_results_corrupted_tail.append(
                sum(k_raw_corrupted_tail[i]) / len(k_raw_corrupted_tail[i]))

        metric_results[RankMetricType.HITS_AT_K] = (k_results_corrupted_head,
                                                    k_results_corrupted_tail)
        return metric_results
Exemple #9
0
    def train(self, pos_train_triples, neg_train_triples, pos_valid_triples,
              neg_valid_triples, mapped_nodes):

        all_triples = np.concatenate((pos_train_triples, neg_train_triples))

        # testme
        self.config[keenConst.NUM_ENTITIES] = len(
            np.unique(np.concatenate((all_triples[:, 0], all_triples[:, 2]))))
        self.config[keenConst.NUM_RELATIONS] = len(np.unique(all_triples[:,
                                                                         1]))

        ## prepare model
        self.kge_model = pipeline.get_kge_model(config=self.config)
        self.kge_model = self.kge_model.to(self.device)

        optimizer = optim.SGD(self.kge_model.parameters(),
                              lr=self.config[keenConst.LEARNING_RATE])
        loss_per_epoch = []
        num_pos_examples = pos_train_triples.shape[0]
        num_neg_examples = neg_train_triples.shape[0]

        ### train model
        tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
        for _ in tqdm(range(self.config[keenConst.NUM_EPOCHS]),
                      file=tqdmbuffer):
            # create batches
            indices_pos = np.arange(num_pos_examples)
            np.random.shuffle(indices_pos)
            pos_train_triples = pos_train_triples[indices_pos]
            pos_batches = self._split_list_in_batches(
                input_list=pos_train_triples,
                batch_size=self.config["batch_size"])
            indices_neg = np.arange(num_neg_examples)
            np.random.shuffle(indices_neg)
            neg_train_triples = neg_train_triples[indices_neg]
            neg_batches = self._split_list_in_batches(
                input_list=neg_train_triples,
                batch_size=self.config["batch_size"])
            current_epoch_loss = 0.0
            #tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None
            for pos_batch, neg_batch in tqdm(zip(pos_batches, neg_batches),
                                             total=len(neg_batches)):
                current_batch_size = len(pos_batch)

                # if not len(pos_batch) == len(neg_batch):
                #    raise RuntimeError('Pos and neg batches are not the same length')

                pos_batch_tensor = torch.tensor(pos_batch,
                                                dtype=torch.long,
                                                device=self.device)
                neg_batch_tensor = torch.tensor(neg_batch,
                                                dtype=torch.long,
                                                device=self.device)
                # Recall that torch *accumulates* gradients. Before passing in a
                # new instance, you need to zero out the gradients from the old instance
                optimizer.zero_grad()
                loss = self.kge_model(pos_batch_tensor, neg_batch_tensor)
                current_epoch_loss += loss.item() * current_batch_size

                loss.backward()
                optimizer.step()

            loss_per_epoch.append(current_epoch_loss / len(pos_train_triples))

        ### prepare results for output
        entity_to_embedding = {
            id: embedding.detach().cpu().numpy()
            for id, embedding in enumerate(
                self.kge_model.entity_embeddings.weight)
        }
        relation_to_embedding = {
            id: embedding.detach().cpu().numpy()
            for id, embedding in enumerate(
                self.kge_model.relation_embeddings.weight)
        }

        results = {
            "trained_model": self.kge_model,
            "loss_per_epoch": loss_per_epoch,
            "entity_to_embedding": entity_to_embedding,
            "relation_to_embedding": relation_to_embedding,
            "config": self.config,
        }
        self.output_results(results)

        return self.kge_model