def save_remove_n_edges(edges: pandas.DataFrame, n): """ removes n edges from 'edges' so that no node is removed in the process, i.e. the total number of nodes in 'edges' stays the same :param edges: original edges :param n: number of how many edges should be removed :return: """ if n < 1: return edges all_edges = SortedList(list(edges[globConst.NODE1_ID_COL_NAME].append(edges[globConst.NODE2_ID_COL_NAME]))) edges_list = set(all_edges) edges_count_dict = {x: all_edges.count(x) for x in edges_list} i = 0 for _ in range(1000): drop_indices_candidates = random.sample(edges.index.values.tolist(), n) drop_indices = [] tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for drop_index in tqdm(drop_indices_candidates, file=tqdmbuffer): if i == n: break drop_edge_candidate = edges.loc[drop_index] if ( edges_count_dict[drop_edge_candidate[globConst.NODE1_ID_COL_NAME]] > 1 and edges_count_dict[drop_edge_candidate[globConst.NODE2_ID_COL_NAME]] > 1 ): drop_indices.append(drop_index) i += 1 edges.drop(inplace=True, index=drop_indices) if i == n: break edges.reset_index(drop=True, inplace=True) return edges
def meta_edges_to_graph(self, edge_metadata_list, tn=None): edges_dic = {} nodes_dic = {} namespaces_set = set() tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None it = tqdm(edge_metadata_list, file=tqdmbuffer, desc="meta edges to graph") for d in it: it.write(f"Converting {d}") nodes1, nodes2, edges = self.create_nodes_and_edges(d, tn) if str(d.edgeType) in edges_dic: edges_dic[str(d.edgeType)].update(edges) else: edges_dic[str(d.edgeType)] = edges if str(d.node1_type) in nodes_dic: nodes_dic[str(d.node1_type)].update(nodes1) else: nodes_dic[str(d.node1_type)] = nodes1 if str(d.node2_type) in nodes_dic: nodes_dic[str(d.node2_type)].update(nodes2) else: nodes_dic[str(d.node2_type)] = nodes2 namespaces_set.update([str(node.namespace) for node in nodes1]) namespaces_set.update([str(node.namespace) for node in nodes2]) return nodes_dic, edges_dic, namespaces_set
def create_input_files(self): logging.info("## Start creating input files ##") skip = None for_all = False if not globalConfig.INTERACTIVE_MODE: skip = globalConfig.SKIP_EXISTING_FILES for_all = True if not os.path.exists(gcConst.IN_FILE_PATH): os.makedirs(gcConst.IN_FILE_PATH) tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None it = tqdm(self.file_readers, file=tqdmbuffer) for reader in it: if reader.readerType not in self.readerType_processor_map: it.write(f"There is no processor for the reader {reader.readerType}") continue it.write(f"Reading: {reader.__class__.__name__}") # check beforehand if read in content is processed as parsing can be time consuming all_files_exist = all( os.path.isfile( os.path.join(gcConst.IN_FILE_PATH, self.infileType_inMetadata_map[processor.infileType].csv_name) ) for processor in self.readerType_processor_map[reader.readerType] ) if all_files_exist and not for_all and self.readerType_processor_map[reader.readerType]: first_processor = self.readerType_processor_map[reader.readerType][0] first_processor_out_path = os.path.join( gcConst.IN_FILE_PATH, (self.infileType_inMetadata_map[first_processor.infileType]).csv_name ) if globConst.GUI_MODE: from openbiolink.gui.gui import skipExistingFiles skip, for_all = skipExistingFiles(first_processor_out_path) else: skip, for_all = Cli.skip_existing_files(first_processor_out_path) if not skip or not all_files_exist: # execute processors in_data = reader.read_file() # fixme ResourceWarning: Enable tracemalloc to get the object allocation traceback for processor in self.readerType_processor_map[reader.readerType]: out_file_path = os.path.join( gcConst.IN_FILE_PATH, (self.infileType_inMetadata_map[processor.infileType]).csv_name ) if not for_all: if globConst.GUI_MODE: from openbiolink.gui.gui import skipExistingFiles skip, for_all = skipExistingFiles(out_file_path) else: skip, for_all = Cli.skip_existing_files(out_file_path) if not (skip and os.path.isfile(out_file_path)): out_data = processor.process(in_data) FileWriter.write_to_file(out_data, out_file_path)
def download(url, o_file_path): opener = urllib.request.build_opener() opener.addheaders = [("User-agent", "Mozilla/5.0")] urllib.request.install_opener(opener) try: tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None with tqdm(unit="B", unit_scale=True, file=tqdmbuffer) as t: reporthook = FileDownloader.download_progress_hook(t) urllib.request.urlretrieve(url, o_file_path, reporthook) except urllib.error.HTTPError as err: logging.error("HTTP %s %s: %s" % (err.code, err.msg, err.geturl())) sys.exit() except urllib.error.URLError as err: logging.error("Url Error: %s" % (err.msg)) sys.exit()
def download_db_files( self, skip_existing: bool = True, ): logging.info("## Start downloading files ##") directory = gcConst.O_FILE_PATH os.makedirs(directory, exist_ok=True) tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None it = tqdm(self.db_file_metadata, file=tqdmbuffer, desc="Downloading files") for db_file in it: path = os.path.join(directory, db_file.ofile_name) if skip_existing and os.path.exists(path): it.write(f"Skipping: {db_file.NAME}") continue if not globConst.GUI_MODE: it.write(f"Downloading: {db_file.NAME}") FileDownloader.download(db_file.url, path)
def generate_random_neg_samples(self, pos_samples, distrib="orig"): col_names = globConst.COL_NAMES_EDGES pos_samples = pos_samples[col_names] neg_samples = pandas.DataFrame(columns=col_names) pos_samples = self.add_edge_type_key_column(pos_samples) # generate distribution of meta_edge types for negative samples meta_edges = list(self.meta_edges_dic.keys()) meta_edges.sort() neg_samples_count_meta_edges = {} if distrib == "uni": num_tp_examples, _ = pos_samples.shape neg_samples_metaEdges = list( numpy.random.choice(meta_edges, num_tp_examples)) neg_samples_metaEdges.sort() neg_samples_count_meta_edges = { e: neg_samples_metaEdges.count(e) for e in set(neg_samples_metaEdges) if neg_samples_metaEdges.count(e) > 0 } elif distrib == "orig": for key in self.meta_edges_dic.keys(): num_entry = len(pos_samples.loc[( pos_samples[ttsConst.EDGE_TYPE_KEY_NAME] == key)]) if num_entry > 0: neg_samples_count_meta_edges[key] = num_entry # generate a negative sub-sample for each negative meta_edge type tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for meta_edge_triple_key, count in tqdm(sorted( neg_samples_count_meta_edges.items()), file=tqdmbuffer): node_type_1, edge_type, node_type_2 = self.meta_edges_dic[ meta_edge_triple_key] pos_samples_of_meta_edge = pos_samples.loc[(pos_samples[ ttsConst.EDGE_TYPE_KEY_NAME] == meta_edge_triple_key)] if ( edge_type in self.tn_edgeTypes ): # only onto edgesTypes can appear multiple times, there should be no onto tn neg_samples = neg_samples.append( self.subsample_with_tn( meta_edge_triple_key=meta_edge_triple_key, subsample_size=count, exclude_df=pos_samples_of_meta_edge[col_names], ), ignore_index=True, ) else: neg_samples = neg_samples.append( self.generate_n_random_samples( n=count, node_type_1=node_type_1, edge_type=edge_type, node_type_2=node_type_2, exclude_df=pos_samples_of_meta_edge[col_names], ), ignore_index=True, ) neg_samples[globConst.VALUE_COL_NAME] = 0 return neg_samples[col_names + [globConst.VALUE_COL_NAME]]
def evaluate_ranked_metrics_1(self, ks, metrics, unfiltered_setting=True, filtered_setting=False): metric_results = {} # get corrupted triples pos_test_examples = self.test_examples[self.test_examples[ globConst.VALUE_COL_NAME] == 1] pos_test_examples_array = pos_test_examples.values nodes_array = self.nodes.values mapped_pos_triples, mapped_nodes = self.get_mapped_triples_and_nodes( triples=pos_test_examples_array, nodes=nodes_array) node_types = np.unique(mapped_nodes[:, 1]) nodes_dic = { nodeType: mapped_nodes[np.where(mapped_nodes[:, 1] == nodeType)][:, 0] for nodeType in node_types } filtered_ranks_corrupted_heads = [] filtered_ranks_corrupted_tails = [] unfiltered_ranks_corrupted_heads = [] unfiltered_ranks_corrupted_tails = [] print("calculating corrupted triples") tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for pos_example in tqdm(mapped_pos_triples, total=mapped_pos_triples.shape[0], file=tqdmbuffer): ( unfiltered_corrupted_head, unfiltered_corrupted_tail, filtered_corrupted_head, filtered_corrupted_tail, ) = utils.calc_corrupted_triples( pos_example=pos_example, nodes=mapped_nodes, nodes_dic=nodes_dic, filtered=filtered_setting, pos_examples=mapped_pos_triples, ) if unfiltered_setting: unfiltered_ranks_corrupted_heads.append( self.get_rank_for_corrupted_examples( unfiltered_corrupted_head, pos_example)) unfiltered_ranks_corrupted_tails.append( self.get_rank_for_corrupted_examples( unfiltered_corrupted_tail, pos_example)) if filtered_setting: filtered_ranks_corrupted_heads.append( self.get_rank_for_corrupted_examples( filtered_corrupted_head, pos_example)) filtered_ranks_corrupted_tails.append( self.get_rank_for_corrupted_examples( filtered_corrupted_tail, pos_example)) filtered_num_examples = len(filtered_ranks_corrupted_heads) unfiltered_num_examples = len(unfiltered_ranks_corrupted_heads) # HITS@K if RankMetricType.HITS_AT_K in metrics: metric_results[ RankMetricType.HITS_AT_K] = self.calculate_hits_at_k( ks=ks, ranks_corrupted_heads=filtered_ranks_corrupted_heads, ranks_corrupted_tails=filtered_ranks_corrupted_tails, num_examples=filtered_num_examples, ) # HITS@K unfiltered if RankMetricType.HITS_AT_K_UNFILTERED in metrics: metric_results[ RankMetricType. HITS_AT_K_UNFILTERED] = self.calculate_hits_at_k( ks=ks, ranks_corrupted_heads=unfiltered_ranks_corrupted_heads, ranks_corrupted_tails=unfiltered_ranks_corrupted_tails, num_examples=unfiltered_num_examples, ) # MRR if RankMetricType.MRR in metrics: metric_results[RankMetricType.MRR] = self.calculate_mrr( ranks_corrupted_heads=filtered_ranks_corrupted_heads, ranks_corrupted_tails=filtered_ranks_corrupted_tails, num_examples=filtered_num_examples, ) # MRR unfiltered if RankMetricType.MRR_UNFILTERED in metrics: metric_results[RankMetricType.MRR] = self.calculate_mrr( ranks_corrupted_heads=unfiltered_ranks_corrupted_heads, ranks_corrupted_tails=unfiltered_ranks_corrupted_tails, num_examples=unfiltered_num_examples, ) return metric_results
def evaluate_ranked_metrics_3(self, ks, metrics, unfiltered_setting=True, filtered_setting=False): metric_results = {} k_raw_corrupted_head = [] for _ in ks: k_raw_corrupted_head.append([]) k_raw_corrupted_tail = [] for _ in ks: k_raw_corrupted_tail.append([]) # get corrupted triples pos_test_examples = self.test_examples[self.test_examples[ globConst.VALUE_COL_NAME] == 1] pos_test_examples_array = pos_test_examples.values nodes_array = self.nodes.values mapped_pos_triples, mapped_nodes = self.get_mapped_triples_and_nodes( triples=pos_test_examples_array, nodes=nodes_array) nodeTypes = np.unique(mapped_nodes[:, 1]) nodes_dic = { nodeType: np.unique(mapped_nodes[np.where(mapped_nodes[:, 1] == nodeType)][:, 0]) for nodeType in nodeTypes } head_tuples = mapped_pos_triples[:, 0:2] head_tuples = np.unique(head_tuples, axis=0) tail_tuples = mapped_pos_triples[:, 1:3] tail_tuples = np.unique(tail_tuples, axis=0) head_ranks = [] # corrupting tail tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for head, relation in tqdm(head_tuples, file=tqdmbuffer): data = mapped_pos_triples[np.where( (mapped_pos_triples[:, 0] == head) * (mapped_pos_triples[:, 1] == relation))] ranked_pos_examples, _ = self.model.get_ranked_and_sorted_predictions( data) _, corrupted_examples, _, _ = utils.calc_corrupted_triples( pos_example=data[0], nodes=mapped_nodes, nodes_dic=nodes_dic, filtered=False, pos_examples=mapped_pos_triples, ) all_examples = np.unique(np.row_stack( (corrupted_examples, np.column_stack( (data, [0] * len(data))))), axis=0) # todo VERY WRONG! ranked_all_examples, _ = self.model.get_ranked_and_sorted_predictions( all_examples) increase_search_frame_by = [0] * len(ks) for example in ranked_pos_examples: search_data = ranked_all_examples[ 0:ks[-1] + 1, :] # fixme this should be more? for i, k in enumerate(ks): current_k = k + increase_search_frame_by[i] current_k = min(current_k, len(search_data)) index = np.where(search_data[:, 2] == example[2])[0] if index <= current_k: k_raw_corrupted_tail[i].append(1) increase_search_frame_by[i] += 1 else: k_raw_corrupted_tail[i].append(0) # corrupting head tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for relation, tail in tqdm(tail_tuples, file=tqdmbuffer): data = mapped_pos_triples[np.where( (mapped_pos_triples[:, 1] == relation) * (mapped_pos_triples[:, 2] == tail))] ranked_pos_examples, _ = self.model.get_ranked_and_sorted_predictions( data) corrupted_examples, _, _, _ = utils.calc_corrupted_triples( pos_example=data[0], nodes=mapped_nodes, nodes_dic=nodes_dic, filtered=False, pos_examples=mapped_pos_triples, ) all_examples = np.unique(np.row_stack( (corrupted_examples, np.column_stack( (data, [0] * len(data))))), axis=0) # todo VERY WRONG! ranked_all_examples, _ = self.model.get_ranked_and_sorted_predictions( all_examples) increase_search_frame_by = [0] * len(ks) for example in ranked_pos_examples: search_data = ranked_all_examples[0:ks[-1] + 1, :] for i, k in enumerate(ks): current_k = k + increase_search_frame_by[i] current_k = min(current_k, len(search_data)) index = np.where(search_data[:, 0] == example[0])[0] + 1 if index <= current_k: k_raw_corrupted_head[i].append(1) increase_search_frame_by[i] += 1 else: k_raw_corrupted_head[i].append(0) k_results_corrupted_head = [] for i, k in enumerate(ks): k_results_corrupted_head.append( sum(k_raw_corrupted_head[i]) / len(k_raw_corrupted_head[i])) k_results_corrupted_tail = [] for i, k in enumerate(ks): k_results_corrupted_tail.append( sum(k_raw_corrupted_tail[i]) / len(k_raw_corrupted_tail[i])) metric_results[RankMetricType.HITS_AT_K] = (k_results_corrupted_head, k_results_corrupted_tail) return metric_results
def train(self, pos_train_triples, neg_train_triples, pos_valid_triples, neg_valid_triples, mapped_nodes): all_triples = np.concatenate((pos_train_triples, neg_train_triples)) # testme self.config[keenConst.NUM_ENTITIES] = len( np.unique(np.concatenate((all_triples[:, 0], all_triples[:, 2])))) self.config[keenConst.NUM_RELATIONS] = len(np.unique(all_triples[:, 1])) ## prepare model self.kge_model = pipeline.get_kge_model(config=self.config) self.kge_model = self.kge_model.to(self.device) optimizer = optim.SGD(self.kge_model.parameters(), lr=self.config[keenConst.LEARNING_RATE]) loss_per_epoch = [] num_pos_examples = pos_train_triples.shape[0] num_neg_examples = neg_train_triples.shape[0] ### train model tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for _ in tqdm(range(self.config[keenConst.NUM_EPOCHS]), file=tqdmbuffer): # create batches indices_pos = np.arange(num_pos_examples) np.random.shuffle(indices_pos) pos_train_triples = pos_train_triples[indices_pos] pos_batches = self._split_list_in_batches( input_list=pos_train_triples, batch_size=self.config["batch_size"]) indices_neg = np.arange(num_neg_examples) np.random.shuffle(indices_neg) neg_train_triples = neg_train_triples[indices_neg] neg_batches = self._split_list_in_batches( input_list=neg_train_triples, batch_size=self.config["batch_size"]) current_epoch_loss = 0.0 #tqdmbuffer = TqdmBuffer() if globConst.GUI_MODE else None for pos_batch, neg_batch in tqdm(zip(pos_batches, neg_batches), total=len(neg_batches)): current_batch_size = len(pos_batch) # if not len(pos_batch) == len(neg_batch): # raise RuntimeError('Pos and neg batches are not the same length') pos_batch_tensor = torch.tensor(pos_batch, dtype=torch.long, device=self.device) neg_batch_tensor = torch.tensor(neg_batch, dtype=torch.long, device=self.device) # Recall that torch *accumulates* gradients. Before passing in a # new instance, you need to zero out the gradients from the old instance optimizer.zero_grad() loss = self.kge_model(pos_batch_tensor, neg_batch_tensor) current_epoch_loss += loss.item() * current_batch_size loss.backward() optimizer.step() loss_per_epoch.append(current_epoch_loss / len(pos_train_triples)) ### prepare results for output entity_to_embedding = { id: embedding.detach().cpu().numpy() for id, embedding in enumerate( self.kge_model.entity_embeddings.weight) } relation_to_embedding = { id: embedding.detach().cpu().numpy() for id, embedding in enumerate( self.kge_model.relation_embeddings.weight) } results = { "trained_model": self.kge_model, "loss_per_epoch": loss_per_epoch, "entity_to_embedding": entity_to_embedding, "relation_to_embedding": relation_to_embedding, "config": self.config, } self.output_results(results) return self.kge_model