def create_and_write_cross_val(self, df: pandas.DataFrame, nodes_in_train_val_set, n_folds): nel_total, _ = df.shape if n_folds == 0 or n_folds == 1 or (n_folds > 1 and not float(n_folds).is_integer()): logging.error("provided folds are not possible!") raise Exception( "fold entry must be either an int>1 (number of folds) or a float >0 and <1 (validation fraction)" ) if n_folds < 1: # n_folds is fraction n_folds = math.ceil(1 / n_folds) n_folds = int(n_folds) rand_index = list(df.index) random.shuffle(rand_index) chunks = np.array_split(rand_index, n_folds) for i in range(n_folds): logging.info(f"Creating fold number {i+1} ...") train_chunk_indices = [(x + i) % n_folds for x in range(n_folds - 1)] val_chunk_index = (n_folds - 1 + i) % n_folds train_indices = [ element for chunk_index in train_chunk_indices for element in chunks[chunk_index] ] val_indices = chunks[val_chunk_index] train_set = df.loc[train_indices] val_set = df.loc[val_indices] new_val_nodes = self.get_additional_nodes( old_nodes_list=train_set[ globalConfig.NODE1_ID_COL_NAME].tolist() + train_set[globalConfig.NODE2_ID_COL_NAME].tolist(), new_nodes_list=nodes_in_train_val_set, ) if len(new_val_nodes) > 0: # nicetohave (6) logging.info( f"Validation set {i+1} contains nodes that are" f" not present in the training set. These edges will be dropped." ) val_set = self.remove_edges_with_nodes(val_set, new_val_nodes) if graphProp.DIRECTED: train_set = utils.remove_reverse_edges(remain_set=train_set, remove_set=val_set) val_set = utils.remove_parent_duplicates_and_reverses( remain_set=val_set, remove_set=train_set) self.writer.write_train_val_set(train_set, val_set, new_val_nodes, i) return train_set.append(val_set)
def time_slice_split(self): # nicetohave (4) like that, neg samples are restricted to edge_types appearing in test_sample --> good idea? # nicetohave (4) idea: calculate nodes like above, then tmo_nodes= test_nodes --> mehr auswahl bei neg examples tmo_positive_samples = self.tmo_all_tp tmo_negative_sampler = NegativeSampler( self.meta_edges_dic, self.tmo_tn_edgeTypes, self.tmo_all_tn, self.tmo_nodes, self.identifier2type ) tmo_negative_samples = tmo_negative_sampler.generate_random_neg_samples(tmo_positive_samples) # todo remove not consistent edges tmo_negative_samples[globalConfig.VALUE_COL_NAME] = 0 tmo_all_samples = (tmo_positive_samples.append(tmo_negative_samples, ignore_index=True)).reset_index( drop=True ) # todo ist append nicht in pace? train_set = tmo_all_samples test_positive_samples, vanished_positive_samples = utils.get_diff( self.all_tp, self.tmo_all_tp, ignore_qscore=True ) test_tn_samples, vanished_tn_samples = utils.get_diff(self.all_tn, self.tmo_all_tn, ignore_qscore=True) if not vanished_positive_samples.empty or not vanished_tn_samples.empty: logging.info("Some edges existing in the first time slice are no longer present in the second one") self.writer.print_vanished_edges(vanished_positive_samples.append(vanished_tn_samples)) test_negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, test_tn_samples, self.all_nodes, self.identifier2type) test_negative_samples = test_negative_sampler.generate_random_neg_samples(test_positive_samples) test_negative_samples[globalConfig.VALUE_COL_NAME] = 0 test_set = (test_positive_samples.append(test_negative_samples, ignore_index=True)).reset_index(drop=True) test_set = utils.remove_parent_duplicates_and_reverses(remain_set=test_set, remove_set=train_set) new_test_nodes = self.get_additional_nodes( old_nodes_list=self.tmo_nodes[globalConfig.ID_NODE_COL_NAME].tolist(), new_nodes_list=self.all_nodes[globalConfig.ID_NODE_COL_NAME].tolist(), ) if new_test_nodes: logging.info( "The test set contains nodes that are not present in the trainings-set. These edges will be removed." ) # nicetohave (6) test_set = self.remove_edges_with_nodes(test_set, new_test_nodes) if graphProp.DIRECTED: train_set = utils.remove_reverse_edges(remain_set=train_set, remove_set=test_set) train_set_nodes = ( train_set[globalConfig.NODE1_ID_COL_NAME].tolist() + train_set[globalConfig.NODE2_ID_COL_NAME].tolist() ) test_set_nodes = ( test_set[globalConfig.NODE1_ID_COL_NAME].tolist() + test_set[globalConfig.NODE2_ID_COL_NAME].tolist() ) self.writer.write_set(train_set, ttsConf.TRAIN_FILE_NAME) self.writer.write_nodes(set(train_set_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME) self.writer.write_set(test_set, ttsConf.TEST_FILE_NAME) self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME) self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME)
def random_edge_split(self, test_frac=None, val=None, crossval=None): if not val: val = 0.05 if not test_frac: test_frac = 0.05 # create positive and negative examples positive_samples = self.all_tp.copy() negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, self.all_tn.copy(), self.all_nodes, self.identifier2type) negative_samples = negative_sampler.generate_random_neg_samples( positive_samples) all_samples = (positive_samples.append( negative_samples, ignore_index=True)).reset_index(drop=True) all_samples = utils.remove_inconsistent_edges(all_samples).reset_index( drop=True) # generate, train-, test-, validation-sets test_set = all_samples.sample(frac=test_frac, random_state=globalConfig.RANDOM_STATE) train_val_set = all_samples.drop(list(test_set.index.values)) test_set = utils.remove_parent_duplicates_and_reverses( remain_set=test_set, remove_set=train_val_set) train_val_nodes = ( train_val_set[globalConfig.NODE1_ID_COL_NAME].tolist() + train_val_set[globalConfig.NODE2_ID_COL_NAME].tolist()) new_test_nodes = self.get_additional_nodes( old_nodes_list=train_val_nodes, new_nodes_list=self.all_nodes[ globalConfig.NODE_TYPE_COL_NAME].tolist()) if new_test_nodes: logging.info( "The test set contains nodes, that are not present in the trainings-set. These edges will be dropped." ) # nicetohave (6): option to keep edges with new nodes test_set = self.remove_edges_with_nodes(test_set, new_test_nodes) test_set_nodes = (test_set[globalConfig.NODE1_ID_COL_NAME].tolist() + test_set[globalConfig.NODE2_ID_COL_NAME].tolist()) if graphProp.DIRECTED: train_val_set = utils.remove_reverse_edges( remain_set=train_val_set, remove_set=test_set) if crossval: logging.info("Performing cross validation on trainingset...") train_val_set = self.create_and_write_cross_val( train_val_set, train_val_nodes, val) self.writer.write_train_test_set(train_val_set, train_val_nodes, test_set, test_set_nodes, new_test_nodes)
def perform_val_split(self, train_val_set, nodes_in_train_val_set, train_indices, val_indices): train_set = train_val_set.loc[train_indices] val_set = train_val_set.loc[val_indices] new_val_nodes = self.get_additional_nodes( old_nodes_list=train_set[globalConfig.NODE1_ID_COL_NAME].tolist() + train_set[globalConfig.NODE2_ID_COL_NAME].tolist(), new_nodes_list=nodes_in_train_val_set, ) if len(new_val_nodes) > 0: # nicetohave (6) logging.info( f"Validation set contains nodes that are" f" not present in the training set. These edges will be dropped." ) val_set = self.remove_edges_with_nodes(val_set, new_val_nodes) if graphProp.DIRECTED: train_set = utils.remove_reverse_edges(remain_set=train_set, remove_set=val_set) val_set = utils.remove_parent_duplicates_and_reverses(remain_set=val_set, remove_set=train_set) return train_set, val_set, new_val_nodes
def random_edge_split(self, test_frac=None, val=None, crossval=None): logging.info(f"Creating random edge split with test_frac: {test_frac}, val_frac: {val}, crossval: {crossval}") # create positive and negative examples logging.info("Removing inconsistent edges from positive edges...") positive_samples = self.all_tp.copy() if self.neg_train_val or self.neg_test: logging.info("Generating negative samples") negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, self.all_tn.copy(), self.all_nodes, self.identifier2type) negative_samples = negative_sampler.generate_random_neg_samples(positive_samples) all_samples = (positive_samples.append(negative_samples, ignore_index=True)).reset_index(drop=True) else: all_samples = positive_samples all_samples = utils.remove_inconsistent_edges(all_samples).reset_index(drop=True) # generate, train-, test-, validation-sets logging.info("Creating testset...") test_set = all_samples.sample(frac=test_frac, random_state=globalConfig.RANDOM_STATE) train_val_set = all_samples.drop(list(test_set.index.values)) logging.info("Removing parent duplicates and reverses from testset...") test_set = utils.remove_parent_duplicates_and_reverses(remain_set=test_set, remove_set=train_val_set) train_val_nodes = self.get_nodes(train_val_set, self.neg_train_val) test_set, new_test_nodes = self.filter(train_val_set, self.neg_train_val, test_set, self.neg_test, "test") if graphProp.DIRECTED: logging.info("Removing reverse edges from train-val set") train_val_set = utils.remove_reverse_edges(remain_set=train_val_set, remove_set=test_set) # single train/test/val split if 0 < val < 1 and not float(val).is_integer() and not crossval: logging.info("Creating validation set ...") rand_index = list(train_val_set.index) random.shuffle(rand_index) val_indices, train_indices = np.array_split(rand_index, [int(len(rand_index) * val)]) train_set, val_set, new_val_nodes = self.perform_val_split(train_val_set, train_val_nodes, train_indices, val_indices) test_set, new_test_nodes = self.filter(train_set, self.neg_train_val, test_set, self.neg_test, "test") test_set_nodes = self.get_nodes(test_set, self.neg_test) positive_train_samples, negative_train_samples = self.split_positive_negative(train_set) positive_test_samples, negative_test_samples = self.split_positive_negative(test_set) positive_val_samples, negative_val_samples = self.split_positive_negative(val_set) # write train set self.writer.write_set(positive_train_samples, ttsConf.TRAIN_FILE_NAME) self.writer.write_nodes(set(train_val_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME) # write test set self.writer.write_set(positive_test_samples, ttsConf.TEST_FILE_NAME) self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME) self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME) # write val set self.writer.write_set(positive_val_samples, ttsConf.VAL_FILE_NAME) self.writer.write_new_nodes(new_val_nodes, ttsConf.NEW_VAL_NODES_FILE_NAME) if self.neg_train_val: self.writer.write_set(negative_train_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TRAIN_FILE_NAME) self.writer.write_set(negative_val_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.VAL_FILE_NAME) if self.neg_test: self.writer.write_set(negative_test_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TEST_FILE_NAME) # only train/test split or crossval else: test_set, new_test_nodes = self.filter(train_val_set, self.neg_train_val, test_set, self.neg_test, "test") test_set_nodes = self.get_nodes(test_set, self.neg_test) positive_train_val_samples, negative_train_val_samples = self.split_positive_negative(train_val_set) positive_test_samples, negative_test_samples = self.split_positive_negative(test_set) # write train_val set self.writer.write_set(positive_train_val_samples, ttsConf.TRAIN_FILE_NAME) self.writer.write_nodes(set(train_val_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME) # write test set self.writer.write_set(positive_test_samples, ttsConf.TEST_FILE_NAME) self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME) self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME) # generate and write negative samples if self.neg_train_val: self.writer.write_set(negative_train_val_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TRAIN_FILE_NAME) if self.neg_test: self.writer.write_set(negative_test_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TEST_FILE_NAME) if 0 < val < 1 and not float(val).is_integer() and crossval: # split with crossvalidation logging.info("Performing cross validation on trainingset...") self.create_and_write_cross_val(train_val_set, train_val_nodes, test_set, val) logging.info("Done splitting!")
def random_edge_split(self, test_frac=None, val=None, crossval=None): if not val: val = 0.2 if not test_frac: test_frac = 0.2 # create positive and negative examples positive_samples = self.all_tp.copy() negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, self.all_tn.copy(), self.all_nodes) negative_samples = negative_sampler.generate_random_neg_samples( positive_samples) all_samples = (positive_samples.append( negative_samples, ignore_index=True)).reset_index(drop=True) all_samples = utils.remove_inconsistent_edges(all_samples).reset_index( drop=True) # generate, train-, test-, validation-sets test_set = all_samples.sample(frac=test_frac, random_state=glob.RANDOM_STATE) train_val_set = all_samples.drop(list(test_set.index.values)) test_set = utils.remove_parent_duplicates_and_reverses( remain_set=test_set, remove_set=train_val_set) nodes_in_train_val_set = train_val_set[globalConfig.NODE1_ID_COL_NAME].tolist() \ + train_val_set[globalConfig.NODE2_ID_COL_NAME].tolist() new_test_nodes = self.get_additional_nodes( old_nodes_list=nodes_in_train_val_set, new_nodes_list=self.all_nodes[ globalConfig.ID_NODE_COL_NAME].tolist()) if new_test_nodes: logging.info( 'The test set contains nodes, that are not present in the trainings-set. These edges will be dropped.' ) #nicetohave (6): option to keep edges with new nodes test_set = self.remove_edges_with_nodes(test_set, new_test_nodes) nodes_in_test_set = test_set[globalConfig.NODE1_ID_COL_NAME].tolist() \ + test_set[globalConfig.NODE2_ID_COL_NAME].tolist() if graphProp.DIRECTED: train_val_set = utils.remove_reverse_edges( remain_set=train_val_set, remove_set=test_set) if crossval: train_val_set_tuples = self.create_cross_val(train_val_set, val) new_val_nodes = None for i, train_val_set_tuple in enumerate(train_val_set_tuples): train_set, val_set = train_val_set_tuple new_val_nodes = self.get_additional_nodes( old_nodes_list=train_set[ globalConfig.NODE1_ID_COL_NAME].tolist() + train_set[globalConfig.NODE2_ID_COL_NAME].tolist(), new_nodes_list=nodes_in_train_val_set) if new_val_nodes: #nicetohave (6) logging.info( 'Validation set %d contains nodes, that are not present in the trainings-set. These edges will be dropped.' % i) val_set = self.remove_edges_with_nodes( val_set, new_val_nodes) train_val_set_tuples[i] = (train_set, val_set) else: train_val_set_tuples = [(train_val_set, pandas.DataFrame())] new_val_nodes = None if graphProp.DIRECTED: train_val_set_tuples = [(utils.remove_reverse_edges(remain_set=t, remove_set=v), v) for t, v in train_val_set_tuples] train_val_set_tuples = [ (t, utils.remove_parent_duplicates_and_reverses(remain_set=v, remove_set=t)) for t, v in train_val_set_tuples ] self.writer.print_sets( train_val_set_tuples=train_val_set_tuples, new_val_nodes=new_val_nodes, test_set=test_set, new_test_nodes=new_test_nodes, nodes_in_train_val_set=set(nodes_in_train_val_set), nodes_in_test_set=set(nodes_in_test_set)) #nicetohave (3) option to remove examples with new nodes return train_val_set_tuples, test_set