Example #1
0
    def create_and_write_cross_val(self, df: pandas.DataFrame,
                                   nodes_in_train_val_set, n_folds):
        nel_total, _ = df.shape
        if n_folds == 0 or n_folds == 1 or (n_folds > 1 and
                                            not float(n_folds).is_integer()):
            logging.error("provided folds are not possible!")
            raise Exception(
                "fold entry must be either an int>1 (number of folds) or a float >0 and <1 (validation fraction)"
            )

        if n_folds < 1:  # n_folds is fraction
            n_folds = math.ceil(1 / n_folds)
        n_folds = int(n_folds)

        rand_index = list(df.index)
        random.shuffle(rand_index)
        chunks = np.array_split(rand_index, n_folds)

        for i in range(n_folds):
            logging.info(f"Creating fold number {i+1} ...")
            train_chunk_indices = [(x + i) % n_folds
                                   for x in range(n_folds - 1)]
            val_chunk_index = (n_folds - 1 + i) % n_folds
            train_indices = [
                element for chunk_index in train_chunk_indices
                for element in chunks[chunk_index]
            ]
            val_indices = chunks[val_chunk_index]
            train_set = df.loc[train_indices]
            val_set = df.loc[val_indices]

            new_val_nodes = self.get_additional_nodes(
                old_nodes_list=train_set[
                    globalConfig.NODE1_ID_COL_NAME].tolist() +
                train_set[globalConfig.NODE2_ID_COL_NAME].tolist(),
                new_nodes_list=nodes_in_train_val_set,
            )
            if len(new_val_nodes) > 0:  # nicetohave (6)
                logging.info(
                    f"Validation set {i+1} contains nodes that are"
                    f" not present in the training set. These edges will be dropped."
                )
                val_set = self.remove_edges_with_nodes(val_set, new_val_nodes)
            if graphProp.DIRECTED:
                train_set = utils.remove_reverse_edges(remain_set=train_set,
                                                       remove_set=val_set)
            val_set = utils.remove_parent_duplicates_and_reverses(
                remain_set=val_set, remove_set=train_set)

            self.writer.write_train_val_set(train_set, val_set, new_val_nodes,
                                            i)

        return train_set.append(val_set)
Example #2
0
    def time_slice_split(self):
        # nicetohave (4) like that, neg samples are restricted to edge_types appearing in test_sample --> good idea?
        # nicetohave (4) idea: calculate nodes like above, then tmo_nodes= test_nodes --> mehr auswahl bei neg examples
        tmo_positive_samples = self.tmo_all_tp
        tmo_negative_sampler = NegativeSampler(
            self.meta_edges_dic, self.tmo_tn_edgeTypes, self.tmo_all_tn, self.tmo_nodes, self.identifier2type
        )
        tmo_negative_samples = tmo_negative_sampler.generate_random_neg_samples(tmo_positive_samples)
        # todo remove not consistent edges
        tmo_negative_samples[globalConfig.VALUE_COL_NAME] = 0
        tmo_all_samples = (tmo_positive_samples.append(tmo_negative_samples, ignore_index=True)).reset_index(
            drop=True
        )  # todo ist append nicht in pace?
        train_set = tmo_all_samples
        test_positive_samples, vanished_positive_samples = utils.get_diff(
            self.all_tp, self.tmo_all_tp, ignore_qscore=True
        )
        test_tn_samples, vanished_tn_samples = utils.get_diff(self.all_tn, self.tmo_all_tn, ignore_qscore=True)
        if not vanished_positive_samples.empty or not vanished_tn_samples.empty:
            logging.info("Some edges existing in the first time slice are no longer present in the second one")
            self.writer.print_vanished_edges(vanished_positive_samples.append(vanished_tn_samples))
        test_negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, test_tn_samples, self.all_nodes,
                                                self.identifier2type)
        test_negative_samples = test_negative_sampler.generate_random_neg_samples(test_positive_samples)
        test_negative_samples[globalConfig.VALUE_COL_NAME] = 0

        test_set = (test_positive_samples.append(test_negative_samples, ignore_index=True)).reset_index(drop=True)
        test_set = utils.remove_parent_duplicates_and_reverses(remain_set=test_set, remove_set=train_set)
        new_test_nodes = self.get_additional_nodes(
            old_nodes_list=self.tmo_nodes[globalConfig.ID_NODE_COL_NAME].tolist(),
            new_nodes_list=self.all_nodes[globalConfig.ID_NODE_COL_NAME].tolist(),
        )
        if new_test_nodes:
            logging.info(
                "The test set contains nodes that are not present in the trainings-set. These edges will be removed."
            )  # nicetohave (6)
            test_set = self.remove_edges_with_nodes(test_set, new_test_nodes)

        if graphProp.DIRECTED:
            train_set = utils.remove_reverse_edges(remain_set=train_set, remove_set=test_set)

        train_set_nodes = (
                train_set[globalConfig.NODE1_ID_COL_NAME].tolist() + train_set[globalConfig.NODE2_ID_COL_NAME].tolist()
        )
        test_set_nodes = (
                test_set[globalConfig.NODE1_ID_COL_NAME].tolist() + test_set[globalConfig.NODE2_ID_COL_NAME].tolist()
        )

        self.writer.write_set(train_set, ttsConf.TRAIN_FILE_NAME)
        self.writer.write_nodes(set(train_set_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME)
        self.writer.write_set(test_set, ttsConf.TEST_FILE_NAME)
        self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME)
        self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME)
Example #3
0
    def random_edge_split(self, test_frac=None, val=None, crossval=None):
        if not val:
            val = 0.05
        if not test_frac:
            test_frac = 0.05

        # create positive and negative examples
        positive_samples = self.all_tp.copy()
        negative_sampler = NegativeSampler(self.meta_edges_dic,
                                           self.tn_edgeTypes,
                                           self.all_tn.copy(), self.all_nodes,
                                           self.identifier2type)
        negative_samples = negative_sampler.generate_random_neg_samples(
            positive_samples)
        all_samples = (positive_samples.append(
            negative_samples, ignore_index=True)).reset_index(drop=True)
        all_samples = utils.remove_inconsistent_edges(all_samples).reset_index(
            drop=True)

        # generate, train-, test-, validation-sets
        test_set = all_samples.sample(frac=test_frac,
                                      random_state=globalConfig.RANDOM_STATE)
        train_val_set = all_samples.drop(list(test_set.index.values))
        test_set = utils.remove_parent_duplicates_and_reverses(
            remain_set=test_set, remove_set=train_val_set)

        train_val_nodes = (
            train_val_set[globalConfig.NODE1_ID_COL_NAME].tolist() +
            train_val_set[globalConfig.NODE2_ID_COL_NAME].tolist())
        new_test_nodes = self.get_additional_nodes(
            old_nodes_list=train_val_nodes,
            new_nodes_list=self.all_nodes[
                globalConfig.NODE_TYPE_COL_NAME].tolist())
        if new_test_nodes:
            logging.info(
                "The test set contains nodes, that are not present in the trainings-set. These edges will be dropped."
            )  # nicetohave (6): option to keep edges with new nodes
            test_set = self.remove_edges_with_nodes(test_set, new_test_nodes)
        test_set_nodes = (test_set[globalConfig.NODE1_ID_COL_NAME].tolist() +
                          test_set[globalConfig.NODE2_ID_COL_NAME].tolist())
        if graphProp.DIRECTED:
            train_val_set = utils.remove_reverse_edges(
                remain_set=train_val_set, remove_set=test_set)

        if crossval:
            logging.info("Performing cross validation on trainingset...")
            train_val_set = self.create_and_write_cross_val(
                train_val_set, train_val_nodes, val)

        self.writer.write_train_test_set(train_val_set, train_val_nodes,
                                         test_set, test_set_nodes,
                                         new_test_nodes)
Example #4
0
    def perform_val_split(self, train_val_set, nodes_in_train_val_set, train_indices, val_indices):
        train_set = train_val_set.loc[train_indices]
        val_set = train_val_set.loc[val_indices]

        new_val_nodes = self.get_additional_nodes(
            old_nodes_list=train_set[globalConfig.NODE1_ID_COL_NAME].tolist()
                           + train_set[globalConfig.NODE2_ID_COL_NAME].tolist(),
            new_nodes_list=nodes_in_train_val_set,
        )
        if len(new_val_nodes) > 0:  # nicetohave (6)
            logging.info(
                f"Validation set contains nodes that are"
                f" not present in the training set. These edges will be dropped."
            )
            val_set = self.remove_edges_with_nodes(val_set, new_val_nodes)
        if graphProp.DIRECTED:
            train_set = utils.remove_reverse_edges(remain_set=train_set, remove_set=val_set)
        val_set = utils.remove_parent_duplicates_and_reverses(remain_set=val_set, remove_set=train_set)
        return train_set, val_set, new_val_nodes
Example #5
0
    def random_edge_split(self, test_frac=None, val=None, crossval=None):
        logging.info(f"Creating random edge split with test_frac: {test_frac}, val_frac: {val}, crossval: {crossval}")
        # create positive and negative examples
        logging.info("Removing inconsistent edges from positive edges...")
        positive_samples = self.all_tp.copy()
        if self.neg_train_val or self.neg_test:
            logging.info("Generating negative samples")
            negative_sampler = NegativeSampler(self.meta_edges_dic, self.tn_edgeTypes, self.all_tn.copy(),
                                               self.all_nodes, self.identifier2type)
            negative_samples = negative_sampler.generate_random_neg_samples(positive_samples)
            all_samples = (positive_samples.append(negative_samples, ignore_index=True)).reset_index(drop=True)
        else:
            all_samples = positive_samples
        all_samples = utils.remove_inconsistent_edges(all_samples).reset_index(drop=True)

        # generate, train-, test-, validation-sets
        logging.info("Creating testset...")
        test_set = all_samples.sample(frac=test_frac, random_state=globalConfig.RANDOM_STATE)
        train_val_set = all_samples.drop(list(test_set.index.values))

        logging.info("Removing parent duplicates and reverses from testset...")
        test_set = utils.remove_parent_duplicates_and_reverses(remain_set=test_set, remove_set=train_val_set)

        train_val_nodes = self.get_nodes(train_val_set, self.neg_train_val)

        test_set, new_test_nodes = self.filter(train_val_set, self.neg_train_val, test_set, self.neg_test, "test")

        if graphProp.DIRECTED:
            logging.info("Removing reverse edges from train-val set")
            train_val_set = utils.remove_reverse_edges(remain_set=train_val_set, remove_set=test_set)

        # single train/test/val split
        if 0 < val < 1 and not float(val).is_integer() and not crossval:
            logging.info("Creating validation set ...")
            rand_index = list(train_val_set.index)
            random.shuffle(rand_index)

            val_indices, train_indices = np.array_split(rand_index, [int(len(rand_index) * val)])

            train_set, val_set, new_val_nodes = self.perform_val_split(train_val_set, train_val_nodes, train_indices, val_indices)
            
            test_set, new_test_nodes = self.filter(train_set, self.neg_train_val, test_set, self.neg_test, "test")
            test_set_nodes = self.get_nodes(test_set, self.neg_test)

            positive_train_samples, negative_train_samples = self.split_positive_negative(train_set)
            positive_test_samples, negative_test_samples = self.split_positive_negative(test_set)
            positive_val_samples, negative_val_samples = self.split_positive_negative(val_set)

            # write train set
            self.writer.write_set(positive_train_samples, ttsConf.TRAIN_FILE_NAME)
            self.writer.write_nodes(set(train_val_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME)

            # write test set
            self.writer.write_set(positive_test_samples, ttsConf.TEST_FILE_NAME)
            self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME)
            self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME)

            # write val set
            self.writer.write_set(positive_val_samples, ttsConf.VAL_FILE_NAME)
            self.writer.write_new_nodes(new_val_nodes, ttsConf.NEW_VAL_NODES_FILE_NAME)

            if self.neg_train_val:
                self.writer.write_set(negative_train_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TRAIN_FILE_NAME)
                self.writer.write_set(negative_val_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.VAL_FILE_NAME)
            if self.neg_test:
                self.writer.write_set(negative_test_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TEST_FILE_NAME)


        # only train/test split or crossval
        else:
            test_set, new_test_nodes = self.filter(train_val_set, self.neg_train_val, test_set, self.neg_test, "test")
            test_set_nodes = self.get_nodes(test_set, self.neg_test)
            
            positive_train_val_samples, negative_train_val_samples = self.split_positive_negative(train_val_set)
            positive_test_samples, negative_test_samples = self.split_positive_negative(test_set)

            # write train_val set
            self.writer.write_set(positive_train_val_samples, ttsConf.TRAIN_FILE_NAME)
            self.writer.write_nodes(set(train_val_nodes), ttsConf.TRAIN_VAL_NODES_FILE_NAME)

            # write test set
            self.writer.write_set(positive_test_samples, ttsConf.TEST_FILE_NAME)
            self.writer.write_nodes(set(test_set_nodes), ttsConf.TEST_NODES_FILE_NAME)
            self.writer.write_new_nodes(new_test_nodes, ttsConf.NEW_TEST_NODES_FILE_NAME)

            # generate and write negative samples
            if self.neg_train_val:
                self.writer.write_set(negative_train_val_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TRAIN_FILE_NAME)
            if self.neg_test:
                self.writer.write_set(negative_test_samples, ttsConf.NEGATIVE_PREFIX + ttsConf.TEST_FILE_NAME)

            if 0 < val < 1 and not float(val).is_integer() and crossval:
                # split with crossvalidation
                logging.info("Performing cross validation on trainingset...")
                self.create_and_write_cross_val(train_val_set, train_val_nodes, test_set, val)
        logging.info("Done splitting!")
Example #6
0
    def random_edge_split(self, test_frac=None, val=None, crossval=None):
        if not val:
            val = 0.2
        if not test_frac:
            test_frac = 0.2

        # create positive and negative examples
        positive_samples = self.all_tp.copy()
        negative_sampler = NegativeSampler(self.meta_edges_dic,
                                           self.tn_edgeTypes,
                                           self.all_tn.copy(), self.all_nodes)
        negative_samples = negative_sampler.generate_random_neg_samples(
            positive_samples)
        all_samples = (positive_samples.append(
            negative_samples, ignore_index=True)).reset_index(drop=True)
        all_samples = utils.remove_inconsistent_edges(all_samples).reset_index(
            drop=True)

        # generate, train-, test-, validation-sets
        test_set = all_samples.sample(frac=test_frac,
                                      random_state=glob.RANDOM_STATE)
        train_val_set = all_samples.drop(list(test_set.index.values))
        test_set = utils.remove_parent_duplicates_and_reverses(
            remain_set=test_set, remove_set=train_val_set)

        nodes_in_train_val_set = train_val_set[globalConfig.NODE1_ID_COL_NAME].tolist() \
                                 + train_val_set[globalConfig.NODE2_ID_COL_NAME].tolist()
        new_test_nodes = self.get_additional_nodes(
            old_nodes_list=nodes_in_train_val_set,
            new_nodes_list=self.all_nodes[
                globalConfig.ID_NODE_COL_NAME].tolist())
        if new_test_nodes:
            logging.info(
                'The test set contains nodes, that are not present in the trainings-set. These edges will be dropped.'
            )  #nicetohave (6): option to keep edges with new nodes
            test_set = self.remove_edges_with_nodes(test_set, new_test_nodes)
        nodes_in_test_set = test_set[globalConfig.NODE1_ID_COL_NAME].tolist() \
                            + test_set[globalConfig.NODE2_ID_COL_NAME].tolist()
        if graphProp.DIRECTED:
            train_val_set = utils.remove_reverse_edges(
                remain_set=train_val_set, remove_set=test_set)

        if crossval:
            train_val_set_tuples = self.create_cross_val(train_val_set, val)
            new_val_nodes = None
            for i, train_val_set_tuple in enumerate(train_val_set_tuples):
                train_set, val_set = train_val_set_tuple
                new_val_nodes = self.get_additional_nodes(
                    old_nodes_list=train_set[
                        globalConfig.NODE1_ID_COL_NAME].tolist() +
                    train_set[globalConfig.NODE2_ID_COL_NAME].tolist(),
                    new_nodes_list=nodes_in_train_val_set)
                if new_val_nodes:  #nicetohave (6)
                    logging.info(
                        'Validation set %d contains nodes, that are not present in the trainings-set. These edges will be dropped.'
                        % i)
                    val_set = self.remove_edges_with_nodes(
                        val_set, new_val_nodes)
                    train_val_set_tuples[i] = (train_set, val_set)

        else:
            train_val_set_tuples = [(train_val_set, pandas.DataFrame())]
            new_val_nodes = None
        if graphProp.DIRECTED:
            train_val_set_tuples = [(utils.remove_reverse_edges(remain_set=t,
                                                                remove_set=v),
                                     v) for t, v in train_val_set_tuples]
        train_val_set_tuples = [
            (t,
             utils.remove_parent_duplicates_and_reverses(remain_set=v,
                                                         remove_set=t))
            for t, v in train_val_set_tuples
        ]

        self.writer.print_sets(
            train_val_set_tuples=train_val_set_tuples,
            new_val_nodes=new_val_nodes,
            test_set=test_set,
            new_test_nodes=new_test_nodes,
            nodes_in_train_val_set=set(nodes_in_train_val_set),
            nodes_in_test_set=set(nodes_in_test_set))
        #nicetohave (3) option to remove examples with new nodes
        return train_val_set_tuples, test_set