Example #1
0
def main(args):
    """
    Main function of program for predicting similarity of 
    two source code samples

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    _checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt)

    _ds = BagTokSimilarityDS(args.dataset,
                             min_n_solutions = args.min_solutions,
                             max_n_problems = args.problems,
                             short_code_th = args.short_code,
                             long_code_th = args.long_code,
                             test = args.testpart)
    print("Restoring from", _checkpoint)
    _dnn = tf.keras.models.load_model(_checkpoint)
    _test_ds, _labels, _annotations = \
                    _ds.testDataset(args.valsize, args.similpart)
    _eval_loss, _eval_acc = _dnn.evaluate(_test_ds, _labels,
                                          verbose = args.progress)

    _prob = _dnn.predict(_test_ds, verbose = args.progress)
    _confusion = SimilConfusAnalysis(_prob, _labels, 
                                     _ds.solution_names,
                                     _ds.problems, _annotations)
    print("\n")
    print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100))
    print("Evaluation loss is {:5.2f}".format(_eval_loss))
    _confusion.writeReport()
Example #2
0
    def testDataset(self, size, similar_part):
        """
        Make test dataset
        Test datset is always constructed from problems different 
        from ones used for training and validation
        It uses the last problems in the list
        More details are in comments to self.trainValidDsDifferentProblems

        Parameters:
        - size         -- Size of dataset to create
        - similar_part -- Fraction of samples of the created dataset 
                          representing similar source code samples
        Returns:
        - Test datasets having the following items:
        - constructed dataset 
          * either as single numpy array 
          * or list of 2 them 
          * or TF dataset
        - labels as numpy array
        - list of similarity samples in form of 4-tuples
          <problem1, solution1, problem2, solution2>
        """
        print("Constructing test dataset")
        if not self.test_problem_solutions:
            sys.exit("Test datset cannot be created as it was not defined.")
        DataRand.setSeed("SIMIL_TEST_DS_SEED")
        _start_problem = self.n_problems - self.n_test_problems
        self.test_ds = self._makeDs(_start_problem,
                        self.test_problem_solutions, size, similar_part) 
        with open(f"{self.report_dir}/TestDatasetStatistics.lst", 'w') as _f:
            _f.write("PROBLEM DISTRIBUTION IN TEST DATASET\n")
            self.writeProblemDistribution(_start_problem, 
                            self.n_test_problems, self.test_ds[2], _f)
        self.writeSamplesCsv(self.test_ds[2], "test_samples.csv")
        return self.test_ds
Example #3
0
def main(args):
    """
    Main function of program for classifying source code

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', 
                                                  patience=args.patience)
    callbacks = [early_stop]
    if args.ckpt_dir:
        latest_checkpoint = setupCheckpoint(args.ckpt_dir)
        checkpoint_callback = makeCkptCallback(args.ckpt_dir)
        callbacks.append(checkpoint_callback)
    else:
        latest_checkpoint = None

    _ds = SeqTokDataset(args.dataset,
                        min_n_solutions = max(args.min_solutions, 3),
                        max_n_problems = args.problems,
                        short_code_th = args.short_code,
                        long_code_th = args.long_code,
                        max_seq_length = args.seq_len,
                        test_part = args.testpart,
                        balanced_split = args.balanced_split)

    print(f"Classification of source code among {_ds.n_labels} classes")
    print("Technique of convolutional neural network on sequence of tokens\n")
    #Create parallelization strategy for multi GPU mode
    #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy
    #But MultiWorkerMirroredStrategy works better
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    #distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))  
    #Construct DNN in the scope of parrelization strategy.
    with strategy.scope():
        # Everything that creates variables should be under the strategy scope.
        # In general this is only model construction & `compile()`.
        if latest_checkpoint:
            print("Restoring DNN from", latest_checkpoint)
            _dnn = tf.keras.models.load_model(latest_checkpoint)
        else:
            print("Constructing DNN")
            _dnn = makeDNN(_ds.n_token_types, _ds.n_labels, args)
    _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch)

    _tds = _train_ds[0]
    _tds = _tds.shuffle(50, reshuffle_each_iteration=True,
                        seed = UniqueSeed.getSeed()).prefetch(2)

    history = _dnn.fit(_tds,
                       validation_data = _val_ds[0].prefetch(2),
                       epochs = args.epochs, verbose = args.progress,
                       callbacks = callbacks)
def main(args):
    """
    Main function of program for predicting similarity of 
    two source code samples

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    latest_checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt)

    _ds = SeqTok2WaySimDsTF(args.dataset,
            min_n_solutions = args.min_solutions,
            max_n_problems = args.problems,
            short_code_th = args.short_code,
            long_code_th = args.long_code,
            max_seq_length = args.seq_len,
            test = args.testpart,
            batch = args.batch,
            labels01 = not args.symmetric_labels)

    #Create parallelization strategy for multi GPU mode
    #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy
    #But MultiWorkerMirroredStrategy works better
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    #distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))  
    #Construct DNN in the scope of parallelization strategy.
    with strategy.scope():
        # Everything that creates variables should be under the strategy scope.
        # In general this is only model construction & `compile()`.
        print("Restoring from", latest_checkpoint)
        _dnn = tf.keras.models.load_model(latest_checkpoint)
    _test_ds, _labels, _annotations = \
                    _ds.testDataset(args.valsize, args.similpart)
    _eval_loss, _eval_acc = _dnn.evaluate(_test_ds.prefetch(2), 
                                          verbose = args.progress)

    _prob = _dnn.predict(_test_ds.prefetch(2), verbose = args.progress)
    _confusion = SimilConfusAnalysis(_prob, _labels, 
                                     _ds.solution_names,
                                     _ds.problems, _annotations,
                                     labels01 = not args.symmetric_labels)
    print("\n")
    print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100))
    print("Evaluation loss is {:5.2f}".format(_eval_loss))
    _confusion.writeReport()
Example #5
0
def main(args):
    """
    Main function of program for predicting similarity of 
    two source code samples

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)
    if args.ckpt_dir:
        _latest_checkpoint = setupCheckpoint(args.ckpt_dir)
        _checkpoint_callback = makeCkptCallback(args.ckpt_dir)
        _callbacks = [_checkpoint_callback]
    else:
        _latest_checkpoint = None
        _callbacks = None
    _ds = BagTokSimilarityDS(args.dataset,
                             min_n_solutions=args.min_solutions,
                             max_n_problems=args.problems,
                             short_code_th=args.short_code,
                             long_code_th=args.long_code,
                             test=args.testpart)

    _val_ds, _train_ds = \
        _ds.trainValidDsSameProblems(
            args.valpart, args.valsize, args.trainsize,
            args.similpart) if args.validation == "same" else \
        _ds.trainValidDsDifferentProblems(
            args.valpart, args.valsize, args.trainsize,
            args.similpart)

    _model_factory = SeqModelFactory(_ds.n_token_types * 2, 1)
    if _latest_checkpoint:
        print("Restoring from", _latest_checkpoint)
        _dnn = tf.keras.models.load_model(_latest_checkpoint)
    else:
        _dnn = _model_factory.denseDNN(args.dense)

    _history = _dnn.fit(_train_ds[0],
                        _train_ds[1],
                        epochs=args.epochs,
                        batch_size=args.batch,
                        validation_data=(_val_ds[0], _val_ds[1]),
                        verbose=args.progress,
                        callbacks=_callbacks)

    with open(args.history, 'wb') as _jar:
        pickle.dump(_history.history, _jar)
Example #6
0
def main(args):
    """
    Main function of program for classifying source code
    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    if args.ckpt_dir:
        _latest_checkpoint = setupCheckpoint(args.ckpt_dir)
        _checkpoint_callback = makeCkptCallback(args.ckpt_dir)
        _callbacks = [_checkpoint_callback]
    else:
        _latest_checkpoint = None
        _callbacks = None

    _ds = BagTokDataset(args.dataset,
                        min_n_solutions=max(args.min_solutions, 3),
                        max_n_problems=args.problems,
                        short_code_th=args.short_code,
                        long_code_th=args.long_code,
                        test_part=args.testpart,
                        balanced_split=args.balanced_split)
    print(f"Classification of source code among {_ds.n_labels} classes")
    print("Technique of fully connected neural network on bag of tokens\n")
    _model_factory = SeqModelFactory(_ds.n_token_types, _ds.n_labels)
    if _latest_checkpoint:
        print("Restoring DNN from", _latest_checkpoint)
        _dnn = tf.keras.models.load_model(_latest_checkpoint)
    else:
        print("Constructing DNN")
        _dnn = _model_factory.denseDNN(args.dense)

    _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch)

    _history = _dnn.fit(_train_ds[0],
                        _train_ds[1],
                        epochs=args.epochs,
                        batch_size=args.batch,
                        validation_data=(_val_ds[0], _val_ds[1]),
                        verbose=args.progress,
                        callbacks=_callbacks)
    with open(args.history, 'wb') as _jar:
        pickle.dump(_history.history, _jar)
def main(args):
    """
    Main function of program for classifying source code

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    _ds = SeqTokDataset(args.dataset,
                        min_n_solutions=max(args.min_solutions, 3),
                        max_n_problems=args.problems,
                        short_code_th=args.short_code,
                        long_code_th=args.long_code,
                        max_seq_length=args.seq_len,
                        balanced_split=args.balanced_split)

    print(f"Classification of source code among {_ds.n_labels} classes")
    print("Technique of convolutional neural network on sequence of tokens\n")
    _model_factory = SeqModelFactory(_ds.n_token_types, _ds.n_labels)
    _convolutions = list(
        zip(args.filters, args.kernels, args.strides) if args.
        strides else zip(args.filters, args.kernels))
    _dnn = _model_factory.cnnDNN(_convolutions,
                                 args.dense,
                                 pool=args.pool,
                                 conv_act=args.conv_act,
                                 regular=(args.l1, args.l2),
                                 input_type=args.coding,
                                 optimizer=args.optimizer,
                                 embedding_dim=args.embed)

    _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch)

    train(_dnn, _val_ds[0], _train_ds[0], args.epochs, args.history,
          args.progress)
    _ds, _labels, _sample_names, _label_names = _val_ds
    confusionAnalysis(_dnn, _ds, _labels, _sample_names, _label_names)
Example #8
0
def main(args):
    """
    Main function of program for classifying source code samples
    and evaluating its accuracy

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)
    
    _checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt)

    _ds = BagTokDataset(args.dataset,
                        min_n_solutions = max(args.min_solutions, 3),
                        max_n_problems = args.problems,
                        short_code_th = args.short_code,
                        long_code_th = args.long_code,
                        test_part = args.testpart,
                        balanced_split = args.balanced_split)

    print("Restoring from", _checkpoint)
    _dnn = tf.keras.models.load_model(_checkpoint)

    _test_ds, _labels, _sample_names, _label_names = \
                                _ds.testDS(args.batch)
    _eval_loss, _eval_acc = _dnn.evaluate(_test_ds[0], _test_ds[1],
                                  verbose = args.progress)
    _prob = _dnn.predict(_test_ds[0], verbose = args.progress)
    _confusion = ClassConfusAnalysis(_prob, _labels, _sample_names,
                                     _label_names)
    _confusion.writeReport()
    print("\n")
    print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100))
    print("Evaluation loss is {:5.2f}".format(_eval_loss))
def main(args):
    """
    Main function of program for predicting similarity of 
    two source code samples

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)
    early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=args.patience)
    callbacks = [early_stop]

    #callbacks = []

    def lrScheduler(epoch, lr):
        """
        Utility function of learning rate scheduler
        Parameters:
        - epoch   -- current epoch
        - lr      -- current learning rate
        Returns new learning rate
        """
        if epoch < 10:
            return lr
        else:
            return lr * tf.math.exp(-0.1)

        lrUpdate = \
            tf.keras.callbacks.LearningRateScheduler(
                lrScheduler, verbose=1)

    lrOnPlateaur = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                        factor=0.98,
                                                        patience=2,
                                                        cooldown=0,
                                                        min_delta=0.0001,
                                                        min_lr=0.00025,
                                                        verbose=2)
    #callbacks.append(lrOnPlateaur)

    if args.ckpt_dir:
        latest_checkpoint = setupCheckpoint(args.ckpt_dir)
        checkpoint_callback = makeCkptCallback(args.ckpt_dir)
        callbacks.append(checkpoint_callback)
    else:
        latest_checkpoint = None

    _ds = SeqTok2WaySimDsTF(args.dataset,
                            min_n_solutions=args.min_solutions,
                            max_n_problems=args.problems,
                            short_code_th=args.short_code,
                            long_code_th=args.long_code,
                            max_seq_length=args.seq_len,
                            test=args.testpart,
                            batch=args.batch,
                            labels01=not args.symmetric_labels)

    #Create parallelization strategy for multi GPU mode
    #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy
    #But MultiWorkerMirroredStrategy works better
    strategy = tf.distribute.MultiWorkerMirroredStrategy()
    #distribute.MirroredStrategy()
    print("Number of devices: {}".format(strategy.num_replicas_in_sync))
    #Construct DNN in the scope of parrelization strategy.
    with strategy.scope():
        # Everything that creates variables should be under the strategy scope.
        # In general this is only model construction & `compile()`.
        if latest_checkpoint:
            print("Restoring from", latest_checkpoint)
            _dnn = tf.keras.models.load_model(latest_checkpoint)
        else:
            print("Constructing DNN")
            _dnn = makeDNN(_ds.n_token_types, args)

    _val_ds, _train_ds = \
        _ds.trainValidDsSameProblems(
            args.valpart, args.valsize, args.trainsize,
            args.similpart) \
        if args.validation == "same" else \
           _ds.trainValidDsDifferentProblems(
               args.valpart, args.valsize, args.trainsize,
               args.similpart)

    if args.sim_weight:
        _w_sim = args.sim_weight / (1.0 + args.sim_weight)
        _w_dissim = 1 - _w_sim
        history = _dnn.fit(_train_ds[0],
                           validation_data=_val_ds[0],
                           class_weight={
                               0: _w_dissim,
                               1: _w_sim
                           },
                           epochs=args.epochs,
                           steps_per_epoch=args.steps_per_epoch,
                           verbose=args.progress,
                           callbacks=callbacks)
    else:
        history = _dnn.fit(_train_ds[0].repeat(),
                           validation_data=_val_ds[0],
                           epochs=args.epochs,
                           steps_per_epoch=args.steps_per_epoch,
                           verbose=args.progress,
                           callbacks=callbacks)

    with open(args.history, 'wb') as _jar:
        pickle.dump(history.history, _jar)
def main(args):
    """
    Main function of program for predicting similarity of 
    two source code samples

    Parameters:
    - args  -- Parsed command line arguments
               as object returned by ArgumentParser
    """
    resetSeeds()
    DataRand.setDsSeeds(args.seed_ds)

    _convolutions = list(
        zip(args.filters, args.kernels, args.strides) if args.
        strides else zip(args.filters, args.kernels))

    if args.model == 'basic':
        from SeqTokSimDataset import SeqTokSimilarityDS
        from SeqModelMaker import SeqModelFactory
        _ds = SeqTokSimilarityDS(args.dataset,
                                 min_n_solutions=args.min_solutions,
                                 max_n_problems=args.problems,
                                 short_code_th=args.short_code,
                                 long_code_th=args.long_code,
                                 max_seq_length=args.seq_len)

        _model_factory = SeqModelFactory(_ds.n_token_types * 2, 1)
        _dnn = _model_factory.cnnDNN(_convolutions,
                                     args.dense,
                                     input_type=args.coding,
                                     pool=args.pool,
                                     conv_act=args.conv_act,
                                     regular=(args.l1, args.l2),
                                     optimizer=args.optimizer)
        print("Basic model for sequence similarity is constructed")
    else:
        from SeqTok2WaySimDataset import SeqTok2WaySimDS
        from FuncModelMaker import FuncModelFactory
        _ds = SeqTok2WaySimDS(args.dataset,
                              min_n_solutions=args.min_solutions,
                              max_n_problems=args.problems,
                              short_code_th=args.short_code,
                              long_code_th=args.long_code,
                              max_seq_length=args.seq_len)

        _model_factory = FuncModelFactory(1, regularizer=(args.l1, args.l2))
        _dnn = _model_factory.twoWaySimilarityCNN(
            _ds.n_token_types,
            _convolutions,
            args.dense,
            pool=args.pool,
            side_dense=[],
            conv_act=args.conv_act,
            shared=(args.model == 'symmetric'),
            input_type=args.coding,
            embedding_dim=args.embed,
            optimizer=args.optimizer)
        print("Two way model for sequence similarity is constructed")

    _val_ds, _train_ds = \
        _ds.trainValidDsSameProblems(
            args.valpart, args.valsize, args.trainsize,
            args.similpart) \
        if args.validation == "same" else \
           _ds.trainValidDsDifferentProblems(
               args.valpart, args.valsize, args.trainsize,
               args.similpart)
    train(_dnn, _train_ds, _val_ds, args.epochs, args.batch, args.history,
          args.progress)

    del _train_ds
    _val_samples, _labels, _annotations = _val_ds
    _prob = predict(_dnn, _val_samples, batch=args.batch)
    _confusion = SimilConfusAnalysis(_prob, _labels, _ds.solution_names,
                                     _ds.problems, _annotations)
    _confusion.writeReport()
Example #11
0
    def trainValidDsDifferentProblems(self, val_train_split,
                                      val_size, train_size,
                                      similar_part):
        """
        Make datasets for training source code similarity analyser

        Both training and validation parts of dataset are made.

        The validation part is made from solutions of different 
        problems that were used for making the training part 
        of the dataset.
        Validation uses the problems of the beginning of the list
        Training uses problems that are between validation and test ones

        The function provides the following:
        - No same code samples (problems) are used 
          for creation both validation and training datasets
        - Validation and training datasets are created from 
          solutions of different problems
        - Number of samples of similar solutions of each problem
          is proportional to the number of all pairs of solutions 
          of that problem.
        - All pairs of similar solutions are constructed from
          different samples, i.e. no pair has same solution.
          No self similar solution is constructed.
          !!!Possibly this condition should be parameterized.
        - Pairs of dissimilar solutions are constructed such as
          each problem is selected with the probability 
          proportional to the number of its solutions.
        Similar samples are constructed as pairs of solutions
        of the same problem
        Dissimilar samples are constructed as pairs of solution
        of different problems
        All problem solutions are splitted for ones used 
        for training and ones used for validation. The solutions 
        of the resulted subsets of problems are used for 
        constructing training and validation datasets.

        Parameters:
        - val_train_split  -- Fraction of original probles used 
                              for constructing validation dataset.
                              It speciified as float
                              Set of all problem is splitted according
                              to this parameters
        - val_size         -- Size of validation dataset to create
        - train_size       -- Size of training dataset to create
        - similar_part     -- Fraction of samples of the created dataset 
                              representing similar source code samples
        Returns:
        - validation and training datasets:
        Each of them has the following items:
        - constructed dataset 
          * either as single numpy array 
          * or list of 2 them 
          * or TF dataset
        - labels as numpy array
        - list of similarity samples in form of 4-tuples
          <problem1, solution1, problem2, solution2>
        """
        print("Constructing training and validation datasets " + 
              "from solutions of different problems")
        _n_val_probls = int(float(self.n_tran_ds_problems) * 
                            val_train_split)
        print(f"Validation and training use {_n_val_probls} " + 
              f"and {self.n_tran_ds_problems - _n_val_probls} problems respectively")
        if _n_val_probls < 2 or self.n_tran_ds_problems - _n_val_probls < 2:
            sys.exit(f"Too few problems for training " + 
                     f"{self.n_tran_ds_problems - _n_val_probls} " + 
                     f"or validation {_n_val_probls}")
        _val_problem_solutions = \
                self.train_ds_probl_solutions[: _n_val_probls]
        _train_problem_solutions = \
                self.train_ds_probl_solutions[_n_val_probls :]
        self.writeProblemList(self.train_ds_problems[: _n_val_probls],
                              "validation_problems.txt")
        self.writeProblemList(self.train_ds_problems[_n_val_probls :],
                              "training_problems.txt")
        DataRand.setSeed("SIMIL_VALID_DS_SEED")
        self.val_ds = self._makeDs(0, _val_problem_solutions,
                                   val_size, similar_part)
        DataRand.setSeed("SIMIL_TRAIN_DS_SEED")
        self.train_ds = self._makeDs(_n_val_probls,
            _train_problem_solutions, train_size, similar_part)
        self.reportDatasetStatistics(0, _n_val_probls, self.val_ds[2], 
            _n_val_probls, self.n_tran_ds_problems - _n_val_probls, 
            self.train_ds[2])
        self.writeSamplesCsv(self.val_ds[2], "val_samples.csv")
        self.writeSamplesCsv(self.train_ds[2], "train_samples.csv")
        return self.val_ds, self.train_ds
Example #12
0
    def trainValidDsSameProblems(self, val_train_split,
                                 val_size, train_size,
                                 similar_part):
        """
        Make datasets for training source code similarity analyser
        Both training and validation parts of dataset are made.

        The validation part is made from solutions of the same 
        problems that were used for making the training part 
        of the dataset.
        However, the test dataset is made from solutions of 
        the problems different from ones used here for training 
        and validation

        The function provides the following:
        - No same code samples (problem solutions) are used 
          for creation both validation and training datasets
        - Each problem is represented in both created datasets
          with the same fraction of its code samples
        - Number of samples of similar solutions of each problem
          is proportional to the number of all pairs of solutions 
          of that problem.
        - All pairs of similar solutions are constructed from
          different samples, i.e. no pair has same solution.
          No self similar solution is constructed.
          !!!Possibly this condition should be parameterized.
        - Pairs of dissimilar solutions are constructed such as
          each problem is selected with the probability 
          proportional to the number of its solutions.
        Similar samples are constructed as pairs of solutions
        of the same problem
        Dissimilar samples are constructed as pairs of solution
        of different problems
        Samples of each problem solutions are splitted individually.
        The obtained subsets of are used for constructing training 
        and validation datasets.

        Parameters:
        - val_train_split  -- Fraction of training samples that is used 
                              for constructing validation dataset.
                              It speciified as float
                              Samples of each problem solutions are splitted 
                              individually according to this parameters
        - val_size         -- Size of validation dataset to create
        - train_size       -- Size of training dataset to create
        - similar_part     -- Fraction of samples of the created dataset 
                              representing similar source code samples
        Returns:
        - validation and training datasets:
        Each of them has the following items:
        - constructed dataset 
          * either as single numpy array 
          * or list of 2 them 
          * or TF dataset
        - labels as numpy array
        - list of similarity samples in form of 4-tuples
          <problem1, solution1, problem2, solution2>
        """
        print("Constructing training and validation datasets " + 
              "from different solutions of the same problems")
        _val_problem_solutions = \
            [_solutions[: int(float(len(_solutions)) * 
                                   val_train_split)] 
             for _solutions in self.train_ds_probl_solutions]
        _train_problem_solutions = \
            [_solutions[int(float(len(_solutions)) * 
                                 val_train_split) :] 
             for _solutions in self.train_ds_probl_solutions]
        DataRand.setSeed("SIMIL_TRAIN_DS_SEED")
        self.train_ds = self._makeDs(0, _train_problem_solutions,
                                     train_size, similar_part)
        DataRand.setSeed("SIMIL_VALID_DS_SEED")
        self.val_ds = self._makeDs(0, _val_problem_solutions,
                                   val_size, similar_part)
        self.reportDatasetStatistics(0, self.n_tran_ds_problems, 
                                     self.val_ds[2], 
                                     0, self.n_tran_ds_problems, 
                                     self.train_ds[2])
        return self.val_ds, self.train_ds