def main(args): """ Main function of program for predicting similarity of two source code samples Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) _checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt) _ds = BagTokSimilarityDS(args.dataset, min_n_solutions = args.min_solutions, max_n_problems = args.problems, short_code_th = args.short_code, long_code_th = args.long_code, test = args.testpart) print("Restoring from", _checkpoint) _dnn = tf.keras.models.load_model(_checkpoint) _test_ds, _labels, _annotations = \ _ds.testDataset(args.valsize, args.similpart) _eval_loss, _eval_acc = _dnn.evaluate(_test_ds, _labels, verbose = args.progress) _prob = _dnn.predict(_test_ds, verbose = args.progress) _confusion = SimilConfusAnalysis(_prob, _labels, _ds.solution_names, _ds.problems, _annotations) print("\n") print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100)) print("Evaluation loss is {:5.2f}".format(_eval_loss)) _confusion.writeReport()
def testDataset(self, size, similar_part): """ Make test dataset Test datset is always constructed from problems different from ones used for training and validation It uses the last problems in the list More details are in comments to self.trainValidDsDifferentProblems Parameters: - size -- Size of dataset to create - similar_part -- Fraction of samples of the created dataset representing similar source code samples Returns: - Test datasets having the following items: - constructed dataset * either as single numpy array * or list of 2 them * or TF dataset - labels as numpy array - list of similarity samples in form of 4-tuples <problem1, solution1, problem2, solution2> """ print("Constructing test dataset") if not self.test_problem_solutions: sys.exit("Test datset cannot be created as it was not defined.") DataRand.setSeed("SIMIL_TEST_DS_SEED") _start_problem = self.n_problems - self.n_test_problems self.test_ds = self._makeDs(_start_problem, self.test_problem_solutions, size, similar_part) with open(f"{self.report_dir}/TestDatasetStatistics.lst", 'w') as _f: _f.write("PROBLEM DISTRIBUTION IN TEST DATASET\n") self.writeProblemDistribution(_start_problem, self.n_test_problems, self.test_ds[2], _f) self.writeSamplesCsv(self.test_ds[2], "test_samples.csv") return self.test_ds
def main(args): """ Main function of program for classifying source code Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=args.patience) callbacks = [early_stop] if args.ckpt_dir: latest_checkpoint = setupCheckpoint(args.ckpt_dir) checkpoint_callback = makeCkptCallback(args.ckpt_dir) callbacks.append(checkpoint_callback) else: latest_checkpoint = None _ds = SeqTokDataset(args.dataset, min_n_solutions = max(args.min_solutions, 3), max_n_problems = args.problems, short_code_th = args.short_code, long_code_th = args.long_code, max_seq_length = args.seq_len, test_part = args.testpart, balanced_split = args.balanced_split) print(f"Classification of source code among {_ds.n_labels} classes") print("Technique of convolutional neural network on sequence of tokens\n") #Create parallelization strategy for multi GPU mode #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy #But MultiWorkerMirroredStrategy works better strategy = tf.distribute.MultiWorkerMirroredStrategy() #distribute.MirroredStrategy() print("Number of devices: {}".format(strategy.num_replicas_in_sync)) #Construct DNN in the scope of parrelization strategy. with strategy.scope(): # Everything that creates variables should be under the strategy scope. # In general this is only model construction & `compile()`. if latest_checkpoint: print("Restoring DNN from", latest_checkpoint) _dnn = tf.keras.models.load_model(latest_checkpoint) else: print("Constructing DNN") _dnn = makeDNN(_ds.n_token_types, _ds.n_labels, args) _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch) _tds = _train_ds[0] _tds = _tds.shuffle(50, reshuffle_each_iteration=True, seed = UniqueSeed.getSeed()).prefetch(2) history = _dnn.fit(_tds, validation_data = _val_ds[0].prefetch(2), epochs = args.epochs, verbose = args.progress, callbacks = callbacks)
def main(args): """ Main function of program for predicting similarity of two source code samples Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) latest_checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt) _ds = SeqTok2WaySimDsTF(args.dataset, min_n_solutions = args.min_solutions, max_n_problems = args.problems, short_code_th = args.short_code, long_code_th = args.long_code, max_seq_length = args.seq_len, test = args.testpart, batch = args.batch, labels01 = not args.symmetric_labels) #Create parallelization strategy for multi GPU mode #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy #But MultiWorkerMirroredStrategy works better strategy = tf.distribute.MultiWorkerMirroredStrategy() #distribute.MirroredStrategy() print("Number of devices: {}".format(strategy.num_replicas_in_sync)) #Construct DNN in the scope of parallelization strategy. with strategy.scope(): # Everything that creates variables should be under the strategy scope. # In general this is only model construction & `compile()`. print("Restoring from", latest_checkpoint) _dnn = tf.keras.models.load_model(latest_checkpoint) _test_ds, _labels, _annotations = \ _ds.testDataset(args.valsize, args.similpart) _eval_loss, _eval_acc = _dnn.evaluate(_test_ds.prefetch(2), verbose = args.progress) _prob = _dnn.predict(_test_ds.prefetch(2), verbose = args.progress) _confusion = SimilConfusAnalysis(_prob, _labels, _ds.solution_names, _ds.problems, _annotations, labels01 = not args.symmetric_labels) print("\n") print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100)) print("Evaluation loss is {:5.2f}".format(_eval_loss)) _confusion.writeReport()
def main(args): """ Main function of program for predicting similarity of two source code samples Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) if args.ckpt_dir: _latest_checkpoint = setupCheckpoint(args.ckpt_dir) _checkpoint_callback = makeCkptCallback(args.ckpt_dir) _callbacks = [_checkpoint_callback] else: _latest_checkpoint = None _callbacks = None _ds = BagTokSimilarityDS(args.dataset, min_n_solutions=args.min_solutions, max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, test=args.testpart) _val_ds, _train_ds = \ _ds.trainValidDsSameProblems( args.valpart, args.valsize, args.trainsize, args.similpart) if args.validation == "same" else \ _ds.trainValidDsDifferentProblems( args.valpart, args.valsize, args.trainsize, args.similpart) _model_factory = SeqModelFactory(_ds.n_token_types * 2, 1) if _latest_checkpoint: print("Restoring from", _latest_checkpoint) _dnn = tf.keras.models.load_model(_latest_checkpoint) else: _dnn = _model_factory.denseDNN(args.dense) _history = _dnn.fit(_train_ds[0], _train_ds[1], epochs=args.epochs, batch_size=args.batch, validation_data=(_val_ds[0], _val_ds[1]), verbose=args.progress, callbacks=_callbacks) with open(args.history, 'wb') as _jar: pickle.dump(_history.history, _jar)
def main(args): """ Main function of program for classifying source code Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) if args.ckpt_dir: _latest_checkpoint = setupCheckpoint(args.ckpt_dir) _checkpoint_callback = makeCkptCallback(args.ckpt_dir) _callbacks = [_checkpoint_callback] else: _latest_checkpoint = None _callbacks = None _ds = BagTokDataset(args.dataset, min_n_solutions=max(args.min_solutions, 3), max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, test_part=args.testpart, balanced_split=args.balanced_split) print(f"Classification of source code among {_ds.n_labels} classes") print("Technique of fully connected neural network on bag of tokens\n") _model_factory = SeqModelFactory(_ds.n_token_types, _ds.n_labels) if _latest_checkpoint: print("Restoring DNN from", _latest_checkpoint) _dnn = tf.keras.models.load_model(_latest_checkpoint) else: print("Constructing DNN") _dnn = _model_factory.denseDNN(args.dense) _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch) _history = _dnn.fit(_train_ds[0], _train_ds[1], epochs=args.epochs, batch_size=args.batch, validation_data=(_val_ds[0], _val_ds[1]), verbose=args.progress, callbacks=_callbacks) with open(args.history, 'wb') as _jar: pickle.dump(_history.history, _jar)
def main(args): """ Main function of program for classifying source code Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) _ds = SeqTokDataset(args.dataset, min_n_solutions=max(args.min_solutions, 3), max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, max_seq_length=args.seq_len, balanced_split=args.balanced_split) print(f"Classification of source code among {_ds.n_labels} classes") print("Technique of convolutional neural network on sequence of tokens\n") _model_factory = SeqModelFactory(_ds.n_token_types, _ds.n_labels) _convolutions = list( zip(args.filters, args.kernels, args.strides) if args. strides else zip(args.filters, args.kernels)) _dnn = _model_factory.cnnDNN(_convolutions, args.dense, pool=args.pool, conv_act=args.conv_act, regular=(args.l1, args.l2), input_type=args.coding, optimizer=args.optimizer, embedding_dim=args.embed) _val_ds, _train_ds = _ds.trainValidDs(args.valpart, args.batch) train(_dnn, _val_ds[0], _train_ds[0], args.epochs, args.history, args.progress) _ds, _labels, _sample_names, _label_names = _val_ds confusionAnalysis(_dnn, _ds, _labels, _sample_names, _label_names)
def main(args): """ Main function of program for classifying source code samples and evaluating its accuracy Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) _checkpoint = getCheckpoint(args.ckpt_dir, args.ckpt) _ds = BagTokDataset(args.dataset, min_n_solutions = max(args.min_solutions, 3), max_n_problems = args.problems, short_code_th = args.short_code, long_code_th = args.long_code, test_part = args.testpart, balanced_split = args.balanced_split) print("Restoring from", _checkpoint) _dnn = tf.keras.models.load_model(_checkpoint) _test_ds, _labels, _sample_names, _label_names = \ _ds.testDS(args.batch) _eval_loss, _eval_acc = _dnn.evaluate(_test_ds[0], _test_ds[1], verbose = args.progress) _prob = _dnn.predict(_test_ds[0], verbose = args.progress) _confusion = ClassConfusAnalysis(_prob, _labels, _sample_names, _label_names) _confusion.writeReport() print("\n") print("Evaluation accuracy is {:5.2f}%".format(_eval_acc * 100)) print("Evaluation loss is {:5.2f}".format(_eval_loss))
def main(args): """ Main function of program for predicting similarity of two source code samples Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=args.patience) callbacks = [early_stop] #callbacks = [] def lrScheduler(epoch, lr): """ Utility function of learning rate scheduler Parameters: - epoch -- current epoch - lr -- current learning rate Returns new learning rate """ if epoch < 10: return lr else: return lr * tf.math.exp(-0.1) lrUpdate = \ tf.keras.callbacks.LearningRateScheduler( lrScheduler, verbose=1) lrOnPlateaur = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.98, patience=2, cooldown=0, min_delta=0.0001, min_lr=0.00025, verbose=2) #callbacks.append(lrOnPlateaur) if args.ckpt_dir: latest_checkpoint = setupCheckpoint(args.ckpt_dir) checkpoint_callback = makeCkptCallback(args.ckpt_dir) callbacks.append(checkpoint_callback) else: latest_checkpoint = None _ds = SeqTok2WaySimDsTF(args.dataset, min_n_solutions=args.min_solutions, max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, max_seq_length=args.seq_len, test=args.testpart, batch=args.batch, labels01=not args.symmetric_labels) #Create parallelization strategy for multi GPU mode #It also can be either MirroredStrategy or MultiWorkerMirroredStrategy #But MultiWorkerMirroredStrategy works better strategy = tf.distribute.MultiWorkerMirroredStrategy() #distribute.MirroredStrategy() print("Number of devices: {}".format(strategy.num_replicas_in_sync)) #Construct DNN in the scope of parrelization strategy. with strategy.scope(): # Everything that creates variables should be under the strategy scope. # In general this is only model construction & `compile()`. if latest_checkpoint: print("Restoring from", latest_checkpoint) _dnn = tf.keras.models.load_model(latest_checkpoint) else: print("Constructing DNN") _dnn = makeDNN(_ds.n_token_types, args) _val_ds, _train_ds = \ _ds.trainValidDsSameProblems( args.valpart, args.valsize, args.trainsize, args.similpart) \ if args.validation == "same" else \ _ds.trainValidDsDifferentProblems( args.valpart, args.valsize, args.trainsize, args.similpart) if args.sim_weight: _w_sim = args.sim_weight / (1.0 + args.sim_weight) _w_dissim = 1 - _w_sim history = _dnn.fit(_train_ds[0], validation_data=_val_ds[0], class_weight={ 0: _w_dissim, 1: _w_sim }, epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, verbose=args.progress, callbacks=callbacks) else: history = _dnn.fit(_train_ds[0].repeat(), validation_data=_val_ds[0], epochs=args.epochs, steps_per_epoch=args.steps_per_epoch, verbose=args.progress, callbacks=callbacks) with open(args.history, 'wb') as _jar: pickle.dump(history.history, _jar)
def main(args): """ Main function of program for predicting similarity of two source code samples Parameters: - args -- Parsed command line arguments as object returned by ArgumentParser """ resetSeeds() DataRand.setDsSeeds(args.seed_ds) _convolutions = list( zip(args.filters, args.kernels, args.strides) if args. strides else zip(args.filters, args.kernels)) if args.model == 'basic': from SeqTokSimDataset import SeqTokSimilarityDS from SeqModelMaker import SeqModelFactory _ds = SeqTokSimilarityDS(args.dataset, min_n_solutions=args.min_solutions, max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, max_seq_length=args.seq_len) _model_factory = SeqModelFactory(_ds.n_token_types * 2, 1) _dnn = _model_factory.cnnDNN(_convolutions, args.dense, input_type=args.coding, pool=args.pool, conv_act=args.conv_act, regular=(args.l1, args.l2), optimizer=args.optimizer) print("Basic model for sequence similarity is constructed") else: from SeqTok2WaySimDataset import SeqTok2WaySimDS from FuncModelMaker import FuncModelFactory _ds = SeqTok2WaySimDS(args.dataset, min_n_solutions=args.min_solutions, max_n_problems=args.problems, short_code_th=args.short_code, long_code_th=args.long_code, max_seq_length=args.seq_len) _model_factory = FuncModelFactory(1, regularizer=(args.l1, args.l2)) _dnn = _model_factory.twoWaySimilarityCNN( _ds.n_token_types, _convolutions, args.dense, pool=args.pool, side_dense=[], conv_act=args.conv_act, shared=(args.model == 'symmetric'), input_type=args.coding, embedding_dim=args.embed, optimizer=args.optimizer) print("Two way model for sequence similarity is constructed") _val_ds, _train_ds = \ _ds.trainValidDsSameProblems( args.valpart, args.valsize, args.trainsize, args.similpart) \ if args.validation == "same" else \ _ds.trainValidDsDifferentProblems( args.valpart, args.valsize, args.trainsize, args.similpart) train(_dnn, _train_ds, _val_ds, args.epochs, args.batch, args.history, args.progress) del _train_ds _val_samples, _labels, _annotations = _val_ds _prob = predict(_dnn, _val_samples, batch=args.batch) _confusion = SimilConfusAnalysis(_prob, _labels, _ds.solution_names, _ds.problems, _annotations) _confusion.writeReport()
def trainValidDsDifferentProblems(self, val_train_split, val_size, train_size, similar_part): """ Make datasets for training source code similarity analyser Both training and validation parts of dataset are made. The validation part is made from solutions of different problems that were used for making the training part of the dataset. Validation uses the problems of the beginning of the list Training uses problems that are between validation and test ones The function provides the following: - No same code samples (problems) are used for creation both validation and training datasets - Validation and training datasets are created from solutions of different problems - Number of samples of similar solutions of each problem is proportional to the number of all pairs of solutions of that problem. - All pairs of similar solutions are constructed from different samples, i.e. no pair has same solution. No self similar solution is constructed. !!!Possibly this condition should be parameterized. - Pairs of dissimilar solutions are constructed such as each problem is selected with the probability proportional to the number of its solutions. Similar samples are constructed as pairs of solutions of the same problem Dissimilar samples are constructed as pairs of solution of different problems All problem solutions are splitted for ones used for training and ones used for validation. The solutions of the resulted subsets of problems are used for constructing training and validation datasets. Parameters: - val_train_split -- Fraction of original probles used for constructing validation dataset. It speciified as float Set of all problem is splitted according to this parameters - val_size -- Size of validation dataset to create - train_size -- Size of training dataset to create - similar_part -- Fraction of samples of the created dataset representing similar source code samples Returns: - validation and training datasets: Each of them has the following items: - constructed dataset * either as single numpy array * or list of 2 them * or TF dataset - labels as numpy array - list of similarity samples in form of 4-tuples <problem1, solution1, problem2, solution2> """ print("Constructing training and validation datasets " + "from solutions of different problems") _n_val_probls = int(float(self.n_tran_ds_problems) * val_train_split) print(f"Validation and training use {_n_val_probls} " + f"and {self.n_tran_ds_problems - _n_val_probls} problems respectively") if _n_val_probls < 2 or self.n_tran_ds_problems - _n_val_probls < 2: sys.exit(f"Too few problems for training " + f"{self.n_tran_ds_problems - _n_val_probls} " + f"or validation {_n_val_probls}") _val_problem_solutions = \ self.train_ds_probl_solutions[: _n_val_probls] _train_problem_solutions = \ self.train_ds_probl_solutions[_n_val_probls :] self.writeProblemList(self.train_ds_problems[: _n_val_probls], "validation_problems.txt") self.writeProblemList(self.train_ds_problems[_n_val_probls :], "training_problems.txt") DataRand.setSeed("SIMIL_VALID_DS_SEED") self.val_ds = self._makeDs(0, _val_problem_solutions, val_size, similar_part) DataRand.setSeed("SIMIL_TRAIN_DS_SEED") self.train_ds = self._makeDs(_n_val_probls, _train_problem_solutions, train_size, similar_part) self.reportDatasetStatistics(0, _n_val_probls, self.val_ds[2], _n_val_probls, self.n_tran_ds_problems - _n_val_probls, self.train_ds[2]) self.writeSamplesCsv(self.val_ds[2], "val_samples.csv") self.writeSamplesCsv(self.train_ds[2], "train_samples.csv") return self.val_ds, self.train_ds
def trainValidDsSameProblems(self, val_train_split, val_size, train_size, similar_part): """ Make datasets for training source code similarity analyser Both training and validation parts of dataset are made. The validation part is made from solutions of the same problems that were used for making the training part of the dataset. However, the test dataset is made from solutions of the problems different from ones used here for training and validation The function provides the following: - No same code samples (problem solutions) are used for creation both validation and training datasets - Each problem is represented in both created datasets with the same fraction of its code samples - Number of samples of similar solutions of each problem is proportional to the number of all pairs of solutions of that problem. - All pairs of similar solutions are constructed from different samples, i.e. no pair has same solution. No self similar solution is constructed. !!!Possibly this condition should be parameterized. - Pairs of dissimilar solutions are constructed such as each problem is selected with the probability proportional to the number of its solutions. Similar samples are constructed as pairs of solutions of the same problem Dissimilar samples are constructed as pairs of solution of different problems Samples of each problem solutions are splitted individually. The obtained subsets of are used for constructing training and validation datasets. Parameters: - val_train_split -- Fraction of training samples that is used for constructing validation dataset. It speciified as float Samples of each problem solutions are splitted individually according to this parameters - val_size -- Size of validation dataset to create - train_size -- Size of training dataset to create - similar_part -- Fraction of samples of the created dataset representing similar source code samples Returns: - validation and training datasets: Each of them has the following items: - constructed dataset * either as single numpy array * or list of 2 them * or TF dataset - labels as numpy array - list of similarity samples in form of 4-tuples <problem1, solution1, problem2, solution2> """ print("Constructing training and validation datasets " + "from different solutions of the same problems") _val_problem_solutions = \ [_solutions[: int(float(len(_solutions)) * val_train_split)] for _solutions in self.train_ds_probl_solutions] _train_problem_solutions = \ [_solutions[int(float(len(_solutions)) * val_train_split) :] for _solutions in self.train_ds_probl_solutions] DataRand.setSeed("SIMIL_TRAIN_DS_SEED") self.train_ds = self._makeDs(0, _train_problem_solutions, train_size, similar_part) DataRand.setSeed("SIMIL_VALID_DS_SEED") self.val_ds = self._makeDs(0, _val_problem_solutions, val_size, similar_part) self.reportDatasetStatistics(0, self.n_tran_ds_problems, self.val_ds[2], 0, self.n_tran_ds_problems, self.train_ds[2]) return self.val_ds, self.train_ds