def _create_data_reader(self, estimator_action: EstimatorAction, repeat_endlessly: bool = False): return PathContextReader( vocabs=self.vocabs, config=self.config, model_input_tensors_former=_KerasModelInputTensorsFormer(estimator_action=estimator_action), estimator_action=estimator_action, repeat_endlessly=repeat_endlessly)
def _get_vocab_embedding_as_np_array(self, vocab_type: VocabType) -> np.ndarray: assert vocab_type in VocabType vocab_tf_variable_name = self.vocab_type_to_tf_variable_name_mapping[vocab_type] if self.eval_reader is None: self.eval_reader = PathContextReader(vocabs=self.vocabs, model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(), config=self.config, estimator_action=EstimatorAction.Evaluate) input_iterator = tf.compat.v1.data.make_initializable_iterator(self.eval_reader.get_dataset()) _, _, _, _, _, _, _, _ = self._build_tf_test_graph(input_iterator.get_next()) if vocab_type is VocabType.Token: shape = (self.vocabs.token_vocab.size, self.config.TOKEN_EMBEDDINGS_SIZE) elif vocab_type is VocabType.Target: shape = (self.vocabs.target_vocab.size, self.config.TARGET_EMBEDDINGS_SIZE) elif vocab_type is VocabType.Path: shape = (self.vocabs.path_vocab.size, self.config.PATH_EMBEDDINGS_SIZE) with tf.compat.v1.variable_scope('model', reuse=True): embeddings = tf.compat.v1.get_variable(vocab_tf_variable_name, shape=shape) self.saver = tf.compat.v1.train.Saver() self._initialize_session_variables() self._load_inner_model(self.sess) vocab_embedding_matrix = self.sess.run(embeddings) return vocab_embedding_matrix
def predict(self, predict_data_lines: Iterable[str]) -> List[ModelPredictionResults]: if self.predict_reader is None: self.predict_reader = PathContextReader(vocabs=self.vocabs, model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(), config=self.config, estimator_action=EstimatorAction.Predict) self.predict_placeholder = tf.compat.v1.placeholder(tf.string) reader_output = self.predict_reader.process_input_row(self.predict_placeholder) self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op, \ self.attention_weights_op, self.predict_source_string, self.predict_path_string, \ self.predict_path_target_string, self.predict_code_vectors = \ self._build_tf_test_graph(reader_output, normalize_scores=True) self._initialize_session_variables() self.saver = tf.compat.v1.train.Saver() self._load_inner_model(sess=self.sess) prediction_results: List[ModelPredictionResults] = [] for line in predict_data_lines: batch_top_words, batch_top_scores, batch_original_name, batch_attention_weights, batch_path_source_strings,\ batch_path_strings, batch_path_target_strings, batch_code_vectors = self.sess.run( [self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op, self.attention_weights_op, self.predict_source_string, self.predict_path_string, self.predict_path_target_string, self.predict_code_vectors], feed_dict={self.predict_placeholder: line}) # shapes: # batch_top_words, top_scores: (batch, top_k) # batch_original_name: (batch, ) # batch_attention_weights: (batch, max_context, 1) # batch_path_source_strings, batch_path_strings, batch_path_target_strings: (batch, max_context) # batch_code_vectors: (batch, code_vector_size) # remove first axis: (batch=1, ...) assert all(tensor.shape[0] == 1 for tensor in (batch_top_words, batch_top_scores, batch_original_name, batch_attention_weights, batch_path_source_strings, batch_path_strings, batch_path_target_strings, batch_code_vectors)) top_words = np.squeeze(batch_top_words, axis=0) top_scores = np.squeeze(batch_top_scores, axis=0) original_name = batch_original_name[0] attention_weights = np.squeeze(batch_attention_weights, axis=0) path_source_strings = np.squeeze(batch_path_source_strings, axis=0) path_strings = np.squeeze(batch_path_strings, axis=0) path_target_strings = np.squeeze(batch_path_target_strings, axis=0) code_vectors = np.squeeze(batch_code_vectors, axis=0) top_words = common.binary_to_string_list(top_words) original_name = common.binary_to_string(original_name) attention_per_context = self._get_attention_weight_per_context( path_source_strings, path_strings, path_target_strings, attention_weights) prediction_results.append(ModelPredictionResults( original_name=original_name, topk_predicted_words=top_words, topk_predicted_words_scores=top_scores, attention_per_context=attention_per_context, code_vector=(code_vectors if self.config.EXPORT_CODE_VECTORS else None) )) return prediction_results
def test_get_dataset(): config.config.CREATE_VOCAB = True config.config.VEC_TRAINING_FREQ_DICTS_PATH = "dataset/java-small.c2v.dict" c2v_vocabs = Code2VecVocabs() pcr = PathContextReader(is_train=True, vocabs=c2v_vocabs, csv_path="dataset/java-small.train_vec.csv") dataset = pcr.get_dataset() it = iter(dataset) it = it.get_next() assert it.target_index.shape[0] == it.path_source_token_indices.shape[0]
def train(self): self.log('Starting training') start_time = time.time() batch_num = 0 sum_loss = 0 multi_batch_start_time = time.time() num_batches_to_save_and_eval = max( int(self.config.train_steps_per_epoch * self.config.SAVE_EVERY_EPOCHS), 1) train_reader = PathContextReader( vocabs=self.vocabs, model_input_tensors_former=_TFTrainModelInputTensorsFormer(), config=self.config, estimator_action=EstimatorAction.Train) input_iterator = tf.compat.v1.data.make_initializable_iterator( train_reader.get_dataset()) input_iterator_reset_op = input_iterator.initializer input_tensors = input_iterator.get_next() optimizer, train_loss = self._build_tf_training_graph(input_tensors) self.saver = tf.compat.v1.train.Saver( max_to_keep=self.config.MAX_TO_KEEP) self.log('Number of trainable params: {}'.format( np.sum([ np.prod(v.get_shape().as_list()) for v in tf.compat.v1.trainable_variables() ]))) for variable in tf.compat.v1.trainable_variables(): self.log("variable name: {} -- shape: {} -- #params: {}".format( variable.name, variable.get_shape(), np.prod(variable.get_shape().as_list()))) self._initialize_session_variables() if self.config.MODEL_LOAD_PATH: self._load_inner_model(self.sess) self.sess.run(input_iterator_reset_op) time.sleep(1) self.log('Started reader...') # run evaluation in a loop until iterator is exhausted. try: while True: # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches. batch_num += 1 # Actual training for the current batch. _, batch_loss = self.sess.run([optimizer, train_loss]) sum_loss += batch_loss if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0: self._trace_training(sum_loss, batch_num, multi_batch_start_time) # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader. # self.log('Number of waiting examples in queue: %d' % self.sess.run( # "shuffle_batch/random_shuffle_queue_Size:0")) sum_loss = 0 multi_batch_start_time = time.time() if batch_num % num_batches_to_save_and_eval == 0: epoch_num = int( (batch_num / num_batches_to_save_and_eval) * self.config.SAVE_EVERY_EPOCHS) save_path = self.config.MODEL_SAVE_PATH + '_iter' + str( epoch_num) self._save_inner_model(save_path) self.log('Saved after %d epochs in: %s' % (epoch_num, save_path)) evaluation_results = self.evaluate() evaluation_results_str = (str(evaluation_results).replace( 'topk', 'top{}'.format( self.config. TOP_K_WORDS_CONSIDERED_DURING_PREDICTION))) self.log( 'After {nr_epochs} epochs -- {evaluation_results}'. format(nr_epochs=epoch_num, evaluation_results=evaluation_results_str)) except tf.errors.OutOfRangeError: pass # The reader iterator is exhausted and have no more batches to produce. self.log('Done training') if self.config.MODEL_SAVE_PATH: self._save_inner_model(self.config.MODEL_SAVE_PATH) self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH) elapsed = int(time.time() - start_time) self.log("Training time: %sH:%sM:%sS\n" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
def evaluate(self) -> Optional[ModelEvaluationResults]: eval_start_time = time.time() if self.eval_reader is None: self.eval_reader = PathContextReader( vocabs=self.vocabs, model_input_tensors_former=_TFEvaluateModelInputTensorsFormer( ), config=self.config, estimator_action=EstimatorAction.Evaluate) input_iterator = tf.compat.v1.data.make_initializable_iterator( self.eval_reader.get_dataset()) self.eval_input_iterator_reset_op = input_iterator.initializer input_tensors = input_iterator.get_next() self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, _, _, _, _, _, _, \ self.eval_code_vectors = self._build_tf_test_graph(input_tensors) self.saver = tf.compat.v1.train.Saver() if self.config.MODEL_LOAD_PATH and not self.config.TRAIN_DATA_PATH_PREFIX: self._initialize_session_variables() self._load_inner_model(self.sess) if self.config.RELEASE: release_name = self.config.MODEL_LOAD_PATH + '.release' self.log('Releasing model, output model: %s' % release_name) self.saver.save(self.sess, release_name) return None # FIXME: why do we return none here? with open('log.txt', 'w') as log_output_file: if self.config.EXPORT_CODE_VECTORS: code_vectors_file = open( self.config.TEST_DATA_PATH + '.vectors', 'w') total_predictions = 0 total_prediction_batches = 0 subtokens_evaluation_metric = SubtokensEvaluationMetric( partial(common.filter_impossible_names, self.vocabs.target_vocab.special_words)) topk_accuracy_evaluation_metric = TopKAccuracyEvaluationMetric( self.config.TOP_K_WORDS_CONSIDERED_DURING_PREDICTION, partial(common.get_first_match_word_from_top_predictions, self.vocabs.target_vocab.special_words)) start_time = time.time() self.sess.run(self.eval_input_iterator_reset_op) self.log('Starting evaluation') # Run evaluation in a loop until iterator is exhausted. # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches. try: while True: top_words, top_scores, original_names, code_vectors = self.sess.run( [ self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, self.eval_code_vectors ], ) # shapes: # top_words: (batch, top_k); top_scores: (batch, top_k) # original_names: (batch, ); code_vectors: (batch, code_vector_size) top_words = common.binary_to_string_matrix( top_words) # (batch, top_k) original_names = common.binary_to_string_list( original_names) # (batch,) self._log_predictions_during_evaluation( zip(original_names, top_words), log_output_file) topk_accuracy_evaluation_metric.update_batch( zip(original_names, top_words)) subtokens_evaluation_metric.update_batch( zip(original_names, top_words)) total_predictions += len(original_names) total_prediction_batches += 1 if self.config.EXPORT_CODE_VECTORS: self._write_code_vectors(code_vectors_file, code_vectors) if total_prediction_batches % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0: elapsed = time.time() - start_time # start_time = time.time() self._trace_evaluation(total_predictions, elapsed) except tf.errors.OutOfRangeError: pass # reader iterator is exhausted and have no more batches to produce. self.log('Done evaluating, epoch reached') log_output_file.write( str(topk_accuracy_evaluation_metric.topk_correct_predictions) + '\n') if self.config.EXPORT_CODE_VECTORS: code_vectors_file.close() elapsed = int(time.time() - eval_start_time) self.log("Evaluation time: %sH:%sM:%sS" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60)) return ModelEvaluationResults( topk_acc=topk_accuracy_evaluation_metric.topk_correct_predictions, subtoken_precision=subtokens_evaluation_metric.precision, subtoken_recall=subtokens_evaluation_metric.recall, subtoken_f1=subtokens_evaluation_metric.f1)
def train(self): self.log('Starting training') start_time = time.time() batch_num = 0 sum_loss = 0 multi_batch_start_time = time.time() num_batches_to_save_and_eval = max( int(self.config.train_steps_per_epoch * self.config.SAVE_EVERY_EPOCHS), 1) train_reader = PathContextReader( vocabs=self.vocabs, model_input_tensors_former=_TFTrainModelInputTensorsFormer(), config=self.config, estimator_action=EstimatorAction.Train) input_iterator = tf.compat.v1.data.make_initializable_iterator( train_reader.get_dataset()) input_iterator_reset_op = input_iterator.initializer input_tensors = input_iterator.get_next() optimizer, train_loss = self._build_tf_training_graph(input_tensors) self.saver = tf.compat.v1.train.Saver( max_to_keep=self.config.MAX_TO_KEEP) self.log('Number of trainable params: {}'.format( np.sum([ np.prod(v.get_shape().as_list()) for v in tf.compat.v1.trainable_variables() ]))) for variable in tf.compat.v1.trainable_variables(): self.log("variable name: {} -- shape: {} -- #params: {}".format( variable.name, variable.get_shape(), np.prod(variable.get_shape().as_list()))) self._initialize_session_variables() if self.config.MODEL_LOAD_PATH: self._load_inner_model(self.sess) self.sess.run(input_iterator_reset_op) time.sleep(1) self.log('Started reader...') training_logger = None os.makedirs('losses_logs/', exist_ok=True) loss_log_path = 'losses_logs/losses_log' + common.now_str( )[:-2] + '.csv' e_loss_log_path = 'losses_logs/losses_log' + common.now_str( )[:-2] + '.csv' if EXTRA_VALIDATION_PERIOD > 0 else None if self.config.USE_TENSORBOARD: log_dir = "logs/scalars/train_" + common.now_str()[:-2] training_logger = tf.summary.create_file_writer(log_dir) self.sess.run(training_logger.init()) training_logger.set_as_default() # run evaluation in a loop until iterator is exhausted. try: epoch_losses = [] extra_losses = [] while True: # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches. batch_num += 1 # Actual training for the current batch. _, batch_loss = self.sess.run([optimizer, train_loss]) sum_loss += batch_loss epoch_losses.append(batch_loss) extra_losses.append(batch_loss) # if self.config.USE_TENSORBOARD: # self.sess.run(tf.summary.scalar('batch_loss', batch_loss, step=batch_num)) if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0: self._trace_training(sum_loss, batch_num, multi_batch_start_time) # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader. # self.log('Number of waiting examples in queue: %d' % self.sess.run( # "shuffle_batch/random_shuffle_queue_Size:0")) sum_loss = 0 multi_batch_start_time = time.time() if EXTRA_VALIDATION_PERIOD > 0 and batch_num % EXTRA_VALIDATION_PERIOD == 0: evaluation_results = self.evaluate() evaluation_results_str = (str(evaluation_results).replace( 'topk', 'top{}'.format( self.config. TOP_K_WORDS_CONSIDERED_DURING_PREDICTION))) extra_mean_train_loss = np.mean( extra_losses) / self.config.TRAIN_BATCH_SIZE extra_losses.clear() print( f'Losses: train: {extra_mean_train_loss}, validation: {evaluation_results.loss}' ) with open(e_loss_log_path, 'at') as loss_log_file: loss_log_file.write( f'{extra_mean_train_loss},{evaluation_results.loss}\n' ) if self.config.USE_TENSORBOARD: self.sess.run([ tf.summary.scalar( 'e_precision', evaluation_results.subtoken_precision, step=batch_num), tf.summary.scalar( 'e_recall', evaluation_results.subtoken_recall, step=batch_num), tf.summary.scalar('e_f1', evaluation_results.subtoken_f1, step=batch_num), tf.summary.scalar('e_train_loss', extra_mean_train_loss, step=batch_num), tf.summary.scalar('e_validation_loss', evaluation_results.loss, step=batch_num), ]) self.sess.run([ tf.summary.scalar(f'e_top{i}_acc', top_i_acc, step=batch_num) for i, top_i_acc in enumerate(evaluation_results.topk_acc) ]) self.sess.run(training_logger.flush()) self.log( f'After {batch_num} batches -- {evaluation_results_str}' ) if batch_num % num_batches_to_save_and_eval == 0: epoch_num = int( (batch_num / num_batches_to_save_and_eval) * self.config.SAVE_EVERY_EPOCHS) model_save_path = self.config.MODEL_SAVE_PATH + '_iter' + str( epoch_num) self.save(model_save_path) self.log('Saved after %d epochs in: %s' % (epoch_num, model_save_path)) evaluation_results = self.evaluate() evaluation_results_str = (str(evaluation_results).replace( 'topk', 'top{}'.format( self.config. TOP_K_WORDS_CONSIDERED_DURING_PREDICTION))) epoch_mean_train_loss = np.mean( epoch_losses) / self.config.TRAIN_BATCH_SIZE epoch_losses.clear() print( f'Losses: train: {epoch_mean_train_loss}, validation: {evaluation_results.loss}' ) with open(loss_log_path, 'at') as loss_log_file: loss_log_file.write( f'{epoch_mean_train_loss},{evaluation_results.loss}\n' ) if self.config.USE_TENSORBOARD: self.sess.run([ tf.summary.scalar( 'precision', evaluation_results.subtoken_precision, step=epoch_num), tf.summary.scalar( 'recall', evaluation_results.subtoken_recall, step=epoch_num), tf.summary.scalar('f1', evaluation_results.subtoken_f1, step=epoch_num), tf.summary.scalar('train_loss', epoch_mean_train_loss, step=epoch_num), tf.summary.scalar('validation_loss', evaluation_results.loss, step=epoch_num), ]) self.sess.run([ tf.summary.scalar(f'top{i}_acc', top_i_acc, step=epoch_num) for i, top_i_acc in enumerate(evaluation_results.topk_acc) ]) self.sess.run(training_logger.flush()) self.log( 'After {nr_epochs} epochs -- {evaluation_results}'. format(nr_epochs=epoch_num, evaluation_results=evaluation_results_str)) except tf.errors.OutOfRangeError: pass # The reader iterator is exhausted and have no more batches to produce. self.log('Done training') if self.config.MODEL_SAVE_PATH: self._save_inner_model(self.config.MODEL_SAVE_PATH) self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH) elapsed = int(time.time() - start_time) self.log("Training time: %sH:%sM:%sS\n" % ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
dest="checkpoints_dir", help="Dir for checkpoints", required=False, default="training") parser.add_argument("--net", dest="net", help="net destination type var or vec", required=False, default="vec") args = parser.parse_args() print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU'))) if args.train: print(f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv") c2v_vocabs = Code2VecVocabs(net=NetType(args.net)) pcr = PathContextReader(is_train=True, vocabs=c2v_vocabs, csv_path=f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv") dataset = pcr.get_dataset() val_dataset, test_dataset = pcr.get_subdatasets() # init lookups c2v_vocabs.target_vocab.get_word_to_index_lookup_table() c2v_vocabs.token_vocab.get_word_to_index_lookup_table() c2v_vocabs.path_vocab.get_word_to_index_lookup_table() TOKEN_VOCAB_SIZE = c2v_vocabs.token_vocab.lookup_table_word_to_index.size().numpy() TARGET_VOCAB_SIZE = c2v_vocabs.target_vocab.lookup_table_word_to_index.size().numpy() PATH_VOCAB_SIZE = c2v_vocabs.path_vocab.lookup_table_word_to_index.size().numpy() tf.random.set_seed(42) model = code2vec(token_vocab_size=TOKEN_VOCAB_SIZE, target_vocab_size=TARGET_VOCAB_SIZE, path_vocab_size=PATH_VOCAB_SIZE,