Ejemplo n.º 1
0
 def _create_data_reader(self, estimator_action: EstimatorAction, repeat_endlessly: bool = False):
     return PathContextReader(
         vocabs=self.vocabs,
         config=self.config,
         model_input_tensors_former=_KerasModelInputTensorsFormer(estimator_action=estimator_action),
         estimator_action=estimator_action,
         repeat_endlessly=repeat_endlessly)
Ejemplo n.º 2
0
    def _get_vocab_embedding_as_np_array(self, vocab_type: VocabType) -> np.ndarray:
        assert vocab_type in VocabType
        vocab_tf_variable_name = self.vocab_type_to_tf_variable_name_mapping[vocab_type]
        
        if self.eval_reader is None:
            self.eval_reader = PathContextReader(vocabs=self.vocabs,
                                                 model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(),
                                                 config=self.config, estimator_action=EstimatorAction.Evaluate)
            input_iterator = tf.compat.v1.data.make_initializable_iterator(self.eval_reader.get_dataset())
            _, _, _, _, _, _, _, _ = self._build_tf_test_graph(input_iterator.get_next())

        if vocab_type is VocabType.Token:
            shape = (self.vocabs.token_vocab.size, self.config.TOKEN_EMBEDDINGS_SIZE)
        elif vocab_type is VocabType.Target:
            shape = (self.vocabs.target_vocab.size, self.config.TARGET_EMBEDDINGS_SIZE)
        elif vocab_type is VocabType.Path:
            shape = (self.vocabs.path_vocab.size, self.config.PATH_EMBEDDINGS_SIZE)

        with tf.compat.v1.variable_scope('model', reuse=True):
            embeddings = tf.compat.v1.get_variable(vocab_tf_variable_name, shape=shape)
        self.saver = tf.compat.v1.train.Saver()
        self._initialize_session_variables() 
        self._load_inner_model(self.sess) 
        vocab_embedding_matrix = self.sess.run(embeddings)
        return vocab_embedding_matrix
Ejemplo n.º 3
0
    def predict(self, predict_data_lines: Iterable[str]) -> List[ModelPredictionResults]:
        if self.predict_reader is None:
            self.predict_reader = PathContextReader(vocabs=self.vocabs,
                                                    model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(),
                                                    config=self.config, estimator_action=EstimatorAction.Predict)
            self.predict_placeholder = tf.compat.v1.placeholder(tf.string)
            reader_output = self.predict_reader.process_input_row(self.predict_placeholder)

            self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op, \
            self.attention_weights_op, self.predict_source_string, self.predict_path_string, \
            self.predict_path_target_string, self.predict_code_vectors = \
                self._build_tf_test_graph(reader_output, normalize_scores=True)

            self._initialize_session_variables()
            self.saver = tf.compat.v1.train.Saver()
            self._load_inner_model(sess=self.sess)

        prediction_results: List[ModelPredictionResults] = []
        for line in predict_data_lines:
            batch_top_words, batch_top_scores, batch_original_name, batch_attention_weights, batch_path_source_strings,\
                batch_path_strings, batch_path_target_strings, batch_code_vectors = self.sess.run(
                    [self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op,
                     self.attention_weights_op, self.predict_source_string, self.predict_path_string,
                     self.predict_path_target_string, self.predict_code_vectors],
                    feed_dict={self.predict_placeholder: line})
            # shapes:
            #   batch_top_words, top_scores: (batch, top_k)
            #   batch_original_name: (batch, )
            #   batch_attention_weights: (batch, max_context, 1)
            #   batch_path_source_strings, batch_path_strings, batch_path_target_strings: (batch, max_context)
            #   batch_code_vectors: (batch, code_vector_size)

            # remove first axis: (batch=1, ...)
            assert all(tensor.shape[0] == 1 for tensor in (batch_top_words, batch_top_scores, batch_original_name,
                                                           batch_attention_weights, batch_path_source_strings,
                                                           batch_path_strings, batch_path_target_strings,
                                                           batch_code_vectors))
            top_words = np.squeeze(batch_top_words, axis=0)
            top_scores = np.squeeze(batch_top_scores, axis=0)
            original_name = batch_original_name[0]
            attention_weights = np.squeeze(batch_attention_weights, axis=0)
            path_source_strings = np.squeeze(batch_path_source_strings, axis=0)
            path_strings = np.squeeze(batch_path_strings, axis=0)
            path_target_strings = np.squeeze(batch_path_target_strings, axis=0)
            code_vectors = np.squeeze(batch_code_vectors, axis=0)

            top_words = common.binary_to_string_list(top_words)
            original_name = common.binary_to_string(original_name)
            attention_per_context = self._get_attention_weight_per_context(
                path_source_strings, path_strings, path_target_strings, attention_weights)
            prediction_results.append(ModelPredictionResults(
                original_name=original_name,
                topk_predicted_words=top_words,
                topk_predicted_words_scores=top_scores,
                attention_per_context=attention_per_context,
                code_vector=(code_vectors if self.config.EXPORT_CODE_VECTORS else None)
            ))
        return prediction_results
Ejemplo n.º 4
0
def test_get_dataset():
    config.config.CREATE_VOCAB = True
    config.config.VEC_TRAINING_FREQ_DICTS_PATH = "dataset/java-small.c2v.dict"
    c2v_vocabs = Code2VecVocabs()
    pcr = PathContextReader(is_train=True,
                            vocabs=c2v_vocabs,
                            csv_path="dataset/java-small.train_vec.csv")
    dataset = pcr.get_dataset()
    it = iter(dataset)
    it = it.get_next()
    assert it.target_index.shape[0] == it.path_source_token_indices.shape[0]
Ejemplo n.º 5
0
    def train(self):
        self.log('Starting training')
        start_time = time.time()

        batch_num = 0
        sum_loss = 0
        multi_batch_start_time = time.time()
        num_batches_to_save_and_eval = max(
            int(self.config.train_steps_per_epoch *
                self.config.SAVE_EVERY_EPOCHS), 1)

        train_reader = PathContextReader(
            vocabs=self.vocabs,
            model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
            config=self.config,
            estimator_action=EstimatorAction.Train)
        input_iterator = tf.compat.v1.data.make_initializable_iterator(
            train_reader.get_dataset())
        input_iterator_reset_op = input_iterator.initializer
        input_tensors = input_iterator.get_next()

        optimizer, train_loss = self._build_tf_training_graph(input_tensors)
        self.saver = tf.compat.v1.train.Saver(
            max_to_keep=self.config.MAX_TO_KEEP)

        self.log('Number of trainable params: {}'.format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.compat.v1.trainable_variables()
            ])))
        for variable in tf.compat.v1.trainable_variables():
            self.log("variable name: {} -- shape: {} -- #params: {}".format(
                variable.name, variable.get_shape(),
                np.prod(variable.get_shape().as_list())))

        self._initialize_session_variables()

        if self.config.MODEL_LOAD_PATH:
            self._load_inner_model(self.sess)

        self.sess.run(input_iterator_reset_op)
        time.sleep(1)
        self.log('Started reader...')
        # run evaluation in a loop until iterator is exhausted.
        try:
            while True:
                # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
                batch_num += 1

                # Actual training for the current batch.
                _, batch_loss = self.sess.run([optimizer, train_loss])

                sum_loss += batch_loss
                if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                    self._trace_training(sum_loss, batch_num,
                                         multi_batch_start_time)
                    # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
                    # self.log('Number of waiting examples in queue: %d' % self.sess.run(
                    #    "shuffle_batch/random_shuffle_queue_Size:0"))
                    sum_loss = 0
                    multi_batch_start_time = time.time()
                if batch_num % num_batches_to_save_and_eval == 0:
                    epoch_num = int(
                        (batch_num / num_batches_to_save_and_eval) *
                        self.config.SAVE_EVERY_EPOCHS)
                    save_path = self.config.MODEL_SAVE_PATH + '_iter' + str(
                        epoch_num)
                    self._save_inner_model(save_path)
                    self.log('Saved after %d epochs in: %s' %
                             (epoch_num, save_path))
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    self.log(
                        'After {nr_epochs} epochs -- {evaluation_results}'.
                        format(nr_epochs=epoch_num,
                               evaluation_results=evaluation_results_str))
        except tf.errors.OutOfRangeError:
            pass  # The reader iterator is exhausted and have no more batches to produce.

        self.log('Done training')

        if self.config.MODEL_SAVE_PATH:
            self._save_inner_model(self.config.MODEL_SAVE_PATH)
            self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH)

        elapsed = int(time.time() - start_time)
        self.log("Training time: %sH:%sM:%sS\n" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
Ejemplo n.º 6
0
    def evaluate(self) -> Optional[ModelEvaluationResults]:
        eval_start_time = time.time()
        if self.eval_reader is None:
            self.eval_reader = PathContextReader(
                vocabs=self.vocabs,
                model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(
                ),
                config=self.config,
                estimator_action=EstimatorAction.Evaluate)
            input_iterator = tf.compat.v1.data.make_initializable_iterator(
                self.eval_reader.get_dataset())
            self.eval_input_iterator_reset_op = input_iterator.initializer
            input_tensors = input_iterator.get_next()

            self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, _, _, _, _, _, _, \
                self.eval_code_vectors = self._build_tf_test_graph(input_tensors)
            self.saver = tf.compat.v1.train.Saver()

        if self.config.MODEL_LOAD_PATH and not self.config.TRAIN_DATA_PATH_PREFIX:
            self._initialize_session_variables()
            self._load_inner_model(self.sess)
            if self.config.RELEASE:
                release_name = self.config.MODEL_LOAD_PATH + '.release'
                self.log('Releasing model, output model: %s' % release_name)
                self.saver.save(self.sess, release_name)
                return None  # FIXME: why do we return none here?

        with open('log.txt', 'w') as log_output_file:
            if self.config.EXPORT_CODE_VECTORS:
                code_vectors_file = open(
                    self.config.TEST_DATA_PATH + '.vectors', 'w')
            total_predictions = 0
            total_prediction_batches = 0
            subtokens_evaluation_metric = SubtokensEvaluationMetric(
                partial(common.filter_impossible_names,
                        self.vocabs.target_vocab.special_words))
            topk_accuracy_evaluation_metric = TopKAccuracyEvaluationMetric(
                self.config.TOP_K_WORDS_CONSIDERED_DURING_PREDICTION,
                partial(common.get_first_match_word_from_top_predictions,
                        self.vocabs.target_vocab.special_words))
            start_time = time.time()

            self.sess.run(self.eval_input_iterator_reset_op)

            self.log('Starting evaluation')

            # Run evaluation in a loop until iterator is exhausted.
            # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
            try:
                while True:
                    top_words, top_scores, original_names, code_vectors = self.sess.run(
                        [
                            self.eval_top_words_op, self.eval_top_values_op,
                            self.eval_original_names_op, self.eval_code_vectors
                        ], )

                    # shapes:
                    #   top_words: (batch, top_k);   top_scores: (batch, top_k)
                    #   original_names: (batch, );   code_vectors: (batch, code_vector_size)

                    top_words = common.binary_to_string_matrix(
                        top_words)  # (batch, top_k)
                    original_names = common.binary_to_string_list(
                        original_names)  # (batch,)

                    self._log_predictions_during_evaluation(
                        zip(original_names, top_words), log_output_file)
                    topk_accuracy_evaluation_metric.update_batch(
                        zip(original_names, top_words))
                    subtokens_evaluation_metric.update_batch(
                        zip(original_names, top_words))

                    total_predictions += len(original_names)
                    total_prediction_batches += 1
                    if self.config.EXPORT_CODE_VECTORS:
                        self._write_code_vectors(code_vectors_file,
                                                 code_vectors)
                    if total_prediction_batches % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                        elapsed = time.time() - start_time
                        # start_time = time.time()
                        self._trace_evaluation(total_predictions, elapsed)
            except tf.errors.OutOfRangeError:
                pass  # reader iterator is exhausted and have no more batches to produce.
            self.log('Done evaluating, epoch reached')
            log_output_file.write(
                str(topk_accuracy_evaluation_metric.topk_correct_predictions) +
                '\n')
        if self.config.EXPORT_CODE_VECTORS:
            code_vectors_file.close()

        elapsed = int(time.time() - eval_start_time)
        self.log("Evaluation time: %sH:%sM:%sS" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return ModelEvaluationResults(
            topk_acc=topk_accuracy_evaluation_metric.topk_correct_predictions,
            subtoken_precision=subtokens_evaluation_metric.precision,
            subtoken_recall=subtokens_evaluation_metric.recall,
            subtoken_f1=subtokens_evaluation_metric.f1)
Ejemplo n.º 7
0
    def train(self):
        self.log('Starting training')
        start_time = time.time()

        batch_num = 0
        sum_loss = 0
        multi_batch_start_time = time.time()
        num_batches_to_save_and_eval = max(
            int(self.config.train_steps_per_epoch *
                self.config.SAVE_EVERY_EPOCHS), 1)

        train_reader = PathContextReader(
            vocabs=self.vocabs,
            model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
            config=self.config,
            estimator_action=EstimatorAction.Train)
        input_iterator = tf.compat.v1.data.make_initializable_iterator(
            train_reader.get_dataset())
        input_iterator_reset_op = input_iterator.initializer
        input_tensors = input_iterator.get_next()

        optimizer, train_loss = self._build_tf_training_graph(input_tensors)
        self.saver = tf.compat.v1.train.Saver(
            max_to_keep=self.config.MAX_TO_KEEP)

        self.log('Number of trainable params: {}'.format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.compat.v1.trainable_variables()
            ])))
        for variable in tf.compat.v1.trainable_variables():
            self.log("variable name: {} -- shape: {} -- #params: {}".format(
                variable.name, variable.get_shape(),
                np.prod(variable.get_shape().as_list())))

        self._initialize_session_variables()

        if self.config.MODEL_LOAD_PATH:
            self._load_inner_model(self.sess)

        self.sess.run(input_iterator_reset_op)
        time.sleep(1)
        self.log('Started reader...')

        training_logger = None
        os.makedirs('losses_logs/', exist_ok=True)
        loss_log_path = 'losses_logs/losses_log' + common.now_str(
        )[:-2] + '.csv'
        e_loss_log_path = 'losses_logs/losses_log' + common.now_str(
        )[:-2] + '.csv' if EXTRA_VALIDATION_PERIOD > 0 else None
        if self.config.USE_TENSORBOARD:
            log_dir = "logs/scalars/train_" + common.now_str()[:-2]
            training_logger = tf.summary.create_file_writer(log_dir)
            self.sess.run(training_logger.init())
            training_logger.set_as_default()
        # run evaluation in a loop until iterator is exhausted.
        try:
            epoch_losses = []
            extra_losses = []
            while True:
                # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
                batch_num += 1

                # Actual training for the current batch.
                _, batch_loss = self.sess.run([optimizer, train_loss])

                sum_loss += batch_loss
                epoch_losses.append(batch_loss)
                extra_losses.append(batch_loss)
                # if self.config.USE_TENSORBOARD:
                #     self.sess.run(tf.summary.scalar('batch_loss', batch_loss, step=batch_num))
                if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                    self._trace_training(sum_loss, batch_num,
                                         multi_batch_start_time)
                    # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
                    # self.log('Number of waiting examples in queue: %d' % self.sess.run(
                    #    "shuffle_batch/random_shuffle_queue_Size:0"))
                    sum_loss = 0
                    multi_batch_start_time = time.time()
                if EXTRA_VALIDATION_PERIOD > 0 and batch_num % EXTRA_VALIDATION_PERIOD == 0:
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    extra_mean_train_loss = np.mean(
                        extra_losses) / self.config.TRAIN_BATCH_SIZE
                    extra_losses.clear()
                    print(
                        f'Losses: train: {extra_mean_train_loss}, validation: {evaluation_results.loss}'
                    )
                    with open(e_loss_log_path, 'at') as loss_log_file:
                        loss_log_file.write(
                            f'{extra_mean_train_loss},{evaluation_results.loss}\n'
                        )
                    if self.config.USE_TENSORBOARD:
                        self.sess.run([
                            tf.summary.scalar(
                                'e_precision',
                                evaluation_results.subtoken_precision,
                                step=batch_num),
                            tf.summary.scalar(
                                'e_recall',
                                evaluation_results.subtoken_recall,
                                step=batch_num),
                            tf.summary.scalar('e_f1',
                                              evaluation_results.subtoken_f1,
                                              step=batch_num),
                            tf.summary.scalar('e_train_loss',
                                              extra_mean_train_loss,
                                              step=batch_num),
                            tf.summary.scalar('e_validation_loss',
                                              evaluation_results.loss,
                                              step=batch_num),
                        ])
                        self.sess.run([
                            tf.summary.scalar(f'e_top{i}_acc',
                                              top_i_acc,
                                              step=batch_num) for i, top_i_acc
                            in enumerate(evaluation_results.topk_acc)
                        ])
                        self.sess.run(training_logger.flush())
                    self.log(
                        f'After {batch_num} batches -- {evaluation_results_str}'
                    )
                if batch_num % num_batches_to_save_and_eval == 0:
                    epoch_num = int(
                        (batch_num / num_batches_to_save_and_eval) *
                        self.config.SAVE_EVERY_EPOCHS)
                    model_save_path = self.config.MODEL_SAVE_PATH + '_iter' + str(
                        epoch_num)
                    self.save(model_save_path)
                    self.log('Saved after %d epochs in: %s' %
                             (epoch_num, model_save_path))
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    epoch_mean_train_loss = np.mean(
                        epoch_losses) / self.config.TRAIN_BATCH_SIZE
                    epoch_losses.clear()
                    print(
                        f'Losses: train: {epoch_mean_train_loss}, validation: {evaluation_results.loss}'
                    )
                    with open(loss_log_path, 'at') as loss_log_file:
                        loss_log_file.write(
                            f'{epoch_mean_train_loss},{evaluation_results.loss}\n'
                        )
                    if self.config.USE_TENSORBOARD:
                        self.sess.run([
                            tf.summary.scalar(
                                'precision',
                                evaluation_results.subtoken_precision,
                                step=epoch_num),
                            tf.summary.scalar(
                                'recall',
                                evaluation_results.subtoken_recall,
                                step=epoch_num),
                            tf.summary.scalar('f1',
                                              evaluation_results.subtoken_f1,
                                              step=epoch_num),
                            tf.summary.scalar('train_loss',
                                              epoch_mean_train_loss,
                                              step=epoch_num),
                            tf.summary.scalar('validation_loss',
                                              evaluation_results.loss,
                                              step=epoch_num),
                        ])
                        self.sess.run([
                            tf.summary.scalar(f'top{i}_acc',
                                              top_i_acc,
                                              step=epoch_num) for i, top_i_acc
                            in enumerate(evaluation_results.topk_acc)
                        ])
                        self.sess.run(training_logger.flush())
                    self.log(
                        'After {nr_epochs} epochs -- {evaluation_results}'.
                        format(nr_epochs=epoch_num,
                               evaluation_results=evaluation_results_str))
        except tf.errors.OutOfRangeError:
            pass  # The reader iterator is exhausted and have no more batches to produce.

        self.log('Done training')

        if self.config.MODEL_SAVE_PATH:
            self._save_inner_model(self.config.MODEL_SAVE_PATH)
            self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH)

        elapsed = int(time.time() - start_time)
        self.log("Training time: %sH:%sM:%sS\n" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
Ejemplo n.º 8
0
                        dest="checkpoints_dir",
                        help="Dir for checkpoints",
                        required=False,
                        default="training")
    parser.add_argument("--net",
                        dest="net",
                        help="net destination type var or vec",
                        required=False,
                        default="vec")
    args = parser.parse_args()

    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    if args.train:
        print(f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv")
        c2v_vocabs = Code2VecVocabs(net=NetType(args.net))
        pcr = PathContextReader(is_train=True, vocabs=c2v_vocabs,
                                csv_path=f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv")
        dataset = pcr.get_dataset()
        val_dataset, test_dataset = pcr.get_subdatasets()
        # init lookups

        c2v_vocabs.target_vocab.get_word_to_index_lookup_table()
        c2v_vocabs.token_vocab.get_word_to_index_lookup_table()
        c2v_vocabs.path_vocab.get_word_to_index_lookup_table()

        TOKEN_VOCAB_SIZE = c2v_vocabs.token_vocab.lookup_table_word_to_index.size().numpy()
        TARGET_VOCAB_SIZE = c2v_vocabs.target_vocab.lookup_table_word_to_index.size().numpy()
        PATH_VOCAB_SIZE = c2v_vocabs.path_vocab.lookup_table_word_to_index.size().numpy()
        tf.random.set_seed(42)
        model = code2vec(token_vocab_size=TOKEN_VOCAB_SIZE,
                         target_vocab_size=TARGET_VOCAB_SIZE,
                         path_vocab_size=PATH_VOCAB_SIZE,