Ejemplo n.º 1
0
def test_get_dataset():
    config.config.CREATE_VOCAB = True
    config.config.VEC_TRAINING_FREQ_DICTS_PATH = "dataset/java-small.c2v.dict"
    c2v_vocabs = Code2VecVocabs()
    pcr = PathContextReader(is_train=True,
                            vocabs=c2v_vocabs,
                            csv_path="dataset/java-small.train_vec.csv")
    dataset = pcr.get_dataset()
    it = iter(dataset)
    it = it.get_next()
    assert it.target_index.shape[0] == it.path_source_token_indices.shape[0]
Ejemplo n.º 2
0
    def train(self):
        self.log('Starting training')
        start_time = time.time()

        batch_num = 0
        sum_loss = 0
        multi_batch_start_time = time.time()
        num_batches_to_save_and_eval = max(
            int(self.config.train_steps_per_epoch *
                self.config.SAVE_EVERY_EPOCHS), 1)

        train_reader = PathContextReader(
            vocabs=self.vocabs,
            model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
            config=self.config,
            estimator_action=EstimatorAction.Train)
        input_iterator = tf.compat.v1.data.make_initializable_iterator(
            train_reader.get_dataset())
        input_iterator_reset_op = input_iterator.initializer
        input_tensors = input_iterator.get_next()

        optimizer, train_loss = self._build_tf_training_graph(input_tensors)
        self.saver = tf.compat.v1.train.Saver(
            max_to_keep=self.config.MAX_TO_KEEP)

        self.log('Number of trainable params: {}'.format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.compat.v1.trainable_variables()
            ])))
        for variable in tf.compat.v1.trainable_variables():
            self.log("variable name: {} -- shape: {} -- #params: {}".format(
                variable.name, variable.get_shape(),
                np.prod(variable.get_shape().as_list())))

        self._initialize_session_variables()

        if self.config.MODEL_LOAD_PATH:
            self._load_inner_model(self.sess)

        self.sess.run(input_iterator_reset_op)
        time.sleep(1)
        self.log('Started reader...')
        # run evaluation in a loop until iterator is exhausted.
        try:
            while True:
                # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
                batch_num += 1

                # Actual training for the current batch.
                _, batch_loss = self.sess.run([optimizer, train_loss])

                sum_loss += batch_loss
                if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                    self._trace_training(sum_loss, batch_num,
                                         multi_batch_start_time)
                    # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
                    # self.log('Number of waiting examples in queue: %d' % self.sess.run(
                    #    "shuffle_batch/random_shuffle_queue_Size:0"))
                    sum_loss = 0
                    multi_batch_start_time = time.time()
                if batch_num % num_batches_to_save_and_eval == 0:
                    epoch_num = int(
                        (batch_num / num_batches_to_save_and_eval) *
                        self.config.SAVE_EVERY_EPOCHS)
                    save_path = self.config.MODEL_SAVE_PATH + '_iter' + str(
                        epoch_num)
                    self._save_inner_model(save_path)
                    self.log('Saved after %d epochs in: %s' %
                             (epoch_num, save_path))
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    self.log(
                        'After {nr_epochs} epochs -- {evaluation_results}'.
                        format(nr_epochs=epoch_num,
                               evaluation_results=evaluation_results_str))
        except tf.errors.OutOfRangeError:
            pass  # The reader iterator is exhausted and have no more batches to produce.

        self.log('Done training')

        if self.config.MODEL_SAVE_PATH:
            self._save_inner_model(self.config.MODEL_SAVE_PATH)
            self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH)

        elapsed = int(time.time() - start_time)
        self.log("Training time: %sH:%sM:%sS\n" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
Ejemplo n.º 3
0
class Code2VecModel(Code2VecModelBase):
    def __init__(self, config: Config):
        self.sess = tf.compat.v1.Session()
        self.saver = None

        self.eval_reader = None
        self.eval_input_iterator_reset_op = None
        self.predict_reader = None

        # self.eval_placeholder = None
        self.predict_placeholder = None
        self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, self.eval_code_vectors = None, None, None, None
        self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op = None, None, None

        self.vocab_type_to_tf_variable_name_mapping: Dict[VocabType, str] = {
            VocabType.Token: 'WORDS_VOCAB',
            VocabType.Target: 'TARGET_WORDS_VOCAB',
            VocabType.Path: 'PATHS_VOCAB'
        }

        super(Code2VecModel, self).__init__(config)

    def train(self):
        self.log('Starting training')
        start_time = time.time()

        batch_num = 0
        sum_loss = 0
        multi_batch_start_time = time.time()
        num_batches_to_save_and_eval = max(
            int(self.config.train_steps_per_epoch *
                self.config.SAVE_EVERY_EPOCHS), 1)

        train_reader = PathContextReader(
            vocabs=self.vocabs,
            model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
            config=self.config,
            estimator_action=EstimatorAction.Train)
        input_iterator = tf.compat.v1.data.make_initializable_iterator(
            train_reader.get_dataset())
        input_iterator_reset_op = input_iterator.initializer
        input_tensors = input_iterator.get_next()

        optimizer, train_loss = self._build_tf_training_graph(input_tensors)
        self.saver = tf.compat.v1.train.Saver(
            max_to_keep=self.config.MAX_TO_KEEP)

        self.log('Number of trainable params: {}'.format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.compat.v1.trainable_variables()
            ])))
        for variable in tf.compat.v1.trainable_variables():
            self.log("variable name: {} -- shape: {} -- #params: {}".format(
                variable.name, variable.get_shape(),
                np.prod(variable.get_shape().as_list())))

        self._initialize_session_variables()

        if self.config.MODEL_LOAD_PATH:
            self._load_inner_model(self.sess)

        self.sess.run(input_iterator_reset_op)
        time.sleep(1)
        self.log('Started reader...')
        # run evaluation in a loop until iterator is exhausted.
        try:
            while True:
                # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
                batch_num += 1

                # Actual training for the current batch.
                _, batch_loss = self.sess.run([optimizer, train_loss])

                sum_loss += batch_loss
                if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                    self._trace_training(sum_loss, batch_num,
                                         multi_batch_start_time)
                    # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
                    # self.log('Number of waiting examples in queue: %d' % self.sess.run(
                    #    "shuffle_batch/random_shuffle_queue_Size:0"))
                    sum_loss = 0
                    multi_batch_start_time = time.time()
                if batch_num % num_batches_to_save_and_eval == 0:
                    epoch_num = int(
                        (batch_num / num_batches_to_save_and_eval) *
                        self.config.SAVE_EVERY_EPOCHS)
                    save_path = self.config.MODEL_SAVE_PATH + '_iter' + str(
                        epoch_num)
                    self._save_inner_model(save_path)
                    self.log('Saved after %d epochs in: %s' %
                             (epoch_num, save_path))
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    self.log(
                        'After {nr_epochs} epochs -- {evaluation_results}'.
                        format(nr_epochs=epoch_num,
                               evaluation_results=evaluation_results_str))
        except tf.errors.OutOfRangeError:
            pass  # The reader iterator is exhausted and have no more batches to produce.

        self.log('Done training')

        if self.config.MODEL_SAVE_PATH:
            self._save_inner_model(self.config.MODEL_SAVE_PATH)
            self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH)

        elapsed = int(time.time() - start_time)
        self.log("Training time: %sH:%sM:%sS\n" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))

    def evaluate(self) -> Optional[ModelEvaluationResults]:
        eval_start_time = time.time()
        if self.eval_reader is None:
            self.eval_reader = PathContextReader(
                vocabs=self.vocabs,
                model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(
                ),
                config=self.config,
                estimator_action=EstimatorAction.Evaluate)
            input_iterator = tf.compat.v1.data.make_initializable_iterator(
                self.eval_reader.get_dataset())
            self.eval_input_iterator_reset_op = input_iterator.initializer
            input_tensors = input_iterator.get_next()

            self.eval_top_words_op, self.eval_top_values_op, self.eval_original_names_op, _, _, _, _, _, _, \
                self.eval_code_vectors = self._build_tf_test_graph(input_tensors)
            self.saver = tf.compat.v1.train.Saver()

        if self.config.MODEL_LOAD_PATH and not self.config.TRAIN_DATA_PATH_PREFIX:
            self._initialize_session_variables()
            self._load_inner_model(self.sess)
            if self.config.RELEASE:
                release_name = self.config.MODEL_LOAD_PATH + '.release'
                self.log('Releasing model, output model: %s' % release_name)
                self.saver.save(self.sess, release_name)
                return None  # FIXME: why do we return none here?

        with open('log.txt', 'w') as log_output_file:
            if self.config.EXPORT_CODE_VECTORS:
                code_vectors_file = open(
                    self.config.TEST_DATA_PATH + '.vectors', 'w')
            total_predictions = 0
            total_prediction_batches = 0
            subtokens_evaluation_metric = SubtokensEvaluationMetric(
                partial(common.filter_impossible_names,
                        self.vocabs.target_vocab.special_words))
            topk_accuracy_evaluation_metric = TopKAccuracyEvaluationMetric(
                self.config.TOP_K_WORDS_CONSIDERED_DURING_PREDICTION,
                partial(common.get_first_match_word_from_top_predictions,
                        self.vocabs.target_vocab.special_words))
            start_time = time.time()

            self.sess.run(self.eval_input_iterator_reset_op)

            self.log('Starting evaluation')

            # Run evaluation in a loop until iterator is exhausted.
            # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
            try:
                while True:
                    top_words, top_scores, original_names, code_vectors = self.sess.run(
                        [
                            self.eval_top_words_op, self.eval_top_values_op,
                            self.eval_original_names_op, self.eval_code_vectors
                        ], )

                    # shapes:
                    #   top_words: (batch, top_k);   top_scores: (batch, top_k)
                    #   original_names: (batch, );   code_vectors: (batch, code_vector_size)

                    top_words = common.binary_to_string_matrix(
                        top_words)  # (batch, top_k)
                    original_names = common.binary_to_string_list(
                        original_names)  # (batch,)

                    self._log_predictions_during_evaluation(
                        zip(original_names, top_words), log_output_file)
                    topk_accuracy_evaluation_metric.update_batch(
                        zip(original_names, top_words))
                    subtokens_evaluation_metric.update_batch(
                        zip(original_names, top_words))

                    total_predictions += len(original_names)
                    total_prediction_batches += 1
                    if self.config.EXPORT_CODE_VECTORS:
                        self._write_code_vectors(code_vectors_file,
                                                 code_vectors)
                    if total_prediction_batches % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                        elapsed = time.time() - start_time
                        # start_time = time.time()
                        self._trace_evaluation(total_predictions, elapsed)
            except tf.errors.OutOfRangeError:
                pass  # reader iterator is exhausted and have no more batches to produce.
            self.log('Done evaluating, epoch reached')
            log_output_file.write(
                str(topk_accuracy_evaluation_metric.topk_correct_predictions) +
                '\n')
        if self.config.EXPORT_CODE_VECTORS:
            code_vectors_file.close()

        elapsed = int(time.time() - eval_start_time)
        self.log("Evaluation time: %sH:%sM:%sS" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return ModelEvaluationResults(
            topk_acc=topk_accuracy_evaluation_metric.topk_correct_predictions,
            subtoken_precision=subtokens_evaluation_metric.precision,
            subtoken_recall=subtokens_evaluation_metric.recall,
            subtoken_f1=subtokens_evaluation_metric.f1)

    def _build_tf_training_graph(self, input_tensors):
        # Use `_TFTrainModelInputTensorsFormer` to access input tensors by name.
        input_tensors = _TFTrainModelInputTensorsFormer(
        ).from_model_input_form(input_tensors)
        # shape of (batch, 1) for input_tensors.target_index
        # shape of (batch, max_contexts) for others:
        #   input_tensors.path_source_token_indices, input_tensors.path_indices,
        #   input_tensors.path_target_token_indices, input_tensors.context_valid_mask

        with tf.compat.v1.variable_scope('model'):
            tokens_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Token],
                shape=(self.vocabs.token_vocab.size,
                       self.config.TOKEN_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.compat.v1.initializers.variance_scaling(
                    scale=1.0, mode='fan_out', distribution="uniform"))
            targets_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Target],
                shape=(self.vocabs.target_vocab.size,
                       self.config.TARGET_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.compat.v1.initializers.variance_scaling(
                    scale=1.0, mode='fan_out', distribution="uniform"))
            attention_param = tf.compat.v1.get_variable(
                'ATTENTION',
                shape=(self.config.CODE_VECTOR_SIZE, 1),
                dtype=tf.float32)
            paths_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Path],
                shape=(self.vocabs.path_vocab.size,
                       self.config.PATH_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.compat.v1.initializers.variance_scaling(
                    scale=1.0, mode='fan_out', distribution="uniform"))

            code_vectors, _, _, _ = self._calculate_weighted_contexts(
                tokens_vocab, paths_vocab, attention_param,
                input_tensors.path_source_token_indices,
                input_tensors.path_indices,
                input_tensors.path_target_token_indices,
                input_tensors.context_valid_mask)

            logits = tf.matmul(code_vectors, targets_vocab, transpose_b=True)
            batch_size = tf.cast(tf.shape(input_tensors.target_index)[0],
                                 dtype=tf.float32)
            loss = tf.reduce_sum(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=tf.reshape(input_tensors.target_index, [-1]),
                    logits=logits)) / batch_size

            optimizer = tf.compat.v1.train.AdamOptimizer().minimize(loss)

        return optimizer, loss

    def _calculate_weighted_contexts(self,
                                     tokens_vocab,
                                     paths_vocab,
                                     attention_param,
                                     source_input,
                                     path_input,
                                     target_input,
                                     valid_mask,
                                     is_evaluating=False):
        source_word_embed = tf.nn.embedding_lookup(
            params=tokens_vocab,
            ids=source_input)  # (batch, max_contexts, dim)
        path_embed = tf.nn.embedding_lookup(
            params=paths_vocab, ids=path_input)  # (batch, max_contexts, dim)
        target_word_embed = tf.nn.embedding_lookup(
            params=tokens_vocab,
            ids=target_input)  # (batch, max_contexts, dim)

        context_embed = tf.concat(
            [source_word_embed, path_embed, target_word_embed],
            axis=-1)  # (batch, max_contexts, dim * 3)

        if not is_evaluating:
            context_embed = tf.nn.dropout(context_embed,
                                          rate=1 -
                                          self.config.DROPOUT_KEEP_RATE)

        flat_embed = tf.reshape(context_embed,
                                [-1, self.config.context_vector_size
                                 ])  # (batch * max_contexts, dim * 3)
        transform_param = tf.compat.v1.get_variable(
            'TRANSFORM',
            shape=(self.config.context_vector_size,
                   self.config.CODE_VECTOR_SIZE),
            dtype=tf.float32)

        flat_embed = tf.tanh(tf.matmul(
            flat_embed, transform_param))  # (batch * max_contexts, dim * 3)

        contexts_weights = tf.matmul(
            flat_embed, attention_param)  # (batch * max_contexts, 1)
        batched_contexts_weights = tf.reshape(
            contexts_weights,
            [-1, self.config.MAX_CONTEXTS, 1])  # (batch, max_contexts, 1)
        mask = tf.math.log(valid_mask)  # (batch, max_contexts)
        mask = tf.expand_dims(mask, axis=2)  # (batch, max_contexts, 1)
        batched_contexts_weights += mask  # (batch, max_contexts, 1)
        attention_weights = tf.nn.softmax(batched_contexts_weights,
                                          axis=1)  # (batch, max_contexts, 1)

        batched_embed = tf.reshape(
            flat_embed,
            shape=[-1, self.config.MAX_CONTEXTS, self.config.CODE_VECTOR_SIZE])
        code_vectors = tf.reduce_sum(tf.multiply(batched_embed,
                                                 attention_weights),
                                     axis=1)  # (batch, dim * 3)

        return code_vectors, attention_weights, flat_embed, path_embed

    def _build_tf_test_graph(self, input_tensors, normalize_scores=False):
        with tf.compat.v1.variable_scope(
                'model', reuse=self.get_should_reuse_variables()):
            tokens_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Token],
                shape=(self.vocabs.token_vocab.size,
                       self.config.TOKEN_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                trainable=False)
            targets_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Target],
                shape=(self.vocabs.target_vocab.size,
                       self.config.TARGET_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                trainable=False)
            attention_param = tf.compat.v1.get_variable(
                'ATTENTION',
                shape=(self.config.context_vector_size, 1),
                dtype=tf.float32,
                trainable=False)
            paths_vocab = tf.compat.v1.get_variable(
                self.vocab_type_to_tf_variable_name_mapping[VocabType.Path],
                shape=(self.vocabs.path_vocab.size,
                       self.config.PATH_EMBEDDINGS_SIZE),
                dtype=tf.float32,
                trainable=False)

            targets_vocab = tf.transpose(
                targets_vocab)  # (dim * 3, target_word_vocab)

            # Use `_TFEvaluateModelInputTensorsFormer` to access input tensors by name.
            input_tensors = _TFEvaluateModelInputTensorsFormer(
            ).from_model_input_form(input_tensors)
            # shape of (batch, 1) for input_tensors.target_string
            # shape of (batch, max_contexts) for the other tensors

            code_vectors, attention_weights, flat_embed, path_embed = self._calculate_weighted_contexts(
                tokens_vocab,
                paths_vocab,
                attention_param,
                input_tensors.path_source_token_indices,
                input_tensors.path_indices,
                input_tensors.path_target_token_indices,
                input_tensors.context_valid_mask,
                is_evaluating=True)

        scores = tf.matmul(code_vectors,
                           targets_vocab)  # (batch, target_word_vocab)

        topk_candidates = tf.nn.top_k(
            scores,
            k=tf.minimum(self.config.TOP_K_WORDS_CONSIDERED_DURING_PREDICTION,
                         self.vocabs.target_vocab.size))
        top_indices = topk_candidates.indices
        top_words = self.vocabs.target_vocab.lookup_word(top_indices)
        original_words = input_tensors.target_string
        top_scores = topk_candidates.values
        if normalize_scores:
            top_scores = tf.nn.softmax(top_scores)

        return top_words, top_scores, original_words, attention_weights, flat_embed, path_embed, \
               input_tensors.path_source_token_strings, input_tensors.path_strings, \
               input_tensors.path_target_token_strings, code_vectors

    def predict(
            self,
            predict_data_lines: Iterable[str]) -> List[ModelPredictionResults]:
        if self.predict_reader is None:
            self.predict_reader = PathContextReader(
                vocabs=self.vocabs,
                model_input_tensors_former=_TFEvaluateModelInputTensorsFormer(
                ),
                config=self.config,
                estimator_action=EstimatorAction.Predict)
            self.predict_placeholder = tf.compat.v1.placeholder(tf.string)
            reader_output = self.predict_reader.process_input_row(
                self.predict_placeholder)

            self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op, \
            self.attention_weights_op, self.flat_embed_op, self.path_embed_op, self.predict_source_string,\
            self.predict_path_string, self.predict_path_target_string, self.predict_code_vectors = \
                self._build_tf_test_graph(reader_output, normalize_scores=True)

            self._initialize_session_variables()
            self.saver = tf.compat.v1.train.Saver()
            self._load_inner_model(sess=self.sess)

        prediction_results: List[ModelPredictionResults] = []
        for line in predict_data_lines:
            batch_top_words, batch_top_scores, batch_original_name, batch_attention_weights, flat_embed, \
                batch_path_embed, batch_path_source_strings, batch_path_strings, \
                batch_path_target_strings, batch_code_vectors \
                = self.sess.run(
                    [self.predict_top_words_op, self.predict_top_values_op, self.predict_original_names_op,
                     self.attention_weights_op, self.flat_embed_op, self.path_embed_op, self.predict_source_string,
                     self.predict_path_string, self.predict_path_target_string, self.predict_code_vectors],
                    feed_dict={self.predict_placeholder: line})
            # shapes:
            #   batch_top_words, top_scores: (batch, top_k)
            #   batch_original_name: (batch, )
            #   batch_attention_weights: (batch, max_context, 1)
            #   batch_path_source_strings, batch_path_strings, batch_path_target_strings: (batch, max_context)
            #   batch_code_vectors: (batch, code_vector_size)

            # remove first axis: (batch=1, ...)
            assert all(
                tensor.shape[0] == 1
                for tensor in (batch_top_words, batch_top_scores,
                               batch_original_name, batch_attention_weights,
                               batch_path_embed, batch_path_source_strings,
                               batch_path_strings, batch_path_target_strings,
                               batch_code_vectors))
            top_words = np.squeeze(batch_top_words, axis=0)
            top_scores = np.squeeze(batch_top_scores, axis=0)
            original_name = batch_original_name[0]
            attention_weights = np.squeeze(batch_attention_weights, axis=0)
            path_embed = np.squeeze(batch_path_embed, axis=0)
            path_source_strings = np.squeeze(batch_path_source_strings, axis=0)
            path_strings = np.squeeze(batch_path_strings, axis=0)
            path_target_strings = np.squeeze(batch_path_target_strings, axis=0)
            code_vectors = np.squeeze(batch_code_vectors, axis=0)

            top_words = common.binary_to_string_list(top_words)
            original_name = common.binary_to_string(original_name)
            attention_per_context = self._get_attention_weight_per_context(
                path_source_strings, path_strings, path_target_strings,
                attention_weights)
            embed_per_context = self._get_embed_per_context(
                path_source_strings, path_strings, path_target_strings,
                flat_embed)
            embed_per_path = self._get_embed_per_path(path_strings, path_embed)
            prediction_results.append(
                ModelPredictionResults(
                    original_name=original_name,
                    topk_predicted_words=top_words,
                    topk_predicted_words_scores=top_scores,
                    attention_per_context=attention_per_context,
                    embed_per_context=embed_per_context,
                    embed_per_path=embed_per_path,
                    code_vector=(code_vectors if
                                 self.config.EXPORT_CODE_VECTORS else None)))
        return prediction_results

    def _save_inner_model(self, path: str):
        self.saver.save(self.sess, path)

    def _load_inner_model(self, sess=None):
        if sess is not None:
            self.log('Loading model weights from: ' +
                     self.config.MODEL_LOAD_PATH)
            self.saver.restore(sess, self.config.MODEL_LOAD_PATH)
            self.log('Done loading model weights')

    def _get_vocab_embedding_as_np_array(self,
                                         vocab_type: VocabType) -> np.ndarray:
        assert vocab_type in VocabType
        vocab_tf_variable_name = self.vocab_type_to_tf_variable_name_mapping[
            vocab_type]
        with tf.compat.v1.variable_scope('model', reuse=None):
            embeddings = tf.compat.v1.get_variable(vocab_tf_variable_name)
            self.saver = tf.compat.v1.train.Saver()
            self._load_inner_model(self.sess)
            vocab_embedding_matrix = self.sess.run(embeddings)
            return vocab_embedding_matrix

    def get_should_reuse_variables(self):
        if self.config.TRAIN_DATA_PATH_PREFIX:
            return True
        else:
            return None

    def _log_predictions_during_evaluation(self, results, output_file):
        for original_name, top_predicted_words in results:
            found_match = common.get_first_match_word_from_top_predictions(
                self.vocabs.target_vocab.special_words, original_name,
                top_predicted_words)
            if found_match is not None:
                prediction_idx, predicted_word = found_match
                if prediction_idx == 0:
                    output_file.write('Original: ' + original_name +
                                      ', predicted 1st: ' + predicted_word +
                                      '\n')
                else:
                    output_file.write('\t\t predicted correctly at rank: ' +
                                      str(prediction_idx + 1) + '\n')
            else:
                output_file.write('No results for predicting: ' +
                                  original_name)

    def _trace_training(self, sum_loss, batch_num, multi_batch_start_time):
        multi_batch_elapsed = time.time() - multi_batch_start_time
        avg_loss = sum_loss / (self.config.NUM_BATCHES_TO_LOG_PROGRESS *
                               self.config.TRAIN_BATCH_SIZE)
        throughput = self.config.TRAIN_BATCH_SIZE * self.config.NUM_BATCHES_TO_LOG_PROGRESS / \
                     (multi_batch_elapsed if multi_batch_elapsed > 0 else 1)
        self.log('Average loss at batch %d: %f, \tthroughput: %d samples/sec' %
                 (batch_num, avg_loss, throughput))

    def _trace_evaluation(self, total_predictions, elapsed):
        state_message = 'Evaluated %d examples...' % total_predictions
        throughput_message = "Prediction throughput: %d samples/sec" % int(
            total_predictions / (elapsed if elapsed > 0 else 1))
        self.log(state_message)
        self.log(throughput_message)

    def close_session(self):
        self.sess.close()

    def _initialize_session_variables(self):
        self.sess.run(
            tf.group(tf.compat.v1.global_variables_initializer(),
                     tf.compat.v1.local_variables_initializer(),
                     tf.compat.v1.tables_initializer()))
        self.log('Initalized variables')
Ejemplo n.º 4
0
    def train(self):
        self.log('Starting training')
        start_time = time.time()

        batch_num = 0
        sum_loss = 0
        multi_batch_start_time = time.time()
        num_batches_to_save_and_eval = max(
            int(self.config.train_steps_per_epoch *
                self.config.SAVE_EVERY_EPOCHS), 1)

        train_reader = PathContextReader(
            vocabs=self.vocabs,
            model_input_tensors_former=_TFTrainModelInputTensorsFormer(),
            config=self.config,
            estimator_action=EstimatorAction.Train)
        input_iterator = tf.compat.v1.data.make_initializable_iterator(
            train_reader.get_dataset())
        input_iterator_reset_op = input_iterator.initializer
        input_tensors = input_iterator.get_next()

        optimizer, train_loss = self._build_tf_training_graph(input_tensors)
        self.saver = tf.compat.v1.train.Saver(
            max_to_keep=self.config.MAX_TO_KEEP)

        self.log('Number of trainable params: {}'.format(
            np.sum([
                np.prod(v.get_shape().as_list())
                for v in tf.compat.v1.trainable_variables()
            ])))
        for variable in tf.compat.v1.trainable_variables():
            self.log("variable name: {} -- shape: {} -- #params: {}".format(
                variable.name, variable.get_shape(),
                np.prod(variable.get_shape().as_list())))

        self._initialize_session_variables()

        if self.config.MODEL_LOAD_PATH:
            self._load_inner_model(self.sess)

        self.sess.run(input_iterator_reset_op)
        time.sleep(1)
        self.log('Started reader...')

        training_logger = None
        os.makedirs('losses_logs/', exist_ok=True)
        loss_log_path = 'losses_logs/losses_log' + common.now_str(
        )[:-2] + '.csv'
        e_loss_log_path = 'losses_logs/losses_log' + common.now_str(
        )[:-2] + '.csv' if EXTRA_VALIDATION_PERIOD > 0 else None
        if self.config.USE_TENSORBOARD:
            log_dir = "logs/scalars/train_" + common.now_str()[:-2]
            training_logger = tf.summary.create_file_writer(log_dir)
            self.sess.run(training_logger.init())
            training_logger.set_as_default()
        # run evaluation in a loop until iterator is exhausted.
        try:
            epoch_losses = []
            extra_losses = []
            while True:
                # Each iteration = batch. We iterate as long as the tf iterator (reader) yields batches.
                batch_num += 1

                # Actual training for the current batch.
                _, batch_loss = self.sess.run([optimizer, train_loss])

                sum_loss += batch_loss
                epoch_losses.append(batch_loss)
                extra_losses.append(batch_loss)
                # if self.config.USE_TENSORBOARD:
                #     self.sess.run(tf.summary.scalar('batch_loss', batch_loss, step=batch_num))
                if batch_num % self.config.NUM_BATCHES_TO_LOG_PROGRESS == 0:
                    self._trace_training(sum_loss, batch_num,
                                         multi_batch_start_time)
                    # Uri: the "shuffle_batch/random_shuffle_queue_Size:0" op does not exist since the migration to the new reader.
                    # self.log('Number of waiting examples in queue: %d' % self.sess.run(
                    #    "shuffle_batch/random_shuffle_queue_Size:0"))
                    sum_loss = 0
                    multi_batch_start_time = time.time()
                if EXTRA_VALIDATION_PERIOD > 0 and batch_num % EXTRA_VALIDATION_PERIOD == 0:
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    extra_mean_train_loss = np.mean(
                        extra_losses) / self.config.TRAIN_BATCH_SIZE
                    extra_losses.clear()
                    print(
                        f'Losses: train: {extra_mean_train_loss}, validation: {evaluation_results.loss}'
                    )
                    with open(e_loss_log_path, 'at') as loss_log_file:
                        loss_log_file.write(
                            f'{extra_mean_train_loss},{evaluation_results.loss}\n'
                        )
                    if self.config.USE_TENSORBOARD:
                        self.sess.run([
                            tf.summary.scalar(
                                'e_precision',
                                evaluation_results.subtoken_precision,
                                step=batch_num),
                            tf.summary.scalar(
                                'e_recall',
                                evaluation_results.subtoken_recall,
                                step=batch_num),
                            tf.summary.scalar('e_f1',
                                              evaluation_results.subtoken_f1,
                                              step=batch_num),
                            tf.summary.scalar('e_train_loss',
                                              extra_mean_train_loss,
                                              step=batch_num),
                            tf.summary.scalar('e_validation_loss',
                                              evaluation_results.loss,
                                              step=batch_num),
                        ])
                        self.sess.run([
                            tf.summary.scalar(f'e_top{i}_acc',
                                              top_i_acc,
                                              step=batch_num) for i, top_i_acc
                            in enumerate(evaluation_results.topk_acc)
                        ])
                        self.sess.run(training_logger.flush())
                    self.log(
                        f'After {batch_num} batches -- {evaluation_results_str}'
                    )
                if batch_num % num_batches_to_save_and_eval == 0:
                    epoch_num = int(
                        (batch_num / num_batches_to_save_and_eval) *
                        self.config.SAVE_EVERY_EPOCHS)
                    model_save_path = self.config.MODEL_SAVE_PATH + '_iter' + str(
                        epoch_num)
                    self.save(model_save_path)
                    self.log('Saved after %d epochs in: %s' %
                             (epoch_num, model_save_path))
                    evaluation_results = self.evaluate()
                    evaluation_results_str = (str(evaluation_results).replace(
                        'topk', 'top{}'.format(
                            self.config.
                            TOP_K_WORDS_CONSIDERED_DURING_PREDICTION)))
                    epoch_mean_train_loss = np.mean(
                        epoch_losses) / self.config.TRAIN_BATCH_SIZE
                    epoch_losses.clear()
                    print(
                        f'Losses: train: {epoch_mean_train_loss}, validation: {evaluation_results.loss}'
                    )
                    with open(loss_log_path, 'at') as loss_log_file:
                        loss_log_file.write(
                            f'{epoch_mean_train_loss},{evaluation_results.loss}\n'
                        )
                    if self.config.USE_TENSORBOARD:
                        self.sess.run([
                            tf.summary.scalar(
                                'precision',
                                evaluation_results.subtoken_precision,
                                step=epoch_num),
                            tf.summary.scalar(
                                'recall',
                                evaluation_results.subtoken_recall,
                                step=epoch_num),
                            tf.summary.scalar('f1',
                                              evaluation_results.subtoken_f1,
                                              step=epoch_num),
                            tf.summary.scalar('train_loss',
                                              epoch_mean_train_loss,
                                              step=epoch_num),
                            tf.summary.scalar('validation_loss',
                                              evaluation_results.loss,
                                              step=epoch_num),
                        ])
                        self.sess.run([
                            tf.summary.scalar(f'top{i}_acc',
                                              top_i_acc,
                                              step=epoch_num) for i, top_i_acc
                            in enumerate(evaluation_results.topk_acc)
                        ])
                        self.sess.run(training_logger.flush())
                    self.log(
                        'After {nr_epochs} epochs -- {evaluation_results}'.
                        format(nr_epochs=epoch_num,
                               evaluation_results=evaluation_results_str))
        except tf.errors.OutOfRangeError:
            pass  # The reader iterator is exhausted and have no more batches to produce.

        self.log('Done training')

        if self.config.MODEL_SAVE_PATH:
            self._save_inner_model(self.config.MODEL_SAVE_PATH)
            self.log('Model saved in file: %s' % self.config.MODEL_SAVE_PATH)

        elapsed = int(time.time() - start_time)
        self.log("Training time: %sH:%sM:%sS\n" %
                 ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
Ejemplo n.º 5
0
                        required=False,
                        default="training")
    parser.add_argument("--net",
                        dest="net",
                        help="net destination type var or vec",
                        required=False,
                        default="vec")
    args = parser.parse_args()

    print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
    if args.train:
        print(f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv")
        c2v_vocabs = Code2VecVocabs(net=NetType(args.net))
        pcr = PathContextReader(is_train=True, vocabs=c2v_vocabs,
                                csv_path=f"dataset/{args.dataset_name}/{args.dataset_name}.{args.net}.csv")
        dataset = pcr.get_dataset()
        val_dataset, test_dataset = pcr.get_subdatasets()
        # init lookups

        c2v_vocabs.target_vocab.get_word_to_index_lookup_table()
        c2v_vocabs.token_vocab.get_word_to_index_lookup_table()
        c2v_vocabs.path_vocab.get_word_to_index_lookup_table()

        TOKEN_VOCAB_SIZE = c2v_vocabs.token_vocab.lookup_table_word_to_index.size().numpy()
        TARGET_VOCAB_SIZE = c2v_vocabs.target_vocab.lookup_table_word_to_index.size().numpy()
        PATH_VOCAB_SIZE = c2v_vocabs.path_vocab.lookup_table_word_to_index.size().numpy()
        tf.random.set_seed(42)
        model = code2vec(token_vocab_size=TOKEN_VOCAB_SIZE,
                         target_vocab_size=TARGET_VOCAB_SIZE,
                         path_vocab_size=PATH_VOCAB_SIZE,
                         custom_metrics=["accuracy"])