Esempio n. 1
0
    def evaluate(self, release=False):
        eval_start_time = time.time()
        if self.eval_queue is None:
            self.eval_queue = androidreader.Reader(
                subtoken_to_index=self.subtoken_to_index,
                node_to_index=self.node_to_index,
                target_to_index=self.target_to_index,
                config=self.config,
                is_evaluating=True)
            reader_output = self.eval_queue.get_output()
            self.eval_predicted_indices_op, self.eval_topk_values, _, _, self.method_embedding = \
                self.build_test_graph(reader_output)
            self.eval_true_target_strings_op = reader_output[
                androidreader.TARGET_STRING_KEY]
            self.eval_tag_key_op = reader_output[androidreader.TARGET_TAG_KEY]
            self.saver = tf.train.Saver(max_to_keep=10)

        if self.config.LOAD_PATH and not self.config.TRAIN_PATH:
            self.initialize_session_variables(self.sess)
            self.load_model(self.sess)
            if release:
                release_name = self.config.LOAD_PATH + '.release'
                print('Releasing model, output model: %s' % release_name)
                self.saver.save(self.sess, release_name)
                shutil.copyfile(src=self.config.LOAD_PATH + '.dict',
                                dst=release_name + '.dict')
                return None
        model_dirname = os.path.dirname(self.config.SAVE_PATH if self.config.
                                        SAVE_PATH else self.config.LOAD_PATH)
        ref_file_name = model_dirname + '/ref.txt'
        predicted_file_name = model_dirname + '/pred.txt'
        embedding_file_name = model_dirname + '/embedding.txt'
        if not os.path.exists(model_dirname):
            os.makedirs(model_dirname)

        # print("itern decoder size is " + str(self.config.DECODER_SIZE))
        with open(model_dirname + '/log.txt', 'w') as output_file, \
            open(ref_file_name, 'w') as ref_file, \
            open( predicted_file_name, 'w') as pred_file, \
            open( embedding_file_name, "w") as embedding_file:
            num_correct_predictions = 0 if self.config.BEAM_WIDTH == 0 \
                else np.zeros([self.config.BEAM_WIDTH], dtype=np.int32)
            total_predictions = 0
            total_prediction_batches = 0
            true_positive, false_positive, false_negative = 0, 0, 0
            self.eval_queue.reset(self.sess)
            start_time = time.time()

            try:
                while True:
                    predicted_indices, true_target_strings, top_values, method_embeddings, tag = self.sess.run(
                        [
                            self.eval_predicted_indices_op,
                            self.eval_true_target_strings_op,
                            self.eval_topk_values, self.method_embedding,
                            self.eval_tag_key_op
                        ], )
                    #print( tag.shape )
                    #print( tag[0])
                    #                    print( method_embeddings.shape )
                    #                    print( "0,0 " +  str(method_embeddings[0,0]))
                    #                    print( "MAX_LINE,MAX_COLUMN " + str(method_embeddings[ method_embeddings.shape[0] - 1 , method_embeddings.shape[1] - 1]))
                    #print( true_target_strings )
                    true_target_strings = Common.binary_to_string_list(
                        true_target_strings)
                    ref_file.write('\n'.join([
                        name.replace(Common.internal_delimiter, ' ')
                        for name in true_target_strings
                    ]) + '\n')
                    if self.config.BEAM_WIDTH > 0:
                        # predicted indices: (batch, time, beam_width)
                        predicted_strings = [[[
                            self.index_to_target[i] for i in timestep
                        ] for timestep in example]
                                             for example in predicted_indices]
                        predicted_strings = [
                            list(map(list, zip(*example)))
                            for example in predicted_strings
                        ]  # (batch, top-k, target_length)
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings[0]
                        ]) + '\n')
                    else:
                        predicted_strings = [[
                            self.index_to_target[i] for i in example
                        ] for example in predicted_indices]
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings
                        ]) + '\n')

                    num_correct_predictions = self.update_correct_predictions(
                        num_correct_predictions, output_file,
                        zip(true_target_strings, predicted_strings))
                    true_positive, false_positive, false_negative = self.update_per_subtoken_statistics(
                        zip(true_target_strings, predicted_strings),
                        true_positive, false_positive, false_negative)

                    total_predictions += len(true_target_strings)
                    total_prediction_batches += 1
                    if total_prediction_batches % self.num_batches_to_log == 0:
                        elapsed = time.time() - start_time
                        self.trace_evaluation(output_file,
                                              num_correct_predictions,
                                              total_predictions, elapsed)
                    embedding_file.write('\n'.join([
                        Common.binary_to_string(tag[i]) + ',' + ','.join([
                            str(method_embeddings[i, j])
                            for j in range(method_embeddings.shape[1])
                        ]) for i in range(method_embeddings.shape[0])
                    ]) + '\n')
            except tf.errors.OutOfRangeError:
                pass

            print('Done testing, epoch reached')
            output_file.write(
                str(num_correct_predictions / total_predictions) + '\n')
            # Common.compute_bleu(ref_file_name, predicted_file_name)

        elapsed = int(time.time() - eval_start_time)
        precision, recall, f1 = self.calculate_results(true_positive,
                                                       false_positive,
                                                       false_negative)
        try:
            files_rouge = FilesRouge()
            rouge = files_rouge.get_scores(hyp_path=predicted_file_name,
                                           ref_path=ref_file_name,
                                           avg=True,
                                           ignore_empty=True)
        except ValueError:
            rouge = 0
        print("Evaluation time: %sh%sm%ss" %
              ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return num_correct_predictions / total_predictions, \
               precision, recall, f1, rouge
Esempio n. 2
0
    def evaluate(self, release=False):
        eval_start_time = time.time()
        if self.eval_queue is None:
            self.eval_queue = reader.Reader(
                subtoken_to_index=self.subtoken_to_index,
                node_to_index=self.node_to_index,
                target_to_index=self.target_to_index,
                config=self.config,
                is_evaluating=True)
            reader_output = self.eval_queue.get_output()
            self.eval_predicted_indices_op, self.eval_topk_values, _, _ = \
                self.build_test_graph(reader_output)
            self.eval_true_target_strings_op = reader_output[
                reader.TARGET_STRING_KEY]
            self.saver = tf.train.Saver(max_to_keep=10)

        if self.config.LOAD_PATH and not self.config.TRAIN_PATH:
            self.initialize_session_variables(self.sess)
            self.load_model(self.sess)
            if release:
                release_name = self.config.LOAD_PATH + '.release'
                print('Releasing model, output model: %s' % release_name)
                self.saver.save(self.sess, release_name)
                shutil.copyfile(src=self.config.LOAD_PATH + '.dict',
                                dst=release_name + '.dict')
                return None
        model_dirname = os.path.dirname(self.config.SAVE_PATH if self.config.
                                        SAVE_PATH else self.config.LOAD_PATH)
        ref_file_name = model_dirname + '/ref.txt'
        predicted_file_name = model_dirname + '/pred.txt'
        if not os.path.exists(model_dirname):
            os.makedirs(model_dirname)

        with open(model_dirname + '/log.txt', 'w') as output_file, open(
                ref_file_name, 'w') as ref_file, open(predicted_file_name,
                                                      'w') as pred_file:
            num_correct_predictions = 0
            total_predictions = 0
            total_prediction_batches = 0
            true_positive, false_positive, false_negative = 0, 0, 0
            self.eval_queue.reset(self.sess)
            start_time = time.time()

            try:
                while True:
                    predicted_indices, true_target_strings, top_values = self.sess.run(
                        [
                            self.eval_predicted_indices_op,
                            self.eval_true_target_strings_op,
                            self.eval_topk_values
                        ], )
                    true_target_strings = Common.binary_to_string_list(
                        true_target_strings)
                    ref_file.write('\n'.join([
                        name.replace(Common.internal_delimiter, ' ')
                        for name in true_target_strings
                    ]) + '\n')
                    if self.config.BEAM_WIDTH > 0:
                        # predicted indices: (batch, time, beam_width)
                        predicted_strings = [[[
                            self.index_to_target[i] for i in timestep
                        ] for timestep in example]
                                             for example in predicted_indices]
                        predicted_strings = [
                            list(map(list, zip(*example)))
                            for example in predicted_strings
                        ]  # (batch, top-k, target_length)
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings[0]
                        ]) + '\n')
                    else:
                        predicted_strings = [[
                            self.index_to_target[i] for i in example
                        ] for example in predicted_indices]
                        pred_file.write('\n'.join([
                            ' '.join(Common.filter_impossible_names(words))
                            for words in predicted_strings
                        ]) + '\n')

                    num_correct_predictions = self.update_correct_predictions(
                        num_correct_predictions, output_file,
                        zip(true_target_strings, predicted_strings))
                    true_positive, false_positive, false_negative = self.update_per_subtoken_statistics(
                        zip(true_target_strings, predicted_strings),
                        true_positive, false_positive, false_negative)

                    total_predictions += len(true_target_strings)
                    total_prediction_batches += 1
                    if total_prediction_batches % self.num_batches_to_log == 0:
                        elapsed = time.time() - start_time
                        self.trace_evaluation(output_file,
                                              num_correct_predictions,
                                              total_predictions, elapsed)
            except tf.errors.OutOfRangeError:
                pass

            print('Done testing, epoch reached')
            output_file.write(
                str(num_correct_predictions / total_predictions) + '\n')
            # Common.compute_bleu(ref_file_name, predicted_file_name)

        elapsed = int(time.time() - eval_start_time)
        precision, recall, f1 = self.calculate_results(true_positive,
                                                       false_positive,
                                                       false_negative)
        print("Evaluation time: %sh%sm%ss" %
              ((elapsed // 60 // 60), (elapsed // 60) % 60, elapsed % 60))
        return num_correct_predictions / total_predictions, precision, recall, f1
Esempio n. 3
0
             tf.local_variables_initializer(), tf.tables_initializer()).run()
    reader.reset(sess)

    try:
        while True:
            target_indices, target_strings, target_lengths, path_source_indices, \
            node_indices, path_target_indices, valid_context_mask, path_source_lengths, \
            path_lengths, path_target_lengths, path_source_strings, path_strings, \
            path_target_strings = sess.run(
                [target_index_op, target_string_op, target_length_op, path_source_indices_op,
                 node_indices_op, path_target_indices_op, valid_context_mask_op, path_source_lengths_op,
                 path_lengths_op, path_target_lengths_op, path_source_strings_op, path_strings_op,
                 path_target_strings_op])

            print('Target strings: ',
                  Common.binary_to_string_list(target_strings))
            print(
                'Context strings: ',
                Common.binary_to_string_3d(
                    np.concatenate([
                        path_source_strings, path_strings, path_target_strings
                    ], -1)))
            print('Target indices: ', target_indices)
            print('Target lengths: ', target_lengths)
            print('Path source strings: ',
                  Common.binary_to_string_3d(path_source_strings))
            print('Path source indices: ', path_source_indices)
            print('Path source lengths: ', path_source_lengths)
            print('Path strings: ', Common.binary_to_string_3d(path_strings))
            print('Node indices: ', node_indices)
            print('Path lengths: ', path_lengths)
Esempio n. 4
0
    sess = tf.InteractiveSession()
    tf.group(tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()).run()
    reader.reset(sess)

    try:
        while True:
            target_indices, target_strings, target_lengths, path_source_indices, \
            node_indices, path_target_indices, valid_context_mask, path_source_lengths, \
            path_lengths, path_target_lengths, path_source_strings, path_strings, \
            path_target_strings = sess.run(
                [target_index_op, target_string_op, target_length_op, path_source_indices_op,
                 node_indices_op, path_target_indices_op, valid_context_mask_op, path_source_lengths_op,
                 path_lengths_op, path_target_lengths_op, path_source_strings_op, path_strings_op,
                 path_target_strings_op])

            print('Target strings: ', Common.binary_to_string_list(target_strings))
            print('Context strings: ', Common.binary_to_string_3d(
                np.concatenate([path_source_strings, path_strings, path_target_strings], -1)))
            print('Target indices: ', target_indices)
            print('Target lengths: ', target_lengths)
            print('Path source strings: ', Common.binary_to_string_3d(path_source_strings))
            print('Path source indices: ', path_source_indices)
            print('Path source lengths: ', path_source_lengths)
            print('Path strings: ', Common.binary_to_string_3d(path_strings))
            print('Node indices: ', node_indices)
            print('Path lengths: ', path_lengths)
            print('Path target strings: ', Common.binary_to_string_3d(path_target_strings))
            print('Path target indices: ', path_target_indices)
            print('Path target lengths: ', path_target_lengths)
            print('Valid context mask: ', valid_context_mask)
Esempio n. 5
0
    def get_the_processed_input(self, input_tensors):

        target_string = input_tensors[reader.TARGET_STRING_KEY]
        target_index = input_tensors[reader.TARGET_INDEX_KEY]
        target_lengths = input_tensors[reader.TARGET_LENGTH_KEY]
        path_source_indices = input_tensors[reader.PATH_SOURCE_INDICES_KEY]
        node_indices = input_tensors[reader.NODE_INDICES_KEY]
        path_target_indices = input_tensors[reader.PATH_TARGET_INDICES_KEY]
        valid_context_mask = input_tensors[reader.VALID_CONTEXT_MASK_KEY]
        path_source_lengths = input_tensors[reader.PATH_SOURCE_LENGTHS_KEY]
        path_lengths = input_tensors[reader.PATH_LENGTHS_KEY]
        path_target_lengths = input_tensors[reader.PATH_TARGET_LENGTHS_KEY]

        with tf.variable_scope('model'):

            source_word_embed = tf.nn.embedding_lookup(
                params=self.subtoken_vocab, ids=path_source_indices
            )  # (batch, max_contexts, max_name_parts, dim)
            path_embed = tf.nn.embedding_lookup(
                params=self.nodes_vocab, ids=node_indices
            )  # (batch, max_contexts, max_path_length+1, dim)
            target_word_embed = tf.nn.embedding_lookup(
                params=self.subtoken_vocab, ids=path_target_indices
            )  # (batch, max_contexts, max_name_parts, dim)

        #sess.close()
        sess = tf.InteractiveSession()
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer(),
                 tf.tables_initializer()).run()
        self.predict_queue.reset(sess)
        path_source_indices_matrix, node_indices_matrix, path_target_indices_matrix, \
        source_word_embed_matrix, path_embed_matrix, target_word_embed_matrix, \
        np_target_string  = sess.run(
                [path_source_indices, node_indices, path_target_indices,
                source_word_embed, path_embed, target_word_embed, target_string])


        self.predict_path_source_indices_matrix, self.predict_node_indices_matrix, self.predict_path_target_indices_matrix \
            = np.array(path_source_indices_matrix), np.array(node_indices_matrix), np.array(path_target_indices_matrix)
        self.predict_source_word_embed_matrix, self.predict_path_embed_matrix, self.predict_target_word_embed_matrix  = \
            np.array(source_word_embed_matrix), np.array(path_embed_matrix), np.array(target_word_embed_matrix)

        print(
            "=============================================================\n")

        #print('Path source indices: ', self.predict_path_source_indices_matrix)
        print('Path source indices: ',
              self.predict_path_source_indices_matrix.shape)
        print("------------------------------\n")

        #print('Node indices: ', self.predict_node_indices_matrix)
        print('Node indices: ', self.predict_node_indices_matrix.shape)
        print("------------------------------\n")

        #print('Path target indices: ', self.path_target_indices_matrix)
        print('Path target indices: ',
              self.predict_path_target_indices_matrix.shape)
        print("------------------------------\n")

        #print('source_word_embed: ', self.predict_source_word_embed_matrix)
        print('source_word_embed: ',
              self.predict_source_word_embed_matrix.shape)
        print("------------------------------\n")

        #print('path_embed: ', np.array(path_embed_temp))
        print('path_embed: ', self.predict_path_embed_matrix.shape)
        print("------------------------------\n")

        #print('target_word_embed: ', np.array(target_word_embed_temp))
        print('target_word_embed: ',
              self.predict_target_word_embed_matrix.shape)

        print(
            "=============================================================\n")
        print('np_target_string: ',
              Common.binary_to_string_list(np_target_string))
        print('np_target_string shape: ', np_target_string.shape)

        self.predict_group_for_each_data = np.full(
            len(self.predict_source_word_embed_matrix), 0)
Esempio n. 6
0
    def get_the_processed_data(self, input_tensors):
        source_string = input_tensors[reader.PATH_SOURCE_STRINGS_KEY]
        path_string = input_tensors[reader.PATH_STRINGS_KEY]
        target_string = input_tensors[reader.PATH_TARGET_STRINGS_KEY]
        target_string = input_tensors[reader.TARGET_STRING_KEY]
        target_index = input_tensors[reader.TARGET_INDEX_KEY]
        target_lengths = input_tensors[reader.TARGET_LENGTH_KEY]
        path_source_indices = input_tensors[reader.PATH_SOURCE_INDICES_KEY]
        node_indices = input_tensors[reader.NODE_INDICES_KEY]
        path_target_indices = input_tensors[reader.PATH_TARGET_INDICES_KEY]
        valid_context_mask = input_tensors[reader.VALID_CONTEXT_MASK_KEY]
        path_source_lengths = input_tensors[reader.PATH_SOURCE_LENGTHS_KEY]
        path_lengths = input_tensors[reader.PATH_LENGTHS_KEY]
        path_target_lengths = input_tensors[reader.PATH_TARGET_LENGTHS_KEY]

        with tf.variable_scope('model'):
            subtoken_vocab = tf.get_variable(
                'SUBTOKENS_VOCAB',
                shape=(self.subtoken_vocab_size, self.config.EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, mode='FAN_OUT', uniform=True))
            target_words_vocab = tf.get_variable(
                'TARGET_WORDS_VOCAB',
                shape=(self.target_vocab_size, self.config.EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, mode='FAN_OUT', uniform=True))
            nodes_vocab = tf.get_variable(
                'NODES_VOCAB',
                shape=(self.nodes_vocab_size, self.config.EMBEDDINGS_SIZE),
                dtype=tf.float32,
                initializer=tf.contrib.layers.variance_scaling_initializer(
                    factor=1.0, mode='FAN_OUT', uniform=True))
            # (batch, max_contexts, decoder_size)
            source_word_embed, path_embed, target_word_embed = self.compute_contexts(
                subtoken_vocab=subtoken_vocab,
                nodes_vocab=nodes_vocab,
                source_input=path_source_indices,
                nodes_input=node_indices,
                target_input=path_target_indices,
                valid_mask=valid_context_mask,
                path_source_lengths=path_source_lengths,
                path_lengths=path_lengths,
                path_target_lengths=path_target_lengths)

            self.subtoken_vocab, self.target_words_vocab, self.nodes_vocab = subtoken_vocab, target_words_vocab, nodes_vocab

            batch_size = tf.shape(target_index)[0]

        #Here, the program gives us the processed data-sets
        #every context, in the definition, has the form of [source, path, target]
        # path_target_indices is the index matrix for all targets
        # node_indices is the index matrix for all paths
        # path_source_indices is the index matrix for all sources
        # source_word_embed is the embedding for path_source_indices
        # path_embed is the embedding for node_indices
        # target_word_embed is the embedding for path_target_indices

        #now we just transfer all these needed variable to np array

        sess = tf.InteractiveSession()
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer(),
                 tf.tables_initializer()).run()
        self.queue_thread.reset(sess)
        path_source_indices_matrix, node_indices_matrix, path_target_indices_matrix, \
        source_word_embed_matrix, path_embed_matrix, target_word_embed_matrix, \
        np_target_string, self.source_string, self.path_string, self.target_string  = sess.run(
                [path_source_indices, node_indices, path_target_indices,
                source_word_embed, path_embed, target_word_embed, target_string,
                source_string, path_string, target_string])
        sess.close()

        self.path_source_indices_matrix, self.node_indices_matrix, self.path_target_indices_matrix \
            = np.array(path_source_indices_matrix), np.array(node_indices_matrix), np.array(path_target_indices_matrix)
        self.source_word_embed_matrix, self.path_embed_matrix, self.target_word_embed_matrix  = \
            np.array(source_word_embed_matrix), np.array(path_embed_matrix), np.array(target_word_embed_matrix)

        print(
            "=============================================================\n")

        #print('Path source indices: ', self.path_source_indices_matrix)
        print('Path source indices: ', self.path_source_indices_matrix.shape)
        print("------------------------------\n")

        #print('Node indices: ', self.node_indices_matrix)
        print('Node indices: ', self.node_indices_matrix.shape)
        print("------------------------------\n")

        #print('Path target indices: ', self.path_target_indices_matrix)
        print('Path target indices: ', self.path_target_indices_matrix.shape)
        print("------------------------------\n")

        #print('source_word_embed: ', self.source_word_embed_matrix)
        print('source_word_embed: ', self.source_word_embed_matrix.shape)
        print("------------------------------\n")

        #print('path_embed: ', np.array(path_embed_temp))
        print('path_embed: ', self.path_embed_matrix.shape)
        print("------------------------------\n")

        #print('target_word_embed: ', np.array(target_word_embed_temp))
        print('target_word_embed: ', self.target_word_embed_matrix.shape)

        print(
            "=============================================================\n")
        self.target_string = Common.binary_to_string_list(np_target_string)
        print('np_target_string: ', self.target_string)
        print('np_target_string shape: ', np_target_string.shape)

        #temp = self.calculate_total_distance(self.source_word_embed_matrix[0],
        #    self.path_embed_matrix[0],self.target_word_embed_matrix[0],
        #    self.source_word_embed_matrix[1],self.path_embed_matrix[1],self.target_word_embed_matrix[1])
        #print(temp)

        self.group_for_each_data = np.full(len(self.source_word_embed_matrix),
                                           0)

        #generate random K centers for clustering
        self.index_of_initial_center = self.generate_center(
            0,
            len(self.source_word_embed_matrix) - 1)