Ejemplo n.º 1
0
def gigaword_generator(dataset_name, dataset_split):
    article_path = os.path.join(kaiqiang_data_dir, dataset_name, dataset_split + '.Ndocument')
    abstract_path = os.path.join(kaiqiang_data_dir, dataset_name, dataset_split + '.Nsummary')
    giga_article_lines = [line.strip() for line in open(article_path).readlines()]
    giga_abstract_lines = [line.strip() for line in open(abstract_path).readlines()]

    if len(giga_article_lines) != len(giga_abstract_lines):
        util.print_vars(giga_article_lines, giga_abstract_lines)
        raise Exception('len(article_lines) != len(abstract_lines)')
    for article_idx in range(len(giga_abstract_lines)):
        article_line = giga_article_lines[article_idx]
        abstract_line = giga_abstract_lines[article_idx]

        article = ''
        doc_indices = ''
        raw_article_sents = []

        orig_sent = article_line
        tokenized_sent = util.process_sent(orig_sent)
        # if is_quote(tokenized_sent):
        #     continue
        sent = ' '.join(tokenized_sent)
        article += sent + ' '

        doc_indices_for_tokens = [0] * len(tokenized_sent)
        doc_indices_str = ' '.join(str(x) for x in doc_indices_for_tokens)
        doc_indices += doc_indices_str + ' '
        raw_article_sents.append(orig_sent)

        article = article.strip()

        abstracts_unprocessed = [[abstract_line]]
        abstracts = []
        for abstract_lines in abstracts_unprocessed:
            abstract = process_abstract(abstract_lines)
            abstracts.append(abstract)
        # yield article, abstracts, doc_indices, raw_article_sents
        example = make_example(article, abstracts, doc_indices, raw_article_sents, None)
        yield example
    def __init__(self, is_testing):
        super().__init__()
        self.is_testing = is_testing
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            regularizer = layers.l2_regularizer(1e-4)
            self.name = "%s %s" % (self.revision, self.message)
            self.train, self.valid, self.test = self.encode_data(sudoku())

            print("Building graph...")
            self.session = tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True))
            self.global_step = tf.Variable(initial_value=0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4)
            self.mode = tf.placeholder(tf.string)

            edges = self.sudoku_edges()
            edges = [(i + (b * 81), j + (b * 81))
                     for b in range(self.batch_size) for i, j in edges]
            ridx = [edges.index((j, i)) for i, j in edges]
            edge_indices = tf.constant(edges, tf.int32)
            n_edges = tf.shape(edge_indices)[0]

            positions = tf.constant([[(i, j) for i in range(9)
                                      for j in range(9)]
                                     for b in range(self.batch_size)],
                                    tf.int32)  # (bs, 81, 2)
            rows = layers.embed_sequence(positions[:, :, 0],
                                         9,
                                         self.emb_size,
                                         scope='row-embeddings',
                                         unique=True)  # bs, 81, emb_size
            cols = layers.embed_sequence(positions[:, :, 1],
                                         9,
                                         self.emb_size,
                                         scope='cols-embeddings',
                                         unique=True)  # bs, 81, emb_size

            def avg_n(x):
                return tf.reduce_mean(tf.stack(x, axis=0), axis=0)

            towers = []
            with tf.variable_scope(tf.get_variable_scope()):
                for device_nr, device in enumerate(self.devices):
                    with tf.device('/cpu:0'):

                        if self.is_testing:
                            (quizzes, answers
                             ), edge_keep_prob = self.test.get_next(), 1.0
                        else:
                            (quizzes, answers), edge_keep_prob = tf.cond(
                                tf.equal(self.mode, "train"),
                                true_fn=lambda:
                                (self.train.get_next(), self.edge_keep_prob),
                                false_fn=lambda: (self.valid.get_next(), 1.0))

                        x = layers.embed_sequence(
                            quizzes,
                            10,
                            self.emb_size,
                            scope='nr-embeddings',
                            unique=True)  # bs, 81, emb_size
                        x = tf.concat([x, rows, cols], axis=2)
                        x = tf.reshape(x, (-1, 3 * self.emb_size))

                    with tf.device(device), tf.name_scope("device-%s" %
                                                          device_nr):

                        def mlp(x, scope, n_out):
                            with tf.variable_scope(scope):
                                for i in range(3):
                                    x = layers.fully_connected(
                                        x,
                                        n_out,
                                        weights_regularizer=regularizer)
                                return layers.fully_connected(
                                    x,
                                    n_out,
                                    weights_regularizer=regularizer,
                                    activation_fn=None)

                        x = mlp(x, 'C1', self.n_hidden)
                        dependents = tf.zeros((n_edges, 10))
                        outputs = []
                        log_losses = []
                        with tf.variable_scope('steps'):
                            for step in range(self.n_steps):
                                # M_F = c2(c1(x, p), c1(x, N_F\p), d_pF)
                                # d_pF = sum_{q \in N_F\p} (M_F)
                                # p(y_p|x) = softmax(sum(M_F))

                                logits, messages = message_passing(
                                    x, edge_indices, dependents,
                                    lambda x: mlp(x, 'C2', 10))
                                dependents = tf.gather(
                                    logits, edge_indices[:, 0]) - tf.gather(
                                        messages, ridx)
                                out = tf.reshape(logits, (-1, 81, 10))
                                outputs.append(out)
                                log_losses.append(
                                    tf.reduce_mean(
                                        tf.nn.
                                        sparse_softmax_cross_entropy_with_logits(
                                            labels=answers, logits=out)))
                                tf.get_variable_scope().reuse_variables()

                        reg_loss = sum(
                            tf.get_collection(
                                tf.GraphKeys.REGULARIZATION_LOSSES))
                        loss = log_losses[-1] + reg_loss

                        towers.append({
                            'loss':
                            loss,
                            'grads':
                            [(tf.clip_by_value(g, -10.0, 10.0), v)
                             for g, v in self.optimizer.compute_gradients(loss)
                             ],
                            'log_losses':
                            tf.stack(log_losses),  # (n_steps, 1)
                            'quizzes':
                            quizzes,  # (bs, 81, 10)
                            'answers':
                            answers,  # (bs, 81, 10)
                            'outputs':
                            tf.stack(outputs)  # n_steps, bs, 81, 10
                        })

                        tf.get_variable_scope().reuse_variables()

            self.loss = avg_n([t['loss'] for t in towers])
            self.out = tf.concat([t['outputs'] for t in towers],
                                 axis=1)  # n_steps, bs, 81, 10
            self.predicted = tf.cast(tf.argmax(self.out, axis=3), tf.int32)
            self.answers = tf.concat([t['answers'] for t in towers], axis=0)
            self.quizzes = tf.concat([t['quizzes'] for t in towers], axis=0)

            tf.summary.scalar('losses/total', self.loss)
            tf.summary.scalar('losses/reg', reg_loss)
            log_losses = avg_n([t['log_losses'] for t in towers])

            for step in range(self.n_steps):
                equal = tf.equal(self.answers, self.predicted[step])

                digit_acc = tf.reduce_mean(tf.to_float(equal))
                tf.summary.scalar('steps/%d/digit-acc' % step, digit_acc)

                puzzle_acc = tf.reduce_mean(
                    tf.to_float(tf.reduce_all(equal, axis=1)))
                tf.summary.scalar('steps/%d/puzzle-acc' % step, puzzle_acc)

                tf.summary.scalar('steps/%d/losses/log' % step,
                                  log_losses[step])

            avg_gradients = util.average_gradients(
                [t['grads'] for t in towers])
            self.train_step = self.optimizer.apply_gradients(
                avg_gradients, global_step=self.global_step)

            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            util.print_vars(tf.trainable_variables())

            self.train_writer = tf.summary.FileWriter(
                self.tensorboard_dir + '/sudoku/%s/train/%s' %
                (self.revision, self.name), self.session.graph)
            self.test_writer = tf.summary.FileWriter(
                self.tensorboard_dir + '/sudoku/%s/test/%s' %
                (self.revision, self.name), self.session.graph)
            self.summaries = tf.summary.merge_all()
"""Question: https://leetcode.com/problems/delete-node-in-a-linked-list/
"""

from datastruct import ListNode
from util import print_vars


class Solution:
    def deleteNode(self, node: ListNode) -> None:
        """
        :type node: ListNode
        :rtype: void Do not return anything, modify node in-place instead.
        """
        node.val = node.next.val
        node.next = node.next.next


if __name__ == '__main__':
    head = ListNode.from_list([4, 5, 1, 9])
    node = head.next
    print_vars(head, node)
    print('delete node: ', node)
    Solution().deleteNode(node)
    print_vars(head, node)
def main(unused_argv):
    print('Running statistics on %s' % exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.singles_and_pairs == 'both':
        in_dataset = FLAGS.dataset_name
        out_dataset = FLAGS.dataset_name + '_both'
    else:
        in_dataset = FLAGS.dataset_name + '_singles'
        out_dataset = FLAGS.dataset_name + '_singles'

    if FLAGS.lr:
        out_dataset = FLAGS.dataset_name + '_lr'

    start_time = time.time()
    np.random.seed(random_seed)
    source_dir = os.path.join(data_dir, in_dataset)
    ex_sents = ['single .', 'sentence .']
    article_text = ' '.join(ex_sents)
    sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
        tfidf_vectorizer, ex_sents, article_text, pca)
    if FLAGS.singles_and_pairs == 'pairs':
        single_feat_len = 0
    else:
        single_feat_len = len(
            get_single_sent_features(0, sent_term_matrix,
                                     [['single', '.'], ['sentence', '.']],
                                     [0, 0], 0))
    if FLAGS.singles_and_pairs == 'singles':
        pair_feat_len = 0
    else:
        pair_feat_len = len(
            get_pair_sent_features([0, 1], sent_term_matrix,
                                   [['single', '.'], ['sentence', '.']],
                                   [0, 0], [0, 0]))
    util.print_vars(single_feat_len, pair_feat_len)
    util.create_dirs(temp_dir)

    if FLAGS.dataset_split == 'all':
        dataset_splits = ['test', 'val', 'train']
    elif FLAGS.dataset_split == 'train_val':
        dataset_splits = ['val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for split in dataset_splits:
        source_files = sorted(glob.glob(source_dir + '/' + split + '*'))

        out_path = os.path.join(out_dir, out_dataset, split)
        if FLAGS.pca:
            out_path += '_pca'
        util.create_dirs(os.path.join(out_path))
        total = len(source_files) * 1000 if (
            'cnn' in in_dataset or 'newsroom' in in_dataset
            or 'xsum' in in_dataset) else len(source_files)
        example_generator = data.example_generator(source_dir + '/' + split +
                                                   '*',
                                                   True,
                                                   False,
                                                   should_check_valid=False)
        # for example in tqdm(example_generator, total=total):
        ex_gen = example_generator_extended(example_generator, total,
                                            single_feat_len, pair_feat_len,
                                            FLAGS.singles_and_pairs, out_path)
        print('Creating list')
        ex_list = [ex for ex in ex_gen]
        if FLAGS.num_instances != -1:
            ex_list = ex_list[:FLAGS.num_instances]
        print('Converting...')
        # all_features = pool.map(convert_article_to_lambdamart_features, ex_list)

        # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list])

        if FLAGS.lr:
            all_instances = list(
                futures.map(convert_article_to_lambdamart_features, ex_list))
            all_instances = util.flatten_list_of_lists(all_instances)
            x = [inst.features for inst in all_instances]
            x = np.array(x)
            y = [inst.relevance for inst in all_instances]
            y = np.expand_dims(np.array(y), 1)
            x_y = np.concatenate((x, y), 1)
            np.save(writer, x_y)
        else:
            list(futures.map(convert_article_to_lambdamart_features, ex_list))
            # writer.write(''.join(all_features))

        # all_features = []
        # for example  in tqdm(ex_gen, total=total):
        #     all_features.append(convert_article_to_lambdamart_features(example))

        # all_features = util.flatten_list_of_lists(all_features)
        # num1 = sum(x == 1 for x in all_features)
        # num2 = sum(x == 2 for x in all_features)
        # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2)

        # for example in tqdm(ex_gen, total=total):
        #     features = convert_article_to_lambdamart_features(example)
        #     writer.write(features)

        final_out_path = out_path + '.txt'
        file_names = sorted(glob.glob(os.path.join(out_path, '*')))
        writer = open(final_out_path, 'wb')
        for file_name in tqdm(file_names):
            with open(file_name) as f:
                text = f.read()
            writer.write(text)
        writer.close()
    util.print_execution_time(start_time)
Ejemplo n.º 5
0
    def __init__(self, is_testing):
        super().__init__()
        self.is_testing = is_testing

        print("Preparing data...")
        self.train, self.valid, self.test, self.vocab = self.encode_data(
            bAbI('en-valid-10k'))

        print("Creating graph...")
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            regularizer = layers.l2_regularizer(1e-4)

            self.session = tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True))
            self.global_step = tf.Variable(initial_value=0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4)

            self.facts_ph = tf.placeholder(tf.int32,
                                           shape=(None,
                                                  None))  # (bs*#facts, seq)
            self.facts_pos_ph = tf.placeholder(tf.int32,
                                               shape=(None, ))  # (bs*#facts, )
            self.question_ph = tf.placeholder(tf.int32,
                                              shape=(None, None))  # (bs, seq)
            self.answers_ph = tf.placeholder(tf.int32,
                                             shape=(None, ))  # (bs, )
            self.edge_indices_ph = tf.placeholder(tf.int32, shape=(None, 2))
            self.fact_segments_ph = tf.placeholder(tf.int32, shape=(None, ))
            self.edge_segments_ph = tf.placeholder(tf.int32, shape=(None, ))
            self.q_seq_length_ph = tf.placeholder(tf.int32, shape=(None, ))
            self.f_seq_length_ph = tf.placeholder(tf.int32, shape=(None, ))
            self.task_indices_ph = tf.placeholder(tf.int32, shape=(None, ))
            self.edge_keep_prob_ph = tf.placeholder(tf.float32, shape=())
            self.is_training_ph = tf.placeholder(tf.bool)

            placeholders = [
                self.facts_ph, self.facts_pos_ph, self.question_ph,
                self.answers_ph, self.edge_indices_ph, self.fact_segments_ph,
                self.edge_segments_ph, self.q_seq_length_ph,
                self.f_seq_length_ph, self.task_indices_ph,
                self.edge_keep_prob_ph
            ]

            self.train_queue = tf.FIFOQueue(self.qsize,
                                            [ph.dtype for ph in placeholders],
                                            name='train-queue')
            self.val_queue = tf.FIFOQueue(self.qsize,
                                          [ph.dtype for ph in placeholders],
                                          name='val-queue')

            self.train_enqueue_op = self.train_queue.enqueue(placeholders)
            self.train_qsize_op = self.train_queue.size()
            tf.summary.scalar('queues/train', self.train_qsize_op)

            self.val_enqueue_op = self.val_queue.enqueue(placeholders)
            self.val_qsize_op = self.val_queue.size()
            tf.summary.scalar('queues/val', self.val_qsize_op)

            def avg_n(x):
                return tf.reduce_mean(tf.stack(x, axis=0), axis=0)

            towers = []
            with tf.variable_scope(tf.get_variable_scope()):
                for device_nr, device in enumerate(self.devices):
                    with tf.device('/cpu:0'):
                        if self.is_testing:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = placeholders
                        else:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = tf.cond(
                                self.is_training_ph,
                                true_fn=lambda: self.train_queue.dequeue(),
                                false_fn=lambda: self.val_queue.dequeue(),
                            )

                            vars = (facts_ph, facts_pos_ph, question_ph,
                                    answers_ph, edge_indices_ph,
                                    fact_segments_ph, edge_segments_ph,
                                    q_seq_length_ph, f_seq_length_ph,
                                    task_indices_ph, edge_keep_prob)
                            for v, ph in zip(vars, placeholders):
                                v.set_shape(ph.get_shape())

                        facts_emb = layers.embed_sequence(
                            facts_ph,
                            self.vocab.size(),
                            self.emb_size,
                            scope='word-embeddings')
                        questions_emb = layers.embed_sequence(
                            question_ph,
                            self.vocab.size(),
                            self.emb_size,
                            scope='word-embeddings',
                            reuse=True)

                    with tf.device(device), tf.name_scope("device-%s" %
                                                          device_nr):

                        def mlp(x, scope, n_hidden):
                            with tf.variable_scope(scope):
                                for i in range(3):
                                    x = layers.fully_connected(
                                        x,
                                        n_hidden,
                                        weights_regularizer=regularizer)
                                return layers.fully_connected(
                                    x,
                                    n_hidden,
                                    weights_regularizer=regularizer,
                                    activation_fn=None)

                        _, (_, f_encoding) = tf.nn.dynamic_rnn(
                            tf.nn.rnn_cell.LSTMCell(32),
                            facts_emb,
                            dtype=tf.float32,
                            sequence_length=f_seq_length_ph,
                            scope='fact-encoder')

                        random_pos_offsets = tf.random_uniform(
                            tf.shape(answers_ph),
                            minval=0,
                            maxval=self.num_facts,
                            dtype=tf.int32)
                        fact_pos = facts_pos_ph + tf.gather(
                            random_pos_offsets, fact_segments_ph)
                        facts_pos_encoding = tf.one_hot(
                            fact_pos, 2 * self.num_facts)
                        f_encoding = tf.concat(
                            [f_encoding, facts_pos_encoding], axis=1)

                        _, (_, q_encoding) = tf.nn.dynamic_rnn(
                            tf.nn.rnn_cell.LSTMCell(32),
                            questions_emb,
                            dtype=tf.float32,
                            sequence_length=q_seq_length_ph,
                            scope='question-encoder')

                        def graph_fn(x):
                            with tf.variable_scope('graph-fn'):
                                x = layers.fully_connected(
                                    x,
                                    self.n_hidden,
                                    weights_regularizer=regularizer)
                                x = layers.fully_connected(
                                    x,
                                    self.n_hidden,
                                    weights_regularizer=regularizer)
                                return layers.fully_connected(
                                    x,
                                    self.vocab.size(),
                                    activation_fn=None,
                                    weights_regularizer=regularizer)

                        x = tf.concat([
                            f_encoding,
                            tf.gather(q_encoding, fact_segments_ph)
                        ], 1)
                        x0 = mlp(x, 'pre', self.n_hidden)
                        edge_features = tf.gather(q_encoding, edge_segments_ph)
                        x = x0
                        outputs = []
                        log_losses = []
                        with tf.variable_scope('steps'):
                            lstm_cell = LSTMCell(self.n_hidden)
                            state = lstm_cell.zero_state(
                                tf.shape(x)[0], tf.float32)

                            for step in range(self.n_steps):
                                x = message_passing(
                                    x, edge_indices_ph, edge_features,
                                    lambda x: mlp(x, 'message-fn', self.
                                                  n_hidden), edge_keep_prob)
                                x = mlp(tf.concat([x, x0], axis=1), 'post-fn',
                                        self.n_hidden)
                                x, state = lstm_cell(x, state)
                                with tf.variable_scope('graph-sum'):
                                    graph_sum = tf.segment_sum(
                                        x, fact_segments_ph)
                                    out = graph_fn(graph_sum)
                                    outputs.append(out)
                                    log_losses.append(
                                        tf.reduce_mean(
                                            tf.nn.
                                            sparse_softmax_cross_entropy_with_logits(
                                                labels=answers_ph,
                                                logits=out)))

                                tf.get_variable_scope().reuse_variables()

                        reg_loss = sum(
                            tf.get_collection(
                                tf.GraphKeys.REGULARIZATION_LOSSES))
                        loss = avg_n(log_losses) + reg_loss

                        towers.append({
                            'loss':
                            loss,
                            'grads':
                            self.optimizer.compute_gradients(loss),
                            'log_losses':
                            tf.stack(log_losses),  # (n_steps, 1)
                            'answers':
                            answers_ph,  # (batch_size, n_outputs)
                            'outputs':
                            tf.stack(
                                outputs),  # (n_steps, batch_size, n_outputs)
                            'task_indices':
                            task_indices_ph  # (batch_size, n_outputs
                        })

                        tf.get_variable_scope().reuse_variables()

            self.loss = avg_n([t['loss'] for t in towers])
            self.out = tf.concat([t['outputs'] for t in towers], axis=1)
            self.answers = tf.concat([t['answers'] for t in towers], axis=0)
            self.task_indices = tf.concat([t['task_indices'] for t in towers],
                                          axis=0)

            tf.summary.scalar('losses/total', self.loss)
            tf.summary.scalar('losses/reg', reg_loss)
            log_losses = avg_n([t['log_losses'] for t in towers])
            for i in range(self.n_steps):
                tf.summary.scalar('steps/%d/losses/log' % i, log_losses[i])

            avg_gradients = util.average_gradients(
                [t['grads'] for t in towers])
            self.train_step = self.optimizer.apply_gradients(
                avg_gradients, global_step=self.global_step)

            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            util.print_vars(tf.trainable_variables())

            self.train_writer = tf.summary.FileWriter(
                '/tmp/tensorboard/bAbI/%s/train/%s' %
                (self.revision, self.name), self.session.graph)
            self.test_writer = tf.summary.FileWriter(
                '/tmp/tensorboard/bAbI/%s/test/%s' %
                (self.revision, self.name), self.session.graph)

            self.summaries = tf.summary.merge_all()

        print("Starting data loaders...")
        train_mp_queue = mp.Manager().Queue(maxsize=self.qsize)
        val_mp_queue = mp.Manager().Queue(maxsize=self.qsize)

        data_loader_processes = [
            mp.Process(target=self.data_loader, args=(train_mp_queue, True))
            for i in range(4)
        ]
        val_data_loader_processes = [
            mp.Process(target=self.data_loader, args=(val_mp_queue, False))
            for i in range(1)
        ]

        for p in data_loader_processes + val_data_loader_processes:
            p.daemon = True
            p.start()

        queue_putter_threads = [
            threading.Thread(target=self.queue_putter,
                             args=(train_mp_queue, self.train_enqueue_op,
                                   'train', 1000)),
            threading.Thread(target=self.queue_putter,
                             args=(val_mp_queue, self.val_enqueue_op, 'val',
                                   1)),
        ]
        for t in queue_putter_threads:
            t.daemon = True
            t.start()

        train_qsize, val_qsize = 0, 0
        print("Waiting for queue to fill...")
        while train_qsize < self.qsize or val_qsize < self.qsize:
            train_qsize = self.session.run(self.train_qsize_op)
            val_qsize = self.session.run(self.val_qsize_op)
            print('train_qsize: %d, val_qsize: %d' % (train_qsize, val_qsize),
                  flush=True)
            time.sleep(1)
    def __init__(self, is_testing):
        super().__init__()
        self.is_testing = is_testing

        print("Preparing data...")
        # Load and encode data (Disk -> Memory), see more details in encode_data()
        # Also see data_loader(), the next processing stage.
        self.train, self.valid, self.test, self.vocab = self.encode_data(bAbI('en-valid-10k'))

        print("Creating graph...")
        with tf.Graph().as_default(), tf.device('/cpu:0'):
            regularizer = layers.l2_regularizer(1e-4)  # regularizer applied to fully-connected network

            # allow_soft_placement=True: if cannot find specific device, allow tf to choose the device
            self.session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            self.global_step = tf.Variable(initial_value=0, trainable=False)
            self.optimizer = tf.train.AdamOptimizer(learning_rate=2e-4)

            self.facts_ph = tf.placeholder(tf.int32, shape=(None, None))  # (bs*#facts, seq)
            self.facts_pos_ph = tf.placeholder(tf.int32, shape=(None,))  # (bs*#facts, )
            self.question_ph = tf.placeholder(tf.int32, shape=(None, None))  # (bs, seq)
            self.answers_ph = tf.placeholder(tf.int32, shape=(None,))  # (bs, )
            self.edge_indices_ph = tf.placeholder(tf.int32, shape=(None, 2))
            self.fact_segments_ph = tf.placeholder(tf.int32, shape=(None,))
            self.edge_segments_ph = tf.placeholder(tf.int32, shape=(None,))
            self.q_seq_length_ph = tf.placeholder(tf.int32, shape=(None,))
            self.f_seq_length_ph = tf.placeholder(tf.int32, shape=(None,))
            self.task_indices_ph = tf.placeholder(tf.int32, shape=(None,))
            self.edge_keep_prob_ph = tf.placeholder(tf.float32, shape=())
            self.is_training_ph = tf.placeholder(tf.bool)

            # device: CPU:0
            placeholders = [self.facts_ph, self.facts_pos_ph, self.question_ph, self.answers_ph, self.edge_indices_ph,
                            self.fact_segments_ph, self.edge_segments_ph, self.q_seq_length_ph, self.f_seq_length_ph,
                            self.task_indices_ph, self.edge_keep_prob_ph]

            # each element of train_queue is a training batch
            self.train_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='train-queue')
            # each element of train_queue is a validation batch
            self.val_queue = tf.FIFOQueue(self.qsize, [ph.dtype for ph in placeholders], name='val-queue')

            self.train_enqueue_op = self.train_queue.enqueue(placeholders)
            self.train_qsize_op = self.train_queue.size()
            # record the size of the train_queue every batch
            tf.summary.scalar('queues/train', self.train_qsize_op)

            self.val_enqueue_op = self.val_queue.enqueue(placeholders)
            self.val_qsize_op = self.val_queue.size()
            # record the size of the val_queue every batch
            tf.summary.scalar('queues/val', self.val_qsize_op)

            def avg_n(x):
                return tf.reduce_mean(tf.stack(x, axis=0), axis=0)

            towers = []
            with tf.variable_scope(tf.get_variable_scope()):
                for device_nr, device in enumerate(self.devices):
                    with tf.device('/cpu:0'):
                        if self.is_testing:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = placeholders
                        else:
                            facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph, edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob = tf.cond(
                                self.is_training_ph,
                                true_fn=lambda: self.train_queue.dequeue(),
                                false_fn=lambda: self.val_queue.dequeue(),
                            )
                            # device: CPU:0, CPU:0, CPU:0 (In a 3 GPU machine, these placeholders are in triplicate.)
                            vars = (facts_ph, facts_pos_ph, question_ph, answers_ph, edge_indices_ph, fact_segments_ph,
                                    edge_segments_ph, q_seq_length_ph, f_seq_length_ph, task_indices_ph, edge_keep_prob)

                            for v, ph in zip(vars, placeholders):
                                v.set_shape(ph.get_shape())
                        # device: CPU:0, CPU:0, CPU:0
                        facts_emb = layers.embed_sequence(facts_ph, self.vocab.size(), self.emb_size,
                                                          scope='word-embeddings')
                        # device: CPU:0, CPU:0, CPU:0
                        questions_emb = layers.embed_sequence(question_ph, self.vocab.size(), self.emb_size,
                                                              scope='word-embeddings', reuse=True)

                    with tf.device(device), tf.name_scope("device-%s" % device_nr):
                        # 4 layers FC
                        def mlp(x, scope, n_hidden):
                            with tf.variable_scope(scope):
                                for i in range(3):
                                    x = layers.fully_connected(x, n_hidden, weights_regularizer=regularizer)
                                return layers.fully_connected(x, n_hidden, weights_regularizer=regularizer,
                                                              activation_fn=None)

                        # get the final hidden state for the sentences(facts), f_encoding shape: (bs*#facts, state_size)
                        _, (_, f_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), facts_emb, dtype=tf.float32,
                                                               sequence_length=f_seq_length_ph, scope='fact-encoder')

                        # shape:(bs, ) (the same as answers_ph), elements inside the vector range from 0 to 20 randomly
                        # and subjects to the normal distribution
                        random_pos_offsets = tf.random_uniform(tf.shape(answers_ph), minval=0, maxval=self.num_facts,
                                                               dtype=tf.int32)
                        # Generate random offset. Note that for a specific task, the offset is the same.
                        fact_pos = facts_pos_ph + tf.gather(random_pos_offsets, fact_segments_ph)
                        # Considering the offset, the depth for the positional one-hot encoding should be 2*num_facts
                        facts_pos_encoding = tf.one_hot(fact_pos, 2 * self.num_facts)

                        # concatenate the encoding of content and position; device: GPU:0, GPU:1, GPU:2
                        f_encoding = tf.concat([f_encoding, facts_pos_encoding], axis=1)

                        # Need not to encode position for questions, just get the features of their content
                        # q_encoding shape: (bs, state_size); device: GPU:0, GPU:1, GPU: 2
                        _, (_, q_encoding) = tf.nn.dynamic_rnn(tf.nn.rnn_cell.LSTMCell(32), questions_emb,
                                                               dtype=tf.float32, sequence_length=q_seq_length_ph,
                                                               scope='question-encoder')

                        # MLP of 3 layers FC, used to process the output of a graph
                        # num output of last layer is vocab.size(), so as to get the logits
                        def graph_fn(x):
                            with tf.variable_scope('graph-fn'):
                                x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer)
                                x = layers.fully_connected(x, self.n_hidden, weights_regularizer=regularizer)
                                return layers.fully_connected(x, self.vocab.size(), activation_fn=None,
                                                              weights_regularizer=regularizer)

                        # concatenate the fact_encoding and the question_encoding
                        x = tf.concat([f_encoding, tf.gather(q_encoding, fact_segments_ph)], 1)

                        # x0 represents "fact embedding given the question"
                        # (by concatenate the question embedding with them)
                        # device: GPU:0, GPU:1, GPU:2
                        x0 = mlp(x, 'pre', self.n_hidden)

                        # generate the question encoding for every edge
                        # edge_features shape: (bs*(#facts**2), LSTM state_size)
                        edge_features = tf.gather(q_encoding, edge_segments_ph)

                        x = x0
                        outputs = []
                        log_losses = []
                        with tf.variable_scope('steps'):
                            lstm_cell = LSTMCell(self.n_hidden)
                            state = lstm_cell.zero_state(tf.shape(x)[0], tf.float32)

                            for step in range(self.n_steps):
                                x = message_passing(x, edge_indices_ph, edge_features,
                                                    lambda x: mlp(x, 'message-fn', self.n_hidden), edge_keep_prob)

                                x = mlp(tf.concat([x, x0], axis=1), 'post-fn', self.n_hidden)
                                # x=hidden state, state=<cell state, hidden state>
                                # device: (GPU:0)*5, (GPU:1)*5, (GPU:2)*5 (5 is the time step)
                                x, state = lstm_cell(x, state)
                                with tf.variable_scope('graph-sum'):
                                    # In every step, get the sum of output vectors of Nodes for every task(Graph)
                                    # i.e. graph_sum shape: (bs, n_hidden)
                                    graph_sum = tf.segment_sum(x, fact_segments_ph)
                                    out = graph_fn(graph_sum)  # shape: (bs, vocab_size)
                                    outputs.append(out)
                                    # softmax loss, scalar Tensor
                                    log_loss=tf.reduce_mean(
                                        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=answers_ph, logits=out))
                                    # log_losses is a list of scalar Tensor, each one means the loss in a time step
                                    log_losses.append(log_loss)

                                # reuse the Variables in LSTM across different time step
                                tf.get_variable_scope().reuse_variables()
                        # scalr Tensor, the sum of all regularization term loss
                        reg_loss = sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
                        # avg_n(log_losses) gets the mean loss for every step, i.e. "loss" is a scalar Tensor
                        # device: GPU:0, GPU:1, GPU:2
                        loss = avg_n(log_losses) + reg_loss

                        # device: GPU:0, GPU:1, GPU:2
                        stat={
                            'loss': loss,  # scalar Tensor
                            'grads': self.optimizer.compute_gradients(loss),
                            'log_losses': tf.stack(log_losses),  # (n_steps, )
                            'answers': answers_ph,  # (batch_size, )
                            'outputs': tf.stack(outputs),  # (n_steps, batch_size, vocab_size)
                            'task_indices': task_indices_ph  # (batch_size, )
                        }
                        towers.append(stat)
                        print('line 159: ')
                        print('"' + tf.get_variable_scope().name + '"')
                        # reuse the Variables in embedding, encoder, and some MLPs across different device
                        tf.get_variable_scope().reuse_variables()

            # device of the following 4 vars is CPU:0
            self.loss = avg_n([t['loss'] for t in towers])
            self.out = tf.concat([t['outputs'] for t in towers], axis=1)
            self.answers = tf.concat([t['answers'] for t in towers], axis=0)
            self.task_indices = tf.concat([t['task_indices'] for t in towers], axis=0)

            tf.summary.scalar('losses/total', self.loss)
            tf.summary.scalar('losses/reg', reg_loss)
            log_losses = avg_n([t['log_losses'] for t in towers])
            for i in range(self.n_steps):
                tf.summary.scalar('steps/%d/losses/log' % i, log_losses[i])

            avg_gradients = util.average_gradients([t['grads'] for t in towers])

            # global_step increases by 1 after the gradient is updated
            self.train_step = self.optimizer.apply_gradients(avg_gradients, global_step=self.global_step)

            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            util.print_vars(tf.trainable_variables())

            self.train_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/train/%s' % (self.revision, self.name),
                                                      self.session.graph)
            self.test_writer = tf.summary.FileWriter('/tmp/tensorboard/bAbI/%s/test/%s' % (self.revision, self.name),
                                                     self.session.graph)

            self.summaries = tf.summary.merge_all()

        print("Starting data loaders...")
        train_mp_queue = mp.Manager().Queue(maxsize=self.qsize)
        val_mp_queue = mp.Manager().Queue(maxsize=self.qsize)

        # After loaded data from disk(done in the code `self.encode_data(bAbI('en-valid-10k'))`),
        # use 4+1=5 Processes to construct batches and encode them, then enqueue them onto corresponding queue.
        # see more details in random_batch() and encode_batch()
        data_loader_processes = [mp.Process(target=self.data_loader, args=(train_mp_queue, True)) for i in range(4)]
        val_data_loader_processes = [mp.Process(target=self.data_loader, args=(val_mp_queue, False)) for i in range(1)]

        # start the processes
        for p in data_loader_processes + val_data_loader_processes:
            p.daemon = True
            p.start()

        # Use 2 threads to transfer data from train_mp_queue(val_mp_queue) to train_queue(val_queue).
        # Note that batch in train_mp_queue is ndarray of numpy,
        # and these two thread change every batch into Tensors and enqueue it onto train_queue.
        # see the placeholders defined before for the format of each batch.
        queue_putter_threads = [
            threading.Thread(target=self.queue_putter, args=(train_mp_queue, self.train_enqueue_op, 'train', 1000)),
            threading.Thread(target=self.queue_putter, args=(val_mp_queue, self.val_enqueue_op, 'val', 1)),
        ]
        # start data transferring
        for t in queue_putter_threads:
            t.daemon = True
            t.start()

        train_qsize, val_qsize = 0, 0
        print("Waiting for queue to fill...")
        while train_qsize < self.qsize or val_qsize < self.qsize:
            # update the size of the queues of training and validation
            train_qsize = self.session.run(self.train_qsize_op)
            val_qsize = self.session.run(self.val_qsize_op)
            print('train_qsize: %d, val_qsize: %d' % (train_qsize, val_qsize), flush=True)
            time.sleep(1)
"""Question: https://leetcode.com/problems/middle-of-the-linked-list/
"""

from datastruct import ListNode
from util import print_vars


class Solution:
    def middleNode(self, head: ListNode) -> ListNode:
        slow = fast = head
        while fast and fast.next:
            slow = slow.next
            fast = fast.next.next
        return slow


if __name__ == '__main__':
    head = ListNode.from_list([1, 2, 3, 4, 5])
    output = Solution().middleNode(head)
    print_vars(head, output)

    head = ListNode.from_list([1, 2, 3, 4, 5, 6])
    output = Solution().middleNode(head)
    print_vars(head, output)
Ejemplo n.º 8
0
    def my_pow_binary_bit(self, x: float, n: int) -> float:
        if x == 0.0:
            return 0.0
        if n < 0:
            x, n = 1 / x, -n
        res = 1
        while n:
            if n & 1:
                res *= x
            x *= x
            n >>= 1
        return res

    def myPow(self, x: float, n: int) -> float:
        return x**n


if __name__ == '__main__':
    x, n = 2.0, 10
    output = Solution().myPow(x, n)
    print_vars(x, n, output)

    x, n = 2.0, 10
    output = Solution().my_pow_binary(x, n)
    print_vars(x, n, output)

    x, n = 2.0, 10
    output = Solution().my_pow_binary_bit(x, n)
    print_vars(x, n, output)
    def __init__(self):
        self.train = train = TabSeparated('tasks/parsing/data/%s/train.tsv' % self.type, self.output_length)
        self.train_iterator = self.iterator(train)

        valid = TabSeparated('tasks/parsing/data/%s/valid.tsv' % self.type, self.output_length)
        self.valid_iterator = self.iterator(valid)

        parser = {'amounts': AmountParser(self.batch_size), 'dates': DateParser(self.batch_size)}[self.type]

        print("Building graph...")
        config = tf.ConfigProto(allow_soft_placement=False)
        self.session = tf.Session(config=config)
        self.is_training_ph = tf.placeholder(tf.bool)

        source, self.targets = tf.cond(
            self.is_training_ph,
            true_fn=lambda: self.train_iterator.get_next(),
            false_fn=lambda: self.valid_iterator.get_next()
        )
        self.sources = source

        oh_inputs = tf.one_hot(source, train.n_output)  # (bs, seq, n_out)

        context = tf.zeros(
            (self.batch_size, self.context_size),
            dtype=tf.float32,
            name=None
        )

        output_logits = parser.parse(oh_inputs, context, self.is_training_ph)

        with tf.variable_scope('loss'):
            mask = tf.logical_not(tf.equal(self.targets, train.pad_idx))
            label_cross_entropy = tf.reduce_mean(tf.losses.sparse_softmax_cross_entropy(self.targets, output_logits, reduction=Reduction.NONE) * tf.to_float(mask)) / tf.log(2.)

            chars = tf.argmax(output_logits, axis=2, output_type=tf.int32)
            equal = tf.equal(self.targets, chars)
            acc = tf.reduce_mean(tf.to_float(tf.reduce_all(tf.logical_or(equal, tf.logical_not(mask)), axis=1)))

        self.actual = chars
        self.loss = label_cross_entropy

        self.global_step = tf.Variable(initial_value=0, trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)

        self.train_step = self.optimizer.minimize(self.loss, global_step=self.global_step, colocate_gradients_with_ops=True)

        self.session.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        util.print_vars(tf.trainable_variables())

        if self.continue_from:
            print("Restoring " + self.continue_from + "...")
            self.saver.restore(self.session, self.continue_from)

        tf.summary.scalar('loss', self.loss)
        tf.summary.scalar('label cross entropy', label_cross_entropy)
        tf.summary.scalar('acc', acc)

        tensorboard_dir = os.environ.get('TENSORBOARD_DIR') or '/tmp/tensorboard'
        self.train_writer = tf.summary.FileWriter(tensorboard_dir + '/parse/%s/%s/train' % (self.type, self.experiment), self.session.graph)
        self.test_writer = tf.summary.FileWriter(tensorboard_dir + '/parse/%s/%s/test' % (self.type, self.experiment), self.session.graph)
        self.summaries = tf.summary.merge_all()
Ejemplo n.º 10
0
"""Question: https://leetcode.com/problems/implement-strstr/
"""

from util import print_vars


class Solution:
    def strStr(self, haystack: str, needle: str) -> int:
        if haystack == "" and needle == "":
            return 0
        for i in range(len(haystack) - len(needle) + 1):
            if haystack[i:i + len(needle)] == needle:
                return i
        return -1


if __name__ == "__main__":
    haystack = "hello"
    needle = "ll"
    output = Solution().strStr(haystack, needle)
    print_vars(haystack, needle, output)
    assert output == 2
Ejemplo n.º 11
0
from typing import List

from util import print_vars


class Solution:
    def lemonadeChange(self, bills: List[int]) -> bool:
        five, ten = 0, 0
        for i in bills:
            if i == 5:
                five += 1
            elif i == 10:
                five -= 1
                ten += 1
            elif ten > 0:  # i = 20
                five -= 1
                ten -= 1
            else:  # i = 20 and no 10 in hand
                five -= 3

            if five < 0:
                return False
        return True


if __name__ == "__main__":
    bills = [5, 5, 5, 10, 20]
    output = Solution().lemonadeChange(bills)
    print_vars(bills, output)
"""Question: https://leetcode.com/problems/minimum-depth-of-binary-tree/
"""

from datastruct import TreeNode
from util import print_vars


class Solution:
    def minDepth(self, root: TreeNode) -> int:
        if root is None:
            return 0
        left = self.minDepth(root.left)
        right = self.minDepth(root.right)
        if left and right:
            return min(left, right) + 1
        else:
            return left + right + 1


if __name__ == "__main__":
    root = TreeNode.deserialize("[3,9,20,null,null,15,7]")
    output = Solution().minDepth(root)
    print_vars(root, output)
Ejemplo n.º 13
0
            else:
                nums[i] = 0

    def moveZeroes(self, nums: List[int]) -> None:
        """
        Do not return anything, modify nums in-place instead.
        """
        for i in range(len(nums)):
            if nums[i] == 0:
                for j in range(i + 1, len(nums)):
                    if nums[j] != 0:
                        nums[i], nums[j] = nums[j], nums[i]
                        break


if __name__ == "__main__":
    nums = [0, 1, 0, 3, 12]
    print_vars(nums)
    Solution().moveZeroes(nums)
    print_vars(nums)

    nums = [0, 1, 0, 3, 12]
    print_vars(nums)
    Solution().move_zeros_save_non_zero(nums)
    print_vars(nums)

    nums = [0, 1, 0, 3, 12]
    print_vars(nums)
    Solution().move_zeros_two_pointer(nums)
    print_vars(nums)
Ejemplo n.º 14
0
"""Question: https://leetcode.com/problems/intersection-of-two-arrays/
"""

from typing import List

from util import print_vars


class Solution:
    def intersection(self, nums1: List[int], nums2: List[int]) -> List[int]:
        return list(set(nums1) & set(nums2))


if __name__ == '__main__':
    nums1, nums2 = [1, 2, 2, 1], [2, 2]
    output = Solution().intersection(nums1, nums2)
    print_vars(nums1, nums2, output)

    nums1, nums2 = [4, 9, 5], [9, 4, 9, 8, 4]
    output = Solution().intersection(nums1, nums2)
    print_vars(nums1, nums2, output)
Ejemplo n.º 15
0
"""Question: https://leetcode.com/problems/longest-common-prefix/
"""

from typing import List

from util import print_vars


class Solution:
    def longestCommonPrefix(self, strs: List[str]) -> str:
        if not strs:
            return ""
        prefix = strs[0]
        for s in strs:
            while prefix and prefix != s[:len(prefix)]:
                prefix = prefix[:-1]
            if not prefix:
                return ""
        return prefix


if __name__ == '__main__':
    strs = ["flower", "flow", "flight"]
    output = Solution().longestCommonPrefix(strs)
    print_vars(strs, output)
Ejemplo n.º 16
0
    def decode_iteratively(self, example_generator, total, names_to_types,
                           ssi_list, hps):
        attn_vis_idx = 0
        for example_idx, example in enumerate(
                tqdm(example_generator, total=total)):
            raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, groundtruth_article_lcs_paths_list = util.unpack_tf_example(
                example, names_to_types)
            article_sent_tokens = [
                util.process_sent(sent) for sent in raw_article_sents
            ]
            groundtruth_summ_sents = [[
                sent.strip()
                for sent in groundtruth_summary_text.strip().split('\n')
            ]]
            groundtruth_summ_sent_tokens = [
                sent.split(' ') for sent in groundtruth_summ_sents[0]
            ]

            if ssi_list is None:  # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth)
                sys_ssi = groundtruth_similar_source_indices_list
                sys_alp_list = groundtruth_article_lcs_paths_list
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2)
                sys_ssi, sys_alp_list = util.replace_empty_ssis(
                    sys_ssi, raw_article_sents, sys_alp_list=sys_alp_list)
            else:
                gt_ssi, sys_ssi, ext_len, sys_token_probs_list = ssi_list[
                    example_idx]
                sys_alp_list = ssi_functions.list_labels_from_probs(
                    sys_token_probs_list, FLAGS.tag_threshold)
                if FLAGS.singles_and_pairs == 'singles':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 1)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 1)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 1)
                elif FLAGS.singles_and_pairs == 'both':
                    sys_ssi = util.enforce_sentence_limit(sys_ssi, 2)
                    sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2)
                    groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                        groundtruth_similar_source_indices_list, 2)
                    gt_ssi = util.enforce_sentence_limit(gt_ssi, 2)
                # if gt_ssi != groundtruth_similar_source_indices_list:
                #     raise Exception('Example %d has different groundtruth source indices: ' + str(groundtruth_similar_source_indices_list) + ' || ' + str(gt_ssi))
                if FLAGS.dataset_name == 'xsum':
                    sys_ssi = [sys_ssi[0]]

            final_decoded_words = []
            final_decoded_outpus = ''
            best_hyps = []
            highlight_html_total = '<u>System Summary</u><br><br>'
            for ssi_idx, ssi in enumerate(sys_ssi):
                # selected_article_lcs_paths = None
                selected_article_lcs_paths = sys_alp_list[ssi_idx]
                ssi, selected_article_lcs_paths = util.make_ssi_chronological(
                    ssi, selected_article_lcs_paths)
                selected_article_lcs_paths = [selected_article_lcs_paths]
                selected_raw_article_sents = util.reorder(
                    raw_article_sents, ssi)
                selected_article_text = ' '.join([
                    ' '.join(sent)
                    for sent in util.reorder(article_sent_tokens, ssi)
                ])
                selected_doc_indices_str = '0 ' * len(
                    selected_article_text.split())
                if FLAGS.upper_bound:
                    selected_groundtruth_summ_sent = [[
                        groundtruth_summ_sents[0][ssi_idx]
                    ]]
                else:
                    selected_groundtruth_summ_sent = groundtruth_summ_sents

                batch = create_batch(selected_article_text,
                                     selected_groundtruth_summ_sent,
                                     selected_doc_indices_str,
                                     selected_raw_article_sents,
                                     selected_article_lcs_paths,
                                     FLAGS.batch_size, hps, self._vocab)

                original_article = batch.original_articles[0]  # string
                original_abstract = batch.original_abstracts[0]  # string
                article_withunks = data.show_art_oovs(original_article,
                                                      self._vocab)  # string
                abstract_withunks = data.show_abs_oovs(
                    original_abstract, self._vocab,
                    (batch.art_oovs[0]
                     if FLAGS.pointer_gen else None))  # string
                # article_withunks = data.show_art_oovs(original_article, self._vocab) # string
                # abstract_withunks = data.show_abs_oovs(original_abstract, self._vocab, (batch.art_oovs[0] if FLAGS.pointer_gen else None)) # string

                if FLAGS.first_intact and ssi_idx == 0:
                    decoded_words = selected_article_text.strip().split()
                    decoded_output = selected_article_text
                else:
                    decoded_words, decoded_output, best_hyp = decode_example(
                        self._sess, self._model, self._vocab, batch,
                        example_idx, hps)
                    best_hyps.append(best_hyp)
                final_decoded_words.extend(decoded_words)
                final_decoded_outpus += decoded_output

                if example_idx < 100 or (example_idx >= 2000
                                         and example_idx < 2100):
                    min_matched_tokens = 2
                    selected_article_sent_tokens = [
                        util.process_sent(sent)
                        for sent in selected_raw_article_sents
                    ]
                    highlight_summary_sent_tokens = [decoded_words]
                    highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list(
                        highlight_summary_sent_tokens,
                        selected_article_sent_tokens, None, 2,
                        min_matched_tokens)
                    highlighted_html = ssi_functions.html_highlight_sents_in_article(
                        highlight_summary_sent_tokens,
                        highlight_ssi_list,
                        selected_article_sent_tokens,
                        lcs_paths_list=lcs_paths_list,
                        article_lcs_paths_list=
                        highlight_smooth_article_lcs_paths_list)
                    highlight_html_total += highlighted_html + '<br>'

                if FLAGS.attn_vis and example_idx < 200:
                    self.write_for_attnvis(
                        article_withunks, abstract_withunks, decoded_words,
                        best_hyp.attn_dists, best_hyp.p_gens, attn_vis_idx
                    )  # write info to .json file for visualization tool
                    attn_vis_idx += 1

                if len(final_decoded_words) >= 100:
                    break

            gt_ssi_list, gt_alp_list = util.replace_empty_ssis(
                groundtruth_similar_source_indices_list,
                raw_article_sents,
                sys_alp_list=groundtruth_article_lcs_paths_list)
            highlight_html_gt = '<u>Reference Summary</u><br><br>'
            for ssi_idx, ssi in enumerate(gt_ssi_list):
                selected_article_lcs_paths = gt_alp_list[ssi_idx]
                try:
                    ssi, selected_article_lcs_paths = util.make_ssi_chronological(
                        ssi, selected_article_lcs_paths)
                except:
                    util.print_vars(ssi, example_idx,
                                    selected_article_lcs_paths)
                    raise
                selected_raw_article_sents = util.reorder(
                    raw_article_sents, ssi)

                if example_idx < 100 or (example_idx >= 2000
                                         and example_idx < 2100):
                    min_matched_tokens = 2
                    selected_article_sent_tokens = [
                        util.process_sent(sent)
                        for sent in selected_raw_article_sents
                    ]
                    highlight_summary_sent_tokens = [
                        groundtruth_summ_sent_tokens[ssi_idx]
                    ]
                    highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list(
                        highlight_summary_sent_tokens,
                        selected_article_sent_tokens, None, 2,
                        min_matched_tokens)
                    highlighted_html = ssi_functions.html_highlight_sents_in_article(
                        highlight_summary_sent_tokens,
                        highlight_ssi_list,
                        selected_article_sent_tokens,
                        lcs_paths_list=lcs_paths_list,
                        article_lcs_paths_list=
                        highlight_smooth_article_lcs_paths_list)
                    highlight_html_gt += highlighted_html + '<br>'

            if example_idx < 100 or (example_idx >= 2000
                                     and example_idx < 2100):
                self.write_for_human(raw_article_sents, groundtruth_summ_sents,
                                     final_decoded_words, example_idx)
                highlight_html_total = ssi_functions.put_html_in_two_columns(
                    highlight_html_total, highlight_html_gt)
                ssi_functions.write_highlighted_html(highlight_html_total,
                                                     self._highlight_dir,
                                                     example_idx)

            # if example_idx % 100 == 0:
            #     attn_dir = os.path.join(self._decode_dir, 'attn_vis_data')
            #     attn_selections.process_attn_selections(attn_dir, self._decode_dir, self._vocab)

            rouge_functions.write_for_rouge(
                groundtruth_summ_sents,
                None,
                example_idx,
                self._rouge_ref_dir,
                self._rouge_dec_dir,
                decoded_words=final_decoded_words,
                log=False
            )  # write ref summary and decoded summary to file, to eval with pyrouge later
            # if FLAGS.attn_vis:
            #     self.write_for_attnvis(article_withunks, abstract_withunks, decoded_words, best_hyp.attn_dists, best_hyp.p_gens, example_idx) # write info to .json file for visualization tool
            example_idx += 1  # this is how many examples we've decoded

        logging.info("Decoder has finished reading dataset for single_pass.")
        logging.info("Output has been saved in %s and %s.",
                     self._rouge_ref_dir, self._rouge_dec_dir)
        if len(os.listdir(self._rouge_ref_dir)) != 0:
            if FLAGS.dataset_name == 'xsum':
                l_param = 100
            else:
                l_param = 100
            logging.info("Now starting ROUGE eval...")
            results_dict = rouge_functions.rouge_eval(self._rouge_ref_dir,
                                                      self._rouge_dec_dir,
                                                      l_param=l_param)
            rouge_functions.rouge_log(results_dict, self._decode_dir)
Ejemplo n.º 17
0
"""Question: https://leetcode.com/problems/find-common-characters/
"""

from typing import List

from util import print_vars


class Solution:
    def commonChars(self, A: List[str]) -> List[str]:
        from collections import Counter
        cnt = Counter(A[0])
        for i in A:
            cnt &= Counter(i)
        return list(cnt.elements())


if __name__ == '__main__':
    a = ["bella", "label", "roller"]
    output = Solution().commonChars(a)
    print_vars(a, output)
"""Question: https://leetcode.com/problems/sum-of-even-numbers-after-queries/
"""

from typing import List

from util import print_vars


class Solution:
    def sumEvenAfterQueries(self, A: List[int],
                            queries: List[List[int]]) -> List[int]:
        res = []
        sum_even = sum(i for i in A if i % 2 == 0)
        for val, index in queries:
            if A[index] % 2 == 0:
                sum_even -= A[index]
            A[index] += val
            if A[index] % 2 == 0:
                sum_even += A[index]
            res.append(sum_even)
        return res


if __name__ == "__main__":
    A, queries = [1, 2, 3, 4], [[1, 0], [-3, 1], [-4, 0], [2, 3]]
    output = Solution().sumEvenAfterQueries(A, queries)
    print_vars(A, queries, output)