Example #1
0
 def report_statics(self):
     mylog_section("FSA")
     mylog_subsection("FSA Info")
     mylog("Number of States: {}".format(len(self.states)))
     mylog("Number of Links: {}".format(self.num_links))
     mylog("Start state: {}".format(self.start_state.name))
     mylog("End state: {}".format(self.end_state.name))
Example #2
0
    def __init__(self,
                 buckets,
                 size,
                 from_vocab_size,
                 target_vocab_size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 optimizer="adam",
                 forward_only=False,
                 dropoutRate=1.0,
                 run_options=None,
                 run_metadata=None,
                 devices_per_model=None,
                 topk_n=30,
                 dtype=tf.float32,
                 with_attention=False,
                 beam_search=False,
                 beam_buckets=None,
                 n_samples=500,
                 with_sampled_softmax=False,
                 attention_style="additive",
                 attention_scale=True,
                 num_models=4,
                 tie_input_output_embedding=False):
        '''
        LocalReplica: Model1[GPU0,GPU1] Model2[GPU3,GPU4],... each model has their own variables, after one step, gradients will sum across multiple GPUs, and updates locally on their own GPU. 

        devices_per_model: [["/gpu:0",..],...] devices_per_model[m][l] m: model, l:layer



        '''

        self.models = []
        self.devices_per_model = devices_per_model
        self.variable_mgr = VariableMgrLocalReplicated()
        self.num_models = num_models
        self.buckets = buckets
        self.run_options = run_options
        self.run_metadata = run_metadata

        # Generate models
        for d, devices_each_model in enumerate(self.devices_per_model):
            with tf.device(devices_each_model[0]):
                with self.variable_mgr.create_outer_variable_scope(
                        d), tf.name_scope("tower_{}".format(d)) as name_scope:
                    mylog("creating model #{} at devices: {}".format(
                        d, devices_each_model))
                    seqModel = SeqModel(
                        buckets,
                        size,
                        from_vocab_size,
                        target_vocab_size,
                        num_layers,
                        max_gradient_norm,
                        batch_size,
                        learning_rate,
                        learning_rate_decay_factor,
                        optimizer=optimizer,
                        forward_only=forward_only,
                        dropoutRate=dropoutRate,
                        devices=devices_each_model,
                        run_options=run_options,
                        run_metadata=run_metadata,
                        topk_n=topk_n,
                        dtype=dtype,
                        with_attention=with_attention,
                        beam_search=beam_search,
                        beam_buckets=beam_buckets,
                        n_samples=n_samples,
                        with_sampled_softmax=with_sampled_softmax,
                        attention_style=attention_style,
                        attention_scale=attention_scale,
                        standalone=False,  # ! do not init the optimizer now
                        n_distributed_models=self.num_models,
                        tie_input_output_embedding=tie_input_output_embedding)

                    self.models.append(seqModel)

        # collect the learning_rate_decay_op
        self.learning_rate_dacay_ops = []
        self.dropout10_ops = []
        self.dropoutAssign_ops = []
        for model in self.models:
            self.learning_rate_dacay_ops.append(model.learning_rate_decay_op)
            self.dropout10_ops.append(model.dropout10_op)
            self.dropoutAssign_ops.append(model.dropoutAssign_op)

        # Aggregate the gradients

        section = "Aggregate Gradients "
        mylog_section(section)

        agg_grads = []

        for b in xrange(len(buckets)):

            mylog_subsection("Bucket {}".format(b))

            # for each buckets
            gradients = []  # [[grad * n_variable] * n_model]
            params = []  # [[param * n_variable] * n_model]
            for model in self.models:
                gradients.append(model.gradients[b])
                params.append(model.params)

            agg_grad_per_gpu = {
            }  # record how many aggregations of grads happens on eah gpu

            agg_grads_per_bucket = []

            for param_id in xrange(len(params[0])):

                grads_per_model = []
                params_per_model = []

                for model_id in xrange(len(params)):
                    params_per_model.append(params[model_id][param_id])
                    grads_per_model.append(gradients[model_id][param_id])

                # choose one device to do aggregation
                device_for_agg = None

                min_n_agg = 1000000

                for param in params_per_model:
                    dev = param.device
                    if not dev in agg_grad_per_gpu:
                        agg_grad_per_gpu[dev] = []
                    n_agg = len(agg_grad_per_gpu[dev])
                    if min_n_agg > n_agg:
                        min_n_agg = n_agg
                        device_for_agg = dev

                agg_grad_per_gpu[device_for_agg].append(params[0][param_id])

                with tf.device(device_for_agg):
                    if type(grads_per_model[0]) == tf.IndexedSlices:
                        values = tf.concat([x.values for x in grads_per_model],
                                           0)
                        indices = tf.concat(
                            [x.indices for x in grads_per_model], 0)
                        agg_grad = tf.IndexedSlices(values, indices)
                    else:
                        agg_grad = tf.add_n(grads_per_model)

                agg_grads_per_bucket.append(agg_grad)

            # show aggregation device placement
            for device in agg_grad_per_gpu:
                mylog("Aggregated On {}:".format(device))
                for param in agg_grad_per_gpu[device]:
                    mylog("\t" + param.name)
            agg_grads.append(agg_grads_per_bucket)

        # send the aggregated grads to each model on different gpus
        for d, devices_each_model in enumerate(self.devices_per_model):
            self.models[d].init_agg_updates(agg_grads)

        # combine losses, updates and gradients norm
        self.losses = []  # per bucket
        self.updates = []
        self.gradient_norms = []

        for b in xrange(len(buckets)):
            losses = []
            updates = []
            gradient_norms = []
            for i, model in enumerate(self.models):
                losses.append(model.losses[b])
                updates.append(model.updates[b])
                gradient_norms.append(model.gradient_norms[b])

            loss = tf.add_n(losses)
            self.losses.append(loss)
            self.updates.append(updates)
            self.gradient_norms.append(gradient_norms)

        # get init ops group
        self.var_init_op = tf.global_variables_initializer()
        self.broadcast_ops = self.variable_mgr.get_post_init_ops()

        # for saver
        all_vars = tf.global_variables()
        self.train_vars = []
        for var in all_vars:
            if var.name.startswith("v0"):
                self.train_vars.append(var)

        self.saver = tf.train.Saver(self.train_vars)
        self.best_saver = tf.train.Saver(self.train_vars)
Example #3
0
def log_flags(_FLAGS):
    members = _FLAGS.__dict__['__flags'].keys()
    mylog_section("FLAGS")
    for attr in members:
        mylog("{}={}".format(attr, getattr(_FLAGS, attr)))
Example #4
0
def train():

    # Read Data
    mylog_section("READ DATA")

    from_train = None
    to_train = None
    from_dev = None
    to_dev = None

    from_train, to_train, from_dev, to_dev, _, _ = data_utils.prepare_data(
        FLAGS.data_cache_dir,
        FLAGS.train_path_from,
        FLAGS.train_path_to,
        FLAGS.dev_path_from,
        FLAGS.dev_path_to,
        FLAGS.from_vocab_size,
        FLAGS.to_vocab_size,
        preprocess_data = FLAGS.preprocess_data
    )


    train_data_bucket = read_data(from_train,to_train,_buckets)
    dev_data_bucket = read_data(from_dev,to_dev, _buckets)
    _,_,real_vocab_size_from,real_vocab_size_to = data_utils.get_vocab_info(FLAGS.data_cache_dir)
    
    FLAGS._buckets = _buckets
    FLAGS.real_vocab_size_from = real_vocab_size_from
    FLAGS.real_vocab_size_to = real_vocab_size_to

    train_n_targets = np.sum([np.sum([len(items[1]) for items in x]) for x in train_data_bucket])
    train_n_tokens = np.sum([np.sum([len(items[1])+len(items[0]) for items in x]) for x in train_data_bucket])
    
    train_bucket_sizes = [len(train_data_bucket[b]) for b in xrange(len(_buckets))]
    train_total_size = float(sum(train_bucket_sizes))
    train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size for i in xrange(len(train_bucket_sizes))]
    dev_bucket_sizes = [len(dev_data_bucket[b]) for b in xrange(len(_buckets))]
    dev_total_size = int(sum(dev_bucket_sizes))

    mylog_section("REPORT")
    # steps
    batch_size = FLAGS.batch_size
    n_epoch = FLAGS.n_epoch
    steps_per_epoch = int(train_total_size / batch_size / FLAGS.num_models)
    steps_per_dev = int(dev_total_size / batch_size)
    if FLAGS.checkpoint_steps == 0:
        steps_per_checkpoint = int(steps_per_epoch / FLAGS.checkpoint_frequency)
    else:
        steps_per_checkpoint = FLAGS.checkpoint_steps
        
    total_steps = steps_per_epoch * n_epoch

    # reports
    mylog("from_vocab_size: {}".format(FLAGS.real_vocab_size_from))
    mylog("to_vocab_size: {}".format(FLAGS.real_vocab_size_to))
    mylog("_buckets: {}".format(FLAGS._buckets))
    mylog("Train:")
    mylog("total: {}".format(train_total_size))
    mylog("bucket sizes: {}".format(train_bucket_sizes))
    mylog("Dev:")
    mylog("total: {}".format(dev_total_size))
    mylog("bucket sizes: {}".format(dev_bucket_sizes))
    mylog("Steps_per_epoch: {}".format(steps_per_epoch))
    mylog("Total_steps:{}".format(total_steps))
    mylog("Steps_per_checkpoint: {}".format(steps_per_checkpoint))


    mylog_section("IN TENSORFLOW")
    
    config = tf.ConfigProto(allow_soft_placement=True, log_device_placement = False)
    config.gpu_options.allow_growth = FLAGS.allow_growth

    with tf.Session(config=config) as sess:
        
        # runtime profile
        if FLAGS.profile:
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            run_metadata = tf.RunMetadata()
        else:
            run_options = None
            run_metadata = None

        mylog_section("MODEL/SUMMARY/WRITER")

        mylog("Creating Model.. (this can take a few minutes)")
        model = create_model(sess, run_options, run_metadata)

        if FLAGS.with_summary:
            mylog("Creating ModelSummary")
            modelSummary = ModelSummary()

            mylog("Creating tf.summary.FileWriter")
            summaryWriter = tf.summary.FileWriter(os.path.join(FLAGS.summary_dir , "train.summary"), sess.graph)

        mylog_section("All Variables")
        show_all_variables()

        # Data Iterators
        mylog_section("Data Iterators")

        dite = DataIterator(model, train_data_bucket, len(train_buckets_scale), batch_size, train_buckets_scale)
        
        iteType = 0
        if iteType == 0:
            mylog("Itetype: withRandom")
            ite = dite.next_random()
        elif iteType == 1:
            mylog("Itetype: withSequence")
            ite = dite.next_sequence()
        
        # statistics during training
        step_time, loss = 0.0, 0.0
        get_batch_time = 0.0
        current_step = 0
        previous_losses = []
        low_ppx = float("inf")
        low_ppx_step = 0
        steps_per_report = 30
        n_targets_report = 0
        n_sources_report = 0
        report_time = 0
        n_valid_sents = 0
        n_valid_words = 0
        patience = FLAGS.patience
        
        mylog_section("TRAIN")

        
        while current_step < total_steps:
            
            # start
            start_time = time.time()
            
            # data and train
            source_inputs, target_inputs, target_outputs, target_weights, bucket_id = ite.next()

            get_batch_time += (time.time() - start_time) / steps_per_checkpoint
            
            L, norm = model.step(sess, source_inputs, target_inputs, target_outputs, target_weights, bucket_id)

            
            # loss and time
            step_time += (time.time() - start_time) / steps_per_checkpoint
            
            loss += L
            current_step += 1
            n_valid_sents += np.sum(np.sign(target_weights[0]))

            # double sum because different model's target_weights has different shape
            n_valid_words += np.sum(np.sum(target_weights))
            
            # for report
            report_time += (time.time() - start_time)
            
            n_targets_report += np.sum(np.sum(target_weights))
            n_sources_report += np.sum(np.sum(np.sign(source_inputs)))
    
            if current_step % steps_per_report == 1:
                sect_name = "STEP {}".format(current_step)
                msg = "StepTime: {:.4f} sec Speed: {:.4f} words/s Total_words: {} get_batch_time_ratio: {:.4f}".format(report_time/steps_per_report, (n_sources_report+n_targets_report)*1.0 / report_time, train_n_tokens, get_batch_time / step_time)
                mylog_line(sect_name,msg)

                report_time = 0
                n_targets_report = 0
                n_sources_report = 0

                # Create the Timeline object, and write it to a json
                if FLAGS.profile:
                    tl = timeline.Timeline(run_metadata.step_stats)
                    ctf = tl.generate_chrome_trace_format()
                    with open('timeline.json', 'w') as f:
                        f.write(ctf)
                    exit()
                    
            
            if current_step % steps_per_checkpoint == 1:

                i_checkpoint = int(current_step / steps_per_checkpoint)
                
                # train_ppx
                loss = loss * FLAGS.batch_size * FLAGS.num_models 
                loss = loss / n_valid_words
                train_ppx = math.exp(float(loss)) if loss < 300 else float("inf")
                learning_rate = model.get_learning_rate(sess)
                
                                
                # dev_ppx
                dev_loss, dev_ppx = evaluate(sess, model, dev_data_bucket)

                # report
                sect_name = "CHECKPOINT {} STEP {}".format(i_checkpoint, current_step)
                msg = "Learning_rate: {:.4f} Dev_ppx: {:.4f} Train_ppx: {:.4f} Norm: {:.4f}".format(learning_rate, dev_ppx, train_ppx, norm)
                mylog_line(sect_name, msg)

                if FLAGS.with_summary:
                    # save summary
                    _summaries = modelSummary.step_record(sess, train_ppx, dev_ppx)
                    for _summary in _summaries:
                        summaryWriter.add_summary(_summary, i_checkpoint)
                
                # save model per checkpoint
                if FLAGS.saveCheckpoint:
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir, "model")
                    s = time.time()
                    model.saver.save(sess, checkpoint_path, global_step=i_checkpoint, write_meta_graph = False)
                    msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path)
                    mylog_line(sect_name, msg)
                    
                # save best model
                if dev_ppx < low_ppx:
                    patience = FLAGS.patience
                    low_ppx = dev_ppx
                    low_ppx_step = current_step
                    checkpoint_path = os.path.join(FLAGS.saved_model_dir, "best")
                    s = time.time()
                    model.best_saver.save(sess, checkpoint_path, global_step=0, write_meta_graph = False)
                    msg = "Model saved using {:.4f} sec at {}".format(time.time()-s, checkpoint_path)
                    mylog_line(sect_name, msg)
                else:
                    patience -= 1

                    # decay the learning rate
                    if FLAGS.decay_learning_rate:
                        sess.run(model.learning_rate_dacay_ops)
                        msg = "New learning_rate: {:.4f} Dev_ppx: {:.4f} Lowest_dev_ppx: {:.4f}".format(model.get_learning_rate(sess), dev_ppx, low_ppx)
                        mylog_line(sect_name, msg)

                    

                if patience <= 0:
                    mylog("Training finished. Running out of patience.")
                    break

                # Save checkpoint and zero timer and loss.
                step_time, loss, n_valid_sents, n_valid_words = 0.0, 0.0, 0, 0
                get_batch_time = 0