Ejemplo n.º 1
0
                restored = saver.restore(sess, model_file)
            except:
                log.error('restore failed. model_file: %s' % model_file)
            try:
                for i, s in enumerate(sentences):
                    log.info('')
                    log.info('[%s] in : "%s"' % (i, s))
                    _features, _labels = WordSpacing.sentence2features_labels(s, left_gram, right_gram)
                    dataset = DataSet(features=_features, labels=_labels, features_vector=features_vector, labels_vector=labels_vector)
                    dataset.convert_to_one_hot_vector()
                    if len(dataset) > 0:
                        _predicted, _accuracy = sess.run([predicted, accuracy], feed_dict={X: dataset.features, Y: dataset.labels})  # Accuracy report

                        sentence_hat = WordSpacing.spacing(s.replace(' ', ''), _predicted)
                        sim, correct, total = WordSpacing.sim_two_sentence(s, sentence_hat, left_gram=left_gram, right_gram=right_gram)

                        accuracies.append(_accuracy)
                        sims.append(sim)

                        log.info('[%s] out: "%s" (accuracy: %.1f%%, sim: %.1f%%=%s/%s)' % (i, sentence_hat, _accuracy * 100, sim * 100, correct, total))
            except:
                log.error(traceback.format_exc())

        log.info('chek result OK.')
        # noinspection PyStringFormat
        log.info('mean(accuracy): %.2f%%, mean(sim): %.2f%%' % (np.mean(accuracies) * 100, np.mean(sims) * 100))
        log.info('secs/sentence: %.4f' % (watch.elapsed('run tensorflow') / len(sentences)))
        log.info(watch.summary())
    except:
        log.error(traceback.format_exc())
Ejemplo n.º 2
0
                                            valid_writer.add_summary(_summary, global_step=nth_batch)
                                            if _valid_cost < min_valid_cost:
                                                min_valid_cost = _valid_cost
                                                min_valid_epoch = epoch
                                            log.info('[epoch: %s, nth_batch: %s] train cost: %.8f, valid cost: %.8f' % (
                                                epoch, nth_batch, _train_cost, _valid_cost))
                                            if min_valid_epoch == epoch:  # save the lastest best model
                                                saver.save(sess, model_file)

                                    if save_model_each_epochs:
                                        saver.save(sess, model_file, global_step=epoch)

                                log.info('')
                                log.info(
                                    '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s,  %.2f secs (batch_size: %s,  total_input_data: %s, total_epochs: %s, total_train_time: %s secs)' % (
                                        model_name, min_valid_cost, min_valid_epoch, watch.elapsed(),
                                        batch_size, NumUtil.comma_str(batch_size * nth_batch), epoch, total_train_time))
                                log.info('')
                            except:
                                log.info(traceback.format_exc())
                            finally:
                                coordinator.request_stop()
                                coordinator.join(threads)  # Wait for threads to finish.
                        else:  # testing
                            x, y, learning_rate, W1, b1, y_hat, cost, train_step, summary = create_graph(model_name, scope_name, verbose=False)
                            test_x_batch, test_y_batch = input_pipeline([test_file], batch_size=n_test, delim='\t', splits=3)

                            log.info('')
                            log.info('model loaded... %s' % model_file)
                            saver = tf.train.Saver(max_to_keep=None)
                            saver.restore(sess, model_file)
Ejemplo n.º 3
0
    def train(self, iterations: int, batch: int, embedding: Word2VecEmbedding,
              args: argparse.Namespace) -> str:
        batches_in_epoch = int(numpy.ceil(
            len(self.dataloader.dataset) / batch))
        total_batches = batches_in_epoch * iterations
        nth_total_batch = 0
        log.info(f'batches_in_epoch: {batches_in_epoch}')
        log.info(f'total_batches: {total_batches}')

        watch = WatchUtil(auto_stop=False)
        watch.start()
        best_loss = float("inf")
        first_epoch, last_epoch = self.epoch + 1, self.epoch + iterations + 1
        last_embedding_file = None

        log.info(Word2VecEmbedding.get_filenpath(args))
        for self.epoch in range(first_epoch, last_epoch):
            log.info(f"[e{self.epoch:2d}] {self}")
            loss_list = []
            for nth, (iword, owords) in enumerate(self.dataloader, 1):
                try:
                    loss = self.sgns(iword, owords)
                except RuntimeError:
                    loss_list = [float('-inf')]
                    break

                self.optim.zero_grad()
                loss.backward()
                self.optim.step()
                # if nth_batch == 1 and self.scheduler is not None and self.epoch >= self.decay_start_epoch:  # TODO: TEST
                #     self.scheduler.step()

                if self.learning_decay != 0:
                    PytorchUtil.set_learning_rate(self.optim,
                                                  self.epoch,
                                                  gamma=self.learning_decay,
                                                  base_lr=self.init_lr,
                                                  min_lr=1e-10,
                                                  decay_start=2,
                                                  decay_interval=3)

                lr = PytorchUtil.get_learning_rate(self.optim)

                _, negatives = owords.size()
                real_loss = loss.data[0] / float(negatives)

                loss_list.append(real_loss)

                nth_total_batch += 1
                progressed = nth_total_batch / total_batches
                seconds_per_batch = float(
                    watch.elapsed()) / float(nth_total_batch)
                remain_batches = total_batches - nth_total_batch
                remain_secs = int(seconds_per_batch * remain_batches)

                if nth == 1 or nth == batches_in_epoch or nth % 1000 == 0:
                    log.info(
                        f"[e{self.epoch:2d}][b{nth:5d}/{batches_in_epoch:5d}][{progressed*100:.1f}% remain: {DateUtil.secs_to_string(remain_secs)}][window: {self.window}][lr: {lr:.0e}] loss: {real_loss:.7f}"
                    )

            total_loss = numpy.mean(loss_list)
            log.info(
                f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss: {total_loss:.7f}, best_loss: {best_loss:.7f}"
            )
            if total_loss > best_loss or total_loss == float(
                    'inf') or total_loss == float(
                        '-inf'):  # bad loss than before or diverge
                log.info('')
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] total_loss > best_loss BREAK"
                )
                log.info('')
                break
            else:
                if best_loss < total_loss:
                    best_loss = total_loss
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save()..."
                )
                args.epoch = self.epoch
                last_embedding_file = embedding.save(
                    idx2vec=trainer.embedding,
                    filepath=Word2VecEmbedding.get_filenpath(args))
                log.info(
                    f"[e{self.epoch:2d}][window: {self.window}][lr: {lr:.0e}] embedding.save() OK. {os.path.basename(embedding.filepath)}"
                )
        return last_embedding_file
Ejemplo n.º 4
0
                                                   _train_cost, _valid_cost))
                                            if min_valid_epoch == epoch:  # save the lastest best model
                                                saver.save(sess, model_file)

                                    if save_model_each_epochs:
                                        saver.save(sess,
                                                   model_file,
                                                   global_step=epoch)

                                log.info('')
                                log.info(
                                    '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s,  %.2f secs (batch_size: %s,  total_input_data: %s, total_epochs: %s, total_train_time: %s secs)'
                                    %
                                    (model_name,
                                     min_valid_cost, min_valid_epoch,
                                     watch.elapsed(), batch_size,
                                     NumUtil.comma_str(batch_size * nth_batch),
                                     epoch, total_train_time))
                                log.info('')
                            except:
                                log.info(traceback.format_exc())
                            finally:
                                coordinator.request_stop()
                                coordinator.join(
                                    threads)  # Wait for threads to finish.
                        else:  # testing
                            x, y, learning_rate, use_first_pipeline, W1, b1, y_hat, cost, train_step, summary = create_graph(
                                model_name,
                                scope_name,
                                first_pipeline=test_pipeline,
                                second_pipeline=test_pipeline,
Ejemplo n.º 5
0
                                                     _train_cost, _valid_cost))
                                                if min_valid_epoch == epoch:  # save the lastest best model
                                                    saver.save(
                                                        sess, model_file)

                                    if save_model_each_epochs:
                                        saver.save(sess,
                                                   model_file,
                                                   global_step=epoch)
                                log.info('')
                                log.info(
                                    '"%s" train: min_valid_cost: %.8f, min_valid_epoch: %s,  %.2f secs (batch_size: %s,  total_input_data: %s, total_epochs: %s, total_train_time: %s secs)'
                                    %
                                    (model_name,
                                     min_valid_cost, min_valid_epoch,
                                     watch.elapsed(), batch_size,
                                     NumUtil.comma_str(batch_size * nth_batch),
                                     epoch, total_train_time))
                                log.info('')
                            except:
                                log.info(traceback.format_exc())
                        else:  # testing
                            log.info('')
                            log.info('model loaded... %s' % model_file)
                            saver.restore(sess, model_file)
                            log.info('model loaded OK. %s' % model_file)

                            try:
                                watch = WatchUtil()
                                watch.start()
                                for _x_batch, _y_batch in next_batch_in_memory(
Ejemplo n.º 6
0
                                                             Y: dataset.labels
                                                         })  # Accuracy report

                        generated_sentence = WordSpacing.spacing(
                            s.replace(' ', ''), _predicted)
                        sim, correct, total = WordSpacing.sim_two_sentence(
                            s,
                            generated_sentence,
                            left_gram=left_gram,
                            right_gram=right_gram)

                        accuracies.append(_accuracy)
                        sims.append(sim)

                        log.info(
                            '[%s] out: "%s" (accuracy: %.1f%%, sim: %.1f%%=%s/%s)'
                            % (i, generated_sentence, _accuracy * 100,
                               sim * 100, correct, total))
            except:
                log.error(traceback.format_exc())

        log.info('chek result OK.')
        # noinspection PyStringFormat
        log.info('mean(accuracy): %.2f%%, mean(sim): %.2f%%' %
                 (np.mean(accuracies) * 100, np.mean(sims) * 100))
        log.info('secs/sentence: %.4f' %
                 (watch.elapsed('run tensorflow') / len(sentences)))
        log.info(watch.summary())
    except:
        log.error(traceback.format_exc())