def home():
    obj = DataReader()
    obj.parse_country()
    obj.parse_city()
    obj.parse_features()
    obj.parse_prediction()
    return jsonify(obj.final_data)
 def pretrain_model(self, src1_path, src2_path, tgt_path, epochs):
     datareader = DataReader()
     data = datareader.read_parallel_data(self.model, src1_path, src2_path,
                                          tgt_path)
     self.seq2seq_trainer.train(
         train_data=data,
         val_data=[],
         epochs=epochs,
         pretrain=True,
     )
Beispiel #3
0
    def init_from_config(self, config):
        # self.model = Model(config)
        self.model = Transformer(config, config.test.devices)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir))

        self.data_reader = DataReader(config)
Beispiel #4
0
    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        if is_debug:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Beispiel #5
0
    def init_from_config(self, config):

        logger = logging.getLogger('')

        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        # Print the number of total parameters
        print_num_of_total_parameters()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Beispiel #6
0
    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config)

        # Restore model.
        try:
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))
        except tf.errors.NotFoundError:
            roll_back_to_previous_version(config)
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)
Beispiel #7
0
 def __init__(self, options):
     """Gonna need a db, and some creds."""
     log.info("Starting AG Chatter Bot.")
     self.options = options
     # Build Constructors
     self.idx2word = Database(
             host=options.redis_host, pass_=options.redis_pass, db=0
         )
     self.word2idx = Database(
             host=options.redis_host, pass_=options.redis_pass, db=1
         )
     self.dataReader = DataReader(
             self.options, self.idx2word, self.word2idx
         )
     self.model = Model(
             self.options
         )
     log.debug(options)
     log.info("Init complete.")
class Seq2SeqTester:
    def __init__(self, model, output_name):
        self.model = model
        self.datareader = DataReader()
        self.metrics = ErrorMetrics()
        self.output_name = output_name

    def test(self, src1, src2, tgt):
        if tgt:
            data = self.datareader.read_parallel_data(self.model, src1, src2,
                                                      tgt)
            output_name = "{}_{}".format(self.output_name, src1.split("/")[-1])
            cer, wer = self.metrics.get_average_cer(
                self.model,
                data,
                output_file=open("{}.output".format(output_name),
                                 "w",
                                 encoding="utf-8"),
                write_pgens=False,
            )
            with open("{}.metrics".format(output_name), "w") as output_file:
                output_file.write("TEST CER: %0.4f\n" % (cer))
                output_file.write("TEST WER: %0.4f\n" % (wer))
        else:
            output_file = open(
                "{}_{}.output".format(self.output_name,
                                      src1.split("/")[-1]),
                "w",
                encoding="utf8",
            )
            data = self.datareader.read_test_data(self.model, src1, src2)
            for src1, src2 in data:
                if len(src1) == 0 or len(src2) == 0:
                    output_file.write("\n")
                    continue
                dy.renew_cg()
                output, _ = self.model.generate_beam(src1, src2)
                output_file.write(str(output) + "\n")
            output_file.close()
Beispiel #9
0
class Chatter(object):
    """Chatter App."""

    def __init__(self, options):
        """Gonna need a db, and some creds."""
        log.info("Starting AG Chatter Bot.")
        self.options = options
        # Build Constructors
        self.idx2word = Database(
                host=options.redis_host, pass_=options.redis_pass, db=0
            )
        self.word2idx = Database(
                host=options.redis_host, pass_=options.redis_pass, db=1
            )
        self.dataReader = DataReader(
                self.options, self.idx2word, self.word2idx
            )
        self.model = Model(
                self.options
            )
        log.debug(options)
        log.info("Init complete.")

    def sanity(self):
        """This kind of thing should be standardized."""
        log.info("Starting Stanity Check")
        key = "stuff"
        value = "morestuff"
        self.idx2word.write_data(key, value)
        new_value = self.idx2word.read_data(key)
        assert value == new_value
        log.debug("Passed Stanity Check")
        return True

    def main(self):
        """This kind of thing should be standardized."""
        if self.sanity():
            # Add the path to files in the config.yaml
            dataset = self.dataReader.make_buckets()
            print(dataset)
            return True
        return False
Beispiel #10
0
logfile = sys.argv[6]

ckpt_file = "ckpt"

x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \
    inference(batch_size)

sess = tf.InteractiveSession()

# Setup summary
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(logfile, sess.graph)

# Get data reader
data_reader = DataReader(train_dataset_dir,
                         batch_size=batch_size,
                         file_names=False,
                         resize_to=(224, 224))

tf.add_to_collection('x', x)
tf.add_to_collection('y', y)

tf.add_to_collection('gap_w', gap_w)
tf.add_to_collection('conv3', conv3_pool)

ckpt = tf.train.latest_checkpoint(model_path)
if ckpt:
    saver.restore(sess, ckpt)
    print("Model loaded from file: %s" % ckpt)

# Initialize variables
sess.run(tf.global_variables_initializer())
Beispiel #11
0
def train(config, num_epoch, last_pretrain_model_dir, pretrain_model_dir,
          model_dir, block_idx_enc, block_idx_dec):
    logger = logging.getLogger('')
    config.num_blocks_enc = block_idx_enc
    config.num_blocks_dec = block_idx_dec
    # if block_idx >= 2:
    #     config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str(
    #         block_idx - 1) + '|' + 'encoder/src_embedding' + '|' + 'decoder/dst_embedding'
    # if block_idx >= 2:
    #     config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str(
    #         block_idx - 1)
    logger.info("config.num_blocks_enc=" + str(config.num_blocks_enc) +
                ",config.num_blocks_dec=" + str(config.num_blocks_dec) +
                ',config.train.var_filter=' + str(config.train.var_filter))
    """Train a model with a config file."""
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(pretrain_model_dir,
                                           graph=model.graph)

    with tf.Session(config=sess_config, graph=model.graph) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables in disk.
        if tf.train.latest_checkpoint(last_pretrain_model_dir):
            available_vars = available_variables_without_global_step(
                last_pretrain_model_dir)
            # available_vars = available_variables(last_pretrain_model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(
                    sess, tf.train.latest_checkpoint(last_pretrain_model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(
            **config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch):
            feat_batch, target_batch = batch
            feed_dict = expand_feed_dict({
                model.src_pls: feat_batch,
                model.dst_pls: target_batch
            })
            step, lr, loss, _ = sess.run([
                model.global_step, model.learning_rate, model.loss,
                model.train_op
            ],
                                         feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                logger.info('pretrain summary_writer...')
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
                summary_writer.flush()
            return step, lr, loss

        def maybe_save_model(model_dir, is_save_global_step=True):
            global dev_bleu, toleration
            new_dev_bleu = evaluator.evaluate(
                **config.dev) if config.train.eval_on_dev else dev_bleu + 1
            if new_dev_bleu >= dev_bleu:
                mp = model_dir + '/pretrain_model_step_{}'.format(step)

                # model.saver.save(sess, mp)
                if is_save_global_step:
                    model.saver.save(sess, mp)
                else:
                    variables_without_global_step = global_variables_without_global_step(
                    )
                    saver = tf.train.Saver(
                        var_list=variables_without_global_step, max_to_keep=10)
                    saver.save(sess, mp)

                logger.info('Save model in %s.' % mp)
                toleration = config.train.toleration
                dev_bleu = new_dev_bleu
            else:
                toleration -= 1

        step = 0
        for epoch in range(1, num_epoch + 1):
            for batch in data_reader.get_training_batches_with_buckets():
                # Train normal instances.
                start_time = time.time()
                step, lr, loss = train_one_step(batch)
                logger.info(
                    'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'
                    .format(epoch, step, lr, loss,
                            time.time() - start_time))

                if config.train.num_steps and step >= config.train.num_steps:
                    break

            # Early stop
            if toleration <= 0:
                break

        maybe_save_model(pretrain_model_dir)
        if model_dir:
            maybe_save_model(model_dir, False)
        logger.info("Finish pretrain block_idx_enc=" + str(block_idx_enc) +
                    ',block_idx_dec=' + str(block_idx_dec))
Beispiel #12
0
    def init_from_frozen_graphdef(self, config):
        frozen_graph_path = os.path.join(config.model_dir,
                                         'freeze_graph_test.py')
        # If the file doesn't existed, create it.
        if not os.path.exists(frozen_graph_path):
            logging.warning(
                'The frozen graph does not existed, use \'init_from_config\' instead'
                'and create a frozen graph for next use.')
            self.init_from_config(config)
            saver = tf.train.Saver()
            save_dir = '/tmp/graph-{}'.format(os.getpid())
            os.mkdir(save_dir)
            save_path = '{}/ckpt'.format(save_dir)
            saver.save(sess=self.sess, save_path=save_path)

            with tf.Session(graph=tf.Graph()) as sess:
                clear_devices = True
                output_node_names = ['loss_sum', 'predictions']
                # We import the meta graph in the current default Graph
                saver = tf.train.import_meta_graph(save_path + '.meta',
                                                   clear_devices=clear_devices)

                # We restore the weights
                saver.restore(sess, save_path)

                # We use a built-in TF helper to export variables to constants
                output_graph_def = tf.graph_util.convert_variables_to_constants(
                    sess,  # The session is used to retrieve the weights
                    tf.get_default_graph().as_graph_def(
                    ),  # The graph_def is used to retrieve the nodes
                    output_node_names  # The output node names are used to select the useful nodes
                )

                # Finally we serialize and dump the output graph to the filesystem
                with tf.gfile.GFile(frozen_graph_path, "wb") as f:
                    f.write(output_graph_def.SerializeToString())
                    logging.info("%d ops in the final graph." %
                                 len(output_graph_def.node))

                # Remove temp files.
                os.system('rm -rf ' + save_dir)
        else:
            sess_config = tf.ConfigProto()
            sess_config.gpu_options.allow_growth = True
            sess_config.allow_soft_placement = True
            self.sess = tf.Session(config=sess_config)
            self.data_reader = DataReader(config)

            # We load the protobuf file from the disk and parse it to retrieve the
            # unserialized graph_def
            with tf.gfile.GFile(frozen_graph_path, "rb") as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())

            # Import the graph_def into current the default graph.
            tf.import_graph_def(graph_def)
            graph = tf.get_default_graph()
            self.model = AttrDict()

            def collect_placeholders(prefix):
                ret = []
                idx = 0
                while True:
                    try:
                        ret.append(
                            graph.get_tensor_by_name('import/{}_{}:0'.format(
                                prefix, idx)))
                        idx += 1
                    except KeyError:
                        return tuple(ret)

            self.model['src_pls'] = collect_placeholders('src_pl')
            self.model['dst_pls'] = collect_placeholders('dst_pl')
            self.model['predictions'] = graph.get_tensor_by_name(
                'import/predictions:0')
Beispiel #13
0
    "pert_id": ['BRD-U41416256', 'BRD-U60236422'],
    "pert_type": ["trt_cp"],
    "cell_id": ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC'],
    "pert_idose":
    ["0.04 um", "0.12 um", "0.37 um", "1.11 um", "3.33 um", "10.0 um"]
}

# check cuda
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Use GPU: %s" % torch.cuda.is_available())

data = DataReader(drug_file, gene_file, gene_expression_file_train,
                  gene_expression_file_dev, gene_expression_file_test, filter,
                  device)
print('#Train: %d' % len(data.train_feature['drug']))
print('#Dev: %d' % len(data.dev_feature['drug']))
print('#Test: %d' % len(data.test_feature['drug']))

# model creation
model = DeepCE(drug_input_dim=drug_input_dim,
               drug_emb_dim=drug_embed_dim,
               conv_size=conv_size,
               degree=degree,
               gene_input_dim=np.shape(data.gene)[1],
               gene_emb_dim=gene_embed_dim,
               num_gene=np.shape(data.gene)[0],
               hid_dim=hid_dim,
               dropout=dropout,
    target_col = len(col_name) - 1

    # ============================================ #
    # Data location
    wd = os.path.dirname(os.path.abspath(__file__)) + '/'
    data_path = wd + 'data/'
    data_path += 'prototype/'
    output_path = wd + 'output/'

    # ============================================ #
    # Read data
    data_files = os.listdir(data_path)
    for i in range(len(data_files)):
        data_files[i] = data_path + data_files[i]

    dr = DataReader(data_files, col_idx)
    ds = DataScaler()
    dp = DataParser()

    print('======== Supplying data ============')
    for file_id in range(len(data_files)):
        dr_tmp = DataReader([data_files[file_id]], col_idx)
        dr_tmp.read(delimiter='\t')
        data = dr_tmp.getData()
        data = parse_data(dp, data, col_name, target_col)
        dr.append(data)

        del data
        del dr_tmp

        print(file_id + 1, ' - ', data_files[file_id], ': ',
Beispiel #15
0
model_path = sys.argv[5]
logfile = sys.argv[6]

ckpt_file = "ckpt"

x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \
    inference(batch_size)

sess = tf.InteractiveSession()

# Setup summary
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter(logfile, sess.graph)

# Get data reader
data_reader = DataReader(train_dataset_dir, batch_size=batch_size,
                         file_names=False)

tf.add_to_collection('x', x)
tf.add_to_collection('y', y)

tf.add_to_collection('gap_w', gap_w)
tf.add_to_collection('conv3', conv3_pool)

ckpt = tf.train.latest_checkpoint(model_path)
if ckpt:
    saver.restore(sess, ckpt)
    print("Model loaded from file: %s" % ckpt)

# Initialize variables
sess.run(tf.global_variables_initializer())
Beispiel #16
0
class Evaluator(object):
    """
    Evaluate the model.
    """
    def __init__(self):
        pass

    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        if is_debug:
            self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
        # Restore model.
        self.model.saver.restore(self.sess,
                                 tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)

    def init_from_existed(self, model, sess, data_reader):
        assert model.graph == sess.graph
        self.sess = sess
        self.model = model
        self.data_reader = data_reader

    def beam_search(self, X):
        return self.sess.run(self.model.prediction,
                             feed_dict=expand_feed_dict(
                                 {self.model.src_pls: X}))

    def loss(self, X, Y):
        return self.sess.run(self.model.loss_sum,
                             feed_dict=expand_feed_dict({
                                 self.model.src_pls: X,
                                 self.model.dst_pls: Y
                             }))

    def translate(self, src_path, output_path, batch_size):
        logging.info('Translate %s.' % src_path)
        tmp = output_path + '.tmp'
        fd = codecs.open(tmp, 'w', 'utf8')
        count = 0
        token_count = 0
        start = time.time()
        for X, uttids in self.data_reader.get_test_batches(
                src_path, batch_size):
            Y = self.beam_search(X)
            sents = self.data_reader.indices_to_words(Y)
            assert len(X) == len(sents)
            for sent, uttid in zip(sents, uttids):
                print(uttid + '\t' + sent, file=fd)
            count += len(X)
            token_count += np.sum(np.not_equal(Y, 3))  # 3: </s>
            time_span = time.time() - start
            logging.info(
                '{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).'
                .format(count, token_count, time_span / 60,
                        time_span / token_count))
        fd.close()
        # Remove BPE flag, if have.
        os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path))
        os.remove(tmp)
        logging.info('The result file was saved in %s.' % output_path)

    def ppl(self, src_path, dst_path, batch_size):
        logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path))
        token_count = 0
        loss_sum = 0
        for batch in self.data_reader.get_test_batches_with_target(
                src_path, dst_path, batch_size):
            X, Y = batch
            loss_sum += self.loss(X, Y)
            token_count += np.sum(np.greater(Y, 0))
        # Compute PPL
        ppl = np.exp(loss_sum / token_count)
        logging.info('PPL: %.4f' % ppl)
        return ppl

    def evaluate(self, batch_size, **kargs):
        """Evaluate the model on dev set."""
        src_path = kargs['src_path']
        output_path = kargs['output_path']
        cmd = kargs['cmd'] if 'cmd' in kargs else\
            "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'"
        self.translate(src_path, output_path, batch_size)
        # if 'ref_path' in kargs:
        #     ref_path = kargs['ref_path']
        #     bleu = commands.getoutput(cmd.format(**{'ref': ref_path, 'output': output_path}))
        #     logging.info('BLEU: {}'.format(bleu))
        #     return float(bleu)
        # if 'dst_path' in kargs:
        #     self.ppl(src_path, kargs['dst_path'], batch_size)
        return None
Beispiel #17
0
def train(args):
    vocab = Vocab.load(args.vocab, max_size=args.vocab_size)
    data_reader = DataReader(data_dir=args.data_dir, shuffle=True)
    preprocessor = Preprocessor(
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next,
        vocab=vocab, max_length=args.max_length, gpu=args.gpu)
    model = SkipThought(
        rnn_type=args.rnn_type, num_words=len(vocab),
        word_dim=args.word_dim, hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        predict_prev=args.predict_prev,
        predict_cur=args.predict_cur,
        predict_next=args.predict_next)
    print(model)

    if args.pretrained is not None:
        print(f'Loading pretrained model from {args.pretrained}')
        model.load_state_dict(
            torch.load(args.pretrained,
                       map_location=lambda storage, loc: storage))
    if args.gpu > -1:
        model.cuda(args.gpu)
    optimizer = optim.Adam(model.parameters())

    summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log'))

    def add_scalar_summary(name, value, step):
        summary_writer.add_scalar(tag=name, scalar_value=value,
                                  global_step=step)

    def add_text_summary(name, value, step):
        summary_writer.add_text(tag=name, text_string=value,
                                global_step=step)

    def variable(tensor, volatile=False):
        return Variable(tensor, volatile=volatile)

    def run_train_iter(batch):
        if not model.training:
            model.train()
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0]), tgt[k][1])
        logits = model.forward(src=src, tgt=tgt)
        loss = 0
        for k in tgt:
            logits_k = logits[k]
            tgt_k = tgt[k]
            loss = loss + basic.sequence_cross_entropy(
                logits=logits_k[:-1], targets=tgt_k[0][1:],
                length=tgt_k[1] - 1)
        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm(model.parameters(), max_norm=10)
        optimizer.step()
        return loss.data[0]

    def ids_to_words(ids):
        words = []
        eos_id = vocab.stoi(vocab.eos)
        for id_ in ids:
            words.append(vocab.itos(id_))
            if id_ == eos_id:
                break
        return words

    def generate_using_decoder(name, src, max_length):
        _, encoder_state = model.encoder(words=src[0], length=src[1])
        if isinstance(encoder_state, tuple):  # LSTM
            encoder_state = encoder_state[0]
        context = (encoder_state.transpose(0, 1).contiguous()
                   .view(-1, args.hidden_dim))
        batch_size = src[1].size(0)

        bos_id = vocab.stoi(vocab.bos)
        bos = Variable(src[1].new(1, batch_size).fill_(bos_id))
        decoder = model.get_decoder(name)
        prev_pred = bos
        done = torch.zeros(batch_size).byte()
        hyps = []
        prev_state = context.unsqueeze(0)
        for t in range(max_length):
            if done.all():
                break
            decoder_input = prev_pred
            logit, prev_state = decoder(words=decoder_input,
                                        prev_state=prev_state)
            pred = logit.max(2)[1]
            prev_pred = pred
            hyps.append(pred.data)
        hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist()
        return hyps

    def generate(batch):
        # Greedy search
        src, tgt = preprocessor(batch)
        src = (variable(src[0]), src[1])
        for k in tgt:
            tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1])
        batch_size = src[0].size(1)
        max_length = src[0].size(0) * 2
        generated = {}
        for k in tgt:
            generated[k] = generate_using_decoder(
                name=k, src=src, max_length=max_length)
        results = []
        for i in range(batch_size):
            res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)),
                   'tgt': {},
                   'out': {}}
            for k in tgt:
                res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data))
                res['out'][k] = ' '.join(ids_to_words(generated[k][i]))
            results.append(res)
        return results

    def generate_synthetic_batch(real_batch):
        def sort_by_length(tgt_of_key):
            sorted_length, sort_inds = tgt_of_key[1].sort(
                dim=0, descending=True)
            return tgt_of_key[0][:, sort_inds], sorted_length

        # Forward: given prev, generate cur'
        _, tgt = preprocessor(real_batch)
        tgt_prev, tgt_prev_length = sort_by_length(tgt['prev'])
        syn_src_fw = generate_using_decoder(
            name='next',
            src=(variable(tgt_prev[1:], volatile=True),
                 tgt_prev_length - 1),
            max_length=args.max_length)
        # Backward: given next, generate cur''
        tgt_next, tgt_next_length = sort_by_length(tgt['next'])
        syn_src_bw = generate_using_decoder(
            name='prev',
            src=(variable(tgt_next[1:], volatile=True),
                 tgt_next_length - 1),
            max_length=args.max_length)
        syn_batch_fw = []
        syn_batch_bw = []
        for i in range(len(real_batch)):
            syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i]))
            syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i]))
            syn_batch_fw.append(
                (real_batch[i][0], syn_src_fw_str, real_batch[i][2]))
            syn_batch_bw.append(
                (real_batch[i][0], syn_src_bw_str, real_batch[i][2]))
        return syn_batch_fw, syn_batch_bw

    global_step = 0

    def print_samples():
        model.eval()
        num_samples = 2
        samples = data_reader.next_batch(size=num_samples, peek=True)
        syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples)
        gen_results = generate(samples)
        syn_gen_results_fw = generate(syn_samples_fw)
        syn_gen_results_bw = generate(syn_samples_bw)
        text_val = ''
        for i, res in enumerate(gen_results):
            text_val += f'* sample (real) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_fw):
            text_val += f'* sample (syn_fw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        for i, res in enumerate(syn_gen_results_bw):
            text_val += f'* sample (syn_bw) #{i}\n'
            text_val += f'\t* src: {res["src"]}\n'
            for k in res['tgt']:
                tgt_k = res['tgt'][k]
                out_k = res['out'][k]
                text_val += f'\t* {k} (tgt): {tgt_k}\n'
                text_val += f'\t* {k} (out): {out_k}\n'
        add_text_summary('Sample', value=text_val, step=global_step)

    for epoch in range(args.max_epoch):
        data_reader.start_epoch()
        for batch in tqdm(data_reader.iterator(args.batch_size),
                          desc=f'Epoch {epoch}'):
            # Train on real batch
            real_loss = run_train_iter(batch)
            # Train on synthetic batches
            syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch)
            syn_loss_fw = run_train_iter(syn_batch_fw)
            syn_loss_bw = run_train_iter(syn_batch_bw)
            global_step += 1
            add_scalar_summary(name='real_loss', value=real_loss,
                               step=global_step)
            add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw,
                               step=global_step)
            add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw,
                               step=global_step)
            if global_step % args.print_every == 0:
                print_samples()
            if global_step % args.save_every == 0:
                model_filename = f'model-{global_step}.pt'
                model_path = os.path.join(args.save_dir, model_filename)
                torch.save(model.state_dict(), model_path)
                print(f'\nIter #{global_step}: '
                      f'Saved checkpoint to {model_path}')
    results = {}
    for color in point_clouds:
        results[color] = []
        for x, y in point_clouds[color]:
            if x > 300 and y > 300:
                results[color].append([x, y])
    return results


if __name__ == "__main__":
    target_case = os.listdir(target_folder)
    failure_count = 0
    cannot_fix = 0
    with tqdm(target_case) as t:
        for case_name in t:
            point_clouds = DataReader.parse_annotation(
                os.path.join(target_folder, case_name, file_name))
            stats = {c: len(point_clouds[c]) for c in point_clouds}
            if not landmark_num_checker(point_clouds):
                print("case '{}' is not legal\n{}".format(
                    case_name, json.dumps(point_clouds)))
                new_point_clouds = location_filter(point_clouds)
                if not landmark_num_checker(new_point_clouds):
                    print(
                        "location filtered case '{}' is not legal\n{}".format(
                            case_name, json.dumps(new_point_clouds)))
                    cannot_fix += 1
                failure_count += 1
            t.set_postfix(stats)
    print("Summary")
    print("In folder {}, {} failures found, {} cannot be auto-fixed".format(
        target_folder, failure_count, cannot_fix))
Beispiel #19
0
def main(unused_argv):
    # prints a message if you've entered flags incorrectly
    if len(unused_argv) != 1:
        raise Exception("Problem with flags: %s" % unused_argv)

    # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly.
    #logging.basicConfig(level=logging.INFO)
    print('Starting Basic model')
    log_root = FLAGS.log_root
    exp_name = FLAGS.exp_name
    data_file_path = FLAGS.data_file_path
    pinyin_dict_path = FLAGS.pinyin_dict_path
    id_data_dir = FLAGS.id_data_dir

    n_epoch = FLAGS.n_epoch
    batch_size = FLAGS.batch_size
    seed_num = FLAGS.seed_num
    max_timesteps = FLAGS.max_timesteps
    vocab_size = FLAGS.vocab_size
    train_size = FLAGS.train_size
    load_data_and_dr = FLAGS.load_data_and_dr
    use_local = FLAGS.use_local

    # make the directory for logs
    log_root = os.path.join(log_root, exp_name)
    if not os.path.exists(log_root):
        os.makedirs(log_root)

    if use_local == 1:
        #load or save the DR class from local dir
        DR_path = os.path.join(log_root, 'DataReader.pkl')
        #load or save the id data from local dir
        id_data_path = os.path.join(log_root, 'id_data.pkl')
    else:
        #load or save the DR class from global dir
        DR_path = os.path.join(id_data_dir, 'DataReader.pkl')
        #load or save the id data from global dir
        id_data_path = os.path.join(id_data_dir, 'id_data.pkl')

    if load_data_and_dr == 1:
        with open(DR_path, 'rb') as f:
            DR = pickle.load(f)
        with open(id_data_path, 'rb') as f1:
            input_pinyin_data = pickle.load(f1)
            input_word_data = pickle.load(f1)
            target_data = pickle.load(f1)
    else:
        # load and make the data for training
        DR = DataReader(vocab_size=vocab_size,
                        pinyin_dict_path=pinyin_dict_path)
        #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True)
        input_pinyin_data, input_word_data, target_data = DR.make_data_from_dataframe(
            file_path=data_file_path,
            build_dictionary=True,
            max_rows=train_size)
        #save the DR class to local dir
        with open(DR_path, 'wb') as f:
            pickle.dump(DR, f)

        #save the ids data to local dir
        with open(id_data_path, 'wb') as f1:
            pickle.dump(input_pinyin_data, f1)
            pickle.dump(input_word_data, f1)
            pickle.dump(target_data, f1)

    # make the batch
    train_data_full = batch_generator_triple_with_length(
        input_pinyin_data, input_word_data, target_data, batch_size,
        max_timesteps, DR.word2id, DR.pinyin2id)

    # create the model
    model = SpellChecker(hps=FLAGS)
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2)
    epoch = 0.0
    print('number of iterations per epoch: {}'.format(n_iter_per_epoch))
    print('start training...')
    for _ in range(n_epoch * 2):
        epoch += 0.5
        avg_loss = 0.0
        print("----- Epoch {}/{} -----".format(epoch, n_epoch))
        for t in tqdm(range(1, n_iter_per_epoch + 1)):
            batch_full = next(train_data_full)
            src_pinyin_list, src_word_list, src_length_list, tgt_list, tgt_length_list = batch_full

            src_pinyin_list = np.asarray(src_pinyin_list, dtype=np.int32)
            src_word_list = np.asarray(src_word_list, dtype=np.int32)
            src_length_list = np.asarray(src_length_list, dtype=np.int32)
            tgt_list = np.asarray(tgt_list, dtype=np.int32)
            keep_ratio = FLAGS.keep_ratio

            #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32)
            loss = model.train_one_step(src_pinyin_list, src_word_list,
                                        src_length_list, tgt_list, keep_ratio,
                                        sess)
            avg_loss += loss
        avg_loss /= n_iter_per_epoch
        print('the avg_loss is {}'.format(avg_loss))

        if epoch == 1.5:
            print('Build model for serving...')
            model.build_model_for_serving(sess)
            print('Build model serving done!')
Beispiel #20
0
    col_idx = (1, 2, 3, 4, 5, 6)
    target_col = len(col_name) - 1

    # ============================================ #
    # Data location
    wd = os.path.dirname(os.path.abspath(__file__)) + '/'
    data_path = wd + 'data/'
    output_path = wd + 'output/'

    # ============================================ #
    # Read data
    data_files = os.listdir(data_path)
    for i in range(len(data_files)):
        data_files[i] = data_path + data_files[i]

    dr = DataReader(data_files, col_idx)
    ds = DataScaler()
    dp = DataParser()

    print('======== Supplying data ============')
    dr.read()

    print('======== Extracting data ============')
    # ============================================ #
    # Split data
    X = dr.data[:, :target_col]
    y = dr.data[:, target_col]
    alias = list(np.unique(y))
    y = dp.convertTextTarget(y, alias)
    #dump_result(output_path + 'accidents.csv', np.array(alias), ['accident'])
    print('Accident types: ', alias)
Beispiel #21
0
def train(config):
    """Train a model with a config file."""
    logger = logging.getLogger('')
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    train_op, loss_op = model.get_train_op(name=None)
    global_saver = tf.train.Saver()

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(config.model_dir)

    with tf.Session(config=sess_config) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables from disk.
        if tf.train.latest_checkpoint(config.model_dir):
            available_vars = available_variables(config.model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(sess, tf.train.latest_checkpoint(config.model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(**config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch, loss_op, train_op):
            feed_dict = expand_feed_dict({model.src_pls: batch[0], model.dst_pls: batch[1]})
            step, lr, loss, _ = sess.run(
                [model.global_step, model.learning_rate,
                 loss_op, train_op],
                feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
            return step, lr, loss

        def maybe_save_model():
            global dev_bleu, toleration

            def save():
                mp = config.model_dir + '/model_step_{}'.format(step)
                global_saver.save(sess, mp)
                logger.info('Save model in %s.' % mp)

            if config.train.eval_on_dev:
                new_dev_bleu = evaluator.evaluate(**config.dev)

                summary = tf.Summary(value=[tf.Summary.Value(tag="dev_bleu",
                                                             simple_value=new_dev_bleu)])

                summary_writer.add_summary(summary, step)

                if config.train.toleration is None:
                    save()
                else:
                    if new_dev_bleu >= dev_bleu:
                        save()
                        toleration = config.train.toleration
                        dev_bleu = new_dev_bleu
                    else:
                        toleration -= 1
            else:
                save()

        try:
            step = 0
            for epoch in range(1, config.train.num_epochs+1):
                for batch in data_reader.get_training_batches(epoches=1):

                    # Train normal instances.
                    start_time = time.time()
                    step, lr, loss = train_one_step(batch, loss_op, train_op)
                    logger.info(
                        'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'.
                        format(epoch, step, lr, loss, time.time() - start_time))
                    # Save model
                    if config.train.save_freq > 0 \
                       and step > 0 \
                       and step % config.train.save_freq == 0:
                        maybe_save_model()

                    if config.train.num_steps is not None and step >= config.train.num_steps:
                        raise BreakLoopException("BreakLoop")

                    if toleration is not None and toleration <= 0:
                        raise BreakLoopException("BreakLoop")

                # Save model per epoch if config.train.save_freq is less or equal than zero
                if config.train.save_freq <= 0:
                    maybe_save_model()
        except BreakLoopException as e:
            logger.info(e)

        logger.info("Finish training.")
Beispiel #22
0
class Evaluator(object):
    """
    Evaluate the model.
    """
    def __init__(self):
        pass

    def init_from_config(self, config):

        logger = logging.getLogger('')

        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        # Print the number of total parameters
        print_num_of_total_parameters()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config, graph=self.model.graph)
        # Restore model.
        self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)

    def init_from_existed(self, model, sess, data_reader):
        assert model.graph == sess.graph
        self.sess = sess
        self.model = model
        self.data_reader = data_reader

    def beam_search(self, X):
        return self.sess.run(self.model.prediction, feed_dict=expand_feed_dict({self.model.src_pls: X}))

    def beam_search_label(self, X, Y, Z, X_lens):
        return self.sess.run([self.model.prediction, self.model.prediction_label], feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y, self.model.label_pls: Z, self.model.src_len_pls: X_lens}))

    def loss(self, X, Y):
        return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y}))

    def loss_label(self, X, Y, Z):
        return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y, self.model.label_pls: Z}))

    def translate(self, src_path, dst_path, lbl_path, output_path, output_label_path, batch_size):
        logging.info('Translate %s.' % src_path)
        _, tmp = mkstemp()
        fd = codecs.open(tmp, 'w', 'utf8')

        _, tmp_label = mkstemp()
        fd_label = codecs.open(tmp_label, 'w', 'utf8')

        count = 0
        token_count = 0
        start = time.time()
        for X, ref, label, src_lens in self.data_reader.get_test_batches_with_target_with_label(src_path, dst_path, lbl_path, batch_size):
            Y, Z = self.beam_search_label(X, ref, label, src_lens)
            sents = self.data_reader.indices_to_words(Y, src_lens)
            assert len(X) == len(sents)
            for sent in sents:
                print(sent, file=fd)
            count += len(X)
            token_count += np.sum(np.not_equal(Y, 3))  # 3: </s>
            time_span = time.time() - start
            logging.info('{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).'.
                         format(count, token_count, time_span / 60, time_span / token_count))

            # Save the prediction of label
            sents_label = self.data_reader.indices_to_words(Z, src_lens, o='lbl')
            assert len(X) == len(sents_label)
            for sent in sents_label:
                print(sent, file=fd_label)

        fd.close()

        # Remove BPE flag, if have.
        os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path))
        os.remove(tmp)
        logging.info('The result file was saved in %s.' % output_path)

        fd_label.close()
        os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp_label, output_label_path))
        os.remove(tmp_label)
        logging.info('The label file was saved in %s.' % output_label_path)

    def ppl(self, src_path, dst_path, batch_size):
        logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path))
        token_count = 0
        loss_sum = 0
        for batch in self.data_reader.get_test_batches_with_target(src_path, dst_path, batch_size):
            X, Y = batch
            loss_sum += self.loss(X, Y)
            token_count += np.sum(np.greater(Y, 0))
        # Compute PPL
        ppl = np.exp(loss_sum / token_count)
        logging.info('PPL: %.4f' % ppl)
        return ppl

    def fscore(self, lbl_path, output_label_path):
        logging.info('Calculate P/R/F for %s and %s.' % (lbl_path, output_label_path))
        ref_file = codecs.open(lbl_path, 'r', 'utf8')
        pred_file = codecs.open(output_label_path, 'r', 'utf8')

        tp, fp, fn = 1, 1, 1
        err = 0
        # assert len(target) == len(prediction)
        line = 0
        for ref, pred in zip(ref_file, pred_file):
            line += 1
            if len(ref) != len(pred):
                # print(line)
                err += 1
                continue
            for x, y in zip(ref, pred):
                if x == y and x == 'E':
                    tp += 1
                elif y == 'E':
                    fp += 1
                elif x == 'E':
                    fn += 1
                else:
                    pass
        print('tp:{}, fp:{}, fn:{}, err:{}'.format(tp, fp, fn, err))
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        fscore = (2 * precision * recall / (precision + recall))

        ref_file.close()
        pred_file.close()

        logging.info('precision: %.4f' % precision)
        logging.info('recall: %.4f' % recall)
        logging.info('fscore: %.4f' % fscore)
        return precision, recall, fscore

    def evaluate(self, batch_size, **kargs):
        """Evaluate the model on dev set."""
        src_path = kargs['src_path']
        dst_path = kargs['ref_path']
        lbl_path = kargs['label_path']
        output_path = kargs['output_path']
        output_label_path = kargs['output_label_path']
        cmd = kargs['cmd'] if 'cmd' in kargs else\
            "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'"
        self.translate(src_path, dst_path, lbl_path, output_path, output_label_path, batch_size)

        if 'dst_path' in kargs:
            self.ppl(src_path, kargs['dst_path'], batch_size)

        # calculate the fscore of label result
        if 'label_path' in kargs:
            precision, recall, f_score = self.fscore(lbl_path, output_label_path)
            return float(f_score)

        return None
Beispiel #23
0
def main(unused_argv):
    # prints a message if you've entered flags incorrectly
    if len(unused_argv) != 1: 
        raise Exception("Problem with flags: %s" % unused_argv)
    
    # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly.
    #logging.basicConfig(level=logging.INFO)
    print('Starting Basic model')
    log_root = FLAGS.log_root
    exp_name = FLAGS.exp_name
    data_file_path = FLAGS.data_file_path
    pinyin_dict_path = FLAGS.pinyin_dict_path
    id_data_dir = FLAGS.id_data_dir
    
    n_epoch = FLAGS.n_epoch
    batch_size = FLAGS.batch_size
    seed_num = FLAGS.seed_num
    max_timesteps= FLAGS.max_timesteps
    vocab_size = FLAGS.vocab_size
    train_size = FLAGS.train_size
    load_data_and_dr = FLAGS.load_data_and_dr
    use_local = FLAGS.use_local
        
    
    # make the directory for logs
    log_root = os.path.join(log_root, exp_name)
    if not os.path.exists(log_root):
        os.makedirs(log_root)

    if use_local == 1:
        #load or save the DR class from local dir
        DR_path = os.path.join(log_root, 'DataReader.pkl')
        #load or save the id data from local dir
        id_data_path = os.path.join(log_root, 'id_data.pkl')
    else:
        #load or save the DR class from global dir
        DR_path = os.path.join(id_data_dir, 'DataReader.pkl')
        #load or save the id data from global dir
        id_data_path = os.path.join(id_data_dir, 'id_data.pkl')

    if load_data_and_dr == 1:
        with open(DR_path,'rb') as f:
            DR = pickle.load(f)
        with open(id_data_path,'rb') as f1:
            input_pinyin_data = pickle.load(f1)
            input_word_data = pickle.load(f1)
            target_data = pickle.load(f1)
    else:
        # load and make the data for training
        DR = DataReader(vocab_size = vocab_size, pinyin_dict_path = pinyin_dict_path)
        #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True)
        input_pinyin_data,input_word_data,target_data = DR.make_data_from_dataframe(file_path = data_file_path,build_dictionary=True,max_rows = train_size)
        #save the DR class to local dir
        with open(DR_path,'wb') as f:
            pickle.dump(DR,f)

        #save the ids data to local dir
        with open(id_data_path,'wb') as f1:
            pickle.dump(input_pinyin_data,f1)
            pickle.dump(input_word_data,f1)
            pickle.dump(target_data,f1)
    
    # make the batch
    train_data_full= batch_generator_triple_with_length(input_pinyin_data,input_word_data,target_data,batch_size,max_timesteps,DR.word2id,DR.pinyin2id)

    # create the model
    model = SpellChecker(hps = FLAGS)

    
    # create the supervisor
    with model.graph.as_default():
        # print the variables of tensorflow
        print("Number of sets of parameters: {}".format(len(tf.trainable_variables())))
        print("Number of parameters: {}".format(
                np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()])))
        for v in tf.trainable_variables():
            print(v)

        sv = tf.train.Supervisor(logdir=log_root,
                                saver = model.saver,
                                summary_op=None,
                                save_model_secs=60,
                                global_step = model.global_step,
                                init_op=model.init_op) # Do not run the summary service


        # train the model 
        with sv.managed_session() as sess:
            n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2)
            epoch = 0.0
            print('number of iterations per epoch: {}'.format(n_iter_per_epoch))
            print('start training...')     
            for _ in range(n_epoch * 2):
                epoch += 0.5
                avg_loss = 0.0
                print("----- Epoch {}/{} -----".format(epoch, n_epoch))
                for t in tqdm(range(1, n_iter_per_epoch + 1)):
                    batch_full = next(train_data_full)
                    src_pinyin_list,src_word_list,src_length_list,tgt_list,tgt_length_list = batch_full
                    
                    #if epoch == 0.5:
                        #print(src_list[1])
                        #print(len(src_list[1]))
                        #print(src_length_list[1])
                        #print(tgt_list[1])
                        #print(len(tgt_list[1]))
                        #print(tgt_length_list[1])
                    
                    src_pinyin_list = np.asarray(src_pinyin_list,dtype = np.int32)
                    src_word_list = np.asarray(src_word_list,dtype = np.int32)
                    src_length_list = np.asarray(src_length_list,dtype = np.int32)
                    tgt_list = np.asarray(tgt_list,dtype = np.int32)
                    #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32)
                    loss = model.train_one_step(src_pinyin_list, src_word_list,src_length_list, tgt_list,sess)
                    avg_loss +=loss
                avg_loss /= n_iter_per_epoch
                print('the avg_loss is {}'.format(avg_loss))
Beispiel #24
0
class Seq2SeqTrainer:
    def __init__(self, model, output_name=None):
        self.model = model
        self.datareader = DataReader()
        self.metrics = ErrorMetrics()
        self.output_name = output_name

    def train(
        self,
        train_data,
        val_data,
        epochs=EPOCHS,
        patience=PATIENCE,
        pretrain=False,
        minibatch_size=1,
    ):
        trainer = dy.SimpleSGDTrainer(self.model.model)

        logging.info("Training data length: %d" % len(train_data))
        logging.info("Validation data length: %d" % len(val_data))

        for e in range(epochs):
            start_time = time.time()
            logging.info("Epoch: %d" % e)
            epoch_loss = 0.0
            random.shuffle(train_data)

            for i in range(0, len(train_data), minibatch_size):
                cur_size = min(minibatch_size, len(train_data) - i)
                losses = []
                dy.renew_cg()
                for (src1, src2, tgt) in train_data[i:i + cur_size]:
                    losses.append(self.model.get_loss(src1, src2, tgt))
                batch_loss = dy.esum(losses)
                batch_loss.backward()
                trainer.update()
                epoch_loss += batch_loss.scalar_value()
            logging.info("Epoch loss: %0.4f" % (epoch_loss / len(train_data)))

            if not pretrain:
                cur_cer, cur_wer = self.metrics.get_average_cer(
                    self.model, val_data, output_file=None, write_pgens=False)
                if cur_cer < self.model.best_val_cer:
                    self.model.save()
                    self.model.best_val_cer = cur_cer
                    best_val_epoch = e
                    logging.info(
                        "Model saved at epoch: {}".format(best_val_epoch))
                logging.info("VAL CER: %0.4f" % (cur_cer))
                logging.info("VAL WER: %0.4f" % (cur_wer))
                if cur_cer == 0:
                    logging.info("Validation CER is zero. End training.")
                    break

            logging.info("--- %s seconds ---" % (time.time() - start_time))
            logging.info("\n")

            if not pretrain:
                if e - best_val_epoch > patience:
                    logging.info("Patience reached. End training.")
                    break

    def train_model(self, train_src1, train_src2, train_tgt, val_src1,
                    val_src2, val_tgt):
        train_data = self.datareader.read_parallel_data(
            self.model, train_src1, train_src2, train_tgt)
        val_data = self.datareader.read_parallel_data(self.model, val_src1,
                                                      val_src2, val_tgt)
        self.train(train_data, val_data)
Beispiel #25
0
def train(config):
    logger = logging.getLogger('')
    """Train a model with a config file."""
    data_reader = DataReader(config=config)
    model = eval(config.model)(config=config, num_gpus=config.train.num_gpus)
    model.build_train_model(test=config.train.eval_on_dev)

    sess_config = tf.ConfigProto()
    sess_config.gpu_options.allow_growth = True
    sess_config.allow_soft_placement = True

    summary_writer = tf.summary.FileWriter(config.model_dir, graph=model.graph)

    with tf.Session(config=sess_config, graph=model.graph) as sess:
        # Initialize all variables.
        sess.run(tf.global_variables_initializer())
        # Reload variables in disk.
        if tf.train.latest_checkpoint(config.model_dir):
            available_vars = available_variables(config.model_dir)
            if available_vars:
                saver = tf.train.Saver(var_list=available_vars)
                saver.restore(sess,
                              tf.train.latest_checkpoint(config.model_dir))
                for v in available_vars:
                    logger.info('Reload {} from disk.'.format(v.name))
            else:
                logger.info('Nothing to be reload from disk.')
        else:
            logger.info('Nothing to be reload from disk.')

        evaluator = Evaluator()
        evaluator.init_from_existed(model, sess, data_reader)

        global dev_bleu, toleration
        dev_bleu = evaluator.evaluate(
            **config.dev) if config.train.eval_on_dev else 0
        toleration = config.train.toleration

        def train_one_step(batch):
            feat_batch, target_batch, batch_size = batch
            feed_dict = expand_feed_dict({
                model.src_pls: feat_batch,
                model.dst_pls: target_batch
            })
            step, lr, loss, _ = sess.run([
                model.global_step, model.learning_rate, model.loss,
                model.train_op
            ],
                                         feed_dict=feed_dict)
            if step % config.train.summary_freq == 0:
                summary = sess.run(model.summary_op, feed_dict=feed_dict)
                summary_writer.add_summary(summary, global_step=step)
            return step, lr, loss

        def maybe_save_model():
            global dev_bleu, toleration
            new_dev_bleu = evaluator.evaluate(
                **config.dev) if config.train.eval_on_dev else dev_bleu + 1
            if new_dev_bleu >= dev_bleu:
                mp = config.model_dir + '/model_step_{}'.format(step)
                model.saver.save(sess, mp)
                logger.info('Save model in %s.' % mp)
                toleration = config.train.toleration
                dev_bleu = new_dev_bleu
            else:
                toleration -= 1

        step = 0
        for epoch in range(1, config.train.num_epochs + 1):
            for batch in data_reader.get_training_batches_with_buckets():

                # Train normal instances.
                start_time = time.time()
                step, lr, loss = train_one_step(batch)
                logger.info(
                    'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}\tbatch_size: {5}'
                    .format(epoch, step, lr, loss,
                            time.time() - start_time, batch[2]))
                # Save model
                if config.train.save_freq > 0 and step % config.train.save_freq == 0:
                    maybe_save_model()

                if config.train.num_steps and step >= config.train.num_steps:
                    break

            # Save model per epoch if config.train.save_freq is less or equal than zero
            if config.train.save_freq <= 0:
                maybe_save_model()

            # Early stop
            if toleration <= 0:
                break
        logger.info("Finish training.")
def build_data_loader(args, char_dict, intent_dict):
    """[decorate samples for dataloader]
    
    Arguments:
        args {[type]} -- [description]
        char_dict {[type]} -- [description]
        intent_dict {[type]} -- [description]
    
    Returns:
        [type] -- [description]
    """
    loader_res = {}
    if args.do_train:
        train_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        train_data_generator = train_processor.prepare_data(
            data_path=args.data_dir + "train.txt",
            batch_size=args.batch_size,
            mode='train')
        loader_res["train_data_generator"] = train_data_generator
        num_train_examples = train_processor._get_num_examples()
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Num train steps: %d" % (math.ceil(num_train_examples * 1.0 / args.batch_size) * \
                                            args.epoch // DEV_COUNT))
        if math.ceil(
                num_train_examples * 1.0 / args.batch_size) // DEV_COUNT <= 0:
            logger.error(
                "Num of train steps is less than 0  or equals to 0, exit")
            exit(1)
    if args.do_eval:
        eval_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        eval_data_generator = eval_processor.prepare_data(
            data_path=args.data_dir + "eval.txt",
            batch_size=args.batch_size,
            mode='eval')
        loader_res["eval_data_generator"] = eval_data_generator
        num_eval_examples = eval_processor._get_num_examples()
        logger.info("Num eval examples: %d" % num_eval_examples)
    if args.do_test:
        test_processor = DataReader(char_dict, intent_dict, args.max_seq_len)
        test_data_generator = test_processor.prepare_data(
            data_path=args.data_dir + "test.txt",
            batch_size=args.batch_size,
            mode='test')
        loader_res["test_data_generator"] = test_data_generator
    return loader_res
 def __init__(self, model, output_name):
     self.model = model
     self.datareader = DataReader()
     self.metrics = ErrorMetrics()
     self.output_name = output_name
                    M, C_pc, C_rpc, p, q, 
                    loss_fun='square_loss', alpha=0.2)
            H = - np.sum(dP * np.log(dP + 1e-5))
            # print(k, H)
            P = P + dP
            # print(k, np.argmax(dP, axis=1))
        re_ranking[k] = np.argmax(P, axis=1).tolist()

    return re_ranking


if __name__ == '__main__':
    golden_annotations = defaultdict(list)
    for case in golden_cases:
        file = os.path.join(golden_train_dir, case, "annotation.txt")
        annotation = DataReader.parse_annotation(file)
        for k, v in annotation.items():
            golden_annotations[k].append(np.asarray(v))
    t = tqdm(train_cases)
    for case in t:
        raw_annotation = os.path.join(train_dir, case, "annotation.txt")
        annotation = {k: np.asarray(v)
            for k, v in DataReader.parse_annotation(raw_annotation).items()}
        re_ranking = align_annotation(
            golden_annotations, annotation, align_method='FGW')
        if re_ranking:
            re_annotation = {k: np.asarray([v[i] for i in re_ranking[k]])
                            for k, v in annotation.items()}
            save_refined_annotation(sorted_train_dir, case, re_annotation)
        else:
            fail_list.write(case + "\n")
Beispiel #29
0
class Evaluator(object):
    """
    Evaluate the model.
    """
    def __init__(self):
        pass

    def init_from_config(self, config):
        self.model = eval(config.model)(config, config.test.num_gpus)
        self.model.build_test_model()

        sess_config = tf.ConfigProto()
        sess_config.gpu_options.allow_growth = True
        sess_config.allow_soft_placement = True
        self.sess = tf.Session(config=sess_config)

        # Restore model.
        try:
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))
        except tf.errors.NotFoundError:
            roll_back_to_previous_version(config)
            tf.train.Saver().restore(
                self.sess, tf.train.latest_checkpoint(config.model_dir))

        self.data_reader = DataReader(config)

    def init_from_frozen_graphdef(self, config):
        frozen_graph_path = os.path.join(config.model_dir,
                                         'freeze_graph_test.py')
        # If the file doesn't existed, create it.
        if not os.path.exists(frozen_graph_path):
            logging.warning(
                'The frozen graph does not existed, use \'init_from_config\' instead'
                'and create a frozen graph for next use.')
            self.init_from_config(config)
            saver = tf.train.Saver()
            save_dir = '/tmp/graph-{}'.format(os.getpid())
            os.mkdir(save_dir)
            save_path = '{}/ckpt'.format(save_dir)
            saver.save(sess=self.sess, save_path=save_path)

            with tf.Session(graph=tf.Graph()) as sess:
                clear_devices = True
                output_node_names = ['loss_sum', 'predictions']
                # We import the meta graph in the current default Graph
                saver = tf.train.import_meta_graph(save_path + '.meta',
                                                   clear_devices=clear_devices)

                # We restore the weights
                saver.restore(sess, save_path)

                # We use a built-in TF helper to export variables to constants
                output_graph_def = tf.graph_util.convert_variables_to_constants(
                    sess,  # The session is used to retrieve the weights
                    tf.get_default_graph().as_graph_def(
                    ),  # The graph_def is used to retrieve the nodes
                    output_node_names  # The output node names are used to select the useful nodes
                )

                # Finally we serialize and dump the output graph to the filesystem
                with tf.gfile.GFile(frozen_graph_path, "wb") as f:
                    f.write(output_graph_def.SerializeToString())
                    logging.info("%d ops in the final graph." %
                                 len(output_graph_def.node))

                # Remove temp files.
                os.system('rm -rf ' + save_dir)
        else:
            sess_config = tf.ConfigProto()
            sess_config.gpu_options.allow_growth = True
            sess_config.allow_soft_placement = True
            self.sess = tf.Session(config=sess_config)
            self.data_reader = DataReader(config)

            # We load the protobuf file from the disk and parse it to retrieve the
            # unserialized graph_def
            with tf.gfile.GFile(frozen_graph_path, "rb") as f:
                graph_def = tf.GraphDef()
                graph_def.ParseFromString(f.read())

            # Import the graph_def into current the default graph.
            tf.import_graph_def(graph_def)
            graph = tf.get_default_graph()
            self.model = AttrDict()

            def collect_placeholders(prefix):
                ret = []
                idx = 0
                while True:
                    try:
                        ret.append(
                            graph.get_tensor_by_name('import/{}_{}:0'.format(
                                prefix, idx)))
                        idx += 1
                    except KeyError:
                        return tuple(ret)

            self.model['src_pls'] = collect_placeholders('src_pl')
            self.model['dst_pls'] = collect_placeholders('dst_pl')
            self.model['predictions'] = graph.get_tensor_by_name(
                'import/predictions:0')

    def init_from_existed(self, model, sess, data_reader):
        self.sess = sess
        self.model = model
        self.data_reader = data_reader

    def beam_search(self, X):
        return self.sess.run(self.model.predictions,
                             feed_dict=expand_feed_dict(
                                 {self.model.src_pls: X}))

    def loss(self, X, Y):
        return self.sess.run(self.model.loss_sum,
                             feed_dict=expand_feed_dict({
                                 self.model.src_pls: X,
                                 self.model.dst_pls: Y
                             }))

    def translate(self, src_path, output_path, batch_size):
        logging.info('Translate %s.' % src_path)
        _, tmp = mkstemp()
        fd = codecs.open(tmp, 'w', 'utf8')
        count = 0
        token_count = 0
        epsilon = 1e-6
        start = time.time()
        for X in self.data_reader.get_test_batches(src_path, batch_size):
            Y = self.beam_search(X)
            Y = Y[:len(X)]
            sents = self.data_reader.indices_to_words(Y)
            assert len(X) == len(sents)
            for sent in sents:
                print(sent, file=fd)
            count += len(X)
            token_count += np.sum(np.not_equal(Y, 3))  # 3: </s>
            time_span = time.time() - start
            logging.info(
                '{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).'
                .format(count, token_count, time_span / 60,
                        time_span / (token_count + epsilon)))
        fd.close()
        # Remove BPE flag, if have.
        os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path))
        os.remove(tmp)
        logging.info('The result file was saved in %s.' % output_path)

    def ppl(self, src_path, dst_path, batch_size):
        logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path))
        token_count = 0
        loss_sum = 0
        for batch in self.data_reader.get_test_batches_with_target(
                src_path, dst_path, batch_size):
            X, Y = batch
            loss_sum += self.loss(X, Y)
            token_count += np.sum(np.greater(Y, 0))
        # Compute PPL
        ppl = np.exp(loss_sum / token_count)
        logging.info('PPL: %.4f' % ppl)
        return ppl

    def evaluate(self, batch_size, **kargs):
        """Evaluate the model on dev set."""
        src_path = kargs['src_path']
        output_path = kargs['output_path']
        cmd = kargs['cmd'] if 'cmd' in kargs else\
            "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'"
        cmd = cmd.strip()
        logging.info('Evaluation command: ' + cmd)
        self.translate(src_path, output_path, batch_size)
        bleu = None
        if 'ref_path' in kargs:
            ref_path = kargs['ref_path']
            try:
                bleu = commands.getoutput(
                    cmd.format(**{
                        'ref': ref_path,
                        'output': output_path
                    }))
                bleu = float(bleu)
            except ValueError, e:
                logging.warning(
                    'An error raised when calculate BLEU: {}'.format(e))
                bleu = 0
            logging.info('BLEU: {}'.format(bleu))
        if 'dst_path' in kargs:
            self.ppl(src_path, kargs['dst_path'], batch_size)
        return bleu
Beispiel #30
0
    def train(self, model, save=False, make_chart=False):
        """
		Trains an input model. Makes Calculations, Charts, and Saves
		the model if necessary.

		Parameters
		----------
		model:     SKLearn Model The regression model to use
		save:      Boolean Whether or not the model should be saved
		make_chart Boolean Whether or not to make/save a chart

		Returns
		-------
		float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE 
		"""
        #get/split data
        reader = DataReader()
        df = reader.create_input_data()
        df = self.preprocess(df)
        self.X_train, self.X_test, self.y_train, self.y_test = self.split_data(
            df)

        parameters = {
            'n_estimators': [1, 5, 10, 20, 30],
            'max_depth': [1, 5, 10]
        }
        rf = RandomForestRegressor()
        self.model = GridSearchCV(rf, parameters, cv=10)
        #train model
        self.model.fit(self.X_train, self.y_train)

        #Feature importance
        importances = self.model.best_estimator_.feature_importances_
        cols = self.X_train.columns
        for i in range(len(importances)):
            print(cols[i], importances[i])

        if save:
            joblib.dump(self.model.best_estimator_,
                        "../models/" + self.name + "_2017.joblib")

        print("------------------------")
        MSEs = cross_val_score(estimator=self.model,
                               X=self.X_train,
                               y=self.y_train,
                               scoring='neg_mean_squared_error',
                               cv=8)

        predicted = self.model.predict(self.X_test)
        print("Average CV Mean Squared Error: ", abs(np.mean(MSEs)))
        print(
            "Testing Mean Absolute Error: ",
            mean_absolute_error(self.y_test, self.model.predict(self.X_test)))
        print("Testing MSE: ", mean_squared_error(self.y_test, predicted))
        #print(self.model.feature_importances_)
        if make_chart:
            print("Generating Chart...")
            plt.style.use('dark_background')
            fig, ax = plt.subplots(nrows=1, ncols=1)
            ax.set_ylabel('HDI')
            ax.set_xlabel("Municipality Codmun ID")
            ax.set_title(self.name + 'Real vs Predicted')
            green, = ax.plot(np.arange(20),
                             self.y_test[0:100:5],
                             'g',
                             label='True')
            red, = ax.plot(np.arange(20),
                           predicted[0:100:5],
                           'r',
                           label='Predicted')
            ax.set_xticks(np.arange(20))
            x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist()
            ax.set_xticklabels([str(int(y)) for y in x_labels],
                               rotation='vertical')
            plt.legend(handles=[green, red], labels=["True", "Predicted"])
            plt.tight_layout()
            fig.savefig(self.name + "_real_v_predicted")
            for x in range(0, 100, 5):
                print(predicted[x], x_labels[int(x / 5)])
            print(x_labels, predicted[0:100:5])

        return np.mean(MSEs), mean_absolute_error(
            self.y_test, self.model.predict(self.X_test)), mean_squared_error(
                self.y_test, predicted)