Example #1
0
    def __init_model(self, entry):
        if entry == 'train':
            self.train_manager = NERDataset(model_path=self.model_path,
                                            data_path='data/ner_train.txt',
                                            data_type='train',
                                            tags=self.tags,
                                            max_len=self.embedding_size,
                                            batch_size=self.batch_size)
            self.train_manager.dump_data_map()
            self.total_size = (len(self.train_manager) + self.batch_size -
                               1) // self.batch_size
            dev_manager = NERDataset(model_path=self.model_path,
                                     data_path='data/ner_test.txt',
                                     data_type='dev',
                                     tags=self.tags,
                                     max_len=self.embedding_size,
                                     batch_size=self.batch_size)
            self.dev_batch = dev_manager.batch_iter()

            self.model = BiLSTMCRF(
                self.device,
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == 'predict':
            data_map = self.load_params()
            self.tag_map = data_map.get('tag_map')
            self.vocab = data_map.get('vocab')
            self.model = BiLSTMCRF(self.device,
                                   tag_map=self.tag_map,
                                   vocab_size=len(self.vocab.items()),
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()
        self.model.to(self.device)
Example #2
0
def predict(token_vocab, target_vocab, sent):
    os.environ[
        'CUDA_VISIBLE_DEVICES'] = '-1'  # force to use cpu only (prediction)
    model_dir = "./trained_models"

    # prepare sentence converting
    # to make raw sentence to id data easily
    pred_data = N2NTextData(sent, mode='sentence')
    pred_id_data = N2NConverter.convert(pred_data, target_vocab, token_vocab)
    pred_data_set = NERDataset(pred_id_data, 1, 128)
    #
    a_batch_data = next(pred_data_set.predict_iterator)  # a result
    b_nes_id, b_token_ids, b_weight = a_batch_data

    # Restore graph
    # note that frozen_graph.tf.pb contains graph definition with parameter values in binary format
    _graph_fn = os.path.join(model_dir, 'frozen_graph.tf.pb')
    with tf.gfile.GFile(_graph_fn, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())

    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)

    with tf.Session(graph=graph) as sess:
        # to check load graph
        #for n in tf.get_default_graph().as_graph_def().node: print(n.name)

        # make interface for input
        pl_token = graph.get_tensor_by_name('import/model/pl_tokens:0')
        pl_weight = graph.get_tensor_by_name('import/model/pl_weight:0')
        pl_keep_prob = graph.get_tensor_by_name('import/model/pl_keep_prob:0')

        # make interface for output
        step_out_preds = graph.get_tensor_by_name(
            'import/model/step_out_preds:0')
        step_out_probs = graph.get_tensor_by_name(
            'import/model/step_out_probs:0')

        # predict sentence
        b_best_step_pred_indexs, b_step_pred_probs = sess.run(
            [step_out_preds, step_out_probs],
            feed_dict={
                pl_token: b_token_ids,
                pl_weight: b_weight,
                pl_keep_prob: 1.0,
            })
        best_step_pred_indexs = b_best_step_pred_indexs[0]
        step_pred_probs = b_step_pred_probs[0]

        step_best_targets = []
        step_best_target_probs = []
        for time_step, best_pred_index in enumerate(best_step_pred_indexs):
            _target_class = target_vocab.get_symbol(best_pred_index)
            step_best_targets.append(_target_class)
            _prob = step_pred_probs[time_step][best_pred_index]
            step_best_target_probs.append(_prob)

        for idx, char in enumerate(list(sent)):
            print('{}\t{}\t{}'.format(char, step_best_targets[idx],
                                      step_best_target_probs[idx]))
Example #3
0
def train(train_id_data, num_vocabs, num_taget_class):
    #
    # train sentiment analysis using given train_id_data
    #
    max_epoch = 300
    model_dir = "./trained_models"
    hps = NER.get_default_hparams()
    hps.update(batch_size=100,
               num_steps=128,
               emb_size=50,
               enc_dim=100,
               vocab_size=num_vocabs,
               num_target_class=num_taget_class)

    with tf.variable_scope("model"):
        model = NER(hps, "train")

    sv = tf.train.Supervisor(is_chief=True,
                             logdir=model_dir,
                             summary_op=None,
                             global_step=model.global_step)

    # tf assign compatible operators for gpu and cpu
    tf_config = tf.ConfigProto(allow_soft_placement=True)

    with sv.managed_session(config=tf_config) as sess:
        local_step = 0
        prev_global_step = sess.run(model.global_step)

        train_data_set = NERDataset(train_id_data, hps.batch_size,
                                    hps.num_steps)
        losses = []
        while not sv.should_stop():
            fetches = [model.global_step, model.loss, model.train_op]
            a_batch_data = next(train_data_set.iterator)
            y, x, w = a_batch_data
            fetched = sess.run(
                fetches, {
                    model.x: x,
                    model.y: y,
                    model.w: w,
                    model.keep_prob: hps.keep_prob,
                })

            local_step += 1

            _global_step = fetched[0]
            _loss = fetched[1]
            losses.append(_loss)
            if local_step < 10 or local_step % 10 == 0:
                epoch = train_data_set.get_epoch_num()
                print("Epoch = {:3d} Step = {:7d} loss = {:5.3f}".format(
                    epoch, _global_step, np.mean(losses)))
                _loss = []
                if epoch >= max_epoch: break

        print("Training is done.")
    sv.stop()

    # model.out_pred, model.out_probs
    freeze_graph(
        model_dir, "model/step_out_preds,model/step_out_probs",
        "frozen_graph.tf.pb")  ## freeze graph with params to probobuf format
                pl_keep_prob: 1.0,
            })
        best_step_pred_indexs = b_best_step_pred_indexs[0]
        step_pred_probs = b_step_pred_probs[0]

        step_best_targets = []
        step_best_target_probs = []
        for time_step, best_pred_index in enumerate(best_step_pred_indexs):
            _target_class = target_vocab.get_symbol(best_pred_index)
            step_best_targets.append(_target_class)
            _prob = step_pred_probs[time_step][best_pred_index]
            step_best_target_probs.append(_prob)

        for idx, char in enumerate(list(sent)):
            print('{}\t{}\t{}'.format(char, step_best_targets[idx],
                                      step_best_target_probs[idx]))


if __name__ == '__main__':
    train_id_data, token_vocab, target_vocab = load_data()
    num_vocabs = token_vocab.get_num_tokens()
    num_target_class = target_vocab.get_num_targets()

    train_data_set = NERDataset(train_id_data, 5, 128)
    train(train_id_data, num_vocabs, num_target_class)

    predict(token_vocab, target_vocab,
            '의정지기단은 첫 사업으로 45 명 시의원들의 선거 공약을 수집해 개인별로 카드를 만들었다.')
    predict(token_vocab, target_vocab,
            '한국소비자보호원은 19일 시판중인 선물세트의 상당수가 과대 포장된 것으로 드러났다고 밝혔다.')
Example #5
0
        return (Variable(
            weight.new(self.n_layers, batch_size, self.hidden_dim).uniform_()),
                Variable(
                    weight.new(self.n_layers, batch_size,
                               self.hidden_dim).uniform_()))


if __name__ == "__main__":

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 100
    BATCH_SIZE = 256

    vocab = build_vocab('data')
    word_vocab, label_vocab = vocab
    train_dataset = NERDataset('data', vocab, type='/train')
    train_loader = DataLoader(train_dataset,
                              batch_size=BATCH_SIZE,
                              num_workers=2,
                              collate_fn=custom_collate,
                              shuffle=True)
    sample_data, sample_target, sample_len = next(iter(train_loader))
    sample_data = sample_data.long()

    model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab))
    hidden = model.init_hidden(BATCH_SIZE)

    with torch.no_grad():
        tag_scores = model(sample_data, hidden)
        print(tag_scores.shape)
Example #6
0
class NERModel(object):
    def __init__(self, device, entry='train'):
        self.device = device
        self.load_config()
        self.__init_model(entry)

    def __init_model(self, entry):
        if entry == 'train':
            self.train_manager = NERDataset(model_path=self.model_path,
                                            data_path='data/ner_train.txt',
                                            data_type='train',
                                            tags=self.tags,
                                            max_len=self.embedding_size,
                                            batch_size=self.batch_size)
            self.train_manager.dump_data_map()
            self.total_size = (len(self.train_manager) + self.batch_size -
                               1) // self.batch_size
            dev_manager = NERDataset(model_path=self.model_path,
                                     data_path='data/ner_test.txt',
                                     data_type='dev',
                                     tags=self.tags,
                                     max_len=self.embedding_size,
                                     batch_size=self.batch_size)
            self.dev_batch = dev_manager.batch_iter()

            self.model = BiLSTMCRF(
                self.device,
                tag_map=self.train_manager.tag_map,
                batch_size=self.batch_size,
                vocab_size=len(self.train_manager.vocab),
                dropout=self.dropout,
                embedding_dim=self.embedding_size,
                hidden_dim=self.hidden_size,
            )
            self.restore_model()
        elif entry == 'predict':
            data_map = self.load_params()
            self.tag_map = data_map.get('tag_map')
            self.vocab = data_map.get('vocab')
            self.model = BiLSTMCRF(self.device,
                                   tag_map=self.tag_map,
                                   vocab_size=len(self.vocab.items()),
                                   embedding_dim=self.embedding_size,
                                   hidden_dim=self.hidden_size)
            self.restore_model()
        self.model.to(self.device)

    def load_config(self):
        try:
            fopen = open('config/ner_config.yml')
            config = yaml.load(fopen)
            fopen.close()
        except Exception as error:
            logger.warning(f'Load config failed, using default config {error}')
            with open('config/ner_config.yml', 'w') as fopen:
                config = {
                    'embedding_size': 200,
                    'hidden_size': 128,
                    'batch_size': 128,
                    'dropout': 0.5,
                    'model_path': 'model/',
                    'tags': ['ORG', 'PER', 'LOC', 'COM']
                }
                yaml.dump(config, fopen)
        self.embedding_size = config.get('embedding_size')
        self.hidden_size = config.get('hidden_size')
        self.batch_size = config.get('batch_size')
        self.model_path = config.get('model_path')
        self.tags = config.get('tags')
        self.dropout = config.get('dropout')

    def restore_model(self):
        try:
            self.model.load_state_dict(
                torch.load(os.path.join(self.model_path, 'params.pkl')))
            logger.info('model restore success!')
        except Exception as error:
            logger.warn(f'model restore faild! {error}')

    def load_params(self):
        with codecs.open('ner_model/data.pkl', 'rb') as fopen:
            data_map = pickle.load(fopen)
        return data_map

    def train(self):
        optimizer = optim.Adam(self.model.parameters())
        # optimizer = optim.SGD(ner_model.parameters(), lr=0.01)
        epoch_num = 1
        for epoch in range(epoch_num):
            progress = tqdm(self.train_manager.batch_iter(),
                            desc=f'NER Epoch#{epoch + 1}/{epoch_num}',
                            total=self.total_size,
                            dynamic_ncols=True)
            for batch in progress:
                self.model.zero_grad()
                sentences, tags = zip(*batch)
                sentences_tensor = torch.tensor(
                    sentences, dtype=torch.long).to(self.device)
                tags_tensor = torch.tensor(tags,
                                           dtype=torch.long).to(self.device)
                trained_tags = self.model(sentences_tensor)
                loss = -self.model.crf(trained_tags,
                                       tags_tensor)  # neg_log_likelihood
                progress.set_postfix({
                    'loss': loss.item(),
                })
                loss.backward()
                optimizer.step()
            torch.save(self.model.state_dict(), self.model_path + 'params.pkl')

    def evaluate(self):
        sentences, labels = zip(*self.dev_batch.__next__())
        _, paths = self.model(sentences)
        for tag in self.tags:
            pass
            # f1_score(labels, paths, tag, self.model.tag_map)

    def predict(self, input_str=''):
        if not input_str:
            input_str = input('请输入文本: ')
        input_vec = [self.vocab.get(i, 0) for i in input_str]
        # convert to tensor
        sentences = torch.tensor(input_vec).to(self.device).view(1, -1)
        # _, paths = self.model(sentences)
        id2tag = [
            k for (k, v) in sorted(self.tag_map.items(), key=lambda x: x[1])
        ]
        results = {}
        for tag in id2tag:
            results.update({tag.split('-')[-1]: []})
        trained_tags = self.model(sentences)
        entities = self.model.crf.decode(trained_tags)
        tags = list(map(lambda x: id2tag[x[0]], entities))
        return tags
Example #7
0
            tags_pred = model.decode(seqs.to(device), masks.to(device))
            for tp in tags_pred:
                y_pred.append([ix_to_tag[ix] for ix in tp])
            # true
            lens = masks.sum(0).tolist()
            tags_l = tags.t().tolist()
            for t, ln in zip(tags_l, lens):
                y_true.append([ix_to_tag[ix] for ix in t[:ln]])
    return score(y_true, y_pred)


if __name__ == "__main__":
    data_dir = f"data/{args.dataset}/processed"

    # Load dataset
    train_data = NERDataset(os.path.join(data_dir, "train.pkl"))
    test_data = NERDataset(os.path.join(data_dir, "test.pkl"))
    dev_data = NERDataset(os.path.join(data_dir, "dev.pkl"))

    # Load vocabs
    word_to_ix = load_obj(os.path.join(data_dir, "word_to_ix.pkl"))
    tag_to_ix = load_obj(os.path.join(data_dir, "tag_to_ix.pkl"))

    ix_to_tag = {v: k for k, v in tag_to_ix.items()}

    # DataLoaders
    train_loader = DataLoader(
        train_data,
        batch_size=args.batch_size,
        collate_fn=BatchPadding(),
        shuffle=True,
Example #8
0

if __name__ == '__main__':
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 64
    BATCH_SIZE = 64
    EPOCH = 20
    LR_RATE = 1e-4
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    writer = SummaryWriter()
    writer.flush()

    # Create train dataloader
    vocab = build_vocab('data')
    word_vocab, label_vocab = vocab
    train_dataset = NERDataset('data', vocab, type='/train')
    train_loader = DataLoader(train_dataset,
                              batch_size=128,
                              num_workers=2,
                              collate_fn=custom_collate,
                              shuffle=True)
    val_dataset = NERDataset('data', vocab, type='/val')
    val_loader = DataLoader(val_dataset,
                            batch_size=128,
                            num_workers=2,
                            collate_fn=custom_collate,
                            shuffle=True)

    # Model initialisation
    model = RNN(EMBEDDING_DIM, HIDDEN_DIM, len(word_vocab), len(label_vocab))
    model.to(device)