def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)

    random_seed = argv['--random_seed']
    np.random.seed(random_seed)
    random.seed(random_seed)

    mini_batch_size = argv['--mini_batch_size']

    def read_ids(file):
        ids = []
        with open(file, 'r') as fp:
            for row in fp:
                ids.append(row.strip())
        return ids

    test_ids = read_ids(argv['<test_ids>'])

    with open(argv['--model']) as fp:
        tmp = pickle.load(fp)

    ld = tmp['token']
    mod = BiLSTM(ld.embs,
                 ld.pos,
                 ld.pospeech,
                 ld.chunk,
                 nc=5,
                 nh=2048,
                 de=ld.embs.shape[1])
    mod.__setstate__(tmp['model_params'])

    pairs_idx, chunk_idx, pos_idx, pos_e1_idx, pos_e2_idx, _, subj_y, pred_y, obj_y, idents, e1_ids, e2_ids = ld.transform(
        argv['--dataset'], test_ids)

    test_idxs = list(range(len(pairs_idx)))

    all_test_preds = []
    scores = []
    for start, end in zip(
            range(0, len(test_idxs), mini_batch_size),
            range(mini_batch_size,
                  len(test_idxs) + mini_batch_size, mini_batch_size)):
        if len(test_idxs[start:end]) == 0:
            continue
        tpairs = ld.pad_data([pairs_idx[i] for i in test_idxs[start:end]])
        te1 = ld.pad_data([pos_e1_idx[i] for i in test_idxs[start:end]])
        te2 = ld.pad_data([pos_e2_idx[i] for i in test_idxs[start:end]])
        preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.))

        for x in preds:
            if x > 0.5:
                all_test_preds.append(1)
            else:
                all_test_preds.append(0)

    test_f1 = f1_score(y, all_test_preds, average='binary')
    print("test_f1: %.4f" % (test_f1))
    sys.stdout.flush()
Beispiel #2
0
    def __init__(self, vocab_size, out_size, crf=True):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.embedding_size = LSTMConfig.embedding_size
        self.hidden_size = LSTMConfig.hidden_size

        self.crf = crf

        #无条件随机场
        if not crf:
            self.model = BiLSTM(vocab_size, self.embedding_size,
                                self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_loss
        else:
            self.model = BiLSTM_CRF(vocab_size, self.embedding_size,
                                    self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_lstm_crf_loss

        self.epoches = TrainingConfig.epoches
        self.print_step = TrainingConfig.print_step
        self.lr = TrainingConfig.lr
        self.batch_size = TrainingConfig.batch_size

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        self.step = 0
        #最佳损失函数,初始化一个极大值
        self._best_val_loss = 1e18
        self.best_model = None
Beispiel #3
0
def get_model(config, embedding):
    model = None
    print(config['model_name'])
    try:
        if config['model_name'] == "BiLSTM":
            model = BiLSTM(config, embedding)
    except Exception as e:
        logging.error("load model Exception", e)
        exit()

    return model
Beispiel #4
0
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):
        """初始化参数:
            vocab_size:字典的大小
            emb_size:词向量的维数
            hidden_size:隐向量的维数
            out_size:标注的种类
        """
        super(BiLSTM_CRF, self).__init__()
        self.bilstm = BiLSTM(vocab_size, emb_size, hidden_size, out_size)

        # CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布
        self.transition = nn.Parameter(
            torch.ones(out_size, out_size) * 1 / out_size)
Beispiel #5
0
    def __init__(self, num_outputs, hidden_dim=256, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                 word_embedding_path: str = ROOT_DIR+'/resources/word_embeddings/combined-320.tar/320/',
                 max_seq_len=None):
        """
        :param num_outputs: integer specifying the number of outputs of the model, when unknown in advance,
        this can be retrieved by using the 'get_num_labels_from_file' method
        :param hidden_dim: integer specifying the hidden dimension of the Bidirectional model
        :param device: torch.device specifying the device on which the inputs and the model should be put.
        By default the model will be put on the GPU if one is available
        :param word_embedding_path: string specifying the path of the word embedding text and pt files
        :param max_seq_len: the maximum length to which sentences are clipped, this can be used when some
        sentence are very long, which can cause memory issues when using larger batch sizes.
        """
        # Load in the vectors when they are not already present in the package
        if not embeddings_available():
            download_word_embeddings_nl()
            print("--- Constructing the Pytorch embedding matrix file ---")
            torchtext.vocab.Vectors('combined-320.txt', cache=word_embedding_path)

        vocab_data = torch.load(word_embedding_path+"combined-320.txt.pt")

        self.device = device

        self._words, self._embed_dict, self._embeddings, self._embed_dim = vocab_data

        self.model = BiLSTM(vocab=torch.zeros(size=(1, 1)), hidden_dim=hidden_dim, output_dim=num_outputs,
                            device=device)

        self._TEXT = Field(lower=True, tokenize="spacy", tokenizer_language="nl_core_news_sm", include_lengths=True,
                           batch_first=True, fix_length=max_seq_len)

        self.num_outputs = num_outputs

        self._criterion = None

        self._label_names = None

        self.has_trained = False
Beispiel #6
0
from sklearn import metrics


# 获取模型名称
json_config = {}
with open("config_file", 'r', encoding='utf-8') as f:
    json_config = json.load(f)
label_id2name = {'0': '负向情感', '1': '正向情感'}
#print(json_config['word_to_id'])
print(label_id2name)
tag_to_id = json_config['tag_to_id']## 注意: {'1': 0, '0': 1} 这里的key对应原始文本数据,value是torchtext建立的映射关系
id_to_tag = dict([(v,k) for k,v in tag_to_id.items()])
print('tag_to_id=',tag_to_id) # k 原始文本数据
print('id_to_tag=',id_to_tag) # k 为torchtext加载后的key

model = BiLSTM(json_config['vocab_size'], json_config['embedding_dim'], json_config['hidden_size'],
               json_config['num_layers'], json_config['pad_idx'], json_config['unk_idx'])
model.load_state_dict(torch.load(json_config['ckpts'], map_location='cpu'))  # CPU模式下可独立运行
model.eval()

import jieba
# 定义一个tokenizer
def chi_tokenizer(sentence):
    return [word for word in jieba.cut(sentence)]

def transform_data(record, word_to_id, tag_to_id, batch_size):
    tokens = chi_tokenizer(record['data'])
    res = []
    for token in tokens:
        res.append(word_to_id.get(token, 0))
    PAD_IX = [1] * (batch_size - len(res))
Beispiel #7
0
class BiLSTMClassifier:
    """
    This class implements a Bidirectional LSTM classifier based on the version from PyTorch
    It deals with the various aspects of the training, such as converting the data into the appropriate
    format and logging the training process via TensorBoard

    Attributes
    ----------
        device: torch.device
            torch.device indicating on which device the model and the inputs should be, either on the GPU or the
            CPU. The default behaviour is to put the model and the inputs on the GPU when available.

        model: nn.Module
            The main model used for classification, in this case the Bidirectional LSTM model

        num_outputs: int
            Integer specifying the number of outputs of the model. This should be set to the number of unique classes
            in the dataset. (the 'get_num_labels_from_file' method can be used to retrieve this from the csv file
            when this is not known)

        has_trained: bool
            Boolean specifying whether the model has already been trained. This is used to ensure that the evaluaton
            or scoring is not accidentally run on an untrained model.

        _TEXT: torchtext.data.Field
            torchtext.data.Field instance specifying several parameters of the reading of the data such as
            whether or not to convert all text to lowercase and the type and language of the tokenizer used.

        _words: list
            list with all the words present in the Dutch embedding file

        _embed_dict: dict
            dictionary mapping words in the embeddings file to indices into the embedding matrix

        _embeddings: torch.Tensor
            torch.Tensor of size [num_words, embedding_dim] containing the word embeddings

        _criterion nn.optim.Criterion
            criterion used for the training and evaluation of the model. This is saved in the train methods
            for later use in the evaluation methods

        _embed_dim: int
            Integer specifying the dimension of the embeddings used in the embedding file

        _label_names: list
            list containing the names of the unique labels in the dataset, this is used for converting the
            integer representation used in training back to the original labels for easier interpretation

    """

    def __init__(self, num_outputs, hidden_dim=256, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                 word_embedding_path: str = ROOT_DIR+'/resources/word_embeddings/combined-320.tar/320/',
                 max_seq_len=None):
        """
        :param num_outputs: integer specifying the number of outputs of the model, when unknown in advance,
        this can be retrieved by using the 'get_num_labels_from_file' method
        :param hidden_dim: integer specifying the hidden dimension of the Bidirectional model
        :param device: torch.device specifying the device on which the inputs and the model should be put.
        By default the model will be put on the GPU if one is available
        :param word_embedding_path: string specifying the path of the word embedding text and pt files
        :param max_seq_len: the maximum length to which sentences are clipped, this can be used when some
        sentence are very long, which can cause memory issues when using larger batch sizes.
        """
        # Load in the vectors when they are not already present in the package
        if not embeddings_available():
            download_word_embeddings_nl()
            print("--- Constructing the Pytorch embedding matrix file ---")
            torchtext.vocab.Vectors('combined-320.txt', cache=word_embedding_path)

        vocab_data = torch.load(word_embedding_path+"combined-320.txt.pt")

        self.device = device

        self._words, self._embed_dict, self._embeddings, self._embed_dim = vocab_data

        self.model = BiLSTM(vocab=torch.zeros(size=(1, 1)), hidden_dim=hidden_dim, output_dim=num_outputs,
                            device=device)

        self._TEXT = Field(lower=True, tokenize="spacy", tokenizer_language="nl_core_news_sm", include_lengths=True,
                           batch_first=True, fix_length=max_seq_len)

        self.num_outputs = num_outputs

        self._criterion = None

        self._label_names = None

        self.has_trained = False

    def train_from_file(self, file_name: str, batch_size: int, num_epochs: int, delimiter: str = ",",
                        quotechar: str = '"', text_col_name: str = 'text', label_col_name='label', learning_rate=1.0,
                        logging_dir: str = ROOT_DIR+'/runs/') -> None:
        """
        The main method of this class, implementing a training procedure for the model and handling
        the proper loading of the dataset

        :param file_name: string specifying the location and name of the file that contains the training dat
        :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into the \
        model this can be set lower if memory issues occur
        :param num_epochs: integer specifying the number of epochs for which the model is trained. The right amount of \
        epochs can differ for different datasets and it is recommended to inspect the produced TensorBoard logs \
        to see if the model has converged
        :param delimiter: string specifying the delimiter used in the training csv file
        :param quotechar: string specifying the quotechar used in the training csv file
        :param text_col_name: string specifying the name of the column containing the mails in the csv file
        :param label_col_name: string specifying the name of the column containing the labels of the mails in \
        the csv file
        :param learning_rate: float specifying the learning rate of the model, this can affect the speed of \
        convergence of the model
        :param logging_dir: directory to which the Tensorboard logging files are saved

        """
        print("--- Starting with reading in the dataset ---")
        dataset_loader = CSVDataset(text_field=self._TEXT, file_name=file_name)
        dataset = dataset_loader.load(delimiter=delimiter, quotechar=quotechar, text_col_name=text_col_name,
                                      label_col_name=label_col_name)
        print("--- Finished with reading in the dataset ---")

        dloader = CustomDataLoader(dataset)
        data_iterator = dloader.construct_iterators(batch_size=batch_size, text_col_name=text_col_name,
                                                    label_col_name=label_col_name)

        self._TEXT.vocab.set_vectors(self._embed_dict, self._embeddings, self._embed_dim)

        self.model.set_new_embedding_matrix(self._TEXT.vocab.vectors)
        self._label_names = dataset.fields[label_col_name].vocab.itos

        weights = single_task_class_weighting(data_iterator)
        criterion = nn.CrossEntropyLoss(weight=weights.to(self.device))
        self._criterion = criterion

        optimizer = optim.SGD(self.model.parameters(), lr=learning_rate)
        scheduler = StepLR(optimizer, step_size=50, gamma=0.9)

        generic_training(self.model, criterion, optimizer, scheduler, data_iterator, device=self.device,
                         tensorboard_dir=logging_dir, n_epochs=num_epochs, clip_val=0.0)

        self.has_trained = True
        return None

    def classify_from_file(self, file_name, delimiter: str = ",", quotechar: str = '"', text_col_name: str = "text",
                           batch_size: int = 64) -> list:
        """

         method used for classifying a set of examples for a file with a trained classifier

        This method reads in a file, parses it into the correct format and classifies the contents
        of the file. Throws an error when the model is not trained.

        :param file_name: string specifying the location and name of the file that contains the training dat
        :param delimiter: string specifying the delimiter used in the training csv file
        :param quotechar: string specifying the quotechar used in the training csv file
        :param text_col_name: string specifying the name of the column containing the mails in the csv file
        :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \
        the model this can be set lower if memory issues occur
        :return: returns a list of results, where the result indices from the model have been converted back \
         to the original class names from the file
        """
        assert self.has_trained

        strings = pd.read_csv(file_name, sep=delimiter, quotechar=quotechar)[text_col_name].tolist()

        if isinstance(strings, str):
            strings = [strings]
        if isinstance(strings, list):
            strings = [[string] for string in strings]

        fields = [('text', self._TEXT)]

        list_of_examples = [Example.fromlist(string, fields) for string in strings]
        dataset = torchtext.data.Dataset(list_of_examples, fields)

        data = Iterator(dataset, batch_size=batch_size, device=torch.device("cpu"), sort=False, sort_within_batch=False,
                        repeat=False, shuffle=False)

        predictions = []

        for item in data:
            x = item.text
            self.model.to(self.device)
            self.model = self.model.eval()
            outputs = self.model([x[0].to(self.device), x[1].to(self.device)])
            predictions.extend(outputs.detach().cpu().argmax(1).tolist())
        results = [self._label_names[i] for i in predictions]
        return results

    def classify_from_strings(self, strings: Union[List[str], str]) -> list:
        """

        method that can be used for classifying one or multiple examples with a trained classifier

        :param strings: a single string or a list of strings representing the pieces of text that should be classified
        :return: list containing the predictions of the models for the inputted pieces of text
        """
        assert self.has_trained
        if isinstance(strings, str):
            strings = [strings]
        if isinstance(strings, list):
            strings = [[string] for string in strings]

        fields = [('text', self._TEXT)]

        list_of_examples = [Example.fromlist(string, fields) for string in strings]
        dataset = torchtext.data.Dataset(list_of_examples, fields)

        data = Iterator(dataset, batch_size=1, device=torch.device("cpu"), sort=False, sort_within_batch=False,
                        repeat=False, shuffle=False)

        predictions = []

        for item in data:
            x = item.text
            self.model.to(self.device)
            self.model = self.model.eval()
            outputs = self.model([x[0].to(self.device), x[1].to(self.device)])
            predictions.extend(outputs.detach().cpu().argmax(1).tolist())
        results = [self._label_names[i] for i in predictions]
        return results

    def score(self, file_name: str, delimiter: str = ",", quotechar='"', text_col_name: str = 'text',
              label_col_name: str = 'label', batch_size: int = 64) -> None:
        """

        method that can be used score that model on an unseen test file

        :param file_name: string specifying the location and name of the file that contains the training dat
        :param delimiter: string specifying the delimiter used in the training csv file
        :param quotechar: string specifying the quotechar used in the training csv file
        :param text_col_name: string specifying the name of the column containing the mails in the csv file
        :param label_col_name: string specifying the name of the column containing the labels of the mails \
        in the csv file
        :param batch_size: integer specifying the batch size, this will affect the size of the batches fed into \
         the model this can be set lower if memory issues occur
        """
        assert self.has_trained
        print("Evaluating model")

        print("--- Starting with reading in the dataset ---")
        dataset_loader = CSVDataset(text_field=self._TEXT, file_name=file_name)
        dataset = dataset_loader.load(delimiter=delimiter, quotechar=quotechar, text_col_name=text_col_name,
                                      label_col_name=label_col_name)
        print("--- Finished with reading in the dataset ---")

        dloader = CustomDataLoader(dataset)
        data_iterator = dloader.construct_iterators(batch_size=batch_size, text_col_name=text_col_name,
                                                    label_col_name=label_col_name, is_test_set=True)

        generic_evaluation(self.model, data_iterator, self._criterion, device=self.device)
        return None

    def save_model(self, filename: str) -> None:
        """

        method that can be used to save a (trained) classifier

        :param filename: string specifying the location and name of the destination of the saved model
        """
        assert filename.split(".")[-1] == "pt"
        torch.save(self.model.state_dict(), filename)
        return None

    def load_model(self, filename: str) -> None:
        """

        method that can be used to load a classifier saved in the .pt format

        :param filename: string specifying the name and location of the saved model to be loaded
        """
        assert filename.split(".")[-1] == "pt"
        self.model.load_state_dict(torch.load(filename))
        return None
Beispiel #8
0
def create_model(args, num_classes, embedding_vector):
    nl_str = args.nonlin.lower()
    if nl_str == 'relu':
        nonlin = nn.ReLU
    elif nl_str == 'threshrelu':
        nonlin = ThresholdReLU
    elif nl_str == 'sign11':
        nonlin = partial(Sign11, targetprop_rule=args.tp_rule)
    elif nl_str == 'qrelu':
        nonlin = partial(qReLU, targetprop_rule=args.tp_rule, nsteps=3)
    else:
        raise NotImplementedError(
            'no other non-linearities currently supported')

    # input size
    if args.ds == 'sentiment140' or args.ds == 'tsad':
        input_shape, target_shape = (1, 60, 50), None
    elif args.ds == 'semeval':
        input_shape, target_shape = (1, 60, 100), (1, 6, 100)
    else:
        raise NotImplementedError('no other datasets currently supported')

    # create a model with the specified architecture
    if args.arch == 'cnn':
        model = CNN(input_shape, num_classes, embedding_vector, nonlin=nonlin)
    elif args.arch == 'lstm':
        model = LSTM(input_shape, num_classes, embedding_vector)
    elif args.arch == 'cnn-lstm':
        model = CNN_LSTM(input_shape,
                         num_classes,
                         embedding_vector,
                         nonlin=nonlin)
    elif args.arch == 'lstm-cnn':
        model = LSTM_CNN(input_shape,
                         num_classes,
                         embedding_vector,
                         nonlin=nonlin)
    elif args.arch == 'textcnn':
        model = TextCNN(input_shape,
                        num_classes,
                        embedding_vector,
                        nonlin=nonlin)
    elif args.arch == 'bilstm':
        model = BiLSTM(input_shape,
                       target_shape,
                       num_classes,
                       embedding_vector,
                       nonlin=nonlin)
    else:
        raise NotImplementedError('other models not yet supported')

    logging.info("{} model has {} parameters and non-linearity={} ({})".format(
        args.arch, sum([p.data.nelement() for p in model.parameters()]),
        nl_str, args.tp_rule.name))

    if len(args.gpus) > 1:
        model = nn.DataParallel(model)

    if args.cuda:
        model.cuda()

    return model
Beispiel #9
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    data_loader = TextLoader(True, FLAGS.train_path, FLAGS.map_file_path,
                             FLAGS.batch_size, FLAGS.seq_length, None, None,
                             None, 'utf8', False)
    valid_data_loader = TextLoader(False, FLAGS.valid_path,
                                   FLAGS.map_file_path, FLAGS.batch_size,
                                   FLAGS.seq_length, data_loader.vocab,
                                   data_loader.labels,
                                   data_loader.std_label_map, 'utf8', False)
    tf.logging.info("vocab_size: " + str(data_loader.vocab_size))
    FLAGS.vocab_size = data_loader.vocab_size
    tf.logging.info("label_size: " + str(data_loader.label_size))
    FLAGS.label_size = data_loader.label_size
    bilstm = BiLSTM(FLAGS)
    init = tf.global_variables_initializer()
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        sess.run(init)
        idx = 0
        test_best_acc = 0
        for epcho in range(FLAGS.num_epcho):  # for each epoch
            data_loader.reset_batch_pointer()
            for train_batch_num in range(
                    data_loader.num_batches):  # for each batch
                input_x, input_y, x_len, _ = data_loader.next_batch()
                feed = {
                    bilstm.input_x: input_x,
                    bilstm.input_y: input_y,
                    bilstm.x_len: x_len,
                    bilstm.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, global_step_op, train_loss, train_acc = sess.run(
                    [
                        bilstm.train_step, bilstm.global_step, bilstm.loss,
                        bilstm.acc
                    ],
                    feed_dict=feed)
                tf.logging.info(
                    "training...........global_step = {}, epoch = {}, current_batch = {}, "
                    "train_loss = {:.4f}, accuracy = {:.4f}".format(
                        global_step_op, epcho, train_batch_num, train_loss,
                        train_acc))
                idx += 1
                if idx % FLAGS.check_every == 0:
                    all_num = 0
                    acc_num = 0
                    valid_data_loader.reset_batch_pointer()
                    write_result = []
                    for _ in range(valid_data_loader.num_batches):
                        input_x_valid, input_y_valid, x_len_valid, _ = valid_data_loader.next_batch(
                        )
                        feed = {
                            bilstm.input_x: input_x_valid,
                            bilstm.input_y: input_y_valid,
                            bilstm.x_len: x_len_valid,
                            bilstm.dropout_keep_prob: 1.0
                        }
                        prediction, arg_index = sess.run(
                            [bilstm.prediction, bilstm.arg_index],
                            feed_dict=feed)
                        all_num = all_num + len(input_y_valid)
                        # write_str = ""
                        for i, indexs in enumerate(arg_index):
                            pre_label_id = indexs[0]
                            real_label_id = input_y_valid[i]
                            if pre_label_id == real_label_id:
                                acc_num = acc_num + 1
                            # if real_label_id in valid_data_loader.id_2_label:
                            #     write_str = valid_data_loader.id_2_label.get(real_label_id)
                            # else:
                            #     write_str = "__label__unknown"
                            # for index in indexs:
                            #     cur_label = valid_data_loader.id_2_label.get(index)
                            #     cur_score = prediction[i][index]
                            #     write_str = write_str + " " + cur_label + ":" + str(cur_score)
                            # write_str = write_str + "\n"
                            # write_result.append(write_str)
                    test_acc = acc_num * 1.0 / all_num
                    tf.logging.info(
                        "testing...........global_step = {}, epoch = {}, accuracy = {:.4f}, cur_best_acc = {}"
                        .format(global_step_op, epcho, test_acc,
                                test_best_acc))
                    if test_best_acc < test_acc:
                        test_best_acc = test_acc
                        # save_model
                        if not os.path.exists(FLAGS.model_path):
                            os.makedirs(FLAGS.model_path)
                        checkpoint_path = os.path.join(FLAGS.model_path,
                                                       'lstm_model')
                        bilstm.saver.save(sess,
                                          checkpoint_path,
                                          global_step=global_step_op)
                        # export model
                        export_path = os.path.join(FLAGS.model_path,
                                                   'lstm_tf_serving')
                        if os.path.isdir(export_path):
                            shutil.rmtree(export_path)
                        bilstm.export_model(export_path, sess)
                        # resultfile = open(FLAGS.result_file, 'w', encoding='utf-8')
                        # for pre_sen in write_result:
                        #     resultfile.write(pre_sen)
                        tf.logging.info(
                            "has saved model and write.result..................................................................."
                        )
                        # resultfile.close()
                        # save label and vocab
                        vocabfile = open(FLAGS.vocab_file,
                                         'w',
                                         encoding='utf-8')
                        for key, value in data_loader.vocab.items():
                            vocabfile.write(
                                str(key) + "\t" + str(value) + '\n')
                        vocabfile.close()
                        labelfile = open(FLAGS.label_file,
                                         'w',
                                         encoding='utf-8')
                        for key, value in data_loader.labels.items():
                            labelfile.write(
                                str(key) + "\t" + str(value) + '\n')
                        labelfile.close()
Beispiel #10
0
    print('vocab_size = ', vocab_size)
    print('embedding_dim = ', embedding_dim)
    # 打印上述四句话的参数信息

    ## 保存参数
    word_to_id = dict(TEXT.vocab.stoi)
    tag_to_id = dict(LABEL.vocab.stoi)
    param_config = config_model(vocab_size, embedding_dim, pad_idx, unk_idx,
                                word_to_id, tag_to_id)
    save_config(param_config, 'config_file')
    # 初始化模型
    pre_trained_embedding = TEXT.vocab.vectors
    model = BiLSTM(vocab_size,
                   embedding_dim,
                   config.hidden_size,
                   config.num_layers,
                   pad_idx,
                   unk_idx,
                   pre_trained_embedding=pre_trained_embedding)

    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    loss_func = nn.CrossEntropyLoss()
    # 模型训练
    ## train/val/保存最优模型
    print('=>model training ...<=')
    best_val_loss = float('inf')
    N_EPOCH = args.epoches
    for epoch in range(N_EPOCH):
        t1 = time.time()

        # train
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser(description='Train Neural Network.')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=25,
                        help='Number of updates to make.')
    parser.add_argument('--num_models',
                        type=int,
                        default=5,
                        help='Number of updates to make.')
    parser.add_argument('--lstm_hidden_state',
                        type=int,
                        default=128,
                        help='LSTM hidden state size.')
    parser.add_argument('--word_vectors',
                        default=None,
                        help='Word vecotors filepath.')
    parser.add_argument('--checkpoint_dir',
                        default='./experiments/exp1/checkpoints/',
                        help='Checkpoint directory.')
    parser.add_argument('--checkpoint_name',
                        default='checkpoint',
                        help='Checkpoint File Name.')
    parser.add_argument('--hidden_state',
                        type=int,
                        default=2048,
                        help='hidden layer size.')
    parser.add_argument('--learn_embeddings',
                        type=bool,
                        default=True,
                        help='Learn Embedding Parameters.')
    parser.add_argument('--min_df',
                        type=int,
                        default=5,
                        help='Min word count.')
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='Learning Rate.')
    parser.add_argument('--penalty',
                        type=float,
                        default=0.0,
                        help='Regularization Parameter.')
    parser.add_argument('--p_penalty',
                        type=float,
                        default=0.0,
                        help='Self-Regularization Parameter.')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.5,
                        help='Dropout Value.')
    parser.add_argument('--lstm_dropout',
                        type=float,
                        default=0.5,
                        help='LSTM Dropout Value.')
    parser.add_argument('--lr_decay',
                        type=float,
                        default=1e-6,
                        help='Learning Rate Decay.')
    parser.add_argument('--minibatch_size',
                        type=int,
                        default=50,
                        help='Mini-batch Size.')
    parser.add_argument('--val_minibatch_size',
                        type=int,
                        default=256,
                        help='Val Mini-batch Size.')
    parser.add_argument('--model_type', help='Neural Net Architecutre.')
    parser.add_argument('--train_data_X', help='Training Data.')
    parser.add_argument('--train_data_Y', help='Training Labels.')
    parser.add_argument('--val_data_X', help='Validation Data.')
    parser.add_argument('--val_data_Y', help='Validation Labels.')
    parser.add_argument('--seed', default=42, type=int, help='Random Seed.')
    parser.add_argument('--grad_clip',
                        type=float,
                        default=None,
                        help='Gradient Clip Value.')
    parser.add_argument('--cnn_conv_size',
                        nargs='+',
                        type=int,
                        default=[4, 3, 2, 1],
                        help='CNN Covolution Sizes (widths)')
    parser.add_argument('--num_feat_maps',
                        default=300,
                        type=int,
                        help='Number of CNN Feature Maps.')
    parser.add_argument('--num_att',
                        default=30,
                        type=int,
                        help='Number of Attention Vectors.')

    args = parser.parse_args()

    np.random.seed(args.seed)
    random.seed(args.seed)

    # Load & Process Data
    train_txt, train_Y = load_data_file(args.train_data_X, args.train_data_Y)
    val_txt, val_Y = load_data_file(args.val_data_X, args.val_data_Y)

    data_processor = ProcessData(args.word_vectors,
                                 lower=True,
                                 min_df=args.min_df)
    X_train = data_processor.fit_transform(train_txt)
    X_val = data_processor.transform(val_txt)

    ml_vec = CustomLabelBinarizer()
    ml_vec.fit(train_Y)
    Y_train = ml_vec.transform(train_Y)
    Y_val = ml_vec.transform(val_Y)

    print("Init Model")
    sys.stdout.flush()
    # Init Model
    if args.model_type == 'bilstm':
        from models.bilstm import BiLSTM
        clf = BiLSTM(data_processor.embs,
                     nc=Y_train.shape[1],
                     nh=args.lstm_hidden_state,
                     de=data_processor.embs.shape[1],
                     lr=args.lr,
                     train_emb=args.learn_embeddings,
                     p_lstm_drop=args.lstm_dropout,
                     p_drop=args.dropout,
                     penalty=args.penalty,
                     lr_decay=args.lr_decay,
                     clip=args.grad_clip)
    elif args.model_type == 'cnn':
        from models.cnn import CNN
        clf = CNN(data_processor.embs,
                  nc=Y_train.shape[1],
                  de=data_processor.embs.shape[1],
                  lr=args.lr,
                  p_drop=args.dropout,
                  decay=args.lr_decay,
                  clip=args.grad_clip,
                  fs=args.cnn_conv_size,
                  penalty=args.penalty,
                  train_emb=args.learn_embeddings)
        print(
            "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s"
            % (args.hidden_state, data_processor.embs.shape[1], args.lr,
               args.lr_decay, args.learn_embeddings, args.dropout,
               args.num_feat_maps, args.penalty, args.cnn_conv_size))
    elif args.model_type == 'att_cnn':
        from models.att_cnn import CNN
        clf = CNN(data_processor.embs,
                  nc=Y_train.shape[1],
                  de=data_processor.embs.shape[1],
                  lr=args.lr,
                  p_drop=args.dropout,
                  decay=args.lr_decay,
                  clip=args.grad_clip,
                  fs=args.cnn_conv_size,
                  penalty=args.penalty,
                  train_emb=args.learn_embeddings)
        print(
            "ATT_CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s"
            % (args.hidden_state, data_processor.embs.shape[1], args.lr,
               args.lr_decay, args.learn_embeddings, args.dropout,
               args.num_feat_maps, args.penalty, args.cnn_conv_size))
    elif args.model_type == 'cnn_att_word':
        from models.cnn_att_word_reg import CNN
        clf = CNN(data_processor.embs,
                  nc=Y_train.shape[1],
                  de=data_processor.embs.shape[1],
                  lr=args.lr,
                  p_drop=args.dropout,
                  decay=args.lr_decay,
                  clip=args.grad_clip,
                  fs=args.cnn_conv_size,
                  penalty=args.penalty,
                  train_emb=args.learn_embeddings)
        print(
            "ATT_CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s"
            % (args.hidden_state, data_processor.embs.shape[1], args.lr,
               args.lr_decay, args.learn_embeddings, args.dropout,
               args.num_feat_maps, args.penalty, args.cnn_conv_size))
    elif args.model_type == 'bow':
        from models.bow import BoW
        clf = BoW(data_processor.embs,
                  nc=Y_train.shape[1],
                  nh=args.hidden_state,
                  de=data_processor.embs.shape[1],
                  lr=args.lr,
                  decay=args.lr_decay,
                  clip=args.grad_clip,
                  train_emb=args.learn_embeddings,
                  penalty=args.penalty,
                  p_drop=args.dropout)
        print(
            "BoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f penalty: %.5f"
            %
            (args.hidden_state, data_processor.embs.shape[1], args.lr,
             args.lr_decay, args.learn_embeddings, args.dropout, args.penalty))
    elif args.model_type == 'att':
        from models.att import AttBoW
        clf = AttBoW(data_processor.embs,
                     nc=Y_train.shape[1],
                     nh=args.hidden_state,
                     de=data_processor.embs.shape[1],
                     lr=args.lr,
                     decay=args.lr_decay,
                     clip=args.grad_clip,
                     train_emb=args.learn_embeddings,
                     penalty=args.penalty,
                     p_drop=args.dropout,
                     na=args.num_att,
                     penalty_p=args.p_penalty)
        print(
            "AttBoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_att: %d penalty: %.5f penalty_p: %.5f"
            % (args.hidden_state, data_processor.embs.shape[1], args.lr,
               args.lr_decay, args.learn_embeddings, args.dropout,
               args.num_att, args.penalty, args.p_penalty))
    else:
        raise ValueError('Incorrect Model Specified')

    print("Training Model")
    sys.stdout.flush()
    train_idxs = list(range(len(X_train)))
    val_idxs = list(range(len(X_val)))
    # Train Model
    best_val_f1 = 0
    for epoch in range(1, args.num_epochs + 1):
        mean_loss = []
        mean_f1 = []
        random.shuffle(train_idxs)
        epoch_t0 = time()
        for start, end in zip(
                range(0, len(train_idxs), args.minibatch_size),
                range(args.minibatch_size,
                      len(train_idxs) + args.minibatch_size,
                      args.minibatch_size)):
            if len(train_idxs[start:end]) == 0:
                continue
            mini_batch_sample = data_processor.pad_data(
                [X_train[i] for i in train_idxs[start:end]])
            cost, preds = clf.train_batch(
                mini_batch_sample,
                Y_train[train_idxs[start:end]].astype('int32'), np.float32(0.))

            f1 = f1_score(Y_train[train_idxs[start:end]].argmax(axis=1),
                          preds,
                          average='macro',
                          labels=[0, 1])
            mean_f1.append(f1)
            mean_loss.append(cost)
            sys.stdout.write(
                "Epoch: %d train_avg_loss: %.4f train_avg_f1: %.4f\r" %
                (epoch, np.mean(mean_loss), np.mean(mean_f1)))
            sys.stdout.flush()

        # Validate Model
        final_preds = []
        val_loss = []
        for start, end in zip(
                range(0, len(val_idxs), args.val_minibatch_size),
                range(args.val_minibatch_size,
                      len(train_idxs) + args.val_minibatch_size,
                      args.val_minibatch_size)):
            if len(train_idxs[start:end]) == 0:
                continue
            mini_batch_sample = data_processor.pad_data(
                [X_val[i] for i in val_idxs[start:end]])
            preds, cost = clf.predict_loss(mini_batch_sample,
                                           Y_val[val_idxs[start:end]],
                                           np.float32(1.))
            final_preds += list(preds.flatten())
            val_loss.append(cost)

        f1 = f1_score(Y_val.argmax(axis=1),
                      final_preds,
                      average='macro',
                      labels=[0, 1])
        sys.stdout.write(
            "epoch: %d val_loss %.4f val_f1: %.4f train_avg_loss: %.4f train_avg_f1: %.4f time: %.1f\n"
            % (epoch, np.mean(val_loss), f1, np.mean(mean_loss),
               np.mean(mean_f1), time() - epoch_t0))
        sys.stdout.flush()

        # Checkpoint Model
        if f1 > best_val_f1:
            best_val_f1 = f1
            with open(
                    os.path.abspath(args.checkpoint_dir) + '/' +
                    args.checkpoint_name + '.pkl', 'wb') as out_file:
                pickle.dump(
                    {
                        'model_params': clf.__getstate__(),
                        'token': data_processor,
                        'ml_bin': ml_vec,
                        'args': args,
                        'last_train_avg_loss': np.mean(mean_loss),
                        'last_train_avg_f1': np.mean(mean_f1),
                        'val_f1': f1
                    }, out_file, pickle.HIGHEST_PROTOCOL)
def main():
    parser = argparse.ArgumentParser(description='Test Neural Network.')
    parser.add_argument('--checkpoint_model', help='Checkpoint Model.')
    parser.add_argument('--data_X', help='Test/Validation Data.')
    parser.add_argument('--data_Y', help='Test/Validation Labels.')
    parser.add_argument('--scoring',
                        default='macro',
                        help='Evaluation Measure.')
    parser.add_argument('--minibatch_size',
                        type=int,
                        default=256,
                        help='Mini-batch Size.')
    parser.add_argument('--name_count',
                        default='cnn_1',
                        help='count which run')

    args = parser.parse_args()

    if args.scoring not in ['binary', 'micro', 'macro', 'prf']:
        raise ValueError('Incorrect Evaluation  Measure Specified')

    # Load Checkpoint Model
    with open(args.checkpoint_model, 'rb') as out_file:
        chk_pt = pickle.load(out_file)

    # Load & Process Data
    test_txt, test_Y = load_data_file(args.data_X, args.data_Y)
    X = chk_pt['token'].transform(test_txt)
    Y = chk_pt['ml_bin'].transform(test_Y)

    data_processor = chk_pt['token']

    print("Init Model")
    # Init Model
    if chk_pt['args'].model_type == 'bilstm':
        from models.bilstm import BiLSTM
        clf = BiLSTM(data_processor.embs,
                     nc=Y.shape[1],
                     nh=chk_pt['args'].lstm_hidden_state,
                     de=data_processor.embs.shape[1],
                     lr=chk_pt['args'].lr,
                     train_emb=chk_pt['args'].learn_embeddings,
                     p_lstm_drop=chk_pt['args'].lstm_dropout,
                     p_drop=chk_pt['args'].dropout,
                     penalty=chk_pt['args'].penalty,
                     lr_decay=chk_pt['args'].lr_decay,
                     clip=chk_pt['args'].grad_clip)
        clf.__setstate__(chk_pt['model_params'])
    elif chk_pt['args'].model_type == 'cnn':
        from models.cnn import CNN
        clf = CNN(data_processor.embs,
                  nc=Y.shape[1],
                  de=data_processor.embs.shape[1],
                  lr=chk_pt['args'].lr,
                  p_drop=chk_pt['args'].dropout,
                  decay=chk_pt['args'].lr_decay,
                  clip=chk_pt['args'].grad_clip,
                  fs=chk_pt['args'].cnn_conv_size,
                  penalty=chk_pt['args'].penalty,
                  train_emb=chk_pt['args'].learn_embeddings)
        clf.__setstate__(chk_pt['model_params'])
        print(
            "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s"
            % (chk_pt['args'].hidden_state, data_processor.embs.shape[1],
               chk_pt['args'].lr, chk_pt['args'].lr_decay,
               chk_pt['args'].learn_embeddings, chk_pt['args'].dropout,
               chk_pt['args'].num_feat_maps, chk_pt['args'].penalty,
               chk_pt['args'].cnn_conv_size))
    elif chk_pt['args'].model_type == 'cnn_att_word':
        from models.cnn_att_word_reg import CNN
        clf = CNN(data_processor.embs,
                  nc=Y.shape[1],
                  de=data_processor.embs.shape[1],
                  lr=chk_pt['args'].lr,
                  p_drop=chk_pt['args'].dropout,
                  decay=chk_pt['args'].lr_decay,
                  clip=chk_pt['args'].grad_clip,
                  fs=chk_pt['args'].cnn_conv_size,
                  penalty=chk_pt['args'].penalty,
                  train_emb=chk_pt['args'].learn_embeddings)
        clf.__setstate__(chk_pt['model_params'])
        print(
            "CNN: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_feat_maps: %d penalty: %.5f conv_widths: %s"
            % (chk_pt['args'].hidden_state, data_processor.embs.shape[1],
               chk_pt['args'].lr, chk_pt['args'].lr_decay,
               chk_pt['args'].learn_embeddings, chk_pt['args'].dropout,
               chk_pt['args'].num_feat_maps, chk_pt['args'].penalty,
               chk_pt['args'].cnn_conv_size))

    elif chk_pt['args'].model_type == 'bow':
        from models.bow import BoW
        clf = BoW(data_processor.embs,
                  nc=Y.shape[1],
                  nh=chk_pt['args'].hidden_state,
                  de=data_processor.embs.shape[1],
                  lr=chk_pt['args'].lr,
                  decay=chk_pt['args'].lr_decay,
                  clip=chk_pt['args'].grad_clip,
                  train_emb=chk_pt['args'].learn_embeddings,
                  penalty=chk_pt['args'].penalty,
                  p_drop=chk_pt['args'].dropout)
        clf.__setstate__(chk_pt['model_params'])
        print(
            "BoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f penalty: %.5f"
            % (chk_pt['args'].hidden_state, data_processor.embs.shape[1],
               chk_pt['args'].lr, chk_pt['args'].lr_decay,
               chk_pt['args'].learn_embeddings, chk_pt['args'].dropout,
               chk_pt['args'].penalty))
    elif chk_pt['args'].model_type == 'att':
        from models.att import AttBoW
        clf = AttBoW(data_processor.embs,
                     nc=Y.shape[1],
                     nh=chk_pt['args'].hidden_state,
                     de=data_processor.embs.shape[1],
                     lr=chk_pt['args'].lr,
                     decay=chk_pt['args'].lr_decay,
                     clip=chk_pt['args'].grad_clip,
                     train_emb=chk_pt['args'].learn_embeddings,
                     penalty=chk_pt['args'].penalty,
                     p_drop=chk_pt['args'].dropout,
                     na=chk_pt['args'].num_att,
                     penalty_p=chk_pt['args'].p_penalty)
        clf.__setstate__(chk_pt['model_params'])
        print(
            "AttBoW: hidden_state: %d word_vec_size: %d lr: %.5f decay: %.6f learn_emb: %s dropout: %.3f num_att: %d penalty: %.5f penalty_p: %.5f"
            % (chk_pt['args'].hidden_state, data_processor.embs.shape[1],
               chk_pt['args'].lr, chk_pt['args'].lr_decay,
               chk_pt['args'].learn_embeddings, chk_pt['args'].dropout,
               chk_pt['args'].num_att, chk_pt['args'].penalty,
               chk_pt['args'].p_penalty))
    else:
        raise ValueError('Incorrect Model Specified')

    # Get Predictions
    idxs = list(range(len(X)))
    all_preds = []
    all_proba = np.array([])
    for start, end in zip(
            range(0, len(idxs), args.minibatch_size),
            range(args.minibatch_size,
                  len(idxs) + args.minibatch_size, args.minibatch_size)):
        if len(idxs[start:end]) == 0:
            continue
        mini_batch_sample = data_processor.pad_data(
            [X[i] for i in idxs[start:end]])
        preds = clf.predict(mini_batch_sample, np.float32(1.))
        proba = clf.predict_proba(mini_batch_sample, np.float32(1.))
        if len(all_proba) < 1:
            all_proba = proba
        else:
            all_proba = np.concatenate((all_proba, proba))
        all_preds += list(preds.flatten())

    # Evaluate
    prf1 = None
    filename = '/home/sehan2/Amia/task2/nn_model/probas_full1/' + str(
        args.name_count) + '.npy'
    all_proba.dump(filename)
    '''
def run_train():
    '''
    This method creates the TensorFlow graph and session, running the training loop
    :return: 
    
    '''

    # load preprocessed token, label, shape, char maps
    labels_str_id_map, labels_id_str_map, vocab_str_id_map, vocab_id_str_map, \
    shape_str_id_map, shape_id_str_map, char_str_id_map, char_id_str_map = load_intmaps(FLAGS.train_dir)

    # create intmaps for label types and bio (used later for evaluation, calculating F1 scores, etc.)
    # TODO right now these aren't used
    type_int_int_map, bilou_int_int_map, type_set, bilou_set = create_type_maps(
        labels_str_id_map)

    # load the embeddings
    embeddings = load_embeddings(vocab_str_id_map)

    labels_size = len(labels_str_id_map)
    char_domain_size = len(char_id_str_map)
    vocab_size = len(vocab_str_id_map)
    shape_domain_size = len(shape_id_str_map)

    # create TF graph
    with tf.Graph().as_default():
        # create batchers
        train_batcher = Batcher(
            FLAGS.train_dir,
            FLAGS.batch_size) if FLAGS.memmap_train else SeqBatcher(
                FLAGS.train_dir, FLAGS.batch_size)
        dev_batcher = SeqBatcher(FLAGS.dev_dir,
                                 FLAGS.batch_size,
                                 num_buckets=0,
                                 num_epochs=1)

        train_eval_batcher = SeqBatcher(FLAGS.train_dir,
                                        FLAGS.batch_size,
                                        num_buckets=0,
                                        num_epochs=1)

        # create character embedding model
        if FLAGS.char_dim > 0 and FLAGS.char_model == "lstm":
            print("creating and training character embeddings")
            char_embedding_model = BiLSTMChar(char_domain_size, FLAGS.char_dim,
                                              int(FLAGS.char_tok_dim / 2))
        # elif FLAGS.char_dim > 0 and FLAGS.char_model == "cnn":
        #     char_embedding_model = CNNChar(char_domain_size, FLAGS.char_dim, FLAGS.char_tok_dim, layers_map[0][1]['width'])
        else:
            char_embedding_model = None
        char_embeddings = char_embedding_model.outputs if char_embedding_model is not None else None

        # create BiLSTM model
        if FLAGS.model == 'bilstm':
            model = BiLSTM(
                num_classes=labels_size,
                vocab_size=vocab_size,
                shape_domain_size=shape_domain_size,
                char_domain_size=char_domain_size,
                char_size=FLAGS.char_dim,
                embedding_size=FLAGS.embed_dim,
                shape_size=FLAGS.shape_dim,
                lex_size=FLAGS.lex_dim,
                nonlinearity=FLAGS.nonlinearity,
                viterbi=False,  #viterbi=FLAGS.viterbi,
                hidden_dim=FLAGS.lstm_dim,
                char_embeddings=char_embeddings,
                embeddings=embeddings,
                use_geometric_feats=FLAGS.use_geometric_feats,
                use_lexicons=FLAGS.use_lexicons)
        # elif FLAGS.model == 'lstm':
        #     model = LSTM(
        #         num_classes=labels_size,
        #         vocab_size=vocab_size,
        #         shape_domain_size=shape_domain_size,
        #         char_domain_size=char_domain_size,
        #         char_size=FLAGS.char_dim,
        #         embedding_size=FLAGS.embed_dim,
        #         shape_size=FLAGS.shape_dim,
        #         nonlinearity=FLAGS.nonlinearity,
        #         viterbi=False,  # viterbi=FLAGS.viterbi,
        #         hidden_dim=FLAGS.lstm_dim,
        #         char_embeddings=char_embeddings,
        #         embeddings=embeddings,
        #         use_geometric_feats=FLAGS.use_geometric_feats,
        #         use_lexicons=FLAGS.use_lexicons)

        # Define Training procedure
        global_step = tf.Variable(0, name='global_step', trainable=False)

        optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr,
                                           beta1=FLAGS.beta1,
                                           beta2=FLAGS.beta2,
                                           epsilon=FLAGS.epsilon,
                                           name="optimizer")

        model_vars = [
            v for v in tf.all_variables() if 'context_agg' not in v.name
        ]

        train_op = optimizer.minimize(model.loss,
                                      global_step=global_step,
                                      var_list=model_vars)

        print("model vars: %d" % len(model_vars))
        print(map(lambda v: v.name, model_vars))
        print()
        sys.stdout.flush()
        get_trainable_params()

        tf.initialize_all_variables()

        frontend_opt_vars = [
            optimizer.get_slot(s, n) for n in optimizer.get_slot_names()
            for s in model_vars if optimizer.get_slot(s, n) is not None
        ]

        model_vars += frontend_opt_vars

        # load pretrained model if one is provided
        if FLAGS.load_dir:
            reader = tf.train.NewCheckpointReader(FLAGS.load_dir + ".tf")
            saved_var_map = reader.get_variable_to_shape_map()
            intersect_vars = [
                k for k in tf.all_variables()
                if k.name.split(':')[0] in saved_var_map
                and k.get_shape() == saved_var_map[k.name.split(':')[0]]
            ]
            leftovers = [
                k for k in tf.all_variables()
                if k.name.split(':')[0] not in saved_var_map
                or k.get_shape() != saved_var_map[k.name.split(':')[0]]
            ]
            print("WARNING: Loading pretrained frontend, but not loading: ",
                  map(lambda v: v.name, leftovers))
            frontend_loader = tf.train.Saver(var_list=intersect_vars)

        else:
            frontend_loader = tf.train.Saver(var_list=model_vars)

        frontend_saver = tf.train.Saver(var_list=model_vars)

        # create a supervisor
        sv = tf.python.train.Supervisor(
            logdir=FLAGS.model_dir if FLAGS.model_dir != '' else None,
            global_step=global_step,
            saver=None,
            save_model_secs=0,
            save_summaries_secs=0)

        training_start_time = time.time()

        # create session
        with sv.managed_session(
                FLAGS.master,
                config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            print("session created")
            sys.stdout.flush()

            # start queue runner threads
            threads = tf.train.start_queue_runners(sess=sess)

            # load model if applicable
            if FLAGS.load_dir != '':
                print("Deserializing model: " + FLAGS.load_dir + ".tf")
                frontend_loader.restore(sess, FLAGS.load_dir + ".tf")

            # load batches
            print()
            dev_batches, train_batches, num_dev_examples, num_train_examples \
                = load_batches(sess, train_batcher, train_eval_batcher, dev_batcher)

            # just run the evaluation if applicable
            if FLAGS.evaluate_only:
                if FLAGS.train_eval:
                    w_f1, accuracy, preds, labels = evaluation.run_evaluation(
                        sess, model, char_embedding_model, train_batches,
                        labels_str_id_map, labels_id_str_map, "TRAIN")
                print()
                w_f1, accuracy, preds, labels = evaluation.run_evaluation(
                    sess, model, char_embedding_model, dev_batches,
                    labels_str_id_map, labels_id_str_map, "TEST", True,
                    vocab_str_id_map, vocab_id_str_map)

                # write test set predictions to disk (for furthr analysis)
                print("writing predictions to disk:")
                # with open(FLAGS.model_dir + os.sep + 'test_preds.txt', 'w') as f:
                #     for pred in preds:
                #         f.write(pred + "\n")
                # with open(FLAGS.model_dir + os.sep + 'test_golds.txt', 'w') as f:
                #     for label in labels:
                #         f.write(label + "\n")
                np.save(FLAGS.model_dir + os.sep + "test_preds.npy", preds)
                np.save(FLAGS.model_dir + os.sep + "test_labels.npy", labels)

            # train a model
            else:
                best_score = 0
                total_iterations = 0

                # always train the front-end unless load dir was passed
                if FLAGS.load_dir == '' or (FLAGS.load_dir != ''
                                            and FLAGS.layers2 == ''):
                    best_score, training_iteration, train_speed = train(
                        sess, sv, model, char_embedding_model, train_batches,
                        dev_batches, num_train_examples, num_dev_examples,
                        train_batcher, labels_str_id_map, labels_id_str_map,
                        train_op, frontend_saver, vocab_str_id_map)
                    total_iterations += training_iteration
                    if FLAGS.model_dir:
                        print("Deserializing model: " + FLAGS.model_dir +
                              "-frontend.tf")
                        frontend_saver.restore(
                            sess, FLAGS.model_dir + "-frontend.tf")

        sv.coord.request_stop()
        sv.coord.join(threads)
        sess.close()

    total_time = time.time() - training_start_time
    if FLAGS.evaluate_only:
        print("Testing time: %d seconds" % (total_time))
    else:
        print(
            "Training time: %d minutes, %d iterations (%3.2f minutes/iteration)"
            % (total_time / 60, total_iterations, total_time /
               (60 * total_iterations)))
        print("Avg training speed: %f examples/second" % (train_speed))
        print("Best dev F1: %2.2f" % (best_score * 100))
Beispiel #14
0
    print("--------Training---------")
    print("Encoder Input Shape - {}".format(encoder_in.shape))
    print("Decoder Input Shape - {}".format(decoder_in.shape))
    print("Decoder Output Shape- {}".format(decoder_out.shape))
    print("-------Validation--------")
    print("Encoder Input Shape - {}".format(val_encoder_in.shape))
    print("Decoder Input Shape - {}".format(val_decoder_in.shape))
    print("Decoder Output Shape- {}".format(val_decoder_out.shape))

    train = batchgen(encoder_in, decoder_in, decoder_out, batchsize)
    val = batchgen(val_encoder_in, val_decoder_in, val_decoder_out, batchsize)

if model_type == 'bilstm':
    modelname = 'BiLSTM'
    from models.bilstm import BiLSTM
    model = BiLSTM(comwords, refwords, seqlen, targetlen)
    model, encoder, decoder = model.create_model()
    print(model.summary())
elif model_type == 'bilstm-f':
    modelname = 'BiLSTMF'
    from models.bilstm_f import BiLSTM_F
    model = BiLSTM_F(comwords,
                     len(srctok.word_counts) + 1,
                     len(reftok.word_counts) + 1, seqlen, srclen, targetlen)
    model, encoder, decoder = model.create_model()
    print(model.summary())
elif model_type == 'bilstm-csatt':
    modelname = 'BiLSTMCSATT'
    from models.bilstm_csatt import BiLSTM_CSAtt
    model = BiLSTM_CSAtt(
        len(comtok.word_counts) + 1,
Beispiel #15
0
def main(argv):
    argv = docopt.docopt(__doc__)

    num_epochs = argv['--num_epochs']
    mini_batch_size = argv['--mini_batch_size']
    val_mini_batch_size = 64
    num_classes = argv['--num_classes']
    lstm_hidden_state_size = argv['--lstm_hidden_state']
    random_seed = argv['--random_seed']

    np.random.seed(random_seed)
    random.seed(random_seed)

    def read_ids(filename):
        ids = []
        with open(filename, 'r') as fp:
            for row in fp:
                ids.append(row.strip())
        return ids

    train_ids = read_ids(argv['--train_ids'])
    val_ids = read_ids(argv['--dev_ids'])
    test_ids = read_ids(argv['--test_ids'])

    ld = LoadData(argv['--word2vec'])

    train_pairs, train_e1, train_e2, train_y, train_ids, _, _ = ld.fit_transform(
        argv['--dataset'], train_ids)
    dev_pairs, dev_e1, dev_e2, dev_y, val_ids, dev_e1_ids, dev_e2_ids = ld.transform(
        argv['--dataset'], val_ids)
    test_pairs, test_e1, test_e2, test_y, test_ids, e1_ids, e2_ids = ld.transform(
        argv['--dataset'], test_ids)

    idxs = list(range(len(train_pairs)))
    dev_idxs = list(range(len(dev_pairs)))
    test_idxs = list(range(len(test_pairs)))

    last_loss = None
    avg_loss = []
    avg_f1 = []
    check_preds = None
    mod = BiLSTM(ld.embs,
                 ld.pos,
                 nc=num_classes,
                 nh=lstm_hidden_state_size,
                 de=ld.embs.shape[1])
    best_dev_f1 = 0
    for epoch in range(1, num_epochs + 1):
        mean_loss = []
        random.shuffle(idxs)
        for start, end in zip(
                range(0, len(idxs), mini_batch_size),
                range(mini_batch_size,
                      len(idxs) + mini_batch_size, mini_batch_size)):
            idxs_sample = idxs[start:end]
            if len(idxs_sample) < mini_batch_size:
                continue
            batch_labels = np.array(train_y[idxs_sample], dtype='int32')
            tpairs = ld.pad_data([train_pairs[i] for i in idxs_sample])
            te1 = ld.pad_data([train_e1[i] for i in idxs_sample])
            te2 = ld.pad_data([train_e2[i] for i in idxs_sample])
            cost = mod.train_batch(tpairs, te1, te2,
                                   train_y[idxs_sample].astype('int32'),
                                   np.float32(0.),
                                   np.array(negs).astype('int32'))
            mean_loss.append(cost)
            print("EPOCH: %d loss: %.4f train_loss: %.4f" %
                  (epoch, cost, np.mean(mean_loss)))
            sys.stdout.flush()

        all_dev_preds = []
        scores = []
        for start, end in zip(
                range(0, len(dev_idxs), val_mini_batch_size),
                range(val_mini_batch_size,
                      len(dev_idxs) + val_mini_batch_size,
                      val_mini_batch_size)):
            if len(dev_idxs[start:end]) == 0:
                continue
            vpairs = ld.pad_data([dev_pairs[i] for i in dev_idxs[start:end]])
            ve1 = ld.pad_data([dev_e1[i] for i in dev_idxs[start:end]])
            ve2 = ld.pad_data([dev_e2[i] for i in dev_idxs[start:end]])
            preds = mod.predict_proba(vpairs, ve1, ve2, np.float32(1.))
            for x in preds:
                if x > 0.5:
                    all_dev_preds.append(1)
                else:
                    all_dev_preds.append(0)

        dev_f1 = f1_score(dev_y, all_dev_preds, average='binary')
        print("EPOCH: %d train_loss: %.4f dev_f1: %.4f" %
              (epoch, np.mean(mean_loss), dev_f1))
        sys.stdout.flush()

        if dev_f1 > best_dev_f1:
            with open(argv['--model'], 'w') as fp:
                pickle.dump({
                    'model_params': mod.__getstate__(),
                    'token': ld
                }, fp, pickle.HIGHEST_PROTOCOL)
            best_dev_f1 = dev_f1
            all_test_preds = []
            scores = []
            for start, end in zip(
                    range(0, len(test_idxs), val_mini_batch_size),
                    range(val_mini_batch_size,
                          len(test_idxs) + val_mini_batch_size,
                          val_mini_batch_size)):
                if len(test_idxs[start:end]) == 0:
                    continue
                tpairs = ld.pad_data(
                    [test_pairs[i] for i in test_idxs[start:end]])
                te1 = ld.pad_data([test_e1[i] for i in test_idxs[start:end]])
                te2 = ld.pad_data([test_e2[i] for i in test_idxs[start:end]])
                preds = mod.predict_proba(tpairs, te1, te2, np.float32(1.))
                for x in preds:
                    if x > 0.5:
                        all_test_preds.append(1)
                    else:
                        all_test_preds.append(0)
            test_f1 = f1_score(test_y, all_test_preds, average='binary')
            print("EPOCH: %d train_loss: %.4f dev_f1: %.4f test_f1: %.4f" %
                  (epoch, np.mean(mean_loss), dev_f1, test_f1))
            sys.stdout.flush()