Ejemplo n.º 1
0
    def __init__(self, vocPath, labelsPath):

        with open(vocPath, 'r') as vf:
            vocab = json.load(vf)
            self.voc = Vocab(vocab)

        with open(labelsPath, 'r') as lf:
            self.labels = json.load(lf)
        self.change = None
        self.indices = None
        self.lenArt = 0
        self.noSample = 0
Ejemplo n.º 2
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model_{}.h5'
    num_words = 15000

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # Build models.
    models = [
        UnidirectionalModel(num_words, target_vocab.size).build(),
        BidirectionalModel(num_words, target_vocab.size).build(),
    ]
    for i, model in enumerate(models):
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(i), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(i))
        api = InferenceAPI(model, source_vocab, target_vocab)
        y_pred = api.predict_from_sequences(x_test)
        print(classification_report(y_test, y_pred, digits=4))
Ejemplo n.º 3
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/bidirectional_model.h5'
    num_words = 15000

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    target_vocab = Vocab(lower=False).fit(y_train)
    x_train = create_dataset(x_train, source_vocab)
    y_train = create_dataset(y_train, target_vocab)

    # モデルの構築
    # model = UnidirectionalModel(num_words, target_vocab.size).build()
    model = BidirectionalModel(num_words, target_vocab.size).build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path, save_best_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    model = load_model(model_path)
    api = InferenceAPI(model, source_vocab, target_vocab)
    y_pred = api.predict_from_sequences(x_test)
    print(classification_report(y_test, y_pred, digits=4))
Ejemplo n.º 4
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    # model_path = 'models/unidirectional_model.h5'
    model_path = 'models/'
    pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    maxlen = 250

    # データ・セットの読み込み
    x, y = load_dataset('./data/ja.wikipedia.conll')
    # model = BertModel.from_pretrained (pretrained_model_name_or_path)
    # config =  BertConfig(pretrained_model_name_or_path)
    tokenizer = BertJapaneseTokenizer.from_pretrained(
        pretrained_model_name_or_path, do_word_tokenize=False)

    # データ・セットの前処理
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    target_vocab = Vocab(lower=False).fit(y_train)
    features_train, labels_train = convert_examples_to_features(
        x_train,
        y_train,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)
    features_test, labels_test = convert_examples_to_features(
        x_test,
        y_test,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)

    # モデルの構築
    model = build_model(pretrained_model_name_or_path, target_vocab.size)
    model.compile(optimizer='sgd', loss=loss_func(target_vocab.size))

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
    ]

    # モデルの学習
    model.fit(x=features_train,
              y=labels_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)

    # 予測と評価
    evaluate(model, target_vocab, features_test, labels_test)
Ejemplo n.º 5
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/'
    pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking'
    maxlen = 250

    # Data loading.
    x, y = load_dataset('./data/ja.wikipedia.conll')
    tokenizer = BertJapaneseTokenizer.from_pretrained(
        pretrained_model_name_or_path, do_word_tokenize=False)

    # Pre-processing.
    x = preprocess_dataset(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    target_vocab = Vocab(lower=False).fit(y_train)
    features_train, labels_train = convert_examples_to_features(
        x_train,
        y_train,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)
    features_test, labels_test = convert_examples_to_features(
        x_test,
        y_test,
        target_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer)

    # Build model.
    model = build_model(pretrained_model_name_or_path, target_vocab.size)
    model.compile(optimizer='sgd', loss=loss_func(target_vocab.size))

    # Preparing callbacks.
    callbacks = [
        EarlyStopping(patience=3),
    ]

    # Train the model.
    model.fit(x=features_train,
              y=labels_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)
    model.save_pretrained(model_path)

    # Evaluation.
    evaluate(model, target_vocab, features_test, labels_test)
Ejemplo n.º 6
0
class DataLoader:
    def __init__(self, vocPath, labelsPath):

        with open(vocPath, 'r') as vf:
            vocab = json.load(vf)
            self.voc = Vocab(vocab)

        with open(labelsPath, 'r') as lf:
            self.labels = json.load(lf)
        self.change = None
        self.indices = None
        self.lenArt = 0
        self.noSample = 0

    def readData(self, trainPath):

        with open(trainPath, 'r') as trfile:
            trainArticles = json.load(trfile)

        trarticles = [[
            a,
            map(lambda x: self.voc.s2v(x).long(), trainArticles[a]),
            int(self.labels[a])
        ] for a in trainArticles.keys()]

        self.lenArt = len(trainArticles)

        return trarticles

    def readData_sentence(self, trainPath, bert=0):

        with open(trainPath, 'r') as trfile:
            trainArticles = json.load(trfile)

        trarticles = []
        ap = 0
        co = 0
        for id in trainArticles.keys():

            if id in self.labels.keys():
                trarticles.append([id, []])

                for i, s in enumerate(trainArticles[id]):
                    if len(s) > 1:
                        co += 1
                        if bert == 0:
                            sv = self.voc.s2v(s).long()
                            if sv.size()[0] > 0:
                                trarticles[-1][1].append(
                                    (sv, self.labels[id][i]))
                                if self.labels[id][i] == 1:
                                    ap = 1

                        else:
                            sv = s
                            if len(sv) > 0:
                                trarticles[-1][1].append(
                                    (sv, self.labels[id][i]))
                                if self.labels[id][i] == 1:
                                    ap = 1

                trarticles[-1].append(ap)

        self.lenArt = len(trarticles)

        return trarticles

    def readData_ub(self, trainPath):

        with open(trainPath, 'r') as trfile:
            trainArticles = json.load(trfile)

        trarticles = [[
            a,
            map(lambda x: self.voc.s2v(x).long(), trainArticles[a]),
            int(self.labels[a])
        ] for a in trainArticles.keys()]

        if self.change == None:
            for i, a in enumerate(trainArticles.keys()):
                if int(self.labels[a]) > 0:
                    self.change = i
                    break

            self.lenArt = len(trainArticles)
            indices = [1 / (self.change * 2)] * self.lenArt
            indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i)
            self.indices = indices
            self.noSample = 2 * (self.lenArt - i)

        return trarticles

    def readData_bert(self, trainPath):

        with open(trainPath, 'r') as trfile:
            trainArticles = json.load(trfile)

        trarticles = [[a, trainArticles[a],
                       int(self.labels[a])] for a in trainArticles.keys()]

        if self.change == None:
            for i, a in enumerate(trainArticles.keys()):
                if int(self.labels[a]) > 0:
                    self.change = i
                    break

            self.lenArt = len(trainArticles)
            indices = [1 / (self.change * 2)] * self.lenArt
            indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i)
            self.indices = indices
            self.noSample = 2 * (self.lenArt - i)

        return trarticles

    def readData_with_padding(self, trainPath):

        self.voc.w2i['<PAD>'] = self.voc.keysize
        self.voc.keysize += 1

        with open(trainPath, 'r') as trfile:
            trainArticles = json.load(trfile)

        trarticles = [(map(lambda x: self.voc.s2v(x).long(),
                           trainArticles[a]), int(self.labels[a]))
                      for a in trainArticles.keys()]

        if self.change == None:
            for i, a in enumerate(trainArticles.keys()):
                if int(self.labels[a]) > 0:
                    self.change = i
                    break

            self.lenArt = len(trainArticles)
            indices = [1 / (self.change * 2)] * self.lenArt
            indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i)
            self.indices = indices
            self.noSample = 2 * (self.lenArt - i)

        return trarticles