def __init__(self, vocPath, labelsPath): with open(vocPath, 'r') as vf: vocab = json.load(vf) self.voc = Vocab(vocab) with open(labelsPath, 'r') as lf: self.labels = json.load(lf) self.change = None self.indices = None self.lenArt = 0 self.noSample = 0
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/model_{}.h5' num_words = 15000 # Data loading. x, y = load_dataset('./data/ja.wikipedia.conll') # Pre-processing. x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # Build models. models = [ UnidirectionalModel(num_words, target_vocab.size).build(), BidirectionalModel(num_words, target_vocab.size).build(), ] for i, model in enumerate(models): model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path.format(i), save_best_only=True) ] # Train the model. model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # Inference. model = load_model(model_path.format(i)) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 # model_path = 'models/unidirectional_model.h5' model_path = 'models/bidirectional_model.h5' num_words = 15000 # データ・セットの読み込み x, y = load_dataset('./data/ja.wikipedia.conll') # データ・セットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) source_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train) target_vocab = Vocab(lower=False).fit(y_train) x_train = create_dataset(x_train, source_vocab) y_train = create_dataset(y_train, target_vocab) # モデルの構築 # model = UnidirectionalModel(num_words, target_vocab.size).build() model = BidirectionalModel(num_words, target_vocab.size).build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # 予測と評価 model = load_model(model_path) api = InferenceAPI(model, source_vocab, target_vocab) y_pred = api.predict_from_sequences(x_test) print(classification_report(y_test, y_pred, digits=4))
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 # model_path = 'models/unidirectional_model.h5' model_path = 'models/' pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking' maxlen = 250 # データ・セットの読み込み x, y = load_dataset('./data/ja.wikipedia.conll') # model = BertModel.from_pretrained (pretrained_model_name_or_path) # config = BertConfig(pretrained_model_name_or_path) tokenizer = BertJapaneseTokenizer.from_pretrained( pretrained_model_name_or_path, do_word_tokenize=False) # データ・セットの前処理 x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) target_vocab = Vocab(lower=False).fit(y_train) features_train, labels_train = convert_examples_to_features( x_train, y_train, target_vocab, max_seq_length=maxlen, tokenizer=tokenizer) features_test, labels_test = convert_examples_to_features( x_test, y_test, target_vocab, max_seq_length=maxlen, tokenizer=tokenizer) # モデルの構築 model = build_model(pretrained_model_name_or_path, target_vocab.size) model.compile(optimizer='sgd', loss=loss_func(target_vocab.size)) # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ] # モデルの学習 model.fit(x=features_train, y=labels_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) # 予測と評価 evaluate(model, target_vocab, features_test, labels_test)
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/' pretrained_model_name_or_path = 'cl-tohoku/bert-base-japanese-whole-word-masking' maxlen = 250 # Data loading. x, y = load_dataset('./data/ja.wikipedia.conll') tokenizer = BertJapaneseTokenizer.from_pretrained( pretrained_model_name_or_path, do_word_tokenize=False) # Pre-processing. x = preprocess_dataset(x) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) target_vocab = Vocab(lower=False).fit(y_train) features_train, labels_train = convert_examples_to_features( x_train, y_train, target_vocab, max_seq_length=maxlen, tokenizer=tokenizer) features_test, labels_test = convert_examples_to_features( x_test, y_test, target_vocab, max_seq_length=maxlen, tokenizer=tokenizer) # Build model. model = build_model(pretrained_model_name_or_path, target_vocab.size) model.compile(optimizer='sgd', loss=loss_func(target_vocab.size)) # Preparing callbacks. callbacks = [ EarlyStopping(patience=3), ] # Train the model. model.fit(x=features_train, y=labels_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks, shuffle=True) model.save_pretrained(model_path) # Evaluation. evaluate(model, target_vocab, features_test, labels_test)
class DataLoader: def __init__(self, vocPath, labelsPath): with open(vocPath, 'r') as vf: vocab = json.load(vf) self.voc = Vocab(vocab) with open(labelsPath, 'r') as lf: self.labels = json.load(lf) self.change = None self.indices = None self.lenArt = 0 self.noSample = 0 def readData(self, trainPath): with open(trainPath, 'r') as trfile: trainArticles = json.load(trfile) trarticles = [[ a, map(lambda x: self.voc.s2v(x).long(), trainArticles[a]), int(self.labels[a]) ] for a in trainArticles.keys()] self.lenArt = len(trainArticles) return trarticles def readData_sentence(self, trainPath, bert=0): with open(trainPath, 'r') as trfile: trainArticles = json.load(trfile) trarticles = [] ap = 0 co = 0 for id in trainArticles.keys(): if id in self.labels.keys(): trarticles.append([id, []]) for i, s in enumerate(trainArticles[id]): if len(s) > 1: co += 1 if bert == 0: sv = self.voc.s2v(s).long() if sv.size()[0] > 0: trarticles[-1][1].append( (sv, self.labels[id][i])) if self.labels[id][i] == 1: ap = 1 else: sv = s if len(sv) > 0: trarticles[-1][1].append( (sv, self.labels[id][i])) if self.labels[id][i] == 1: ap = 1 trarticles[-1].append(ap) self.lenArt = len(trarticles) return trarticles def readData_ub(self, trainPath): with open(trainPath, 'r') as trfile: trainArticles = json.load(trfile) trarticles = [[ a, map(lambda x: self.voc.s2v(x).long(), trainArticles[a]), int(self.labels[a]) ] for a in trainArticles.keys()] if self.change == None: for i, a in enumerate(trainArticles.keys()): if int(self.labels[a]) > 0: self.change = i break self.lenArt = len(trainArticles) indices = [1 / (self.change * 2)] * self.lenArt indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i) self.indices = indices self.noSample = 2 * (self.lenArt - i) return trarticles def readData_bert(self, trainPath): with open(trainPath, 'r') as trfile: trainArticles = json.load(trfile) trarticles = [[a, trainArticles[a], int(self.labels[a])] for a in trainArticles.keys()] if self.change == None: for i, a in enumerate(trainArticles.keys()): if int(self.labels[a]) > 0: self.change = i break self.lenArt = len(trainArticles) indices = [1 / (self.change * 2)] * self.lenArt indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i) self.indices = indices self.noSample = 2 * (self.lenArt - i) return trarticles def readData_with_padding(self, trainPath): self.voc.w2i['<PAD>'] = self.voc.keysize self.voc.keysize += 1 with open(trainPath, 'r') as trfile: trainArticles = json.load(trfile) trarticles = [(map(lambda x: self.voc.s2v(x).long(), trainArticles[a]), int(self.labels[a])) for a in trainArticles.keys()] if self.change == None: for i, a in enumerate(trainArticles.keys()): if int(self.labels[a]) > 0: self.change = i break self.lenArt = len(trainArticles) indices = [1 / (self.change * 2)] * self.lenArt indices[i:] = [1 / ((self.lenArt - i) * 2)] * (self.lenArt - i) self.indices = indices self.noSample = 2 * (self.lenArt - i) return trarticles