def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) X = preprocessing.do(data) print('Train model') if config.sg == 'CBOW': model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=0 ) else: model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=1 ) print(model.wv.vectors.shape) model.save(os.path.join(config.save_directory, config.ckpt_name))
def do(config): # 데이터 읽기 & 전처리 print("Make vocab.json and merges.txt") preprocessing = Preprocessing(config) preprocessing.do() print("Model build..") trainer = build(config) trainer.train() trainer.save_model(os.path.join(config.save_directory)) # create empty modelcard file.. >> to fix transformers library bug f = open(file=os.path.join(config.save_directory, 'modelcard.json'), mode='w') f.close() print("Training complete!")
def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) x_train, y_train = preprocessing.do(data) print("Model build..") model, callback = build(config, preprocessing.vocab_size) history = model.fit(x_train, y_train, epochs=config.epoch, callbacks=callback, batch_size=config.batch_size, validation_split=0.2) model.save(os.path.join(config.save_directory, config.ckpt_name))