def load_vocabulary(): if os.path.exists(CKPT_PATH + config['TRAIN']['VOCABULARY']): word2index = {} with open(CKPT_PATH + config['TRAIN']['VOCABULARY']) as file: for line in file: line_spl = line[:-1].split() word2index[line_spl[0]] = int(line_spl[1]) index2word = dict(zip(word2index.values(), word2index.keys())) vocab = Vocabulary() vocab.word2index = word2index vocab.index2word = index2word return vocab else: raise ('not found %s' % CKPT_PATH + config['TRAIN']['VOCABULARY'])
def load_vocabulary(): if os.path.exists(config.vocabulary_path): word2index = {} with open(config.vocabulary_path) as file: for line in file: line_spl = line[:-1].split() word2index[line_spl[0]] = int(line_spl[1]) index2word = dict(zip(word2index.values(), word2index.keys())) vocab = Vocabulary() vocab.word2index = word2index vocab.index2word = index2word return vocab else: raise ('not found %s' % config.vocabulary_path)
def main(_): vocabulary = Vocabulary() vocabulary.load_vocab(FLAGS.vocab_file) if os.path.isdir(FLAGS.checkpoint_path): FLAGS.checkpoint_path =\ tf.train.latest_checkpoint(FLAGS.checkpoint_path) model = LSTMModel(vocabulary.vocab_size, sampling=True, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.load(FLAGS.checkpoint_path) start = vocabulary.encode(FLAGS.start_string) arr = model.predict(FLAGS.max_length, start, vocabulary.vocab_size) print(vocabulary.decode(arr))
def main(config, local): n_gpu = int(GPU_NUM) n_gpu = 1 if n_gpu == 0 else n_gpu np.random.seed(config.random_seed) if n_gpu > 0: torch.cuda.manual_seed_all(config.random_seed) # Create data instances vocab = Vocabulary(config.vocab_path) if config.mode == 'train': # Prepare train data loader train_dataset, val_dataset = Dataset(vocab), Dataset(vocab) train_path = os.path.join(config.data_dir, 'train_data/train_data') val_path = os.path.join(config.data_dir, 'train_data/val_data') train_dataset.create_instances(train_path, config.max_seq_length, type='train') val_dataset.create_instances(val_path, config.max_seq_length, type='val') train_loader = DataLoader(train_dataset, batch_size=config.batch_size * n_gpu, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=config.batch_size * n_gpu) else: train_loader, val_loader = None, None trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader) if nsml.IS_ON_NSML: bind_model(trainer.model, vocab, config) if config.pause: nsml.paused(scope=local) if config.mode == 'train': trainer.train()
def main(_): if os.path.exists(checkpoint_path) is False: os.makedirs(checkpoint_path) # 读取训练文本 with open(datafile, 'r', encoding='utf-8') as f: train_data = f.read() # 加载/生成 词典 vocabulary = Vocabulary() if FLAGS.vocab_file: vocabulary.load_vocab(FLAGS.vocab_file) else: vocabulary.build_vocab(train_data) vocabulary.save(FLAGS.vocab_file) input_ids = vocabulary.encode(train_data) g = batch_generator(input_ids, FLAGS.batch_size, FLAGS.num_steps) model = LSTMModel(vocabulary.vocab_size, batch_size=FLAGS.batch_size, num_steps=FLAGS.num_steps, lstm_size=FLAGS.lstm_size, num_layers=FLAGS.num_layers, learning_rate=FLAGS.learning_rate, train_keep_prob=FLAGS.train_keep_prob, use_embedding=FLAGS.use_embedding, embedding_size=FLAGS.embedding_size) model.train( g, FLAGS.max_steps, checkpoint_path, FLAGS.save_every_n, FLAGS.log_every_n, )
def load_data(config, vocab=None): test_df = pd.read_csv(config.test_file, header=0, names=['face_id', 'content', 'label']) test_data, test_label, test_num_sent, test_num_word = build_data( test_df['content'], test_df['label']) if vocab is None: vocab = Vocabulary() [[vocab.add_sentence(x, y) for (x, y) in zip(data, test_label)] for data in test_data] test_input = [[[vocab.word_to_id(word) for word in sent] for sent in doc] for doc in test_data] test_label = [vocab.tag_to_id(label) for label in test_label] test_input = pad_sequence(test_input, True, config.max_sent, config.max_word) # t = torch.tensor(test_input) # print(t.size()) # print(test_label) test_dataset = myDataset(test_input, test_label) return test_dataset, vocab
subset = subset_df(df, n_samples=n_subset) # Create train, val, test sets train, validation, test = split_df(subset, size_train=train_size, size_valtest=valtest_size) # Compute main target class weights target_weights = class_weights(train, target='overall', p_expect=(1 / 3)) np.savetxt("train_class_weights.csv", target_weights, delimiter=",") # Compute conditional independent sample weights train = sample_weights(train) # Create Vocab on train set vocab = Vocabulary(freq_threshold=5) wordidx, idxword = vocab.build_vocab(train['reviewText'].tolist()) # Save train, val, test sets train.to_csv(save_train, index=False) validation.to_csv(save_val, index=False) test.to_csv(save_test, index=False) # Save wordidx and idxword with open(save_wordidx, 'w') as csv_file: writer = csv.writer(csv_file) for key, value in wordidx.items(): writer.writerow([key, value]) with open(save_idxword, 'w') as csv_file: writer = csv.writer(csv_file)