def evaluate(): # Load model weight_path = 'model/09031344_epoch_4_train_loss_3.7933.h5' # Load data X, Sources, Targets = load_test_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() model = TransformerModel(in_vocab_len=len(idx2de), out_vocab_len=len(idx2en), max_len=hp.maxlen) model.load_model(weight_path) for i in range(len(X) // hp.batch_size): x = X[i * hp.batch_size:(i + 1) * hp.batch_size] sources = Sources[i * hp.batch_size:(i + 1) * hp.batch_size] targets = Targets[i * hp.batch_size:(i + 1) * hp.batch_size] preds = model.translate(x, idx2en) for source, target, pred in zip(sources, targets, preds): print('source:', source) print('expected:', target) print('pred:', pred) print()
def evaluate_train(): # Load model weight_path = 'model/09031925_epoch_0_train_loss_5.9855.h5' # Load data Sources, Targets = load_train_data() de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() batch_size = 5 model = TransformerModel(in_vocab_len=len(idx2de), out_vocab_len=len(idx2en), max_len=hp.maxlen) model.load_model(weight_path) for i in range(5 // batch_size): x = Sources[i * batch_size:(i + 1) * batch_size] sources = Sources[i * batch_size:(i + 1) * batch_size] targets = Targets[i * batch_size:(i + 1) * batch_size] preds = model.translate_with_ans(sources, targets, idx2en) # preds = model.translate(x, idx2en) for source, target, pred in zip(sources, targets, preds): print('source:', ' '.join(idx2de[idx] for idx in source)) print('expected:', ' '.join(idx2en[idx] for idx in target)) print('pred:', pred) print()
def inference(self, name=None): with tf.variable_scope("tf_inference"): X = self.sentence_placeholder seqEmds = tf.nn.embedding_lookup(self.words, X) shapeS = tf.shape(seqEmds) clsH = tf.tile(self.cls, [shapeS[0]]) clsH = tf.reshape(clsH, [ shapeS[0], self.embedding_size-self.pos_embedding_size-self.seg_embedding_size]) clsH = tf.expand_dims(clsH, axis=1) # now X is [Batch, kMaxSeqLen, kMaxSubToken, embedding] X = tf.concat([clsH, seqEmds], axis=1) xs = tf.zeros([shapeS[0], kMaxSeqLen]) #[Batch, kMaxSeqLen, embedding2] Xpos = positional_encoding(xs, self.pos_embedding_size) amask = tf.sequence_mask( self.a_length, self.max_token_per_sentence+1, dtype=tf.int32) abmask = tf.sequence_mask( self.totalLength, self.max_token_per_sentence+1, dtype=tf.int32) totalmask = amask + abmask segE = tf.nn.embedding_lookup(self.segs, totalmask) X = tf.concat([X, Xpos, segE], axis=2) X = layer_norm_and_dropout(X, self.dropout_h) print("now X is: %r" % (X)) X = TransformerModel(8, 6, X, self.dropout_h, mask=abmask) return X
def inference(self, name=None): with tf.variable_scope("tf_inference"): X = self.sentence_placeholder seqEmds = tf.nn.embedding_lookup(self.words, X) shapeS = tf.shape(seqEmds) clsH = tf.tile(self.cls, [shapeS[0]]) # self.embedding_size-self.pos_embedding_size-self.seg_embedding_size] == embeddingSize clsH = tf.reshape(clsH, [ shapeS[0], self.embedding_size-self.pos_embedding_size-self.seg_embedding_size]) clsH = tf.expand_dims(clsH, axis=1) # add [cls]的embedding X = tf.concat([clsH, seqEmds], axis=1) # position add [cls]: cls + max_token_per_sentence = final_length = 200 xs = tf.tile([0], [shapeS[0]]) xs = tf.reshape(xs, [shapeS[0], 1]) # position的embedding Xpos = positional_encoding( tf.concat([xs, self.sentence_placeholder], axis=1), self.pos_embedding_size) amask = tf.sequence_mask( self.a_length, self.max_token_per_sentence+1, dtype=tf.int32) abmask = tf.sequence_mask( self.ab_length, self.max_token_per_sentence+1, dtype=tf.int32) totalmask = amask + abmask # add segment id 的embedding, default 0, if is a sentence then 2, else if is b sentence then 1 segE = tf.nn.embedding_lookup(self.segs, totalmask) X = tf.concat([X, Xpos, segE], axis=2) print("now X is: %r" % (X)) # 10 heads, 2 layers, abmask: 记录输入的有效token位置 # output :[bath_size, length, self.embedding_size] X = TransformerModel(10, 3, X, self.dropout_h, mask=abmask) return X
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Start logging. level = logging.DEBUG if settings.verbose else logging.INFO logging.basicConfig(level=level, format='%(levelname)s: %(message)s') # Create the TensorFlow session. tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) configs.append(config) # Create the model graphs and restore their variables. logging.debug("Loading models\n") models = [] # ============= 19/8/16 KP ============ warning('='*20 + 'Model Config to Load') warning(settings.models) # ===================================== for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) saver = model_loader.init_or_restore_variables(config, session, ensemble_scope=scope) model.sampling_utils = SamplingUtils(settings) models.append(model) # ============= 19/8/16 KP ============ model_summary() # ===================================== # TODO Ensembling is currently only supported for RNNs, so if # TODO len(models) > 1 then check models are all rnn # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Create the TensorFlow session. tf_config = tf.ConfigProto() tf_config.allow_soft_placement = True session = tf.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) configs.append(config) # Create the model graphs. logging.debug("Loading models\n") models = [] for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) model.sampling_utils = SamplingUtils(settings) models.append(model) # Add smoothing variables (if the models were trained with smoothing). #FIXME Assumes either all models were trained with smoothing or none were. if configs[0].exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(configs[0].exponential_smoothing) # Restore the model variables. for i, config in enumerate(configs): with tf.variable_scope("model%d" % i) as scope: _ = model_loader.init_or_restore_variables(config, session, ensemble_scope=scope) # Swap-in the smoothed versions of the variables. if configs[0].exponential_smoothing > 0.0: session.run(fetches=smoothing.swap_ops) # TODO Ensembling is currently only supported for RNNs, so if # TODO len(models) > 1 then check models are all rnn # Translate the source file. inference.translate_file(input_file=settings.input, output_file=settings.output, session=session, models=models, configs=configs, beam_size=settings.beam_size, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size, normalization_alpha=settings.normalization_alpha)
def main(): num_gpus = torch.cuda.device_count() # Parameters args = argparsing() emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 4 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.5 # the dropout value bptt = 35 lr = 5.0 # learning rate batch_size = 20 layerdrop = args.layerdrop if num_gpus < 1: raise Exception('No GPUs available!') elif num_gpus > 1: lr *= num_gpus batch_size *= num_gpus # Dataset print('Create dataloaders') dataloaders, info = get_loaders(name='WikiText2', batch_size=batch_size) ntokens = info['ntokens'] # Model print('Create model') model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout, layerdrop=layerdrop) if num_gpus > 1: device_ids = list(range(num_gpus)) model = torch.nn.DataParallel(model, device_ids=device_ids) optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) criterion = nn.CrossEntropyLoss() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Create trainer') trainer = Trainer(model, dataloaders, optimizer, scheduler, criterion, bptt, ntokens, device) print('Start training') trainer.run(epochs=1)
def _load_models(self, process_id, sess): """ Loads models and returns them """ logging.debug("Process '%s' - Loading models\n" % (process_id)) import tensorflow as tf models = [] for i, options in enumerate(self._options): with tf.variable_scope("model%d" % i) as scope: if options.model_type == "transformer": model = TransformerModel(options) else: model = rnn_model.RNNModel(options) saver = model_loader.init_or_restore_variables( options, sess, ensemble_scope=scope) models.append(model) logging.info("NOTE: Length of translations is capped to {}".format(self._options[0].translation_maxlen)) return models
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(tf_utils.get_available_gpus()) num_replicas = max(1, num_gpus) if config.loss_function == 'MRT': assert config.gradient_aggregation_steps == 1 assert config.max_sentences_per_device == 0, "MRT mode does not support sentence-based split" if config.max_tokens_per_device != 0: assert (config.samplesN * config.maxlen <= config.max_tokens_per_device), "need to make sure candidates of a sentence could be " \ "feed into the model" else: assert num_replicas == 1, "MRT mode does not support sentence-based split" assert (config.samplesN * config.maxlen <= config.token_batch_size), "need to make sure candidates of a sentence could be " \ "feed into the model" logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i>0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = learning_schedule.ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = learning_schedule.TransformerSchedule( global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) elif config.learning_schedule == "warmup-plateau-decay": schedule = learning_schedule.WarmupPlateauDecaySchedule( global_step=global_step, peak_learning_rate=config.learning_rate, warmup_steps=config.warmup_steps, plateau_steps=config.plateau_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format(config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) if config.exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(config.exponential_smoothing) saver, progress = model_loader.init_or_restore_variables( config, sess, train=True) global_step.load(progress.uidx, sess) if config.sample_freq: random_sampler = RandomSampler( models=[replicas[0]], configs=[config], beam_size=1) if config.beam_freq or config.valid_script is not None: beam_search_sampler = BeamSearchSampler( models=[replicas[0]], configs=[config], beam_size=config.beam_size) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) # set epoch = 1 if print per-token-probability if config.print_per_token_pro: config.max_epochs = progress.eidx+1 for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info('Minibatch with zero sample under length {0}'.format(config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ((progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape output = updater.update( sess, x_in, x_mask_in, y_in, y_mask_in, num_to_target, write_summary_for_this_batch) if config.print_per_token_pro == False: total_loss += output else: # write per-token probability into the file f = open(config.print_per_token_pro, 'a') for pro in output: pro = str(pro) + '\n' f.write(pro) f.close() n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 # Update the smoothed version of the model variables. # To reduce the performance overhead, we only do this once every # N steps (the smoothing factor is adjusted accordingly). if config.exponential_smoothing > 0.0 and progress.uidx % smoothing.update_frequency == 0: sess.run(fetches=smoothing.update_ops) if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info('{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}'.format(disp_time, progress.eidx, progress.uidx, total_loss/n_words, n_words/duration, n_sents/duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:, :10] samples = translate_utils.translate_batch( sess, random_sampler, x_small, x_mask_small, config.translation_maxlen, 0.0) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss[0][0], num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small = x_in[:, :, :10] x_mask_small = x_mask_in[:, :10] y_small = y_in[:,:10] samples = translate_utils.translate_batch( sess, beam_search_sampler, x_small, x_mask_small, config.translation_maxlen, config.normalization_alpha) assert len(samples) == len(x_small.T) == len(y_small.T), \ (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost/len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) valid_ce = validate(sess, replicas[0], config, valid_text_iterator) sess.run(fetches=smoothing.swap_ops) else: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: if config.exponential_smoothing > 0.0: sess.run(fetches=smoothing.swap_ops) score = validate_with_script(sess, beam_search_sampler) sess.run(fetches=smoothing.swap_ops) else: score = validate_with_script(sess, beam_search_sampler) need_to_save = (score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file(config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop=True progress_path = '{0}-{1}.progress.json'.format(config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
def train(config, sess): #################################################### assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) ############################################################ #add: pretrain if config.pretrain: logging.info("Start pre-training") #预训练网络参数 pre_batch_size = 1000 epochs = 20 pre_learning_rate = 0.001 pre_optimizer = tf.train.GradientDescentOptimizer( pre_learning_rate).minimize(replicas[0].loss_pre_train) #加载预训练数据及相关字典 gvocab, gvectors = util.pre_load_data(config.pretrain_vocab, config.pretrain_vectors) pre_vocab_list = list(gvocab.keys()) #过采样 pre_train_list = [] with open('/media/ntfs-3/EXP/MULTI/mix/zh-en/data3/glove/vocab.txt', 'r', encoding='utf-8') as f: for line in f: k, v = line.strip().split() pre_train_list.extend([k] * int(v)) utf8_dict = json.load( open(config.source_dicts[0], 'r', encoding='utf-8')) embedding_list = [] #开始训练 for i in range(epochs): logging.info("epoch:{}".format(i)) if i == epochs - 1: source_x, source_y, _vocab = util.get_data(pre_vocab_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=False) else: source_x, source_y, _vocab = util.get_data(pre_train_list, pre_batch_size, gvocab, gvectors, utf8_dict, shuffle=True) for idx, [s_x, s_y] in enumerate(zip(source_x, source_y)): assert len(s_x) == len(s_y), "{}, {}".format( len(s_x), len(s_y)) sx, sy = util.pre_prepare_data(s_x, s_y) feed_dict = {} feed_dict[replicas[0].pre_inputs.x] = sx feed_dict[replicas[0].pre_inputs.y] = sy _, loss, embedding = sess.run([ pre_optimizer, replicas[0].loss_pre_train, replicas[0].pre_embedding ], feed_dict=feed_dict) if idx % 100 == 0: logging.info("loss:{}".format(loss)) if i == epochs - 1: embedding_list.append(embedding) assert _vocab == pre_vocab_list emb = embedding_list[0] for e in embedding_list[1:]: emb = numpy.concatenate((emb, e)) numpy.save("pre_emb/pre_emb.npy", emb) with open("pre_emb/vocab", "w", encoding="utf-8") as f: f.write("\n".join(pre_vocab_list)) #tsne可视化 tsne = util.get_tsne(emb, "pre_emb/tsne.npy") gtsne = numpy.load(config.pretrain_tsne) #util.plot_tsne(_vocab, tsne, gvocab, gtsne, top=20) #exit(0) ################################################################################## global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for pre_source_sents, source_sents, target_sents in text_iterator: #if len(source_sents[0][0]) != config.factors: #logging.error('Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n'.format(config.factors, len(source_sents[0][0]))) #sys.exit(1) px_in, x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, pre_source_sents, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, uLen, batch_size) = x_in.shape loss = updater.update(sess, px_in, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) #logging.info('SOURCE: {}'.format(source)) #logging.info('SOURCE: {}'.format(xx)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, :, : 10], x_mask_in[:, :, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): #source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) #logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: progress.bad_counter = 0 save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
###################################################################### # The model is set up with the hyperparameter below. The vocab size is # equal to the length of the vocab object. # ntokens = len(input_vocab) # the size of vocabulary nclstokens = 4 # D0, D1, S0, S1 + PAD ntagtokens = 1 # binary O or D emsize = 512 # embedding dimension nhid = 512 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 8 # the number of heads in the multiheadattention models dropout = 0.1 # the dropout value #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") device = torch.device("cpu") model = TransformerModel(ntokens, nclstokens, ntagtokens, emsize, nhead, nhid, nlayers, dropout).to(device) state_dict = torch.load("%s/%s" % (model_dir, args.iter), map_location='cpu') from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if k.startswith("module."): name = k[7:] # remove 'module.' of dataparallel else: name = k new_state_dict[name] = v model.load_state_dict(new_state_dict) import time
spiece_id_to_tokens = {v: k for k, v in spiece_ids.items()} eos_token = '</s>' eos_id = spiece_ids[eos_token] import sentencepiece as spm src_tokenizer = spm.SentencePieceProcessor() src_tokenizer.Load(source_tokenizer_model_path) trg_tokenizer = spm.SentencePieceProcessor() trg_tokenizer.Load(target_tokenizer_model_path) model = TransformerModel(intoken=src_vocab_size, outtoken=trg_vocab_size, hidden=d_model, d_ff=d_ff, nlayers=nlayers, dropout=0.0).to(device) model = load_transformer_weights(model, weights_path) print(f"Done loading pretrained model weights!") model.eval() src_pieces = src_tokenizer.encode_as_pieces(args.text_to_translate) src_input_ids = [spiece_ids[piece] for piece in src_pieces] + [eos_id] src_tensor = torch.LongTensor(src_input_ids).unsqueeze(1).to(device) trg_indexes = [eos_id]
def train(config, sess): assert (config.prior_model != None and (tf.train.checkpoint_exists(os.path.abspath(config.prior_model))) or (config.map_decay_c==0.0)), \ "MAP training requires a prior model file: Use command-line option --prior_model" # Construct the graph, with one model replica per GPU num_gpus = len(util.get_available_gpus()) num_replicas = max(1, num_gpus) logging.info('Building model...') replicas = [] for i in range(num_replicas): device_type = "GPU" if num_gpus > 0 else "CPU" device_spec = tf.DeviceSpec(device_type=device_type, device_index=i) with tf.device(device_spec): with tf.variable_scope(tf.get_variable_scope(), reuse=(i > 0)): if config.model_type == "transformer": model = TransformerModel(config) else: model = rnn_model.RNNModel(config) replicas.append(model) init = tf.zeros_initializer(dtype=tf.int32) global_step = tf.get_variable('time', [], initializer=init, trainable=False) if config.learning_schedule == "constant": schedule = ConstantSchedule(config.learning_rate) elif config.learning_schedule == "transformer": schedule = TransformerSchedule(global_step=global_step, dim=config.state_size, warmup_steps=config.warmup_steps) else: logging.error('Learning schedule type is not valid: {}'.format( config.learning_schedule)) sys.exit(1) if config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer( learning_rate=schedule.learning_rate, beta1=config.adam_beta1, beta2=config.adam_beta2, epsilon=config.adam_epsilon) else: logging.error('No valid optimizer defined: {}'.format( config.optimizer)) sys.exit(1) if config.summary_freq: summary_dir = (config.summary_dir if config.summary_dir is not None else os.path.abspath(os.path.dirname(config.saveto))) writer = tf.summary.FileWriter(summary_dir, sess.graph) else: writer = None updater = ModelUpdater(config, num_gpus, replicas, optimizer, global_step, writer) saver, progress = model_loader.init_or_restore_variables(config, sess, train=True) global_step.load(progress.uidx, sess) # Use an InferenceModelSet to abstract over model types for sampling and # beam search. Multi-GPU sampling and beam search are not currently # supported, so we just use the first replica. model_set = inference.InferenceModelSet([replicas[0]], [config]) #save model options write_config_to_json_file(config, config.saveto) text_iterator, valid_text_iterator = load_data(config) _, _, num_to_source, num_to_target = util.load_dictionaries(config) total_loss = 0. n_sents, n_words = 0, 0 last_time = time.time() logging.info("Initial uidx={}".format(progress.uidx)) for progress.eidx in range(progress.eidx, config.max_epochs): logging.info('Starting epoch {0}'.format(progress.eidx)) for source_sents, target_sents in text_iterator: if len(source_sents[0][0]) != config.factors: logging.error( 'Mismatch between number of factors in settings ({0}), and number in training corpus ({1})\n' .format(config.factors, len(source_sents[0][0]))) sys.exit(1) x_in, x_mask_in, y_in, y_mask_in = util.prepare_data( source_sents, target_sents, config.factors, maxlen=None) if x_in is None: logging.info( 'Minibatch with zero sample under length {0}'.format( config.maxlen)) continue write_summary_for_this_batch = config.summary_freq and ( (progress.uidx % config.summary_freq == 0) or (config.finish_after and progress.uidx % config.finish_after == 0)) (factors, seqLen, batch_size) = x_in.shape loss = updater.update(sess, x_in, x_mask_in, y_in, y_mask_in, write_summary_for_this_batch) total_loss += loss n_sents += batch_size n_words += int(numpy.sum(y_mask_in)) progress.uidx += 1 if config.disp_freq and progress.uidx % config.disp_freq == 0: duration = time.time() - last_time disp_time = datetime.now().strftime('[%Y-%m-%d %H:%M:%S]') logging.info( '{0} Epoch: {1} Update: {2} Loss/word: {3} Words/sec: {4} Sents/sec: {5}' .format(disp_time, progress.eidx, progress.uidx, total_loss / n_words, n_words / duration, n_sents / duration)) last_time = time.time() total_loss = 0. n_sents = 0 n_words = 0 if config.sample_freq and progress.uidx % config.sample_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.sample(sess, x_small, x_mask_small) assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) sample = util.seq2words(ss, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) logging.info('SAMPLE: {}'.format(sample)) if config.beam_freq and progress.uidx % config.beam_freq == 0: x_small, x_mask_small, y_small = x_in[:, :, : 10], x_mask_in[:, : 10], y_in[:, : 10] samples = model_set.beam_search( sess, x_small, x_mask_small, config.beam_size, normalization_alpha=config.normalization_alpha) # samples is a list with shape batch x beam x len assert len(samples) == len(x_small.T) == len( y_small.T), (len(samples), x_small.shape, y_small.shape) for xx, yy, ss in zip(x_small.T, y_small.T, samples): source = util.factoredseq2words(xx, num_to_source) target = util.seq2words(yy, num_to_target) logging.info('SOURCE: {}'.format(source)) logging.info('TARGET: {}'.format(target)) for i, (sample_seq, cost) in enumerate(ss): sample = util.seq2words(sample_seq, num_to_target) msg = 'SAMPLE {}: {} Cost/Len/Avg {}/{}/{}'.format( i, sample, cost, len(sample), cost / len(sample)) logging.info(msg) if config.valid_freq and progress.uidx % config.valid_freq == 0: valid_ce = validate(sess, replicas[0], config, valid_text_iterator) if (len(progress.history_errs) == 0 or valid_ce < min(progress.history_errs)): progress.history_errs.append(valid_ce) progress.bad_counter = 0 save_non_checkpoint(sess, saver, config.saveto) progress_path = '{0}.progress.json'.format(config.saveto) progress.save_to_json(progress_path) else: progress.history_errs.append(valid_ce) progress.bad_counter += 1 if progress.bad_counter > config.patience: logging.info('Early Stop!') progress.estop = True break if config.valid_script is not None: score = validate_with_script(sess, replicas[0], config) need_to_save = ( score is not None and (len(progress.valid_script_scores) == 0 or score > max(progress.valid_script_scores))) if score is None: score = 0.0 # ensure a valid value is written progress.valid_script_scores.append(score) if need_to_save: save_path = config.saveto + ".best-valid-script" save_non_checkpoint(sess, saver, save_path) write_config_to_json_file(config, save_path) progress_path = '{}.progress.json'.format(save_path) progress.save_to_json(progress_path) if config.save_freq and progress.uidx % config.save_freq == 0: saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) if config.finish_after and progress.uidx % config.finish_after == 0: logging.info("Maximum number of updates reached") saver.save(sess, save_path=config.saveto, global_step=progress.uidx) write_config_to_json_file( config, "%s-%s" % (config.saveto, progress.uidx)) progress.estop = True progress_path = '{0}-{1}.progress.json'.format( config.saveto, progress.uidx) progress.save_to_json(progress_path) break if progress.estop: break
###################################################################### # The model is set up with the hyperparameter below. The vocab size is # equal to the length of the vocab object. # ntokens = len(input_vocab) # the size of vocabulary nclstokens = 4 # D0, D1, S0, S1 ntagtokens = 1 # Binary classification O or D emsize = 512 # embedding dimension nhid = 512 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 8 # the number of heads in the multiheadattention models dropout = 0.1 # the dropout value device = torch.device("cuda" if torch.cuda.is_available() else "cpu") transformer = TransformerModel(ntokens, nclstokens, ntagtokens, emsize, nhead, nhid, nlayers, dropout) if args.model: transformer.load_state_dict(torch.load(args.model)) else: # save random init model torch.save(transformer.state_dict(), "init.mdl") model = nn.DataParallel(transformer).to(device) ###################################################################### # Run the model # ------------- # cls_criterion = nn.CrossEntropyLoss() lr = float(args.lr) # learning rate optimizer = torch.optim.Adam(model.parameters(), lr=lr)
src_ntokens = len(EN_TEXT.vocab.stoi) # the size of vocabulary tgt_ntokens = len(FR_TEXT.vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value from transformer import TransformerModel from lstm_seq2seq import Seq2Seq model_type = 'LSTM' # LSTM or Transformer if model_type == 'LSTM': model = Seq2Seq(src_ntokens, tgt_ntokens, emsize, nhid, device).to(device) else: model = TransformerModel(src_ntokens, tgt_ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) import numpy as np model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print('# parameters: {:e}'.format(params)) criterion = torch.nn.CrossEntropyLoss(ignore_index=1) optimizer = torch.optim.Adam(model.parameters()) model.train() log_interval = 200 step = 0 while step < 100000: for batch in iter(train_iter): optimizer.zero_grad()
def main(settings): """ Translates a source language file (or STDIN) into a target language file (or STDOUT). """ # Create the TensorFlow session. g = tf.Graph() with g.as_default(): tf_config = tf.compat.v1.ConfigProto() tf_config.allow_soft_placement = True session = tf.compat.v1.Session(config=tf_config) # Load config file for each model. configs = [] for model in settings.models: config = load_config_from_json_file(model) setattr(config, 'reload', model) setattr(config, 'translation_maxlen', settings.translation_maxlen) configs.append(config) # Create the model graphs. logging.debug("Loading models\n") models = [] for i, config in enumerate(configs): with tf.compat.v1.variable_scope("model%d" % i) as scope: if config.model_type == "transformer": model = TransformerModel( config, consts_config_str=settings.config_str) else: model = rnn_model.RNNModel(config) model.sampling_utils = SamplingUtils(settings) models.append(model) # Add smoothing variables (if the models were trained with smoothing). # FIXME Assumes either all models were trained with smoothing or none were. if configs[0].exponential_smoothing > 0.0: smoothing = ExponentialSmoothing(configs[0].exponential_smoothing) # Restore the model variables. for i, config in enumerate(configs): with tf.compat.v1.variable_scope("model%d" % i) as scope: _ = model_loader.init_or_restore_variables( config, session, ensemble_scope=scope) # Swap-in the smoothed versions of the variables. if configs[0].exponential_smoothing > 0.0: session.run(fetches=smoothing.swap_ops) max_translation_len = settings.translation_maxlen # Create a BeamSearchSampler / RandomSampler. if settings.translation_strategy == 'beam_search': sampler = BeamSearchSampler(models, configs, settings.beam_size) else: assert settings.translation_strategy == 'sampling' sampler = RandomSampler(models, configs, settings.beam_size) # Warn about the change from neg log probs to log probs for the RNN. if settings.n_best: model_types = [config.model_type for config in configs] if 'rnn' in model_types: logging.warn( 'n-best scores for RNN models have changed from ' 'positive to negative (as of commit 95793196...). ' 'If you are using the scores for reranking etc, then ' 'you may need to update your scripts.') # Translate the source file. translate_utils.translate_file( input_file=settings.input, output_file=settings.output, session=session, sampler=sampler, config=configs[0], max_translation_len=max_translation_len, normalization_alpha=settings.normalization_alpha, consts_config_str=settings.config_str, nbest=settings.n_best, minibatch_size=settings.minibatch_size, maxibatch_size=settings.maxibatch_size)
model = torch.load(args.load, map_location=device) vocab = model.vocab test_source, _test_target = read_data(args.test) test_source = index_data(test_source, vocab) for i, source in enumerate(batchify_data(test_source)): output = model.decode(source) for words in output: print(' '.join(words)) exit(0) if args.model == 'baseline': model = BaselineModel(vocab).to(device) elif args.model == 'transformer': model = TransformerModel(vocab).to(device) else: print('error: invalid model or model not specified (--model)', file=sys.stderr) sys.exit() for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id) lr = 5 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) bos_token = vocab.numberize('<BOS>')
###################################################################### # The model is set up with the hyperparameter below. The vocab size is # equal to the length of the vocab object. # # ntokens = len(input_vocab) # the size of vocabulary nclstokens = 4 # D0, D1, S0, S1 ntagtokens = 1 # binary O or D emsize = 512 # embedding dimension nhid = 512 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 6 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 8 # the number of heads in the multiheadattention models dropout = 0.1 # the dropout value device = torch.device("cuda" if torch.cuda.is_available() else "cpu") transformer = TransformerModel(ntokens, nclstokens, ntagtokens, emsize, nhead, nhid, nlayers, dropout) transformer.load_state_dict(torch.load("%s/%s" % (model_dir, args.iter))) model = transformer.to(device) import time ###################################################################### # Run the model # ------------- # lr = float(args.lr) # learning rate optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.995) import time
def main(hparams): # Args batch_size = 20 eval_batch_size = 10 bptt = 35 dataset = Dataset() train_data = dataset.batchify(dataset.train_txt, batch_size) val_data = dataset.batchify(dataset.val_txt, eval_batch_size) test_data = dataset.batchify(dataset.test_txt, eval_batch_size) # Transformer model architecture ntokens = len(dataset.TEXT.vocab.stoi) # the size of vocabulary emsize = 200 # embedding dimension nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder nhead = 2 # the number of heads in the multiheadattention models dropout = 0.2 # the dropout value transformer = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout) # Optimizer. criterion = nn.CrossEntropyLoss() # loss function lr = 5.0 # learning rate optimizer = torch.optim.SGD(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95) # bittensor: # Load bittensor config from hparams. config = bittensor.Config(hparams) # Build the neuron from configs. neuron = bittensor.Neuron(config) # Init a trainable request router. router = bittensor.Router(x_dim=784, key_dim=100, topk=10) # Build local network. net = Net() # Subscribe the local network to the network neuron.subscribe(transformer) # Start the neuron backend. neuron.start() def train(dataset, transformer): model.train() # Turn on the train mode total_loss = 0. start_time = time.time() ntokens = len(dataset.TEXT.vocab.stoi) for batch, i in enumerate( range(0, train_data.size(0) - 1, dataset.bptt)): data, targets = dataset.get_batch(train_data, i) optimizer.zero_grad() # data print(data.shape) # Flatten encoder inputs inputs inputs = data.view(-1, bptt, emsize) inputs = torch.flatten(inputs, start_dim=1) # Query the remote network. synapses = neuron.synapses( ) # Returns a list of synapses on the network. requests, scores = router.route( inputs, synapses) # routes inputs to network. responses = neuron(requests, synapses) # Makes network calls. remote = router.join(responses) # Joins responses based on scores. # Encode sequence inputs. encodings = transformer.encode( data) # (seq_len, batch_size, embedding_size) # Get nodes from metagraph. # and map nodes to torch keys. axons = neuron.axons() # List[bittensor_pb2.Node])) keys = keymap.toKeys(axons) # (-1, key_dim) # Learning a map from the gate_inputs to keys # gates[i, j] = score for the jth key for input i gate_inputs = encodings.view( batch_size, x_dim) # (batch_size, seq_len * embedding_size) gates = gate(gate_inputs, keys, topk=min(len(keys), topk)) # Dispatch data to inputs for each key. # when gates[i, j] == 0, the key j does not recieve input i dispatch_inputs = data.view(batch_size, -1) # (batch_size, sequence_length) dispatch = dispatcher.dispatch(dispatch_inputs, gates) # List[(-1, seq_len)] # Query the network by mapping from keys to node endpoints. # results = list[torch.Tensor], len(results) = len(keys) axons = keymap.toAxons(keys) # List[bittensor_pb2.Node] query = neuron(dispatch, axons) # List[(-1, embedding_size)] # Join results using gates to combine inputs. results = dispatcher.combine( query, gates) # (batch_size, seq_len * embedding_size) # Decode responses. results = results.view( -1, batch_size, emsize) # (seq_len, batch_size, embedding_size) to_decode = results + encodings output = model.decode( to_decode) # (target_len, batch_size, embedding_size) # Loss and optimizer step loss = criterion(output.view(-1, ntokens), targets) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() # Update bittensor weights weights = neuron.getweights(axons) weights = (0.95) * weights + (0.05) * torch.mean(gates, dim=0) neuron.setweights(axons, weights) total_loss += loss.item() log_interval = 1 if batch % log_interval == 0 and batch > 0: cur_loss = total_loss / log_interval elapsed = time.time() - start_time print('| epoch {:3d} | {:5d}/{:5d} batches | ' 'lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // dataset.bptt, scheduler.get_lr()[0], elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() for epoch in range(1, epochs + 1): epoch_start_time = time.time() train(dataset, model) scheduler.step()
def main(): parser = argparse.ArgumentParser(description='OGBN (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--project', type=str, default='lcgnn') parser.add_argument('--dataset', type=str, default='flickr') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=4) parser.add_argument('--num_heads', type=int, default=2) parser.add_argument('--ego_size', type=int, default=64) parser.add_argument('--hidden_size', type=int, default=64) parser.add_argument('--input_dropout', type=float, default=0.2) parser.add_argument('--hidden_dropout', type=float, default=0.4) parser.add_argument('--weight_decay', type=float, default=0.005) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--early_stopping', type=int, default=50) parser.add_argument('--batch_size', type=int, default=512) parser.add_argument('--eval_batch_size', type=int, default=2048) parser.add_argument('--layer_norm', type=int, default=0) parser.add_argument('--src_scale', type=int, default=0) parser.add_argument('--num_workers', type=int, default=4, help='number of workers') parser.add_argument('--pe_type', type=int, default=0) parser.add_argument('--mask', type=int, default=0) parser.add_argument('--mlp', type=int, default=0) parser.add_argument("--optimizer", type=str, default='adamw', choices=['adam', 'adamw'], help="optimizer") parser.add_argument("--scheduler", type=str, default='noam', choices=['noam', 'linear'], help="scheduler") parser.add_argument("--method", type=str, default='acl', choices=['acl', 'l1reg'], help="method for local clustering") parser.add_argument('--warmup', type=int, default=10000) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--load_path', type=str, default='') parser.add_argument('--exp_name', type=str, default='') args = parser.parse_args() print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) para_dic = { 'nl': args.num_layers, 'nh': args.num_heads, 'es': args.ego_size, 'hs': args.hidden_size, 'id': args.input_dropout, 'hd': args.hidden_dropout, 'bs': args.batch_size, 'pe': args.pe_type, 'op': args.optimizer, 'lr': args.lr, 'wd': args.weight_decay, 'ln': args.layer_norm, 'sc': args.src_scale, 'sd': args.seed, 'md': args.method } para_dic['warm'] = args.warmup para_dic['mask'] = args.mask exp_name = get_exp_name(args.dataset, para_dic, args.exp_name) wandb_name = exp_name.replace('_sd' + str(args.seed), '') wandb.init(name=wandb_name, project=args.project) wandb.config.update(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) if args.dataset == 'papers100M': dataset = MyNodePropPredDataset(name=args.dataset) elif args.dataset in ['flickr', 'reddit', 'yelp', 'amazon']: dataset = SAINTDataset(name=args.dataset) else: dataset = PygNodePropPredDataset(name=f'ogbn-{args.dataset}') split_idx = dataset.get_idx_split() train_idx = set(split_idx['train'].cpu().numpy()) valid_idx = set(split_idx['valid'].cpu().numpy()) test_idx = set(split_idx['test'].cpu().numpy()) if args.method != "acl": ego_graphs_unpadded = np.load( f'data/{args.dataset}-lc-{args.method}-ego-graphs-{args.ego_size}.npy', allow_pickle=True) conds_unpadded = np.load( f'data/{args.dataset}-lc-{args.method}-conds-{args.ego_size}.npy', allow_pickle=True) else: tmp_ego_size = 256 if args.dataset == 'products' else args.ego_size if args.ego_size < 64: tmp_ego_size = 64 ego_graphs_unpadded = np.load( f'data/{args.dataset}-lc-ego-graphs-{tmp_ego_size}.npy', allow_pickle=True) conds_unpadded = np.load( f'data/{args.dataset}-lc-conds-{tmp_ego_size}.npy', allow_pickle=True) ego_graphs_train, ego_graphs_valid, ego_graphs_test = [], [], [] cut_train, cut_valid, cut_test = [], [], [] for i, x in enumerate(ego_graphs_unpadded): idx = x[0] assert len(x) == len(conds_unpadded[i]) if len(x) > args.ego_size: x = x[:args.ego_size] conds_unpadded[i] = conds_unpadded[i][:args.ego_size] ego_graph = -np.ones(args.ego_size, dtype=np.int32) ego_graph[:len(x)] = x cut_position = np.argmin(conds_unpadded[i]) cut = np.zeros(args.ego_size, dtype=np.float32) cut[:cut_position + 1] = 1.0 if idx in train_idx: ego_graphs_train.append(ego_graph) cut_train.append(cut) elif idx in valid_idx: ego_graphs_valid.append(ego_graph) cut_valid.append(cut) elif idx in test_idx: ego_graphs_test.append(ego_graph) cut_test.append(cut) else: print(f"{idx} not in train/valid/test idx") ego_graphs_train, ego_graphs_valid, ego_graphs_test = torch.LongTensor( ego_graphs_train), torch.LongTensor( ego_graphs_valid), torch.LongTensor(ego_graphs_test) cut_train, cut_valid, cut_test = torch.FloatTensor( cut_train), torch.FloatTensor(cut_valid), torch.FloatTensor(cut_test) pe = None if args.pe_type == 1: pe = torch.load(f'data/{args.dataset}-embedding-{args.hidden_size}.pt') elif args.pe_type == 2: pe = np.fromfile("data/paper100m.pro", dtype=np.float32).reshape(-1, 128) pe = torch.FloatTensor(pe) if args.hidden_size < 128: pe = pe[:, :args.hidden_size] data = dataset[0] if len(data.y.shape) == 1: data.y = data.y.unsqueeze(1) adj = None if args.mask: adj = torch.BoolTensor(~np.load( f'data/{args.dataset}-ego-graphs-adj-{args.ego_size}.npy')) num_classes = dataset.num_classes train_dataset = NodeClassificationDataset(data.x, data.y, ego_graphs_train, pe, args, num_classes, adj, cut_train) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=batcher(train_dataset), pin_memory=True) valid_dataset = NodeClassificationDataset(data.x, data.y, ego_graphs_valid, pe, args, num_classes, adj, cut_valid) valid_loader = DataLoader(valid_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=batcher(valid_dataset), pin_memory=True) test_dataset = NodeClassificationDataset(data.x, data.y, ego_graphs_test, pe, args, num_classes, adj, cut_test) test_loader = DataLoader(test_dataset, batch_size=args.eval_batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=batcher(test_dataset), pin_memory=True) model = TransformerModel(data.x.size(1) + 1, args.hidden_size, args.num_heads, args.hidden_size, args.num_layers, num_classes, args.input_dropout, args.hidden_dropout, layer_norm=args.layer_norm, src_scale=args.src_scale, mlp=args.mlp).to(device) wandb.watch(model, log='all') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs model = nn.DataParallel(model) pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('model parameters:', pytorch_total_params) if not os.path.exists('saved'): os.mkdir('saved') if torch.cuda.device_count() > 1: model.module.init_weights() else: model.init_weights() if args.load_path: model.load_state_dict(torch.load(args.load_path, map_location='cuda:0')) valid_acc, valid_loss = test(model, valid_loader, device, args) valid_output = f'Valid: {100 * valid_acc:.2f}% ' cor_train_acc, _ = test(model, train_loader, device, args) cor_test_acc, cor_test_loss = test(model, test_loader, device, args) train_output = f'Train: {100 * cor_train_acc:.2f}%, ' test_output = f'Test: {100 * cor_test_acc:.2f}%' print(train_output + valid_output + test_output) return best_val_acc = 0 cor_train_acc = 0 cor_test_acc = 0 patience = 0 if args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) else: raise NotImplementedError if args.warmup > 0: if args.scheduler == 'noam': optimizer = NoamOptim( optimizer, args.hidden_size if args.hidden_size > 0 else data.x.size(1), n_warmup_steps=args.warmup) #, init_lr=args.lr) elif args.scheduler == 'linear': optimizer = LinearOptim(optimizer, n_warmup_steps=args.warmup, n_training_steps=args.epochs * len(train_loader), init_lr=args.lr) for epoch in range(1, 1 + args.epochs): # lp = LineProfiler() # lp_wrapper = lp(train) # loss = lp_wrapper(model, train_loader, device, optimizer, args) # lp.print_stats() loss = train(model, train_loader, device, optimizer, args) train_output = valid_output = test_output = '' if epoch >= 10 and epoch % args.log_steps == 0: valid_acc, valid_loss = test(model, valid_loader, device, args) valid_output = f'Valid: {100 * valid_acc:.2f}% ' if valid_acc > best_val_acc: best_val_acc = valid_acc # cor_train_acc, _ = test(model, train_loader, device, args) cor_test_acc, cor_test_loss = test(model, test_loader, device, args) # train_output = f'Train: {100 * cor_train_acc:.2f}%, ' test_output = f'Test: {100 * cor_test_acc:.2f}%' patience = 0 try: if torch.cuda.device_count() > 1: torch.save(model.module.state_dict(), 'saved/' + exp_name + '.pt') else: torch.save(model.state_dict(), 'saved/' + exp_name + '.pt') wandb.save('saved/' + exp_name + '.pt') except FileNotFoundError as e: print(e) else: patience += 1 if patience >= args.early_stopping: print('Early stopping...') break wandb.log({ 'Train Loss': loss, 'Valid Acc': valid_acc, 'best_val_acc': best_val_acc, 'cor_test_acc': cor_test_acc, 'LR': get_lr(optimizer), 'Valid Loss': valid_loss, 'cor_test_loss': cor_test_loss }) else: wandb.log({'Train Loss': loss, 'LR': get_lr(optimizer)}) # train_output + print(f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' + valid_output + test_output)
self.mode="test" ## transformer的参数 self.dropout=0.5 self.max_len=5000 self.nhead=2 # data_path="E:/study_series/2020_3/re_write_classify/data/" # data_path="/mnt/data3/wuchunsheng/code/nlper/NLP_task/text_classification/my_classification_cnews/2020_3_30/text_classify/data/" config = Config() train_iter, valid_iter, test_iter, TEXT = generate_data(config) #model = RNNModel(config, TEXT).to(config.device) model=TransformerModel(config, TEXT).to(config.device) model =load_model(config, model) #sen="目"*50 sen="体育快讯" #sen="".join(['c', 'o', 'n', 't', 'e', 'x', 't', ',', 'l', 'a', 'b', 'e', 'l']) #res=test_sentence(config, model ,TEXT, sen) #print(sen) #print(res) #res=test(config,model,TEXT, test_iter) #print(res) print("=========================") sen="篮球" #sen="体育" sen_ori=sen
logger = logging.getLogger('test') logger.info(f"Test Log") logger.info(opt) device = torch.device(f"cuda:{opt.device}" if opt.device.isdigit() else 'cpu') if __name__ == '__main__': src_vocab_list = VocabField.load_vocab(opt.src_vocab_file) src_vocab = VocabField(src_vocab_list, vocab_size=opt.src_vocab_size) if opt.model_name == 'textrcnn': model = TextRCNN(opt.src_vocab_size, opt.embedding_size, opt.hidden_size) elif opt.model_name == 'rnn': model = RNN(opt.src_vocab_size, opt.embedding_size, opt.hidden_size) elif opt.model_name == 'transformer': model = TransformerModel(opt.src_vocab_size, opt.embedding_size, opt.hidden_size) last_checkpoint = None if opt.resume and not opt.load_checkpoint: last_checkpoint = get_last_checkpoint(opt.best_model_dir) if last_checkpoint: opt.load_checkpoint = os.path.join(opt.model_dir, last_checkpoint) opt.skip_steps = int(last_checkpoint.strip('.pt').split('/')[-1]) if opt.load_checkpoint: model.load_state_dict(torch.load(opt.load_checkpoint)) opt.skip_steps = int(opt.load_checkpoint.strip('.pt').split('/')[-1]) logger.info(f"\nLoad from {opt.load_checkpoint}\n") else: for param in model.parameters(): param.data.uniform_(-opt.init_weight, opt.init_weight)