def transformation(): """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert it to a pandas data frame for internal use and then convert the predictions back to CSV (which really just means one prediction per line, since there's a single column. """ data = None # Convert from CSV to pandas if flask.request.content_type == 'text/csv': data = flask.request.data.decode('utf-8') #s = StringIO.StringIO(data) s = StringIO(data) df = pd.read_csv(s) #, header=None else: return flask.Response(response='This predictor only supports CSV data', status=415, mimetype='text/plain') # Read file path from input bucket = df.loc[0, 'bucket'] s3_file_path = df.loc[0, 'file_path'] pred_file_path = 'gluonts_ds.pkl' #os.environ['SM_CHANNEL_PRED'] #print('==================>', bucket, s3_file_path) # Download s3 pred file try: print('Inside try...') ut.download_file_from_S3(bucket, s3_file_path, pred_file_path) except: print('Inside pass...') pass print('current dir:', os.listdir(".")) print('model', os.listdir(model_path)) # Read into a gluonts dataset # pp.format_cutoff_train_data(pred_dir, cutoff_week, config) pred_ds = mdl.train_input_fn(pred_file_path) # Do the prediction predictions = ScoringService.predict(pred_ds) # Convert from numpy back to CSV #out = StringIO.StringIO() out = StringIO() pd.DataFrame({'results':predictions}).to_csv(out, header=False, index=False) result = out.getvalue() return flask.Response(response=result, status=200, mimetype='text/csv')
import os import tensorflow as tf TRAINING_STEPS = 1000 EVAL_STEPS = 100 from model import train_input_fn, eval_input_fn, model_fn, eval_on_train_data_input_fn print('The path to read files') trainfolderpath = os.path.join(os.environ['DATASPINE_INPUT_PATH'], 'training') print(trainfolderpath) train_func = train_input_fn(trainfolderpath, "training.csv") estimator = tf.estimator.Estimator(model_fn=model_fn) estimator.train(input_fn=train_func, steps=TRAINING_STEPS) # Export the prepared model from model import serving_input_fn serving_func = serving_input_fn(hyperparameters={}) export_path = os.environ['DATASPINE_OUTPUT_PATH'] exported_model = estimator.export_savedmodel( export_dir_base=export_path, serving_input_receiver_fn=serving_func) print('') print(exported_model)
def test(): #Dropout rate is 0 in test setup? BATCH_SIZE = args.batch_size EMBED_DIM = args.embeddingDim MAXLEN = args.maxLen NUM_UNITS = args.units CKPT = args.checkpoint LEARNING_RATE = args.learning_rate EPOCH = 1 DROPOUT = 0 start_word = "<s>" end_word = "</s>" test_source_tensor, test_source_tokenizer, test_target_tensor, test_target_tokenizer = \ load_test_data(test_translate_from=r"./data/newstest2015.en", test_translate_to=r'./data/newstest2015.de', vocab_from=r'./data/vocab.50K.en', vocab_to=r'./data/vocab.50K.de', pad_length=90, limit=args.limit) #test_source_tensor, test_source_tokenizer, test_target_tensor, test_target_tokenizer = \ # load_data(pad_length = MAXLEN, limit=None) print(len(test_source_tensor)) vocab_source_size = len(test_source_tokenizer.word_index) + 1 print("vocab_input_size: ", vocab_source_size) vocab_target_size = len(test_target_tokenizer.word_index) + 1 print("vocab_target_size: ", vocab_target_size) optimizer = tf.optimizers.Adam(learning_rate=LEARNING_RATE) buffer_size = len(test_source_tensor) test_steps = len(test_source_tensor) // BATCH_SIZE #print(test_source_tensor[0]) #print("Type: ",type(test_source_tensor)) #for ele in test_target_tensor: # print(type(ele)) #print("dir of test_target__token: ",dir(test_target_tokenizer)) #print(test_target_tokenizer.index_word) dataset = train_input_fn(test_source_tensor, test_target_tensor, buffer_size, EPOCH, BATCH_SIZE) # apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) encoder = Encoder(vocab_source_size, EMBED_DIM, NUM_UNITS, dropout_rate=DROPOUT, batch_size=BATCH_SIZE) decoder = Decoder(vocab_target_size, EMBED_DIM, NUM_UNITS, batch_size=BATCH_SIZE, method=None, dropout_rate=DROPOUT) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.001) ckpt = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder) manager = tf.train.CheckpointManager(ckpt, args.checkpoint, max_to_keep=10) ckpt.restore(manager.latest_checkpoint) per_epoch_loss, per_epoch_plex = 0, 0 def test_wrapper(source, target): # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size, # EMBED_DIM, NUM_UNITS, activation="tanh", # dropout_rate = DROPOUT) result = "" source_out, source_state = encoder(source, encoder_state, activation="tanh") initial = tf.expand_dims( [test_target_tokenizer.word_index[start_word]] * BATCH_SIZE, 1) attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM)) apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) # cur_total_loss is a sum of loss for current steps, namely batch loss cur_total_loss, cur_total_plex, cur_loss = 0, 0, 0 #print("word_index: ", test_target_tokenizer.word_index,len(test_target_tokenizer.word_index)) #print("index_word: ",test_target_tokenizer.index_word) for i in range(target.shape[1]): output_state, source_state, attention_state = decoder( initial, source_state, source_out, attention_state) # TODO: check for the case where target is 0 # 0 should be the padding value in target. # I assumed that there should not be 0 value in target # for safety reason, we apply this mask to final loss # Mask is a array contains binary value(0 or 1) #print(output_state.numpy().shape) #print(output_state[0].numpy()) cur_loss = apply_loss(target[:, i], output_state) perplex = tf.nn.sparse_softmax_cross_entropy_with_logits( target[:, i], output_state) current_ind = tf.argmax(output_state[0]) mask = tf.math.logical_not(tf.math.equal(target[:, i], 0)) mask = tf.cast(mask, dtype=cur_loss.dtype) cur_loss *= mask perplex *= mask cur_total_loss += tf.reduce_mean(cur_loss) cur_total_plex += tf.reduce_mean(perplex) #tf.print("check current id: ",current_ind) #tf.print(test_target_tokenizer.index_word[29]) #print(current_ind.numpy()) if current_ind.numpy() == 0: # 0 is for pad value, we don't need to record it continue result += test_target_tokenizer.index_word[current_ind.numpy()] if test_target_tokenizer.index_word[ current_ind.numpy()] == end_word: break initial = tf.expand_dims(target[:, i], 1) batch_loss = cur_total_loss / target.shape[1] batch_perplex = cur_total_plex / target.shape[1] return batch_loss, batch_perplex, result encoder_hidden = encoder.initialize_hidden_state() encoder_ceil = encoder.initialize_cell_state() encoder_state = [[encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil]] # TODO : Double check to make sure all re-initialization is performed result_by_batch = [] for idx, data in tqdm(enumerate(dataset.take(test_steps)), total=args.limit): source, target = data batch_loss, batch_perplex, result = test_wrapper(source, target) with open("checkpoint/test_logger.txt", "a") as filelogger: print("The validation loss in batch " + str(idx) + " is : ", str(batch_loss.numpy() / (idx + 1.0)), file=filelogger) print("The validation perplex in batch " + str(idx) + " is : ", str(batch_perplex.numpy() / (idx + 1.0)), file=filelogger) per_epoch_loss += batch_loss per_epoch_plex += batch_perplex assert type(result) == str result_by_batch.append(result) #if idx>=3: # break with open("checkpoint/test_logger.txt", "a") as filelogger: print("The validation loss is : ", str(per_epoch_loss.numpy() / (idx + 1.0)), file=filelogger) print("The validation perplex is: ", str(tf.exp(per_epoch_plex).numpy() / (idx + 1.0)), file=filelogger) return test_target_tokenizer, result_by_batch
def train(): # args is a global variable in this task BATCH_SIZE = args.batch_size EPOCH = args.epoch EMBED_DIM = args.embeddingDim MAXLEN = args.maxLen NUM_UNITS = args.units LEARNING_RATE = args.learning_rate DROPOUT = args.dropout METHOD = args.method GPUNUM = args.gpuNum CKPT = args.checkpoint LIMIT = args.limit start_word = "<s>" end_word = "</s>" #Here, tokenizer saves all info to split data. #Itself is not a part of data. train_source_tensor, train_source_tokenizer, train_target_tensor, train_target_tokenizer = \ load_data(pad_length = MAXLEN, limit=LIMIT) buffer_size = len(train_source_tensor) train_source_tensor, val_source_tensor, train_target_tensor, val_target_tensor = \ train_test_split(train_source_tensor, train_target_tensor, random_state=2019) #TODO: check if we need target tokenizer training_steps = len(train_source_tensor) // BATCH_SIZE vocab_source_size = len(train_source_tokenizer.word_index) + 1 print("vocab_input_size: ", vocab_source_size) vocab_target_size = len(train_target_tokenizer.word_index) + 1 print("vocab_target_size: ", vocab_target_size) step = tf.Variable(0, trainable=False) # boundaries = [100, 200] # values = [1.0, 0.5, 0.1] # boundaries = [30, 40] # values = [1.0, 0.5, 0.0] # learning_rate_fn = tf.compat.v1.train.piecewise_constant(step, # boundaries, values) # optimizer = tf.optimizers.SGD(learning_rate=learning_rate_fn(step)) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.001) # set up checkpoint if not os.path.exists(CKPT): os.makedirs(CKPT) else: print( "Warning: current Checkpoint dir already exist! ", "\nPlease consider to choose a new dir to save your checkpoint!") checkpoint = tf.train.Checkpoint(optimzier=optimizer) checkpoint_prefix = os.path.join(CKPT, "ckpt") dataset = train_input_fn(train_source_tensor, train_target_tensor, buffer_size, EPOCH, BATCH_SIZE) apply_loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True) encoder = Encoder(vocab_source_size, EMBED_DIM, NUM_UNITS, dropout_rate=DROPOUT, batch_size=BATCH_SIZE) decoder = Decoder(vocab_target_size, EMBED_DIM, NUM_UNITS, batch_size=BATCH_SIZE, method=None, dropout_rate=DROPOUT) def train_wrapper(source, target): # with tf.GradientTape(watch_accessed_variables=False) as tape: with tf.GradientTape() as tape: # source_out, source_state, source_trainable_var, tape = encoder(source, encoder_state, vocab_source_size, # EMBED_DIM, NUM_UNITS, activation="tanh", # dropout_rate = DROPOUT) source_out, source_state = encoder(source, encoder_state, activation="tanh") initial = tf.expand_dims( [train_target_tokenizer.word_index[start_word]] * BATCH_SIZE, 1) attention_state = tf.zeros((BATCH_SIZE, 1, EMBED_DIM)) # cur_total_loss is a sum of loss for current steps, namely batch loss cur_total_loss, cur_loss = 0, 0 for i in range(1, target.shape[1]): output_state, source_state, attention_state = decoder( initial, source_state, source_out, attention_state) # TODO: check for the case where target is 0 cur_loss = apply_loss(target[:, i], output_state) # 0 should be the padding value in target. # I assumed that there should not be 0 value in target # for safety reason, we apply this mask to final loss # Mask is a array contains binary value(0 or 1) mask = tf.math.logical_not(tf.math.equal(target[:, i], 0)) mask = tf.cast(mask, dtype=cur_loss.dtype) cur_loss *= mask cur_total_loss += tf.reduce_mean(cur_loss) initial = tf.expand_dims(target[:, i], 1) # print(cur_loss) # print(cur_total_loss) batch_loss = cur_total_loss / target.shape[1] ## debug variables = encoder.trainable_variables + decoder.trainable_variables # print("check variable: ", len(variables)) #variables = encoder.trainable_variables # print("check var:", len(variables), variables[12:]) gradients = tape.gradient(cur_total_loss, variables) # print("check gradient: ", len(gradients)) # g_e = [type(ele) for ele in gradients if not isinstance(ele, tf.IndexedSlices)] # sum_g = [ele.numpy().sum() for ele in gradients if not isinstance(ele, tf.IndexedSlices)] # print(len(gradients), len(sum_g)) optimizer.apply_gradients(zip(gradients, variables), global_step=step) return batch_loss # print(len(train_source_tensor),BATCH_SIZE,training_steps,LIMIT) for epoch in range(EPOCH): per_epoch_loss = 0 start = time.time() encoder_hidden = encoder.initialize_hidden_state() encoder_ceil = encoder.initialize_cell_state() encoder_state = [[encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil], [encoder_hidden, encoder_ceil]] # TODO : Double check to make sure all re-initialization is performed for idx, data in enumerate(dataset.take(training_steps)): source, target = data cur_total_loss = train_wrapper(source, target) per_epoch_loss += cur_total_loss if idx % 10 == 0: # print("current step is: "+str(tf.compat.v1.train.get_global_step())) # print(dir(optimizer)) print("current learning rate is:" + str(optimizer._learning_rate)) print('Epoch {}/{} Batch {}/{} Loss {:.4f}'.format( epoch + 1, EPOCH, idx + 10, training_steps, cur_total_loss.numpy())) # tf.print(step) # print(dir(step)) # print(int(step)) if step >= 5: optimizer._learning_rate /= 2.0 print('Epoch {}/{} Total Loss per epoch {:.4f} - {} sec'.format( epoch + 1, EPOCH, per_epoch_loss / training_steps, time.time() - start)) # TODO: for evaluation add bleu score if epoch % 10 == 0: print('Saving checkpoint for each 10 epochs') checkpoint.save(file_prefix=checkpoint_prefix)