def train(): encoder_input_data, encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_code_vecs_v2.npy') s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_seq_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( OUTPUT_PATH / 'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH / 'py_seq_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') seq2seq_Model = build_seq2seq_model( word_emb_dim=128, hidden_state_dim=128, encoder_seq_len=encoder_seq_len, s_encoder_seq_len=s_encoder_seq_len, num_encoder_tokens=num_encoder_tokens, num_s_encoder_tokens=s_num_encoder_tokens, num_decoder_tokens=num_decoder_tokens) seq2seq_Model.summary() seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005), loss='sparse_categorical_crossentropy') script_name_base = 'py_func_sum_v9_' csv_logger = CSVLogger('{:}.log'.format(script_name_base)) model_checkpoint = ModelCheckpoint( '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format( script_name_base), save_best_only=True) batch_size = 100 epochs = 50 history = seq2seq_Model.fit( [encoder_input_data, s_encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), batch_size=batch_size, epochs=epochs, validation_split=0.12, callbacks=[csv_logger, model_checkpoint]) seq2seq_Model.save("seqmodel.hdf5")
type=int, default=get_value_as_int('TRAIN_EPOCHS', 7)) parser.add_argument("--batch_size", type=int, default=get_value_as_int('BATCH_SIZE', 1200)) parser.add_argument("--validation_split", type=float, default=get_value_as_float('BATCH_SIZE', 0.12)) args = parser.parse_args() print(args) learning_rate = float(args.learning_rate) encoder_input_data, doc_length = load_encoder_inputs( args.input_train_body_vecs_npy) decoder_input_data, decoder_target_data = load_decoder_inputs( args.input_train_title_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.input_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body)
def build_model(self, learning_rate): """Build a keras model.""" logging.info("starting") if self.job_name and self.job_name.lower() in ["ps"]: logging.info("ps doesn't build model") return self.encoder_input_data, doc_length = load_encoder_inputs( self.preprocessed_bodies) self.decoder_input_data, self.decoder_target_data = load_decoder_inputs( self.preprocessed_titles) num_encoder_tokens, self.body_pp = load_text_processor( self.body_pp_file) num_decoder_tokens, self.title_pp = load_text_processor( self.title_pp_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = keras.layers.Embedding( num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ######################## #### Decoder Model #### decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = keras.layers.Embedding( num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # TODO(https://github.com/kubeflow/examples/issues/196): # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761 # and the model won't train. decoder_gru = keras.layers.GRU( latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out]) x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = keras.layers.Dense( num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ######################## #### Seq2Seq Model #### self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) self.seq2seq_Model.compile( optimizer=keras.optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy',) # TODO(jlewi): Computing accuracy causes a dimension mismatch. # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]] # pylint: disable=line-too-long # metrics=['accuracy']) self.seq2seq_Model.summary()
dpickle.dump(title_pp, f) # Save the processed data np.save(data_dir + 'train_title_vecs.npy', train_title_vecs) np.save(data_dir + 'train_body_vecs.npy', train_body_vecs) else: time.sleep(120) while True: if os.path.isfile(data_dir + 'train_body_vecs.npy'): break print("Waiting for dataset") time.sleep(2) encoder_input_data, doc_length = load_encoder_inputs(data_dir + 'train_body_vecs.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( data_dir + 'train_title_vecs.npy') num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl') #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body)
title_pkl_file = args.output_dir + '/title_pp.dpkl' train_title_vecs_file = args.output_dir + '/train_title_vecs.npy' # Save the preprocessor with open(body_pkl_file, 'wb') as f: dpickle.dump(body_pp, f) with open(title_pkl_file, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data np.save(train_title_vecs_file, train_title_vecs) np.save(train_body_vecs_file, train_body_vecs) encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file) decoder_input_data, decoder_target_data = load_decoder_inputs( train_title_vecs_file) num_encoder_tokens, body_pp = load_text_processor(body_pkl_file) num_decoder_tokens, title_pp = load_text_processor(title_pkl_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens,
# Save the processed data np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code) np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment) # Arrange data for modeling # In[5]: from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl') # If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here: # # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl # # Build Seq2Seq Model For Summarizing Code # # We will build a model to predict the docstring given a function or a method. While this is a very cool task in itself, this is not the end goal of this exercise. The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.
import numpy as np if not use_cache: # Save the preprocessor with open(OUTPUT_PATH / 'py_code_proc_v2.dpkl', 'wb') as f: dpickle.dump(code_proc, f) with open(OUTPUT_PATH / 'py_comment_proc_v2.dpkl', 'wb') as f: dpickle.dump(comment_proc, f) #Save the processed data np.save(OUTPUT_PATH / 'py_t_code_vecs_v2.npy', t_code) np.save(OUTPUT_PATH / 'py_t_comment_vecs_v2.npy', t_comment) # Arrange data for modeling from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor encoder_input_data, encoder_seq_length = load_encoder_inputs( OUTPUT_PATH / 'py_t_code_vecs_v2.npy') decoder_input_data, decoder_seq_length = load_decoder_inputs( OUTPUT_PATH / 'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') # Build Seq2Seq Model for summarizing code from seq2seq_utils import build_seq2seq_model seq2seq_Model = build_seq2seq_model(word_emb_dim=800, hidden_state_dim=1000, encoder_seq_len=encoder_seq_length, num_encoder_tokens=num_encoder_tokens, num_decoder_tokens=num_decoder_tokens) seq2seq_Model.summary() # Train Seq2Seq Model