def train(): encoder_input_data, encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_code_vecs_v2.npy') s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_seq_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( OUTPUT_PATH / 'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH / 'py_seq_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') seq2seq_Model = build_seq2seq_model( word_emb_dim=128, hidden_state_dim=128, encoder_seq_len=encoder_seq_len, s_encoder_seq_len=s_encoder_seq_len, num_encoder_tokens=num_encoder_tokens, num_s_encoder_tokens=s_num_encoder_tokens, num_decoder_tokens=num_decoder_tokens) seq2seq_Model.summary() seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005), loss='sparse_categorical_crossentropy') script_name_base = 'py_func_sum_v9_' csv_logger = CSVLogger('{:}.log'.format(script_name_base)) model_checkpoint = ModelCheckpoint( '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format( script_name_base), save_best_only=True) batch_size = 100 epochs = 50 history = seq2seq_Model.fit( [encoder_input_data, s_encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), batch_size=batch_size, epochs=epochs, validation_split=0.12, callbacks=[csv_logger, model_checkpoint]) seq2seq_Model.save("seqmodel.hdf5")
parser.add_argument("--tempfile", default=True) parser.add_argument("--epochs", type=int, default=get_value_as_int('TRAIN_EPOCHS', 7)) parser.add_argument("--batch_size", type=int, default=get_value_as_int('BATCH_SIZE', 1200)) parser.add_argument("--validation_split", type=float, default=get_value_as_float('BATCH_SIZE', 0.12)) args = parser.parse_args() print(args) learning_rate = float(args.learning_rate) encoder_input_data, doc_length = load_encoder_inputs( args.input_train_body_vecs_npy) decoder_input_data, decoder_target_data = load_decoder_inputs( args.input_train_title_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.input_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data_gcs_bucket", type=str, default="") parser.add_argument("--input_data_gcs_path", type=str, default="") parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument("--output_model_gcs_path", type=str, default="") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) print("Download iput file") if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "": bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket) storage.Blob(args.input_data_gcs_path, bucket).download_to_filename('github-issues.zip') else: urllib.request.urlretrieve( "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip", 'github-issues.zip') print("unzip iput file") zip_ref = zipfile.ZipFile('github-issues.zip', 'r') zip_ref.extractall('.') zip_ref.close() # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv('github_issues.csv').sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### if args.output_model_gcs_bucket != "": bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket) storage.Blob(args.output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
def build_model(self, learning_rate): """Build a keras model.""" logging.info("starting") if self.job_name and self.job_name.lower() in ["ps"]: logging.info("ps doesn't build model") return self.encoder_input_data, doc_length = load_encoder_inputs( self.preprocessed_bodies) self.decoder_input_data, self.decoder_target_data = load_decoder_inputs( self.preprocessed_titles) num_encoder_tokens, self.body_pp = load_text_processor( self.body_pp_file) num_decoder_tokens, self.title_pp = load_text_processor( self.title_pp_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = keras.layers.Embedding( num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ######################## #### Decoder Model #### decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = keras.layers.Embedding( num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # TODO(https://github.com/kubeflow/examples/issues/196): # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761 # and the model won't train. decoder_gru = keras.layers.GRU( latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out]) x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = keras.layers.Dense( num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ######################## #### Seq2Seq Model #### self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) self.seq2seq_Model.compile( optimizer=keras.optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy',) # TODO(jlewi): Computing accuracy causes a dimension mismatch. # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]] # pylint: disable=line-too-long # metrics=['accuracy']) self.seq2seq_Model.summary()
# # 1. Load the seq2seq model and extract the encoder (remember seq2seq models have an encoder and a decoder). # 2. Freeze the weights of the encoder. # 3. Add some dense layers on top of the encoder. # 4. Train this new model supplying by supplying `(code, docstring-embeddings)` pairs. We will call this model `code2emb_model`. # 5. Unfreeze the entire model, and resume training. This helps fine tune the model a little more towards this task. # 6. Encode all of the code, including code that does not contain a docstring and save that into a search index for future use. # ### Load seq2seq model from Step 2 and extract the encoder # First load the seq2seq model from Step2, then extract the encoder (we do not need the decoder). # In[2]: # load the pre-processed data for the encoder (we don't care about the decoder in this step) encoder_input_data, doc_length = load_encoder_inputs(seq2seq_path / 'py_t_code_vecs_v2.npy') seq2seq_Model = load_model(seq2seq_path / 'code_summary_seq2seq_model.h5') # In[3]: # Extract Encoder from seq2seq model encoder_model = extract_encoder_model(seq2seq_Model) # Get a summary of the encoder and its layers encoder_model.summary() # Freeze the encoder # In[4]: # Freeze Encoder Model for l in encoder_model.layers:
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data", type=str, default="", help="The input location, a local file path.") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model, a local file path.") ##################################################### # Optional section, based on what your model needs ##################################################### parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") ######################################################## # End of optional args section # # Be sure to add your args at the appropriate sections # of the training code ######################################################## args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) ################################################## # Reading input file(s) # Make changes as needed ################################################## # Reading input data file ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data ################################################### # Fill in your model training code starting here ################################################### # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ######################################################## # End of your training code # # * Be sure to save your model to args.output_model # such as Model.save(args.output_model) ######################################################## # Save model. seq2seq_Model.save(args.output_model)
with open(data_dir + 'title_pp.dpkl', 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data np.save(data_dir + 'train_title_vecs.npy', train_title_vecs) np.save(data_dir + 'train_body_vecs.npy', train_body_vecs) else: time.sleep(120) while True: if os.path.isfile(data_dir + 'train_body_vecs.npy'): break print("Waiting for dataset") time.sleep(2) encoder_input_data, doc_length = load_encoder_inputs(data_dir + 'train_body_vecs.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( data_dir + 'train_title_vecs.npy') num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl') #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ), name='Encoder-Input')
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument( "--input_data", type=str, default="", help="The input location. Can be a GCS or local file path.") # TODO(jlewi): The following arguments are deprecated; just # use input_data. We should remove them as soon as all call sites # are updated. parser.add_argument("--input_data_gcs_bucket", type=str, default="kubeflow-examples") parser.add_argument( "--input_data_gcs_path", type=str, default="github-issue-summarization-data/github-issues.zip") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model GCS or local file path.") # TODO(jlewi): We should get rid of the following arguments and just use # --output_model_h5. If the output is a gs:// location we should use # a local file and then upload it to GCS. parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument( "--output_model_gcs_path", type=str, default="github-issue-summarization-data/output_model.h5") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) # For backwords compatibility input_data_gcs_bucket = None input_data_gcs_path = None if not args.input_data: # Since input_data isn't set fall back on old arguments. input_data_gcs_bucket = args.input_data_gcs_bucket input_data_gcs_path = args.input_data_gcs_path else: if args.input_data.startswith('gs://'): input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri( args.input_data) if input_data_gcs_bucket: logging.info("Download bucket %s object %s.", input_data_gcs_bucket, input_data_gcs_path) bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket) args.input_data = 'github-issues.zip' storage.Blob(input_data_gcs_path, bucket).download_to_filename(args.input_data) ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### # For backwords compatibility output_model_gcs_bucket = None output_model_gcs_path = None if not args.output_model: # Since input_data isn't set fall back on old arguments. output_model_gcs_bucket = args.output_model_gcs_bucket output_model_gcs_path = args.output_model_gcs_path else: if args.output_model.startswith('gs://'): output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri( args.output_model) if output_model_gcs_bucket: logging.info("Uploading model to bucket %s path %s.", output_model_gcs_bucket, output_model_gcs_path) bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket) storage.Blob(output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
train_body_vecs_file = args.output_dir + '/train_body_vecs.npy' title_pkl_file = args.output_dir + '/title_pp.dpkl' train_title_vecs_file = args.output_dir + '/train_title_vecs.npy' # Save the preprocessor with open(body_pkl_file, 'wb') as f: dpickle.dump(body_pp, f) with open(title_pkl_file, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data np.save(train_title_vecs_file, train_title_vecs) np.save(train_body_vecs_file, train_body_vecs) encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file) decoder_input_data, decoder_target_data = load_decoder_inputs( train_title_vecs_file) num_encoder_tokens, body_pp = load_text_processor(body_pkl_file) num_decoder_tokens, title_pp = load_text_processor(title_pkl_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')
# Save the processed data np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code) np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment) # Arrange data for modeling # In[5]: from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl') # If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here: # # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl # # Build Seq2Seq Model For Summarizing Code # # We will build a model to predict the docstring given a function or a method. While this is a very cool task in itself, this is not the end goal of this exercise. The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.