def predict(): train_code, holdout_code, train_comment, holdout_comment = read_training_files( '../../data/processed_data/') loc = "/home/bohong/文档/mygit/cdpensearch/cdpensearch/oneEncoder/seqmodel.hdf5" seq2seq_Model = load_model(loc) loc = OUTPUT_PATH / 'py_code_proc_v2.dpkl' num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) demo_testdf = pd.DataFrame({ 'code': holdout_code, 'comment': holdout_comment, 'ref': '' }) # seq2seq_inf.predications(df=demo_testdf) f = open("generatetag.txt") score = seq2seq_inf.evaluate_model(f.readlines(), holdout_comment, max_len=None) f.close() print(score)
def load_seq2seq_model(self): K.clear_session() seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model)
def detect(inputs, input_model_h5, input_title_preprocessor_dpkl, input_body_preprocessor_dpkl): # Load model, preprocessors. seq2seq_Model = keras.models.load_model(input_model_h5) num_encoder_tokens, body_pp = load_text_processor( input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( input_title_preprocessor_dpkl) # Prepare inference. seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp, seq2seq_model=seq2seq_Model) # Output predictions for n random rows in the test set. return seq2seq_inf.generate_issue_title(input[0])
def create_vector(self, postgres, file_id): K.clear_session() print("Going to load code2emb_model") self.code2emb_model = load_model(str(self.code2emb_path / 'code2emb_model.hdf5'), custom_objects=None, compile=False) print("Going to load_text_processor") self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') # with open(self.data_path/'without_docstrings.function', 'r', encoding='utf-8') as f: # no_docstring_funcs = f.readlines() paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid( file_id) paras = [str(item) for item in paras] no_docstring_funcs = paras print("no_docstring_funcs = ", no_docstring_funcs) print("Going to transform_parallel") # encinp = self.enc_pp_vector.transform_parallel(no_docstring_funcs) encinp = self.enc_pp_vector.transform(no_docstring_funcs) # np.save(self.code2emb_path/'nodoc_encinp.npy', encinp) # encinp = np.load(self.code2emb_path/'nodoc_encinp.npy') print("Going to create the vector") nodoc_vecs = self.code2emb_model.predict(encinp, batch_size=200) # make sure the number of output rows equal the number of input rows assert nodoc_vecs.shape[0] == encinp.shape[0] # np.save(self.code2emb_path/'nodoc_vecs.npy', nodoc_vecs) npy_filename = str(file_id) + "####" + "nodoc_vecs.npy" np.save(self.npy_path / npy_filename, nodoc_vecs) K.clear_session() print("Vector is created")
def load_code2emb_model(self): K.clear_session() self.code2emb_model = load_model(str(self.code2emb_path / 'code2emb_model.hdf5'), custom_objects=None, compile=False) self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl')
def train(): encoder_input_data, encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_code_vecs_v2.npy') s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs( OUTPUT_PATH / 'py_t_seq_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( OUTPUT_PATH / 'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH / 'py_code_proc_v2.dpkl') s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH / 'py_seq_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH / 'py_comment_proc_v2.dpkl') seq2seq_Model = build_seq2seq_model( word_emb_dim=128, hidden_state_dim=128, encoder_seq_len=encoder_seq_len, s_encoder_seq_len=s_encoder_seq_len, num_encoder_tokens=num_encoder_tokens, num_s_encoder_tokens=s_num_encoder_tokens, num_decoder_tokens=num_decoder_tokens) seq2seq_Model.summary() seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005), loss='sparse_categorical_crossentropy') script_name_base = 'py_func_sum_v9_' csv_logger = CSVLogger('{:}.log'.format(script_name_base)) model_checkpoint = ModelCheckpoint( '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format( script_name_base), save_best_only=True) batch_size = 100 epochs = 50 history = seq2seq_Model.fit( [encoder_input_data, s_encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1), batch_size=batch_size, epochs=epochs, validation_split=0.12, callbacks=[csv_logger, model_checkpoint]) seq2seq_Model.save("seqmodel.hdf5")
def load_summarizer(seq2seq_model_path, text_processor_path): """ Loads the code summarizer model and returns the interference object to be used for predicting docstrings. Input: ----- Returns: Seq2Seq_Inference object Author: Tyler Medlin """ #the code from the GitHub team has a LOT of soon to be depricated functions #suppress the depricated warnings tf.logging.set_verbosity('ERROR') os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" warnings.filterwarnings("ignore") logging.warning('Loading pre-trained model...') # Load model seq2seq_Model = load_model(seq2seq_model_path + '/py_func_sum_v9_.epoch16-val2.55276.hdf5') logging.warning('Loading text processor (encoder)...') # Load encoder (code) pre-processor num_encoder_tokens, enc_pp = load_text_processor(text_processor_path + '/py_code_proc_v2.dpkl') logging.warning('Loading text processor (decoder)...') # Load decoder (docstrings/comments) pre-processor num_decoder_tokens, dec_pp = load_text_processor( text_processor_path + '/py_comment_proc_v2.dpkl') graph = tf.get_default_graph() seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) return seq2seq_inf, graph
def create_autotag(self, postgres, file_id): K.clear_session() seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) paras, paraids, autotags, manualtags = postgres.get_paragraphs_fileid( file_id) paras = [str(item) for item in paras] no_docstring_funcs = paras no_docstring_paraids = paraids print("no_docstring_paraids = ", no_docstring_paraids) print("size of paragraphs = ", len(no_docstring_funcs)) print("size of paraids = ", len(no_docstring_paraids)) demo_testdf = pd.DataFrame({ 'code': no_docstring_funcs, 'comment': '', 'ref': '' }) auto_tag = self.seq2seq_inf.demo_model_predictions(n=15, df=demo_testdf) print("size of auto_tag = ", len(auto_tag)) with open(self.data_path / 'without_docstrings.autotag', 'w', encoding='utf-8') as f: index = 0 for item in auto_tag: f.write("%s\n" % item) paraid = no_docstring_paraids[index] # paraid = paraid.strip() updated_rows = postgres.update_autotag(paraid, item) index = index + 1 K.clear_session()
def load_models(self): K.clear_session() print("Going to load 'code_summary_seq2seq_model.h5'") seq2seq_Model = load_model( str(self.seq2seq_path / 'code_summary_seq2seq_model.h5')) print("Going to load 'py_code_proc_v2.dpkl'") num_encoder_tokens, enc_pp = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl') print("Going to load 'py_comment_proc_v2.dpkl'") num_decoder_tokens, dec_pp = load_text_processor( self.seq2seq_path / 'py_comment_proc_v2.dpkl') print("Going to load 'Seq2Seq_Inference'") self.seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=enc_pp, decoder_preprocessor=dec_pp, seq2seq_model=seq2seq_Model) print("Going to load 'code2emb_model.hdf5'") self.code2emb_model = load_model(str(self.code2emb_path / 'code2emb_model.hdf5'), custom_objects=None, compile=False) print("Going to load 'py_code_proc_v2.dpkl'") self.num_encoder_tokens_vector, self.enc_pp_vector = load_text_processor( self.seq2seq_path / 'py_code_proc_v2.dpkl')
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data_gcs_bucket", type=str, default="") parser.add_argument("--input_data_gcs_path", type=str, default="") parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument("--output_model_gcs_path", type=str, default="") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) print("Download iput file") if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "": bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket) storage.Blob(args.input_data_gcs_path, bucket).download_to_filename('github-issues.zip') else: urllib.request.urlretrieve( "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip", 'github-issues.zip') print("unzip iput file") zip_ref = zipfile.ZipFile('github-issues.zip', 'r') zip_ref.extractall('.') zip_ref.close() # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv('github_issues.csv').sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### if args.output_model_gcs_bucket != "": bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket) storage.Blob(args.output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
def build_model(self, learning_rate): """Build a keras model.""" logging.info("starting") if self.job_name and self.job_name.lower() in ["ps"]: logging.info("ps doesn't build model") return self.encoder_input_data, doc_length = load_encoder_inputs( self.preprocessed_bodies) self.decoder_input_data, self.decoder_target_data = load_decoder_inputs( self.preprocessed_titles) num_encoder_tokens, self.body_pp = load_text_processor( self.body_pp_file) num_decoder_tokens, self.title_pp = load_text_processor( self.title_pp_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = keras.layers.Embedding( num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ######################## #### Decoder Model #### decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = keras.layers.Embedding( num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # TODO(https://github.com/kubeflow/examples/issues/196): # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761 # and the model won't train. decoder_gru = keras.layers.GRU( latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out]) x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = keras.layers.Dense( num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ######################## #### Seq2Seq Model #### self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs) self.seq2seq_Model.compile( optimizer=keras.optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy',) # TODO(jlewi): Computing accuracy causes a dimension mismatch. # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]] # pylint: disable=line-too-long # metrics=['accuracy']) self.seq2seq_Model.summary()
# We want to vectorize all of the code without docstrings so we can test the efficacy of the search on the code that was never seen by the model. # In[9]: from keras.models import load_model from pathlib import Path import numpy as np from seq2seq_utils import load_text_processor code2emb_path = Path('./data/code2emb/') seq2seq_path = Path('./data/seq2seq/') data_path = Path('./data/processed_data/') # In[10]: code2emb_model = load_model(code2emb_path / 'code2emb_model.hdf5') num_encoder_tokens, enc_pp = load_text_processor(seq2seq_path / 'py_code_proc_v2.dpkl') with open(data_path / 'without_docstrings.function', 'r') as f: no_docstring_funcs = f.readlines() # ### Pre-process code without docstrings for input into `code2emb` model # # We use the same transformer we used to train the original model. # In[13]: # tokenized functions that did not contain docstrigns no_docstring_funcs[:5] # In[11]:
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data", type=str, default="", help="The input location, a local file path.") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model, a local file path.") ##################################################### # Optional section, based on what your model needs ##################################################### parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") ######################################################## # End of optional args section # # Be sure to add your args at the appropriate sections # of the training code ######################################################## args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) ################################################## # Reading input file(s) # Make changes as needed ################################################## # Reading input data file ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data ################################################### # Fill in your model training code starting here ################################################### # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ######################################################## # End of your training code # # * Be sure to save your model to args.output_model # such as Model.save(args.output_model) ######################################################## # Save model. seq2seq_Model.save(args.output_model)
np.save(data_dir + 'train_title_vecs.npy', train_title_vecs) np.save(data_dir + 'train_body_vecs.npy', train_body_vecs) else: time.sleep(120) while True: if os.path.isfile(data_dir + 'train_body_vecs.npy'): break print("Waiting for dataset") time.sleep(2) encoder_input_data, doc_length = load_encoder_inputs(data_dir + 'train_body_vecs.npy') decoder_input_data, decoder_target_data = load_decoder_inputs( data_dir + 'train_title_vecs.npy') num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl') num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl') #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = tf.keras.layers.Embedding(num_encoder_tokens, latent_dim,
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument( "--input_data", type=str, default="", help="The input location. Can be a GCS or local file path.") # TODO(jlewi): The following arguments are deprecated; just # use input_data. We should remove them as soon as all call sites # are updated. parser.add_argument("--input_data_gcs_bucket", type=str, default="kubeflow-examples") parser.add_argument( "--input_data_gcs_path", type=str, default="github-issue-summarization-data/github-issues.zip") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model GCS or local file path.") # TODO(jlewi): We should get rid of the following arguments and just use # --output_model_h5. If the output is a gs:// location we should use # a local file and then upload it to GCS. parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument( "--output_model_gcs_path", type=str, default="github-issue-summarization-data/output_model.h5") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) # For backwords compatibility input_data_gcs_bucket = None input_data_gcs_path = None if not args.input_data: # Since input_data isn't set fall back on old arguments. input_data_gcs_bucket = args.input_data_gcs_bucket input_data_gcs_path = args.input_data_gcs_path else: if args.input_data.startswith('gs://'): input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri( args.input_data) if input_data_gcs_bucket: logging.info("Download bucket %s object %s.", input_data_gcs_bucket, input_data_gcs_path) bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket) args.input_data = 'github-issues.zip' storage.Blob(input_data_gcs_path, bucket).download_to_filename(args.input_data) ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### # For backwords compatibility output_model_gcs_bucket = None output_model_gcs_path = None if not args.output_model: # Since input_data isn't set fall back on old arguments. output_model_gcs_bucket = args.output_model_gcs_bucket output_model_gcs_path = args.output_model_gcs_path else: if args.output_model.startswith('gs://'): output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri( args.output_model) if output_model_gcs_bucket: logging.info("Uploading model to bucket %s path %s.", output_model_gcs_bucket, output_model_gcs_path) bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket) storage.Blob(output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
type=int, default=get_value_as_int('BATCH_SIZE', 1200)) parser.add_argument("--validation_split", type=float, default=get_value_as_float('BATCH_SIZE', 0.12)) args = parser.parse_args() print(args) learning_rate = float(args.learning_rate) encoder_input_data, doc_length = load_encoder_inputs( args.input_train_body_vecs_npy) decoder_input_data, decoder_target_data = load_decoder_inputs( args.input_train_title_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.input_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.input_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding',
np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code) np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment) # Arrange data for modeling # In[5]: from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy') decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy') num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl') num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl') # If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here: # # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl # - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl # # Build Seq2Seq Model For Summarizing Code # # We will build a model to predict the docstring given a function or a method. While this is a very cool task in itself, this is not the end goal of this exercise. The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search. # In[6]:
# Save the preprocessor with open(body_pkl_file, 'wb') as f: dpickle.dump(body_pp, f) with open(title_pkl_file, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data np.save(train_title_vecs_file, train_title_vecs) np.save(train_body_vecs_file, train_body_vecs) encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file) decoder_input_data, decoder_target_data = load_decoder_inputs( train_title_vecs_file) num_encoder_tokens, body_pp = load_text_processor(body_pkl_file) num_decoder_tokens, title_pp = load_text_processor(title_pkl_file) #arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ##### Define Model Architecture ###### ######################## #### Encoder Model #### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding',
else: filename = 'data/seq2seq/code_summary_seq2seq_model.h5' seq2seq_Model = load_model(filename) loc = "" # Load encoder (code) pre-processor from url if(args['download']): loc = get_file(fname='py_code_proc_v2.dpkl', origin='https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl') else: loc="data/seq2seq/py_code_proc_v2.dpkl" num_encoder_tokens, enc_pp = load_text_processor(loc) loc = "" # Load encoder (code) pre-processor from url if(args['download']): # Load decoder (docstrings/comments) pre-processor from url loc = get_file(fname='py_comment_proc_v2.dpkl', origin='https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl') else: loc="data/seq2seq/py_comment_proc_v2.dpkl" num_decoder_tokens, dec_pp = load_text_processor(loc) from seq2seq_utils import Seq2Seq_Inference import pandas as pd