Esempio n. 1
0
def train():
    encoder_input_data, encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_code_vecs_v2.npy')
    s_encoder_input_data, s_encoder_seq_len = load_encoder_inputs(
        OUTPUT_PATH / 'py_t_seq_vecs_v2.npy')
    decoder_input_data, decoder_target_data = load_decoder_inputs(
        OUTPUT_PATH / 'py_t_comment_vecs_v2.npy')
    num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_code_proc_v2.dpkl')
    s_num_encoder_tokens, s_enc_pp = load_text_processor(OUTPUT_PATH /
                                                         'py_seq_proc_v2.dpkl')
    num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH /
                                                     'py_comment_proc_v2.dpkl')

    seq2seq_Model = build_seq2seq_model(
        word_emb_dim=128,
        hidden_state_dim=128,
        encoder_seq_len=encoder_seq_len,
        s_encoder_seq_len=s_encoder_seq_len,
        num_encoder_tokens=num_encoder_tokens,
        num_s_encoder_tokens=s_num_encoder_tokens,
        num_decoder_tokens=num_decoder_tokens)

    seq2seq_Model.summary()
    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=0.0005),
                          loss='sparse_categorical_crossentropy')

    script_name_base = 'py_func_sum_v9_'
    csv_logger = CSVLogger('{:}.log'.format(script_name_base))

    model_checkpoint = ModelCheckpoint(
        '{:}.epoch{{epoch:02d}}-val{{val_loss:.5f}}.hdf5'.format(
            script_name_base),
        save_best_only=True)

    batch_size = 100
    epochs = 50
    history = seq2seq_Model.fit(
        [encoder_input_data, s_encoder_input_data, decoder_input_data],
        np.expand_dims(decoder_target_data, -1),
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.12,
        callbacks=[csv_logger, model_checkpoint])
    seq2seq_Model.save("seqmodel.hdf5")
Esempio n. 2
0
parser.add_argument("--tempfile", default=True)
parser.add_argument("--epochs",
                    type=int,
                    default=get_value_as_int('TRAIN_EPOCHS', 7))
parser.add_argument("--batch_size",
                    type=int,
                    default=get_value_as_int('BATCH_SIZE', 1200))
parser.add_argument("--validation_split",
                    type=float,
                    default=get_value_as_float('BATCH_SIZE', 0.12))
args = parser.parse_args()
print(args)

learning_rate = float(args.learning_rate)

encoder_input_data, doc_length = load_encoder_inputs(
    args.input_train_body_vecs_npy)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    args.input_train_title_vecs_npy)

num_encoder_tokens, body_pp = load_text_processor(
    args.input_body_preprocessor_dpkl)
num_decoder_tokens, title_pp = load_text_processor(
    args.input_title_preprocessor_dpkl)

# Arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

###############
# Encoder Model.
###############
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')
Esempio n. 3
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data_gcs_bucket", type=str, default="")
    parser.add_argument("--input_data_gcs_path", type=str, default="")

    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument("--output_model_gcs_path", type=str, default="")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)
    print("Download iput file")
    if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "":
        bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket)
        storage.Blob(args.input_data_gcs_path,
                     bucket).download_to_filename('github-issues.zip')
    else:
        urllib.request.urlretrieve(
            "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip",
            'github-issues.zip')

    print("unzip iput file")
    zip_ref = zipfile.ZipFile('github-issues.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv('github_issues.csv').sample(n=args.sample_size),
        test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    if args.output_model_gcs_bucket != "":
        bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket)
        storage.Blob(args.output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Esempio n. 4
0
  def build_model(self, learning_rate):
    """Build a keras model."""
    logging.info("starting")

    if self.job_name and self.job_name.lower() in ["ps"]:
      logging.info("ps doesn't build model")
      return

    self.encoder_input_data, doc_length = load_encoder_inputs(
      self.preprocessed_bodies)
    self.decoder_input_data, self.decoder_target_data = load_decoder_inputs(
      self.preprocessed_titles)

    num_encoder_tokens, self.body_pp = load_text_processor(
      self.body_pp_file)
    num_decoder_tokens, self.title_pp = load_text_processor(
      self.title_pp_file)

    #arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ##### Define Model Architecture ######

    ########################
    #### Encoder Model ####
    encoder_inputs = keras.layers.Input(shape=(doc_length,), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = keras.layers.Embedding(
      num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)
    x = keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    #  encode without decoding if we want to.

    encoder_model = keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')
    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ########################
    #### Decoder Model ####
    decoder_inputs = keras.layers.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = keras.layers.Embedding(
      num_decoder_tokens,
                latent_dim, name='Decoder-Word-Embedding',
                mask_zero=False)(decoder_inputs)
    dec_bn = keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # TODO(https://github.com/kubeflow/examples/issues/196):
    # With TF.Estimtor we hit https://github.com/keras-team/keras/issues/9761
    # and the model won't train.
    decoder_gru = keras.layers.GRU(
      latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')

    decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=[seq2seq_encoder_out])
    x = keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = keras.layers.Dense(
      num_decoder_tokens, activation='softmax', name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ########################
    #### Seq2Seq Model ####

    self.seq2seq_Model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

    self.seq2seq_Model.compile(
      optimizer=keras.optimizers.Nadam(lr=learning_rate),
      loss='sparse_categorical_crossentropy',)
      #  TODO(jlewi): Computing accuracy causes a dimension mismatch.
      # tensorflow.python.framework.errors_impl.InvalidArgumentError: Incompatible shapes: [869] vs. [79,11] # pylint: disable=line-too-long
      # [[{{node metrics/acc/Equal}} = Equal[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](metrics/acc/Reshape, metrics/acc/Cast)]]  # pylint: disable=line-too-long
      # metrics=['accuracy'])

    self.seq2seq_Model.summary()
Esempio n. 5
0
#
# 1. Load the seq2seq model and extract the encoder (remember seq2seq models have an encoder and a decoder).
# 2. Freeze the weights of the encoder.
# 3. Add some dense layers on top of the encoder.
# 4. Train this new model supplying by supplying `(code, docstring-embeddings)` pairs.  We will call this model `code2emb_model`.
# 5. Unfreeze the entire model, and resume training.  This helps fine tune the model a little more towards this task.
# 6. Encode all of the code, including code that does not contain a docstring and save that into a search index for future use.

# ### Load seq2seq model from Step 2 and extract the encoder

# First load the seq2seq model from Step2, then extract the encoder (we do not need the decoder).

# In[2]:

# load the pre-processed data for the encoder (we don't care about the decoder in this step)
encoder_input_data, doc_length = load_encoder_inputs(seq2seq_path /
                                                     'py_t_code_vecs_v2.npy')
seq2seq_Model = load_model(seq2seq_path / 'code_summary_seq2seq_model.h5')

# In[3]:

# Extract Encoder from seq2seq model
encoder_model = extract_encoder_model(seq2seq_Model)
# Get a summary of the encoder and its layers
encoder_model.summary()

# Freeze the encoder

# In[4]:

# Freeze Encoder Model
for l in encoder_model.layers:
Esempio n. 6
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data",
                        type=str,
                        default="",
                        help="The input location, a local file path.")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model, a local file path.")

    #####################################################
    #  Optional section, based on what your model needs
    #####################################################

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")

    ########################################################
    #  End of optional args section
    #
    #  Be sure to add your args at the appropriate sections
    #  of the training code
    ########################################################

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    ##################################################
    #  Reading input file(s)
    #  Make changes as needed
    ##################################################

    # Reading input data file
    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data


###################################################
#  Fill in your model training code starting here
###################################################

# Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    ########################################################
    #  End of your training code
    #
    #  * Be sure to save your model to args.output_model
    #     such as Model.save(args.output_model)
    ########################################################

    # Save model.
    seq2seq_Model.save(args.output_model)
Esempio n. 7
0
    with open(data_dir + 'title_pp.dpkl', 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data
    np.save(data_dir + 'train_title_vecs.npy', train_title_vecs)
    np.save(data_dir + 'train_body_vecs.npy', train_body_vecs)
else:
    time.sleep(120)

while True:
    if os.path.isfile(data_dir + 'train_body_vecs.npy'):
        break
    print("Waiting for dataset")
    time.sleep(2)
encoder_input_data, doc_length = load_encoder_inputs(data_dir +
                                                     'train_body_vecs.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(
    data_dir + 'train_title_vecs.npy')

num_encoder_tokens, body_pp = load_text_processor(data_dir + 'body_pp.dpkl')
num_decoder_tokens, title_pp = load_text_processor(data_dir + 'title_pp.dpkl')

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = tf.keras.layers.Input(shape=(doc_length, ),
                                       name='Encoder-Input')
Esempio n. 8
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument(
        "--input_data",
        type=str,
        default="",
        help="The input location. Can be a GCS or local file path.")

    # TODO(jlewi): The following arguments are deprecated; just
    # use input_data. We should remove them as soon as all call sites
    # are updated.
    parser.add_argument("--input_data_gcs_bucket",
                        type=str,
                        default="kubeflow-examples")
    parser.add_argument(
        "--input_data_gcs_path",
        type=str,
        default="github-issue-summarization-data/github-issues.zip")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model GCS or local file path.")

    # TODO(jlewi): We should get rid of the following arguments and just use
    # --output_model_h5. If the output is a gs:// location we should use
    # a local file and then upload it to GCS.
    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument(
        "--output_model_gcs_path",
        type=str,
        default="github-issue-summarization-data/output_model.h5")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    # For backwords compatibility
    input_data_gcs_bucket = None
    input_data_gcs_path = None

    if not args.input_data:
        # Since input_data isn't set fall back on old arguments.
        input_data_gcs_bucket = args.input_data_gcs_bucket
        input_data_gcs_path = args.input_data_gcs_path
    else:
        if args.input_data.startswith('gs://'):
            input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri(
                args.input_data)

    if input_data_gcs_bucket:
        logging.info("Download bucket %s object %s.", input_data_gcs_bucket,
                     input_data_gcs_path)
        bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket)
        args.input_data = 'github-issues.zip'
        storage.Blob(input_data_gcs_path,
                     bucket).download_to_filename(args.input_data)

    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    # For backwords compatibility
    output_model_gcs_bucket = None
    output_model_gcs_path = None

    if not args.output_model:
        # Since input_data isn't set fall back on old arguments.
        output_model_gcs_bucket = args.output_model_gcs_bucket
        output_model_gcs_path = args.output_model_gcs_path
    else:
        if args.output_model.startswith('gs://'):
            output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri(
                args.output_model)

    if output_model_gcs_bucket:
        logging.info("Uploading model to bucket %s path %s.",
                     output_model_gcs_bucket, output_model_gcs_path)
        bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket)
        storage.Blob(output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Esempio n. 9
0
train_body_vecs_file = args.output_dir + '/train_body_vecs.npy'
title_pkl_file = args.output_dir + '/title_pp.dpkl'
train_title_vecs_file = args.output_dir + '/train_title_vecs.npy'

# Save the preprocessor
with open(body_pkl_file, 'wb') as f:
    dpickle.dump(body_pp, f)

with open(title_pkl_file, 'wb') as f:
    dpickle.dump(title_pp, f)

# Save the processed data
np.save(train_title_vecs_file, train_title_vecs)
np.save(train_body_vecs_file, train_body_vecs)

encoder_input_data, doc_length = load_encoder_inputs(train_body_vecs_file)
decoder_input_data, decoder_target_data = load_decoder_inputs(
    train_title_vecs_file)

num_encoder_tokens, body_pp = load_text_processor(body_pkl_file)
num_decoder_tokens, title_pp = load_text_processor(title_pkl_file)

#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 300

##### Define Model Architecture ######

########################
#### Encoder Model ####
encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')
Esempio n. 10
0
    # Save the processed data
    np.save(OUTPUT_PATH/'py_t_code_vecs_v2.npy', t_code)
    np.save(OUTPUT_PATH/'py_t_comment_vecs_v2.npy', t_comment)


# Arrange data for modeling

# In[5]:



from seq2seq_utils import load_decoder_inputs, load_encoder_inputs, load_text_processor


encoder_input_data, encoder_seq_len = load_encoder_inputs(OUTPUT_PATH/'py_t_code_vecs_v2.npy')
decoder_input_data, decoder_target_data = load_decoder_inputs(OUTPUT_PATH/'py_t_comment_vecs_v2.npy')
num_encoder_tokens, enc_pp = load_text_processor(OUTPUT_PATH/'py_code_proc_v2.dpkl')
num_decoder_tokens, dec_pp = load_text_processor(OUTPUT_PATH/'py_comment_proc_v2.dpkl')


# If you don't have the above files on disk because you set `use_cache = True` you can download the files for the above function calls here:
# 
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_code_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_t_comment_vecs_v2.npy
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_code_proc_v2.dpkl
#  - https://storage.googleapis.com/kubeflow-examples/code_search/data/seq2seq/py_comment_proc_v2.dpkl

# # Build Seq2Seq Model For Summarizing Code
# 
# We will build a model to predict the docstring given a function or a method.  While this is a very cool task in itself, this is not the end goal of this exercise.  The motivation for training this model is to learn a general purpose feature extractor for code that we can use for the task of code search.