Ejemplo n.º 1
0
  def preprocess(self, data_file, num_samples=None):
    """Preprocess the input.

    Trains preprocessors and splits the data into train and test sets.

    Args:
      data_file: The datafile to process
      num_samples: Number of samples to use. Set to None to use
        entire dataset.
    """
    # We preprocess the data if we are the master or chief.
    # Or if we aren't running distributed.
    if self.job_name and self.job_name.lower() not in ["master", "chief"]:
      return

    # TODO(jlewi): The test data isn't being used for anything. How can
    # we configure evaluation?
    if num_samples:
      sampled = pd.read_csv(data_file).sample(n=num_samples)
      traindf, self.test_df = train_test_split(sampled, test_size=.10)
    else:
      traindf, self.test_df = train_test_split(pd.read_csv(data_file), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    self.body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = self.body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    self.title_pp = processor(append_indicators=True, keep_n=4500,
                              padding_maxlen=12, padding='post')

    # process the title data
    train_title_vecs = self.title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor
    with open(self.body_pp_file, 'wb') as f:
      dpickle.dump(self.body_pp, f)

    with open(self.title_pp_file, 'wb') as f:
      dpickle.dump(self.title_pp, f)

    # Save the processed data
    np.save(self.preprocessed_titles, train_title_vecs)
    np.save(self.preprocessed_bodies, train_body_vecs)
Ejemplo n.º 2
0
def datapre():
    train_code, holdout_code, train_comment, holdout_comment = read_training_files(
        './data/processed_data2/')

    assert len(train_code) == len(train_comment)
    assert len(holdout_code) == len(holdout_comment)
    code_proc = processor(heuristic_pct_padding=.7, keep_n=40000)
    print("start")
    t_code = code_proc.fit_transform(train_code)
    print("finish code")
    comment_proc = processor(append_indicators=True,
                             heuristic_pct_padding=.7,
                             keep_n=40000,
                             padding='post')
    t_comment = comment_proc.fit_transform(train_comment)
    print("finish comment")
    with open(OUTPUT_PATH / 'py_code_proc_v2.dpkl', 'wb') as f:
        dpickle.dump(code_proc, f)

    with open(OUTPUT_PATH / 'py_comment_proc_v2.dpkl', 'wb') as f:
        dpickle.dump(comment_proc, f)

    np.save(OUTPUT_PATH / 'py_t_code_vecs_v2.npy', t_code)
    np.save(OUTPUT_PATH / 'py_t_comment_vecs_v2.npy', t_comment)
parser.add_argument("--output_body_preprocessor_dpkl")
parser.add_argument("--output_title_preprocessor_dpkl")
parser.add_argument("--output_train_title_vecs_npy")
parser.add_argument("--output_train_body_vecs_npy")
args = parser.parse_args()
print(args)

# Read data.
traindf = pd.read_csv(args.input_traindf_csv)
train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()

# Clean, tokenize, and apply padding / truncating such that each document
# length = 70. Also, retain only the top 8,000 words in the vocabulary and set
# the remaining words to 1 which will become common index for rare words.
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)

print('Example original body:', train_body_raw[0])
print('Example body after pre-processing:', train_body_vecs[0])

# Instantiate a text processor for the titles, with some different parameters.
title_pp = processor(append_indicators=True,
                     keep_n=4500,
                     padding_maxlen=12,
                     padding='post')

# process the title data
train_title_vecs = title_pp.fit_transform(train_title_raw)

print('Example original title:', train_title_raw[0])
Ejemplo n.º 4
0
    def preprocess(self, data_glob, num_samples=None):
        """Preprocess the input.

        Trains preprocessors and splits the data into train and test sets.

        Parameters
        ----------
        data_glob: list
            The datafiles to process
        num_samples: int or None
            Number of samples to use. Set to None to use entire dataset.

        Returns
        -------
        none
        """
        def strip_list_html(t_list):
            return ([
                BeautifulSoup(text, "html5lib").get_text() for text in t_list
            ])

        # Preprocess the data if we are the master or chief.
        # Or if we aren't running distributed.
        if self.job_name and self.job_name.lower() not in ["master", "chief"]:
            return

        print("DATA GLOB", data_glob)

        # TODO: The test data isn't being used for anything.
        # How can we configure evaluation?
        if num_samples:
            traindf, self.test_df = train_test_split(pd.concat(
                [
                    pd.read_csv(f, usecols=['body', 'title', 'link'])
                    for f in data_glob
                ],
                ignore_index=True).sample(n=num_samples),
                                                     test_size=.10)
        else:
            traindf, self.test_df = train_test_split(pd.concat(
                [
                    pd.read_csv(f, usecols=['body', 'title', 'link'])
                    for f in data_glob
                ],
                ignore_index=True),
                                                     test_size=.10)

        # Print stats about the shape of the data.
        logging.info('Train: %d rows %d columns', traindf.shape[0],
                     traindf.shape[1])

        train_body_raw = traindf.body.tolist()
        train_title_raw = traindf.title.tolist()

        # Clean, tokenize, and apply padding / truncating such that
        # each document length = 120. Also, retain only the top 8,000 words
        # in the vocabulary andset the remaining words to 1 which will
        # become common index for rare words.
        self.body_pp = processor(keep_n=self.body_keep_n,
                                 padding_maxlen=self.body_maxlen)
        train_body_vecs = self.body_pp.fit_transform(
            strip_list_html(train_body_raw))

        logging.info('Example original body: %s', train_body_raw[0])
        logging.info('Example body after pre-processing: %s',
                     train_body_vecs[0])

        self.title_pp = processor(append_indicators=True,
                                  keep_n=self.title_keep_n,
                                  padding_maxlen=self.title_maxlen,
                                  padding='post')
        train_title_vecs = self.title_pp.fit_transform(train_title_raw)

        logging.info('Example original title: %s', train_title_raw[0])
        logging.info('Example title after pre-processing: %s',
                     train_title_vecs[0])

        # Save the preprocessor
        with open(self.body_pp_file, 'wb') as f:
            dpickle.dump(self.body_pp, f)

        with open(self.title_pp_file, 'wb') as f:
            dpickle.dump(self.title_pp, f)

        # Save the processed data
        np.save(self.preprocessed_titles, train_title_vecs)
        np.save(self.preprocessed_bodies, train_body_vecs)
Ejemplo n.º 5
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data_gcs_bucket", type=str, default="")
    parser.add_argument("--input_data_gcs_path", type=str, default="")

    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument("--output_model_gcs_path", type=str, default="")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)
    print("Download iput file")
    if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "":
        bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket)
        storage.Blob(args.input_data_gcs_path,
                     bucket).download_to_filename('github-issues.zip')
    else:
        urllib.request.urlretrieve(
            "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip",
            'github-issues.zip')

    print("unzip iput file")
    zip_ref = zipfile.ZipFile('github-issues.zip', 'r')
    zip_ref.extractall('.')
    zip_ref.close()

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv('github_issues.csv').sample(n=args.sample_size),
        test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    if args.output_model_gcs_bucket != "":
        bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket)
        storage.Blob(args.output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Ejemplo n.º 6
0
import pandas as pd
from ktext.preprocess import processor

data_url = 'https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv'
body = pd.read_csv(data_url).head(2000).text.tolist()
issue_body_proc = processor(heuristic_pct_padding=.7, keep_n=5000)
train_result = issue_body_proc.fit_transform(body)
Ejemplo n.º 7
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument("--input_data",
                        type=str,
                        default="",
                        help="The input location, a local file path.")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model, a local file path.")

    #####################################################
    #  Optional section, based on what your model needs
    #####################################################

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")

    ########################################################
    #  End of optional args section
    #
    #  Be sure to add your args at the appropriate sections
    #  of the training code
    ########################################################

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    ##################################################
    #  Reading input file(s)
    #  Make changes as needed
    ##################################################

    # Reading input data file
    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data


###################################################
#  Fill in your model training code starting here
###################################################

# Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    ########################################################
    #  End of your training code
    #
    #  * Be sure to save your model to args.output_model
    #     such as Model.save(args.output_model)
    ########################################################

    # Save model.
    seq2seq_Model.save(args.output_model)
Ejemplo n.º 8
0
def main():  # pylint: disable=too-many-statements
    # Parsing flags.
    parser = argparse.ArgumentParser()
    parser.add_argument("--sample_size", type=int, default=2000000)
    parser.add_argument("--learning_rate", default="0.001")

    parser.add_argument(
        "--input_data",
        type=str,
        default="",
        help="The input location. Can be a GCS or local file path.")

    # TODO(jlewi): The following arguments are deprecated; just
    # use input_data. We should remove them as soon as all call sites
    # are updated.
    parser.add_argument("--input_data_gcs_bucket",
                        type=str,
                        default="kubeflow-examples")
    parser.add_argument(
        "--input_data_gcs_path",
        type=str,
        default="github-issue-summarization-data/github-issues.zip")

    parser.add_argument(
        "--output_model",
        type=str,
        default="",
        help="The output location for the model GCS or local file path.")

    # TODO(jlewi): We should get rid of the following arguments and just use
    # --output_model_h5. If the output is a gs:// location we should use
    # a local file and then upload it to GCS.
    parser.add_argument("--output_model_gcs_bucket", type=str, default="")
    parser.add_argument(
        "--output_model_gcs_path",
        type=str,
        default="github-issue-summarization-data/output_model.h5")

    parser.add_argument("--output_body_preprocessor_dpkl",
                        type=str,
                        default="body_preprocessor.dpkl")
    parser.add_argument("--output_title_preprocessor_dpkl",
                        type=str,
                        default="title_preprocessor.dpkl")
    parser.add_argument("--output_train_title_vecs_npy",
                        type=str,
                        default="train_title_vecs.npy")
    parser.add_argument("--output_train_body_vecs_npy",
                        type=str,
                        default="train_body_vecs.npy")
    parser.add_argument("--output_model_h5",
                        type=str,
                        default="output_model.h5")

    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format=('%(levelname)s|%(asctime)s'
                '|%(pathname)s|%(lineno)d| %(message)s'),
        datefmt='%Y-%m-%dT%H:%M:%S',
    )
    logging.getLogger().setLevel(logging.INFO)
    logging.info(args)

    learning_rate = float(args.learning_rate)

    pd.set_option('display.max_colwidth', 500)

    # For backwords compatibility
    input_data_gcs_bucket = None
    input_data_gcs_path = None

    if not args.input_data:
        # Since input_data isn't set fall back on old arguments.
        input_data_gcs_bucket = args.input_data_gcs_bucket
        input_data_gcs_path = args.input_data_gcs_path
    else:
        if args.input_data.startswith('gs://'):
            input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri(
                args.input_data)

    if input_data_gcs_bucket:
        logging.info("Download bucket %s object %s.", input_data_gcs_bucket,
                     input_data_gcs_path)
        bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket)
        args.input_data = 'github-issues.zip'
        storage.Blob(input_data_gcs_path,
                     bucket).download_to_filename(args.input_data)

    ext = os.path.splitext(args.input_data)[-1]
    if ext.lower() == '.zip':
        zip_ref = zipfile.ZipFile(args.input_data, 'r')
        zip_ref.extractall('.')
        zip_ref.close()
        # TODO(jlewi): Hardcoding the file in the Archive to use is brittle.
        # We should probably just require the input to be a CSV file.
        csv_file = 'github_issues.csv'
    else:
        csv_file = args.input_data

    # Read in data sample 2M rows (for speed of tutorial)
    traindf, testdf = train_test_split(
        pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10)

    # Print stats about the shape of the data.
    logging.info('Train: %d rows %d columns', traindf.shape[0],
                 traindf.shape[1])
    logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1])

    train_body_raw = traindf.body.tolist()
    train_title_raw = traindf.issue_title.tolist()

    # Clean, tokenize, and apply padding / truncating such that each document
    # length = 70. Also, retain only the top 8,000 words in the vocabulary and set
    # the remaining words to 1 which will become common index for rare words.
    body_pp = processor(keep_n=8000, padding_maxlen=70)
    train_body_vecs = body_pp.fit_transform(train_body_raw)

    logging.info('Example original body: %s', train_body_raw[0])
    logging.info('Example body after pre-processing: %s', train_body_vecs[0])

    # Instantiate a text processor for the titles, with some different parameters.
    title_pp = processor(append_indicators=True,
                         keep_n=4500,
                         padding_maxlen=12,
                         padding='post')

    # process the title data
    train_title_vecs = title_pp.fit_transform(train_title_raw)

    logging.info('Example original title: %s', train_title_raw[0])
    logging.info('Example title after pre-processing: %s', train_title_vecs[0])

    # Save the preprocessor.
    with open(args.output_body_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(body_pp, f)

    with open(args.output_title_preprocessor_dpkl, 'wb') as f:
        dpickle.dump(title_pp, f)

    # Save the processed data.
    np.save(args.output_train_title_vecs_npy, train_title_vecs)
    np.save(args.output_train_body_vecs_npy, train_body_vecs)

    _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy)

    num_encoder_tokens, body_pp = load_text_processor(
        args.output_body_preprocessor_dpkl)
    num_decoder_tokens, title_pp = load_text_processor(
        args.output_title_preprocessor_dpkl)

    # Arbitrarly set latent dimension for embedding and hidden units
    latent_dim = 300

    ###############
    # Encoder Model.
    ###############
    encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input')

    # Word embeding for encoder (ex: Issue Body)
    x = Embedding(num_encoder_tokens,
                  latent_dim,
                  name='Body-Word-Embedding',
                  mask_zero=False)(encoder_inputs)
    x = BatchNormalization(name='Encoder-Batchnorm-1')(x)

    # We do not need the `encoder_output` just the hidden state.
    _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

    # Encapsulate the encoder as a separate entity so we can just
    # encode without decoding if we want to.
    encoder_model = Model(inputs=encoder_inputs,
                          outputs=state_h,
                          name='Encoder-Model')

    seq2seq_encoder_out = encoder_model(encoder_inputs)

    ################
    # Decoder Model.
    ################
    decoder_inputs = Input(shape=(None, ),
                           name='Decoder-Input')  # for teacher forcing

    # Word Embedding For Decoder (ex: Issue Titles)
    dec_emb = Embedding(num_decoder_tokens,
                        latent_dim,
                        name='Decoder-Word-Embedding',
                        mask_zero=False)(decoder_inputs)
    dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

    # Set up the decoder, using `decoder_state_input` as initial state.
    decoder_gru = GRU(latent_dim,
                      return_state=True,
                      return_sequences=True,
                      name='Decoder-GRU')
    decoder_gru_output, _ = decoder_gru(dec_bn,
                                        initial_state=seq2seq_encoder_out)
    x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

    # Dense layer for prediction
    decoder_dense = Dense(num_decoder_tokens,
                          activation='softmax',
                          name='Final-Output-Dense')
    decoder_outputs = decoder_dense(x)

    ################
    # Seq2Seq Model.
    ################

    seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate),
                          loss='sparse_categorical_crossentropy')

    seq2seq_Model.summary()

    #############
    # Save model.
    #############
    seq2seq_Model.save(args.output_model_h5)

    ######################
    # Upload model to GCS.
    ######################
    # For backwords compatibility
    output_model_gcs_bucket = None
    output_model_gcs_path = None

    if not args.output_model:
        # Since input_data isn't set fall back on old arguments.
        output_model_gcs_bucket = args.output_model_gcs_bucket
        output_model_gcs_path = args.output_model_gcs_path
    else:
        if args.output_model.startswith('gs://'):
            output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri(
                args.output_model)

    if output_model_gcs_bucket:
        logging.info("Uploading model to bucket %s path %s.",
                     output_model_gcs_bucket, output_model_gcs_path)
        bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket)
        storage.Blob(output_model_gcs_path,
                     bucket).upload_from_filename(args.output_model_h5)
Ejemplo n.º 9
0
print('class name to integer check:')
print(df[['class_int', 'c_bug', 'c_feature',
          'c_question']].groupby('class_int').max())

#split data into train/test
traindf, testdf = train_test_split(df, test_size=.15, random_state=0)
# Clean, tokenize, and apply padding / truncating such that each document length = 75th percentile for the dataset.
#  also, retain only the top keep_n words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words

train_body_raw = traindf.body.tolist()
train_title_raw = traindf.title.tolist()

# process the issue body data
body_pp = processor(.75, keep_n=8000)
train_body_vecs = body_pp.fit_transform(train_body_raw)

# process the title data
title_pp = processor(.75, keep_n=4500)
train_title_vecs = title_pp.fit_transform(train_title_raw)

# apply transformations to test data
test_body_raw = testdf.body.tolist()
test_title_raw = testdf.title.tolist()

test_body_vecs = body_pp.transform_parallel(test_body_raw)
test_title_vecs = title_pp.transform_parallel(test_title_raw)

# extract labels
train_labels = np.expand_dims(traindf.class_int.values, -1)
Ejemplo n.º 10
0
assert len(train_code) == len(train_comment)
assert len(holdout_code) == len(holdout_comment)


# # Tokenize Text
# 
# In this step, we are going to pre-process the raw text for modeling.  For an explanation of what this section does, see the [Preapre & Clean Data section of this Tutorial](https://towardsdatascience.com/how-to-create-data-products-that-are-magical-using-sequence-to-sequence-models-703f86a231f8)

# In[7]:


from ktext.preprocess import processor

if not use_cache:    
    code_proc = processor(hueristic_pct_padding=.7, keep_n=20000)
    t_code = code_proc.fit_transform(train_code)

    comment_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post')
    t_comment = comment_proc.fit_transform(train_comment)

elif use_cache:
    logging.warning('Not fitting transform function because use_cache=True')


# **Save tokenized text** (You will reuse this for step 4)

# In[10]:


import dill as dpickle
Ejemplo n.º 11
0
        train_code2 = f.readlines()
    train_code = train_code1 + train_code2
    with open(PATH/'test.function', 'r') as f:
        holdout_code = f.readlines()
    with open(PATH/'train.docstring', 'r') as f:
        train_docstring1 = f.readlines()
    with open(PATH/'valid.docstring', 'r') as f:
        train_docstring2 = f.readlines()
    train_docstring = train_docstring1 + train_docstring2
    with open(PATH/'test.docstring', 'r') as f:
        holdout_docstring= f.readlines()
    return train_code, holdout_code, train_docstring, holdout_docstring


def save_processors(code, docstring)
    code_processor = processor(heuristic_pct_padding=.7, keep_n=20000)
    code_pp = code_processor.fit_transform(code)
    docstring_processor = processor(append_indicators=True, heuristic_pct_padding=.7, keep_n=14000, padding ='post')
    docstring_pp = docstring_processor.fit_transform(docstring)
    with open('seq2seq/py_code_processor_v2.dpkl', 'wb') as f:
    	dpickle.dump(code_processor, f)
    with open('seq2seq/py_docstring_processor_v2.dpkl', 'wb') as f:
    	dpickle.dump(docstring_processor, f)
	np.save('seq2seq/py_train_code_vecs_v2.npy', train_code)
	np.save('seq2seq/py_train_docstring_vecs_v2.npy', train_docstring)

def load_text_processor(fname='title_pp.dpkl'):
    with open(fname, 'rb') as f:
        pp = dpickle.load(f)
    num_tokens = max(pp.id2token.keys()) + 1
    return num_tokens, pp
Ejemplo n.º 12
0
import logging
import glob
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 500)
logger = logging.getLogger()
logger.setLevel(logging.WARNING)

#read in data sample 2M rows (for speed of tutorial)
traindf, testdf = train_test_split(
    pd.read_csv('github_issues.csv').sample(n=50000), test_size=.10)

#print out stats about shape of data
print('Train: ', traindf.shape[1])
print('Test: ', testdf.shape[1])

# preview data
traindf.head(3)

train_body_raw = traindf.body.tolist()
train_title_raw = traindf.issue_title.tolist()
#preview output of first element
train_body_raw[0]

from ktext.preprocess import processor
# Clean, tokenize, and apply padding / truncating such that each document length = 70
#  also, retain only the top 8,000 words in the vocabulary and set the remaining words
#  to 1 which will become common index for rare words
body_pp = processor(keep_n=8000, padding_maxlen=70)
train_body_vecs = body_pp.fit_transform(train_body_raw)