def preprocess(self, data_file, num_samples=None): """Preprocess the input. Trains preprocessors and splits the data into train and test sets. Args: data_file: The datafile to process num_samples: Number of samples to use. Set to None to use entire dataset. """ # We preprocess the data if we are the master or chief. # Or if we aren't running distributed. if self.job_name and self.job_name.lower() not in ["master", "chief"]: return # TODO(jlewi): The test data isn't being used for anything. How can # we configure evaluation? if num_samples: sampled = pd.read_csv(data_file).sample(n=num_samples) traindf, self.test_df = train_test_split(sampled, test_size=.10) else: traindf, self.test_df = train_test_split(pd.read_csv(data_file), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. self.body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = self.body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) self.title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = self.title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor with open(self.body_pp_file, 'wb') as f: dpickle.dump(self.body_pp, f) with open(self.title_pp_file, 'wb') as f: dpickle.dump(self.title_pp, f) # Save the processed data np.save(self.preprocessed_titles, train_title_vecs) np.save(self.preprocessed_bodies, train_body_vecs)
def datapre(): train_code, holdout_code, train_comment, holdout_comment = read_training_files( './data/processed_data2/') assert len(train_code) == len(train_comment) assert len(holdout_code) == len(holdout_comment) code_proc = processor(heuristic_pct_padding=.7, keep_n=40000) print("start") t_code = code_proc.fit_transform(train_code) print("finish code") comment_proc = processor(append_indicators=True, heuristic_pct_padding=.7, keep_n=40000, padding='post') t_comment = comment_proc.fit_transform(train_comment) print("finish comment") with open(OUTPUT_PATH / 'py_code_proc_v2.dpkl', 'wb') as f: dpickle.dump(code_proc, f) with open(OUTPUT_PATH / 'py_comment_proc_v2.dpkl', 'wb') as f: dpickle.dump(comment_proc, f) np.save(OUTPUT_PATH / 'py_t_code_vecs_v2.npy', t_code) np.save(OUTPUT_PATH / 'py_t_comment_vecs_v2.npy', t_comment)
parser.add_argument("--output_body_preprocessor_dpkl") parser.add_argument("--output_title_preprocessor_dpkl") parser.add_argument("--output_train_title_vecs_npy") parser.add_argument("--output_train_body_vecs_npy") args = parser.parse_args() print(args) # Read data. traindf = pd.read_csv(args.input_traindf_csv) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) print('Example original body:', train_body_raw[0]) print('Example body after pre-processing:', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) print('Example original title:', train_title_raw[0])
def preprocess(self, data_glob, num_samples=None): """Preprocess the input. Trains preprocessors and splits the data into train and test sets. Parameters ---------- data_glob: list The datafiles to process num_samples: int or None Number of samples to use. Set to None to use entire dataset. Returns ------- none """ def strip_list_html(t_list): return ([ BeautifulSoup(text, "html5lib").get_text() for text in t_list ]) # Preprocess the data if we are the master or chief. # Or if we aren't running distributed. if self.job_name and self.job_name.lower() not in ["master", "chief"]: return print("DATA GLOB", data_glob) # TODO: The test data isn't being used for anything. # How can we configure evaluation? if num_samples: traindf, self.test_df = train_test_split(pd.concat( [ pd.read_csv(f, usecols=['body', 'title', 'link']) for f in data_glob ], ignore_index=True).sample(n=num_samples), test_size=.10) else: traindf, self.test_df = train_test_split(pd.concat( [ pd.read_csv(f, usecols=['body', 'title', 'link']) for f in data_glob ], ignore_index=True), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.title.tolist() # Clean, tokenize, and apply padding / truncating such that # each document length = 120. Also, retain only the top 8,000 words # in the vocabulary andset the remaining words to 1 which will # become common index for rare words. self.body_pp = processor(keep_n=self.body_keep_n, padding_maxlen=self.body_maxlen) train_body_vecs = self.body_pp.fit_transform( strip_list_html(train_body_raw)) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) self.title_pp = processor(append_indicators=True, keep_n=self.title_keep_n, padding_maxlen=self.title_maxlen, padding='post') train_title_vecs = self.title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor with open(self.body_pp_file, 'wb') as f: dpickle.dump(self.body_pp, f) with open(self.title_pp_file, 'wb') as f: dpickle.dump(self.title_pp, f) # Save the processed data np.save(self.preprocessed_titles, train_title_vecs) np.save(self.preprocessed_bodies, train_body_vecs)
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data_gcs_bucket", type=str, default="") parser.add_argument("--input_data_gcs_path", type=str, default="") parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument("--output_model_gcs_path", type=str, default="") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) print("Download iput file") if args.input_data_gcs_bucket != "" and args.input_data_gcs_path != "": bucket = storage.Bucket(storage.Client(), args.input_data_gcs_bucket) storage.Blob(args.input_data_gcs_path, bucket).download_to_filename('github-issues.zip') else: urllib.request.urlretrieve( "https://storage.googleapis.com/kubeflow-examples/github-issue-summarization-data/github-issues.zip", 'github-issues.zip') print("unzip iput file") zip_ref = zipfile.ZipFile('github-issues.zip', 'r') zip_ref.extractall('.') zip_ref.close() # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv('github_issues.csv').sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### if args.output_model_gcs_bucket != "": bucket = storage.Bucket(storage.Client(), args.output_model_gcs_bucket) storage.Blob(args.output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
import pandas as pd from ktext.preprocess import processor data_url = 'https://storage.googleapis.com/issue_label_bot/pre_processed_data/processed_part0000.csv' body = pd.read_csv(data_url).head(2000).text.tolist() issue_body_proc = processor(heuristic_pct_padding=.7, keep_n=5000) train_result = issue_body_proc.fit_transform(body)
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument("--input_data", type=str, default="", help="The input location, a local file path.") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model, a local file path.") ##################################################### # Optional section, based on what your model needs ##################################################### parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") ######################################################## # End of optional args section # # Be sure to add your args at the appropriate sections # of the training code ######################################################## args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) ################################################## # Reading input file(s) # Make changes as needed ################################################## # Reading input data file ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data ################################################### # Fill in your model training code starting here ################################################### # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ######################################################## # End of your training code # # * Be sure to save your model to args.output_model # such as Model.save(args.output_model) ######################################################## # Save model. seq2seq_Model.save(args.output_model)
def main(): # pylint: disable=too-many-statements # Parsing flags. parser = argparse.ArgumentParser() parser.add_argument("--sample_size", type=int, default=2000000) parser.add_argument("--learning_rate", default="0.001") parser.add_argument( "--input_data", type=str, default="", help="The input location. Can be a GCS or local file path.") # TODO(jlewi): The following arguments are deprecated; just # use input_data. We should remove them as soon as all call sites # are updated. parser.add_argument("--input_data_gcs_bucket", type=str, default="kubeflow-examples") parser.add_argument( "--input_data_gcs_path", type=str, default="github-issue-summarization-data/github-issues.zip") parser.add_argument( "--output_model", type=str, default="", help="The output location for the model GCS or local file path.") # TODO(jlewi): We should get rid of the following arguments and just use # --output_model_h5. If the output is a gs:// location we should use # a local file and then upload it to GCS. parser.add_argument("--output_model_gcs_bucket", type=str, default="") parser.add_argument( "--output_model_gcs_path", type=str, default="github-issue-summarization-data/output_model.h5") parser.add_argument("--output_body_preprocessor_dpkl", type=str, default="body_preprocessor.dpkl") parser.add_argument("--output_title_preprocessor_dpkl", type=str, default="title_preprocessor.dpkl") parser.add_argument("--output_train_title_vecs_npy", type=str, default="train_title_vecs.npy") parser.add_argument("--output_train_body_vecs_npy", type=str, default="train_body_vecs.npy") parser.add_argument("--output_model_h5", type=str, default="output_model.h5") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format=('%(levelname)s|%(asctime)s' '|%(pathname)s|%(lineno)d| %(message)s'), datefmt='%Y-%m-%dT%H:%M:%S', ) logging.getLogger().setLevel(logging.INFO) logging.info(args) learning_rate = float(args.learning_rate) pd.set_option('display.max_colwidth', 500) # For backwords compatibility input_data_gcs_bucket = None input_data_gcs_path = None if not args.input_data: # Since input_data isn't set fall back on old arguments. input_data_gcs_bucket = args.input_data_gcs_bucket input_data_gcs_path = args.input_data_gcs_path else: if args.input_data.startswith('gs://'): input_data_gcs_bucket, input_data_gcs_path = split_gcs_uri( args.input_data) if input_data_gcs_bucket: logging.info("Download bucket %s object %s.", input_data_gcs_bucket, input_data_gcs_path) bucket = storage.Bucket(storage.Client(), input_data_gcs_bucket) args.input_data = 'github-issues.zip' storage.Blob(input_data_gcs_path, bucket).download_to_filename(args.input_data) ext = os.path.splitext(args.input_data)[-1] if ext.lower() == '.zip': zip_ref = zipfile.ZipFile(args.input_data, 'r') zip_ref.extractall('.') zip_ref.close() # TODO(jlewi): Hardcoding the file in the Archive to use is brittle. # We should probably just require the input to be a CSV file. csv_file = 'github_issues.csv' else: csv_file = args.input_data # Read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv(csv_file).sample(n=args.sample_size), test_size=.10) # Print stats about the shape of the data. logging.info('Train: %d rows %d columns', traindf.shape[0], traindf.shape[1]) logging.info('Test: %d rows %d columns', testdf.shape[0], testdf.shape[1]) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() # Clean, tokenize, and apply padding / truncating such that each document # length = 70. Also, retain only the top 8,000 words in the vocabulary and set # the remaining words to 1 which will become common index for rare words. body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw) logging.info('Example original body: %s', train_body_raw[0]) logging.info('Example body after pre-processing: %s', train_body_vecs[0]) # Instantiate a text processor for the titles, with some different parameters. title_pp = processor(append_indicators=True, keep_n=4500, padding_maxlen=12, padding='post') # process the title data train_title_vecs = title_pp.fit_transform(train_title_raw) logging.info('Example original title: %s', train_title_raw[0]) logging.info('Example title after pre-processing: %s', train_title_vecs[0]) # Save the preprocessor. with open(args.output_body_preprocessor_dpkl, 'wb') as f: dpickle.dump(body_pp, f) with open(args.output_title_preprocessor_dpkl, 'wb') as f: dpickle.dump(title_pp, f) # Save the processed data. np.save(args.output_train_title_vecs_npy, train_title_vecs) np.save(args.output_train_body_vecs_npy, train_body_vecs) _, doc_length = load_encoder_inputs(args.output_train_body_vecs_npy) num_encoder_tokens, body_pp = load_text_processor( args.output_body_preprocessor_dpkl) num_decoder_tokens, title_pp = load_text_processor( args.output_title_preprocessor_dpkl) # Arbitrarly set latent dimension for embedding and hidden units latent_dim = 300 ############### # Encoder Model. ############### encoder_inputs = Input(shape=(doc_length, ), name='Encoder-Input') # Word embeding for encoder (ex: Issue Body) x = Embedding(num_encoder_tokens, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs) x = BatchNormalization(name='Encoder-Batchnorm-1')(x) # We do not need the `encoder_output` just the hidden state. _, state_h = GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x) # Encapsulate the encoder as a separate entity so we can just # encode without decoding if we want to. encoder_model = Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model') seq2seq_encoder_out = encoder_model(encoder_inputs) ################ # Decoder Model. ################ decoder_inputs = Input(shape=(None, ), name='Decoder-Input') # for teacher forcing # Word Embedding For Decoder (ex: Issue Titles) dec_emb = Embedding(num_decoder_tokens, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs) dec_bn = BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb) # Set up the decoder, using `decoder_state_input` as initial state. decoder_gru = GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU') decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) x = BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output) # Dense layer for prediction decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='Final-Output-Dense') decoder_outputs = decoder_dense(x) ################ # Seq2Seq Model. ################ seq2seq_Model = Model([encoder_inputs, decoder_inputs], decoder_outputs) seq2seq_Model.compile(optimizer=optimizers.Nadam(lr=learning_rate), loss='sparse_categorical_crossentropy') seq2seq_Model.summary() ############# # Save model. ############# seq2seq_Model.save(args.output_model_h5) ###################### # Upload model to GCS. ###################### # For backwords compatibility output_model_gcs_bucket = None output_model_gcs_path = None if not args.output_model: # Since input_data isn't set fall back on old arguments. output_model_gcs_bucket = args.output_model_gcs_bucket output_model_gcs_path = args.output_model_gcs_path else: if args.output_model.startswith('gs://'): output_model_gcs_bucket, output_model_gcs_path = split_gcs_uri( args.output_model) if output_model_gcs_bucket: logging.info("Uploading model to bucket %s path %s.", output_model_gcs_bucket, output_model_gcs_path) bucket = storage.Bucket(storage.Client(), output_model_gcs_bucket) storage.Blob(output_model_gcs_path, bucket).upload_from_filename(args.output_model_h5)
print('class name to integer check:') print(df[['class_int', 'c_bug', 'c_feature', 'c_question']].groupby('class_int').max()) #split data into train/test traindf, testdf = train_test_split(df, test_size=.15, random_state=0) # Clean, tokenize, and apply padding / truncating such that each document length = 75th percentile for the dataset. # also, retain only the top keep_n words in the vocabulary and set the remaining words # to 1 which will become common index for rare words train_body_raw = traindf.body.tolist() train_title_raw = traindf.title.tolist() # process the issue body data body_pp = processor(.75, keep_n=8000) train_body_vecs = body_pp.fit_transform(train_body_raw) # process the title data title_pp = processor(.75, keep_n=4500) train_title_vecs = title_pp.fit_transform(train_title_raw) # apply transformations to test data test_body_raw = testdf.body.tolist() test_title_raw = testdf.title.tolist() test_body_vecs = body_pp.transform_parallel(test_body_raw) test_title_vecs = title_pp.transform_parallel(test_title_raw) # extract labels train_labels = np.expand_dims(traindf.class_int.values, -1)
assert len(train_code) == len(train_comment) assert len(holdout_code) == len(holdout_comment) # # Tokenize Text # # In this step, we are going to pre-process the raw text for modeling. For an explanation of what this section does, see the [Preapre & Clean Data section of this Tutorial](https://towardsdatascience.com/how-to-create-data-products-that-are-magical-using-sequence-to-sequence-models-703f86a231f8) # In[7]: from ktext.preprocess import processor if not use_cache: code_proc = processor(hueristic_pct_padding=.7, keep_n=20000) t_code = code_proc.fit_transform(train_code) comment_proc = processor(append_indicators=True, hueristic_pct_padding=.7, keep_n=14000, padding ='post') t_comment = comment_proc.fit_transform(train_comment) elif use_cache: logging.warning('Not fitting transform function because use_cache=True') # **Save tokenized text** (You will reuse this for step 4) # In[10]: import dill as dpickle
train_code2 = f.readlines() train_code = train_code1 + train_code2 with open(PATH/'test.function', 'r') as f: holdout_code = f.readlines() with open(PATH/'train.docstring', 'r') as f: train_docstring1 = f.readlines() with open(PATH/'valid.docstring', 'r') as f: train_docstring2 = f.readlines() train_docstring = train_docstring1 + train_docstring2 with open(PATH/'test.docstring', 'r') as f: holdout_docstring= f.readlines() return train_code, holdout_code, train_docstring, holdout_docstring def save_processors(code, docstring) code_processor = processor(heuristic_pct_padding=.7, keep_n=20000) code_pp = code_processor.fit_transform(code) docstring_processor = processor(append_indicators=True, heuristic_pct_padding=.7, keep_n=14000, padding ='post') docstring_pp = docstring_processor.fit_transform(docstring) with open('seq2seq/py_code_processor_v2.dpkl', 'wb') as f: dpickle.dump(code_processor, f) with open('seq2seq/py_docstring_processor_v2.dpkl', 'wb') as f: dpickle.dump(docstring_processor, f) np.save('seq2seq/py_train_code_vecs_v2.npy', train_code) np.save('seq2seq/py_train_docstring_vecs_v2.npy', train_docstring) def load_text_processor(fname='title_pp.dpkl'): with open(fname, 'rb') as f: pp = dpickle.load(f) num_tokens = max(pp.id2token.keys()) + 1 return num_tokens, pp
import logging import glob from sklearn.model_selection import train_test_split pd.set_option('display.max_colwidth', 500) logger = logging.getLogger() logger.setLevel(logging.WARNING) #read in data sample 2M rows (for speed of tutorial) traindf, testdf = train_test_split( pd.read_csv('github_issues.csv').sample(n=50000), test_size=.10) #print out stats about shape of data print('Train: ', traindf.shape[1]) print('Test: ', testdf.shape[1]) # preview data traindf.head(3) train_body_raw = traindf.body.tolist() train_title_raw = traindf.issue_title.tolist() #preview output of first element train_body_raw[0] from ktext.preprocess import processor # Clean, tokenize, and apply padding / truncating such that each document length = 70 # also, retain only the top 8,000 words in the vocabulary and set the remaining words # to 1 which will become common index for rare words body_pp = processor(keep_n=8000, padding_maxlen=70) train_body_vecs = body_pp.fit_transform(train_body_raw)