def load_data(hypes): data_directory = 'working_dir/data/%s' % hypes['data_directory'] # read data control dictionaries metadata = load_metadata(hypes) # read numpy arrays idx_q = np.loads(storage.get('%s/idx_q.npy' % data_directory)) idx_a = np.loads(storage.get('%s/idx_a.npy' % data_directory)) (trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a) trainX = trainX.tolist() trainY = trainY.tolist() testX = testX.tolist() testY = testY.tolist() validX = validX.tolist() validY = validY.tolist() trainX = tl.prepro.remove_pad_sequences(trainX) trainY = tl.prepro.remove_pad_sequences(trainY) validX = tl.prepro.remove_pad_sequences(validX) validY = tl.prepro.remove_pad_sequences(validY) testX = tl.prepro.remove_pad_sequences(testX) testY = tl.prepro.remove_pad_sequences(testY) return metadata, trainX, trainY, testX, testY, validX, validY
def get_climate_change_questions_and_answers(config): questions = [] answer_tokens = [] answer_metatokens = {} csvs = storage.get('data/climate_augmented_dataset.csv').decode('utf-8') augmented_rows = csv.reader(StringIO(csvs), delimiter=',') for row in augmented_rows: safe_qs = [q.replace('\n', ' ').replace('\r', '') for q in row[0:9]] safe_as = row[9].replace('\n', ' ').replace('\r', '') # tokenize output as a random string. output_length = random.randrange(15, 20, 1) output_metatoken = ''.join( random.SystemRandom().choice(string.ascii_uppercase) for _ in range(output_length)) answer_metatokens[output_metatoken] = safe_as for question in safe_qs: questions.append(question) answer_tokens.append([output_metatoken]) # This also truncates climate questions that are beyond config limit. questions = [ q[:config['limit']['maxq']] for q in process_raw_lines(questions) ] multiplier = 1 if 'climate_multiplier' not in config else config[ 'climate_multiplier'] print("Climate multiplier %i" % multiplier) return questions * multiplier, answer_tokens * multiplier, answer_metatokens
def length(data_directory, mode): idx_q = np.loads(storage.get('%s/idx_q.npy' % data_directory)) if mode == ModeKeys.TRAIN: return len(idx_q) * split_ratios[0] elif mode == ModeKeys.EVAL: return len(idx_q) * split_ratios[2] elif mode == ModeKeys.PREDICT: return len(idx_q) * split_ratios[1]
def get_conversations(): conv_lines = storage.get('data/movie_conversations.txt').decode( 'utf-8', 'ignore').split('\n') convs = [] for line in conv_lines[:-1]: _line = line.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "") convs.append(_line.split(',')) return convs
def get_id2line(): lines = storage.get('data/movie_lines.txt').decode('utf-8', 'ignore').split('\n') id2line = {} for line in lines: _line = line.split(' +++$+++ ') if len(_line) == 5: i = _line[0] l = _line[4] id2line[i] = l return id2line
def __init__(self, job_id): hypes = {'cell_fn': 'LSTM'} hypes.update(data.load_hypes('working_dir/runs/%s' % job_id)) self.hypes = hypes metadata = data.load_metadata(hypes) self.metadata = metadata net_out_values = np.loads( storage.get('working_dir/runs/%s/%s' % (job_id, data.NET_OUT_FILENAME))) net_rnn_values = np.loads( storage.get('working_dir/runs/%s/%s' % (job_id, data.NET_RNN_FILENAME))) sess = tf.Session() self.sess = sess inference_model = model.initialize_inference_model(hypes, metadata) self.inference_model = inference_model tl.files.assign_params(sess, net_out_values, inference_model['net_out']) tl.files.assign_params(sess, net_rnn_values, inference_model['net_rnn'])
def execute(hypes, metadata, job_directory): data_directory = 'working_dir/data/%s' % (hypes['data_directory']) hypes['data'] = json.loads( storage.get('%s/config.json' % data_directory).decode('utf-8')) storage.write(json.dumps(hypes, indent=2, sort_keys=True), "%s/hypes.json" % job_directory) estimator = model.build_estimator(hypes, metadata, job_directory) train_input_fn = model.get_input_fn(hypes, ModeKeys.TRAIN) train_steps = hypes['epochs'] * data.length(data_directory, ModeKeys.TRAIN) train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn, max_steps=train_steps) eval_input_fn = model.get_input_fn(hypes, ModeKeys.EVAL) eval_spec = tf.estimator.EvalSpec( input_fn=eval_input_fn, steps=hypes['eval_steps'], throttle_secs=hypes['eval_throttle_seconds']) # Run the training job tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def load_hypes(job_directory): return json.loads( storage.get("%s/hypes.json" % job_directory).decode('utf-8'))
def load_metadata(hypes): data_directory = 'working_dir/data/%s' % hypes['data_directory'] return pickle.loads(storage.get("%s/metadata.pkl" % data_directory))
# Run the training job tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) if __name__ == '__main__': # NOTE: This is entry point for distributed cloud execution. # Use {root}/train.py for running estimator locally. parser = argparse.ArgumentParser() parser.add_argument('--hypes_path', help='Path to hypes on GCS for this job run.', default='/hypes.json') parser.add_argument('--bucket_name', help='Name of GCS bucket', required=True) parser.add_argument('--job_directory', help='Name of job directory under working_dir/runs.', required=True) args = parser.parse_args() arguments = args.__dict__ storage.set_bucket(arguments['bucket_name']) hypes = {'cell_fn': 'LSTM'} hypes.update( json.loads(storage.get(arguments['hypes_path']).decode('utf-8'))) job_directory = 'working_dir/runs/%s' % (arguments['job_directory']) metadata = data.load_metadata(hypes) execute(hypes, metadata, job_directory)