Exemple #1
0
def load_data(hypes):
    data_directory = 'working_dir/data/%s' % hypes['data_directory']

    # read data control dictionaries
    metadata = load_metadata(hypes)
    # read numpy arrays
    idx_q = np.loads(storage.get('%s/idx_q.npy' % data_directory))
    idx_a = np.loads(storage.get('%s/idx_a.npy' % data_directory))

    (trainX, trainY), (testX, testY), (validX,
                                       validY) = split_dataset(idx_q, idx_a)
    trainX = trainX.tolist()
    trainY = trainY.tolist()
    testX = testX.tolist()
    testY = testY.tolist()
    validX = validX.tolist()
    validY = validY.tolist()

    trainX = tl.prepro.remove_pad_sequences(trainX)
    trainY = tl.prepro.remove_pad_sequences(trainY)
    validX = tl.prepro.remove_pad_sequences(validX)
    validY = tl.prepro.remove_pad_sequences(validY)
    testX = tl.prepro.remove_pad_sequences(testX)
    testY = tl.prepro.remove_pad_sequences(testY)

    return metadata, trainX, trainY, testX, testY, validX, validY
Exemple #2
0
def get_climate_change_questions_and_answers(config):
    questions = []
    answer_tokens = []
    answer_metatokens = {}
    csvs = storage.get('data/climate_augmented_dataset.csv').decode('utf-8')
    augmented_rows = csv.reader(StringIO(csvs), delimiter=',')
    for row in augmented_rows:
        safe_qs = [q.replace('\n', ' ').replace('\r', '') for q in row[0:9]]
        safe_as = row[9].replace('\n', ' ').replace('\r', '')

        # tokenize output as a random string.
        output_length = random.randrange(15, 20, 1)
        output_metatoken = ''.join(
            random.SystemRandom().choice(string.ascii_uppercase)
            for _ in range(output_length))
        answer_metatokens[output_metatoken] = safe_as

        for question in safe_qs:
            questions.append(question)
            answer_tokens.append([output_metatoken])
    # This also truncates climate questions that are beyond config limit.
    questions = [
        q[:config['limit']['maxq']] for q in process_raw_lines(questions)
    ]
    multiplier = 1 if 'climate_multiplier' not in config else config[
        'climate_multiplier']
    print("Climate multiplier %i" % multiplier)
    return questions * multiplier, answer_tokens * multiplier, answer_metatokens
Exemple #3
0
def length(data_directory, mode):
    idx_q = np.loads(storage.get('%s/idx_q.npy' % data_directory))
    if mode == ModeKeys.TRAIN:
        return len(idx_q) * split_ratios[0]
    elif mode == ModeKeys.EVAL:
        return len(idx_q) * split_ratios[2]
    elif mode == ModeKeys.PREDICT:
        return len(idx_q) * split_ratios[1]
Exemple #4
0
def get_conversations():
    conv_lines = storage.get('data/movie_conversations.txt').decode(
        'utf-8', 'ignore').split('\n')
    convs = []
    for line in conv_lines[:-1]:
        _line = line.split(' +++$+++ ')[-1][1:-1].replace("'",
                                                          "").replace(" ", "")
        convs.append(_line.split(','))
    return convs
Exemple #5
0
def get_id2line():
    lines = storage.get('data/movie_lines.txt').decode('utf-8',
                                                       'ignore').split('\n')
    id2line = {}
    for line in lines:
        _line = line.split(' +++$+++ ')
        if len(_line) == 5:
            i = _line[0]
            l = _line[4]
            id2line[i] = l
    return id2line
Exemple #6
0
    def __init__(self, job_id):
        hypes = {'cell_fn': 'LSTM'}
        hypes.update(data.load_hypes('working_dir/runs/%s' % job_id))
        self.hypes = hypes
        metadata = data.load_metadata(hypes)
        self.metadata = metadata

        net_out_values = np.loads(
            storage.get('working_dir/runs/%s/%s' %
                        (job_id, data.NET_OUT_FILENAME)))
        net_rnn_values = np.loads(
            storage.get('working_dir/runs/%s/%s' %
                        (job_id, data.NET_RNN_FILENAME)))

        sess = tf.Session()
        self.sess = sess
        inference_model = model.initialize_inference_model(hypes, metadata)
        self.inference_model = inference_model
        tl.files.assign_params(sess, net_out_values,
                               inference_model['net_out'])
        tl.files.assign_params(sess, net_rnn_values,
                               inference_model['net_rnn'])
Exemple #7
0
def execute(hypes, metadata, job_directory):
    data_directory = 'working_dir/data/%s' % (hypes['data_directory'])
    hypes['data'] = json.loads(
        storage.get('%s/config.json' % data_directory).decode('utf-8'))

    storage.write(json.dumps(hypes, indent=2, sort_keys=True),
                  "%s/hypes.json" % job_directory)

    estimator = model.build_estimator(hypes, metadata, job_directory)

    train_input_fn = model.get_input_fn(hypes, ModeKeys.TRAIN)
    train_steps = hypes['epochs'] * data.length(data_directory, ModeKeys.TRAIN)
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=train_steps)

    eval_input_fn = model.get_input_fn(hypes, ModeKeys.EVAL)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_input_fn,
        steps=hypes['eval_steps'],
        throttle_secs=hypes['eval_throttle_seconds'])

    # Run the training job
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
Exemple #8
0
def load_hypes(job_directory):
    return json.loads(
        storage.get("%s/hypes.json" % job_directory).decode('utf-8'))
Exemple #9
0
def load_metadata(hypes):
    data_directory = 'working_dir/data/%s' % hypes['data_directory']
    return pickle.loads(storage.get("%s/metadata.pkl" % data_directory))
Exemple #10
0
    # Run the training job
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)


if __name__ == '__main__':
    # NOTE: This is entry point for distributed cloud execution.
    # Use {root}/train.py for running estimator locally.
    parser = argparse.ArgumentParser()
    parser.add_argument('--hypes_path',
                        help='Path to hypes on GCS for this job run.',
                        default='/hypes.json')
    parser.add_argument('--bucket_name',
                        help='Name of GCS bucket',
                        required=True)
    parser.add_argument('--job_directory',
                        help='Name of job directory under working_dir/runs.',
                        required=True)
    args = parser.parse_args()
    arguments = args.__dict__

    storage.set_bucket(arguments['bucket_name'])

    hypes = {'cell_fn': 'LSTM'}
    hypes.update(
        json.loads(storage.get(arguments['hypes_path']).decode('utf-8')))

    job_directory = 'working_dir/runs/%s' % (arguments['job_directory'])
    metadata = data.load_metadata(hypes)
    execute(hypes, metadata, job_directory)