Esempio n. 1
0
 def testEmptyFilename(self):
     f = file_io.FileIO("", mode="r")
     with self.assertRaises(errors.NotFoundError):
         _ = f.read()
def train(job_dir=None, job_id=None,
          use_transposed_conv=True, score_metric='mse', loss='binary_crossentropy',
          learning_rate = 0.001, lr_decay=0.001, optimizer_name='adam', n_epochs=100,
          patience=5, batch_norm_before_activation=False, pool_method='max', **kwargs):
    '''main training function'''

    global do_batch_norm_before_activation
    do_batch_norm_before_activation = batch_norm_before_activation
    print('--> batch_norm_before_activation== {}!!!\n'.format(do_batch_norm_before_activation))

    # input image dimensions
    img_rows, img_cols, img_chns = 28, 28, 1
    original_img_size = (img_rows, img_cols, img_chns)

    n_filters = 8 # number of convolutional filters to use
    kernel_size = 3 # convolution kernel size
    batch_size = 1000

    lr = learning_rate
    decay = lr_decay
    opt = optimizer_name

    if job_dir is None:
        job_dir = './tmp/'

    if job_id is None:
        ctime = time.ctime().split()
        time_str = ctime[4]+ctime[1]+ctime[2]+"_"+''.join(ctime[3].split(":")[0:2])
        job_id = time_str
        checkpoint_filename = 'mnist_autoencoder_checkpoint_{}.hdf5'.format(job_id)
    else:
        checkpoint_filename = '{}.hdf5'.format(job_id)

    if use_transposed_conv is True:
        print('--> use_transposed_conv is True!!!\n')
        job_id = 'transposed_conv_' + job_id


    ############ Encoder ###############
    x = Input(shape=original_img_size) # reshape to: (100, 28, 28, 1)

    conv1 = conv_block(x, n_filters, kernel_size)
    conv1 = conv_block(conv1, n_filters, kernel_size)
    conv1 = pool_layer(conv1, method=pool_method, pool_size=(2,2), padding='same')
    
    conv2 = conv_block(conv1, n_filters*2, kernel_size)
    conv2 = conv_block(conv2, n_filters*2, kernel_size)
    conv2 = pool_layer(conv2, method=pool_method, pool_size=(2,2), padding='same')

    conv3 = conv_block(conv2, n_filters*4, kernel_size)
    conv3 = conv_block(conv3, n_filters*4, kernel_size)
    encoded = pool_layer(conv3, method=pool_method, pool_size=(2,2), padding='same')

    # End of encoder. The compressed representation is (4, 4, 8)

    conv4 = conv_block(encoded, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv)
    conv4 = conv_block(conv4, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv)
    conv4 = UpSampling2D((2, 2))(conv4)

    conv5 = conv_block(conv4, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv)
    conv5 = conv_block(conv5, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv)
    conv5 = UpSampling2D((2, 2))(conv5)

    conv6 = conv_block(conv5, n_filters, kernel_size, use_transposed_conv=use_transposed_conv)
    conv6 = conv_block(conv6, n_filters, kernel_size, use_transposed_conv=use_transposed_conv)
    conv6 = UpSampling2D((2, 2))(conv6)

    decoded = conv_block(conv6, 1, kernel_size=kernel_size, activation='sigmoid', padding='same',
                         batch_norm=False, use_transposed_conv=use_transposed_conv) # the activation here is sigmoid b/c the pixel values are bounded b/w 0-1, and there are lots of 0s
    decoded = Cropping2D(cropping=((2, 2), (2, 2)))(decoded) # crop 2 on each side of the img to get 28x28

    # Put all layers together into a model graph
    autoencoder = Model(x, decoded)

    ######### End of decoder ###################
    ######### Now config models for training and logging ##########

    if opt =='adam':
        optimizer = Adam(lr=lr, decay=decay)
    elif opt =='sgd':
        optimizer = SGD(lr=lr, momentum=0.9, decay=decay, nesterov=True)

    autoencoder.compile(optimizer=optimizer, loss = loss, metrics = [score_metric])
    autoencoder.summary()


    # data from MNIST digits
    (x_train, _), (x_test, y_test) = mnist.load_data()

    # reshape data to (data_size, n_pix, n_pix, n_channels)
    x_train = x_train.astype('float32') / 255.
    x_train = x_train.reshape((x_train.shape[0],) + original_img_size)
    x_test = x_test.astype('float32') / 255.
    x_test = x_test.reshape((x_test.shape[0],) + original_img_size)

    print('x_train.shape:', x_train.shape)
    print('Initial evaluation of random model ={}\n'.format(autoencoder.evaluate(x_test, x_test, batch_size=batch_size)))

    callbacks = [EarlyStopping(monitor='val_loss',patience=5,verbose=2, mode='min', min_delta=0.0005),
                 ModelCheckpoint(checkpoint_filename, monitor='val_loss', verbose=2, save_best_only=True),
                 TensorBoard(log_dir=job_dir)]

    history = autoencoder.fit(x=x_train, y=x_train, shuffle=True, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks,
                        verbose=2, validation_data=(x_test, x_test))

    test_score = autoencoder.evaluate(x_test, x_test, verbose=0, batch_size=batch_size)
    print('Final test score:', test_score)

    if score_metric == 'mae':
        history_key_validation = 'val_mean_absolute_error'
        history_key_train = 'mean_absolute_error'
    if score_metric == 'mse':
        history_key_validation = 'val_mean_squared_error'
        history_key_train = 'mean_squared_error'

    validation_history = history.history[history_key_validation]
    training_history = history.history[history_key_train]


    # Save model to gs
    if 'gs://' in job_dir:
        # Save model hdf5 to google storage
        with file_io.FileIO(checkpoint_filename, mode='r') as input_f:
            with file_io.FileIO(job_dir + checkpoint_filename, mode='w') as output_f:
                output_f.write(input_f.read())


    #%% Plot a learning curve
    fig_name = 'lr_{}.pdf'.format(job_id) #
    if 'gs://' not in job_dir:
        fig_name = job_dir + fig_name

    f, axes = plt.subplots(2, sharex=True, figsize=(8,7))
    axes[0].plot(training_history)
    axes[0].set_ylabel('Training score ({})'.format(score_metric))
    axes[0].set_title('Final test score ({0}) = {1:2.4f}\n LR={2}, decay={3}, optimizer={4}, pool_method={5}\n \
                    use_transposed_conv={6}, loss={7}, batch_norm_before_activation={8}'.format(score_metric,
                    test_score[1], lr, decay, opt, pool_method, use_transposed_conv, loss, do_batch_norm_before_activation), fontsize=9)

    axes[1].plot(validation_history)
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('Validation score ({})'.format(score_metric))
    #f.suptitle('Config file :{}'.format(train_config_file), fontsize=10)
    f.subplots_adjust(hspace=0.05)
    f.savefig(fig_name)

    if 'gs://' in job_dir:
        #Save figure to GS
        with file_io.FileIO(fig_name, mode='r') as input_f:
            with file_io.FileIO(job_dir + fig_name, mode='w') as output_f:
                output_f.write(input_f.read())


    #%% Sample a few test images and compare with reconstructed ones
    n_imgs_to_show = 30
    x_test_sub = np.random.permutation(x_test)[0:n_imgs_to_show]

    reconstructed_test = autoencoder.predict(x_test_sub, batch_size=n_imgs_to_show)
    #reconstructed_train = autoencoder.predict(x_train[0:10000].reshape((10000, 28, 28, 1)), batch_size=batch_size)

    # plot reconstructed images and compare
    fig_name = 'compare_{}.pdf'.format(job_id)
    if 'gs://' not in job_dir:
        fig_name = job_dir + fig_name

    n_rows = 3 # split orignal images into 2 rows
    n_cols = n_imgs_to_show//n_rows
    f, axes = plt.subplots(n_rows*2, n_cols, sharey=True, figsize=(10,10))

    for i in range(n_imgs_to_show):
        axes[i//n_cols * 2, i % n_cols].imshow(x_test_sub[i,:,:,0])
        axes[i//n_cols * 2, 0].set_ylabel('Original')
        axes[i//n_cols * 2 +1, i % n_cols].imshow(reconstructed_test[i,:,:,0])
        axes[i//n_cols * 2 +1, 0].set_ylabel('Reconstructed')
    f.savefig(fig_name)

    if 'gs://' in job_dir:
        #Save figure to GS
        with file_io.FileIO(fig_name, mode='r') as input_f:
            with file_io.FileIO(job_dir + fig_name, mode='w') as output_f:
                output_f.write(input_f.read())
Esempio n. 3
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--predictions',
                        type=str,
                        help='GCS path of prediction file pattern.')
    parser.add_argument('--output',
                        type=str,
                        help='GCS path of the output directory.')
    parser.add_argument(
        '--target_lambda',
        type=str,
        help='a lambda function as a string to compute target.' +
        'For example, "lambda x: x[\'a\'] + x[\'b\']"' +
        'If not set, the input must include a "target" column.')
    args = parser.parse_args()

    on_cloud = args.output.startswith('gs://')
    if not on_cloud and not os.path.exists(args.output):
        os.makedirs(args.output)

    schema_file = os.path.join(os.path.dirname(args.predictions),
                               'schema.json')
    schema = json.loads(file_io.read_file_to_string(schema_file))
    names = [x['name'] for x in schema]
    dfs = []
    files = file_io.get_matching_files(args.predictions)
    for file in files:
        with file_io.FileIO(file, 'r') as f:
            dfs.append(pd.read_csv(f, names=names))

    df = pd.concat(dfs)
    if args.target_lambda:
        df['target'] = df.apply(eval(args.target_lambda), axis=1)

    vocab = list(df['target'].unique())
    cm = confusion_matrix(df['target'], df['predicted'], labels=vocab)
    data = []
    for target_index, target_row in enumerate(cm):
        for predicted_index, count in enumerate(target_row):
            data.append((vocab[target_index], vocab[predicted_index], count))

    df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count'])
    cm_file = os.path.join(args.output, 'confusion_matrix.csv')
    with file_io.FileIO(cm_file, 'w') as f:
        df_cm.to_csv(f,
                     columns=['target', 'predicted', 'count'],
                     header=False,
                     index=False)

    metadata = {
        'outputs': [{
            'type':
            'confusion_matrix',
            'storage':
            'gcs',
            'format':
            'csv',
            'schema': [
                {
                    'name': 'target',
                    'type': 'CATEGORY'
                },
                {
                    'name': 'predicted',
                    'type': 'CATEGORY'
                },
                {
                    'name': 'count',
                    'type': 'NUMBER'
                },
            ],
            'source':
            cm_file,
            # Convert vocab to string because for bealean values we want "True|False" to match csv data.
            'labels':
            list(map(str, vocab)),
        }]
    }
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)

    accuracy = accuracy_score(df['target'], df['predicted'])
    metrics = {
        'metrics': [{
            'name': 'accuracy-score',
            'numberValue': accuracy,
            'format': "PERCENTAGE",
        }]
    }
    with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
        json.dump(metrics, f)
Esempio n. 4
0
def run_local_analysis(output_dir, csv_file_pattern, schema,
                       inverted_features):
    """Use pandas to analyze csv files.

  Produces a stats file and vocab files.

  Args:
    output_dir: output folder
    csv_file_pattern: list of csv file paths, may contain wildcards
    schema: BQ schema list
    inverted_features: inverted_features dict

  Raises:
    ValueError: on unknown transfrorms/schemas
  """
    sys.stdout.write('Expanding any file patterns...\n')
    sys.stdout.flush()
    header = [column['name'] for column in schema]
    input_files = []
    for file_pattern in csv_file_pattern:
        input_files.extend(file_io.get_matching_files(file_pattern))
    sys.stdout.write('file list computed.\n')
    sys.stdout.flush()

    # Make a copy of inverted_features and update the target transform to be
    # identity or one hot depending on the schema.
    inverted_features_target = copy.deepcopy(inverted_features)
    for name, transform_set in six.iteritems(inverted_features_target):
        if transform_set == set([constant.TARGET_TRANSFORM]):
            target_schema = next(col['type'].lower() for col in schema
                                 if col['name'] == name)
            if target_schema in constant.NUMERIC_SCHEMA:
                inverted_features_target[name] = {constant.IDENTITY_TRANSFORM}
            else:
                inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM}

    # initialize the results
    def _init_numerical_results():
        return {
            'min': float('inf'),
            'max': float('-inf'),
            'count': 0,
            'sum': 0.0
        }

    numerical_results = collections.defaultdict(_init_numerical_results)
    vocabs = collections.defaultdict(lambda: collections.defaultdict(int))

    num_examples = 0
    # for each file, update the numerical stats from that file, and update the set
    # of unique labels.
    for input_file in input_files:
        sys.stdout.write('Analyzing file %s...\n' % input_file)
        sys.stdout.flush()
        with file_io.FileIO(input_file, 'r') as f:
            for line in csv.reader(f):
                if len(header) != len(line):
                    raise ValueError(
                        'Schema has %d columns but a csv line only has %d columns.'
                        % (len(header), len(line)))
                parsed_line = dict(zip(header, line))
                num_examples += 1

                for col_name, transform_set in six.iteritems(
                        inverted_features_target):
                    # All transforms in transform_set require the same analysis. So look
                    # at the first transform.
                    transform_name = next(iter(transform_set))
                    if transform_name in constant.TEXT_TRANSFORMS:
                        split_strings = parsed_line[col_name].split(' ')

                        # If a label is in the row N times, increase it's vocab count by 1.
                        # This is needed for TFIDF, but it's also an interesting stat.
                        for one_label in set(split_strings):
                            # Filter out empty strings
                            if one_label:
                                vocabs[col_name][one_label] += 1
                    elif transform_name in constant.CATEGORICAL_TRANSFORMS:
                        if parsed_line[col_name]:
                            vocabs[col_name][parsed_line[col_name]] += 1
                    elif transform_name in constant.NUMERIC_TRANSFORMS:
                        if not parsed_line[col_name].strip():
                            continue

                        numerical_results[col_name]['min'] = (min(
                            numerical_results[col_name]['min'],
                            float(parsed_line[col_name])))
                        numerical_results[col_name]['max'] = (max(
                            numerical_results[col_name]['max'],
                            float(parsed_line[col_name])))
                        numerical_results[col_name]['count'] += 1
                        numerical_results[col_name]['sum'] += float(
                            parsed_line[col_name])

        sys.stdout.write('file %s analyzed.\n' % input_file)
        sys.stdout.flush()

    # Write the vocab files. Each label is on its own line.
    vocab_sizes = {}
    for name, label_count in six.iteritems(vocabs):
        # df is now:
        # label1,count
        # label2,count
        # ...
        # where label1 is the most frequent label, and label2 is the 2nd most, etc.
        df = pd.DataFrame([{
            'label': label,
            'count': count
        } for label, count in sorted(
            six.iteritems(label_count), key=lambda x: x[1], reverse=True)],
                          columns=['label', 'count'])
        csv_string = df.to_csv(index=False, header=False)

        file_io.write_string_to_file(
            os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % name),
            csv_string)

        vocab_sizes[name] = {'vocab_size': len(label_count)}

    # Update numerical_results to just have min/min/mean
    for col_name in numerical_results:
        if float(numerical_results[col_name]['count']) == 0:
            raise ValueError('Column %s has a zero count' % col_name)
        mean = (numerical_results[col_name]['sum'] /
                float(numerical_results[col_name]['count']))
        del numerical_results[col_name]['sum']
        del numerical_results[col_name]['count']
        numerical_results[col_name]['mean'] = mean

    # Write the stats file.
    numerical_results.update(vocab_sizes)
    stats = {'column_stats': numerical_results, 'num_examples': num_examples}
    file_io.write_string_to_file(
        os.path.join(output_dir, constant.STATS_FILE),
        json.dumps(stats, indent=2, separators=(',', ': ')))
Esempio n. 5
0
def run_predict(output_dir, data_path, schema, target_name, model_export_dir,
                project, mode, batch_size):
    """Run predictions with given model using DataFlow.
    Args:
      output_dir: output folder
      data_path: test data file path.
      schema: schema list.
      target_name: target column name.
      model_export_dir: GCS or local path of exported model trained with tft preprocessed data.
      project: the project to run dataflow in.
      local: whether the job should be local or cloud.
      batch_size: batch size when running prediction.
    """

    target_type = next(x for x in schema if x['name'] == target_name)['type']
    labels_file = os.path.join(model_export_dir, 'assets',
                               'vocab_' + target_name)
    is_classification = file_io.file_exists(labels_file)

    output_file_prefix = os.path.join(output_dir, 'prediction_results')
    output_schema_file = os.path.join(output_dir, 'schema.json')
    names = [x['name'] for x in schema]

    output_schema = filter(lambda x: x['name'] != target_name, schema)
    if is_classification:
        with file_io.FileIO(labels_file, mode='r') as f:
            labels = [x.strip() for x in f.readlines()]

        output_schema.append({'name': 'target', 'type': 'CATEGORY'})
        output_schema.append({'name': 'predicted', 'type': 'CATEGORY'})
        output_schema.extend([{'name': x, 'type': 'NUMBER'} for x in labels])
    else:
        output_schema.append({'name': 'target', 'type': 'NUMBER'})
        output_schema.append({'name': 'predicted', 'type': 'NUMBER'})

    if mode == 'local':
        pipeline_options = None
        runner = 'DirectRunner'
    elif mode == 'cloud':
        options = {
            'job_name':
            'pipeline-predict-' +
            datetime.datetime.now().strftime('%y%m%d-%H%M%S'),
            'temp_location':
            os.path.join(output_dir, 'tmp'),
            'project':
            project,
            'setup_file':
            './setup.py',
        }
        pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
        runner = 'DataFlowRunner'
    else:
        raise ValueError("Invalid mode %s." % mode)

    with beam.Pipeline(runner, options=pipeline_options) as p:
        raw_results = (p
                       | 'read data' >> beam.io.ReadFromText(data_path)
                       | 'move target to last' >> beam.ParDo(
                           TargetToLastDoFn(names, target_name))
                       | 'batch' >> beam.ParDo(EmitAsBatchDoFn(batch_size))
                       |
                       'predict' >> beam.ParDo(PredictDoFn(model_export_dir)))

        if is_classification:
            processed_results = (
                raw_results
                | 'unbatch' >>
                beam.FlatMap(lambda x: zip(x['source'], x['scores']))
                | 'get predicted' >>
                beam.Map(lambda x: x[0] + [labels[x[1].argmax()]] + list(x[1]))
            )
        else:
            processed_results = (
                raw_results
                | 'unbatch' >>
                beam.FlatMap(lambda x: zip(x['source'], x['outputs']))
                | 'get predicted' >> beam.Map(lambda x: x[0] + list(x[1])))

        results_save = (
            processed_results
            | 'write csv lines' >> beam.ParDo(ListToCsvDoFn())
            | 'write file' >> beam.io.WriteToText(output_file_prefix))

        (results_save
         | 'fixed one' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1)
         | 'set schema' >> beam.Map(lambda path: json.dumps(output_schema))
         | 'write schema file' >> beam.io.WriteToText(output_schema_file,
                                                      shard_name_template=''))
Esempio n. 6
0
 def _copy_file_to_gcs(job_dir, file_path):
     gcs_url = os.path.join(job_dir, file_path)
     logger.info('Saving models to GCS: %s' % gcs_url)
     with file_io.FileIO(file_path, mode='rb') as input_f:
         with file_io.FileIO(gcs_url, mode='w+') as output_f:
             output_f.write(input_f.read())

model.summary()

model.fit(train_parsed_dataset,
          epochs=args.num_epochs, 
          steps_per_epoch=int(train_records/batch_size)+1,
          validation_data=valid_parsed_dataset,
          validation_steps=int(valid_records/batch_size)+1,
          callbacks=[tensorboard_cb, checkpoint]
         )

# Export the model to a local SavedModel directory/cloud storage
export_path = tf.contrib.saved_model.save_keras_model(model, saved_model_path=os.path.join(output_dir,"model"))

# export checkpoint to cloud storage output directory
if args.hypertune=="hypertune":
    with file_io.FileIO(checkpoint_path, mode='rb') as input_f:
        with file_io.FileIO(os.path.join(output_dir,'checkpoint',checkpoint_path), mode='wb+') as output_f:
            output_f.write(input_f.read())


test_parsed_dataset = test_parsed_dataset.batch(batch_size)
test_parsed_dataset = test_parsed_dataset.repeat()
model.evaluate(test_parsed_dataset,steps=int(test_records/batch_size)+1)





Esempio n. 8
0
def train_model(train_file='data/mnist.pkl',
                job_dir='./tmp/mnist_mlp',
                **args):
    # set the logging path for ML Engine logging to Storage bucket
    logs_path = job_dir + '/logs/' + datetime.now().isoformat()
    print('Using logs_path located at {}'.format(logs_path))

    # Reading in the pickle file. Pickle works differently with Python 2 vs 3
    f = file_io.FileIO(train_file, mode='r')
    if sys.version_info < (3, ):
        data = pickle.load(f)
    else:
        data = pickle.load(f, encoding='bytes')

    # with open(train_file, 'rb') as file:
    #     data = pickle.load(file, encoding='bytes')

    # the data, shuffled and split between train and test sets
    (x_train, y_train), (x_test, y_test) = data

    x_train = x_train.reshape(60000, 784)
    x_test = x_test.reshape(10000, 784)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')

    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)

    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784, )))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, activation='softmax'))

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer=RMSprop(),
                  metrics=['accuracy'])

    history = model.fit(
        x_train,
        y_train,
        batch_size=batch_size,
        epochs=epochs,
        verbose=1,
        callbacks=[keras.callbacks.TensorBoard(log_dir=logs_path)],
        validation_data=(x_test, y_test))

    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    # Save the model locally
    model.save('model.h5')

    # Save the model to the Cloud Storage bucket's jobs directory
    with file_io.FileIO('model.h5', mode='r') as input_f:
        with file_io.FileIO(job_dir + '/model.h5', mode='w+') as output_f:
            output_f.write(input_f.read())
Esempio n. 9
0
        '--config_file',
        help=
        'A YAML config file that contrains all other trainer input parameters',
        required=True)

    parser.add_argument('--data_path',
                        help='GS path for training data',
                        required=True)

    args = parser.parse_args()
    arguments = args.__dict__

    ########### Load config file and config parameters ################
    if args.config_file is None:
        raise ValueError('config YAML file must not be None!!!')
    if file_io.file_exists(args.config_file) is not True:
        # use tf's file_io for both GS and local files
        raise ValueError('config file does not exsit!!!  {}'.format(
            args.config_file))

    with file_io.FileIO(
            args.config_file,
            'r') as f:  # This reads BOTH local files and GS bucket files!!!
        config = yaml.load(f)

    # actually training happens here
    mnist_autoencoder_deconv_simple.train(job_dir=args.job_dir,
                                          job_id=args.job_id,
                                          data_path=args.data_path,
                                          **config)
Esempio n. 10
0
def train_model(train_file='russian punch cards/processedarrays.npy',
                job_dir='./tmp/example-5',
                log_dir='./tmp/logs',
                dropout=0.5,
                rnn_size=128,
                rnn_activation='tanh',
                rnn_layers=1,
                rnn_cell='LSTM',
                lr_decay=0,
                batch_size=64,
                epochs=100,
                saved_model='model.h5',
                test=False,
                **args):
    file_stream = file_io.FileIO(train_file, mode='rb')
    data_dict = np.load(file_stream)
    data_list = list(data_dict[()].values())
    data_unfolded = [
        np.ravel(d, order='C').astype(np.uint8) for d in data_list
        if d.shape[1] == 24
    ]

    MAX_LEN = 2400
    data_repeated = [
        np.tile(x, MAX_LEN // x.shape[0] + 1) for x in data_unfolded
    ]
    pad = pad_sequences(data_repeated,
                        maxlen=MAX_LEN,
                        dtype=np.uint8,
                        value=2,
                        padding='post',
                        truncating='post')

    if rnn_cell == 'LSTM':
        if test:
            cell = LSTM
        else:
            cell = CuDNNLSTM
    elif rnn_cell == 'GRU':
        cell = CuDNNGRU
    else:
        print('unknown rnn cell type, defaulting to LSTM')
        cell = CuDNNLSTM

    model = Sequential()
    model.add(
        cell(rnn_size,
             return_sequences=True,
             batch_input_shape=(None, None, 1)))
    model.add(Dropout(dropout))
    for i in range(rnn_layers - 1):
        model.add(cell(rnn_size, return_sequences=True))
        model.add(Dropout(dropout))
    model.add(Activation(rnn_activation))
    model.add(TimeDistributed(Dense(1, activation='sigmoid')))

    optimizer = RMSprop(clipnorm=1., decay=lr_decay)

    # try using different optimizers and different optimizer configs
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['binary_accuracy'])

    if test:
        X = pad[:256, :-1, None]
        y = pad[:256, 1:, None]
    else:
        X = pad[:, :-1, None]
        y = pad[:, 1:, None]

    ckpt = GCSModelCheckpoint('epoch_{epoch}_' + saved_model,
                              job_dir + '/models',
                              monitor='val_binary_accuracy',
                              save_best_only=True,
                              period=10)
    tb = TensorBoard(log_dir=log_dir + '/' + job_dir.split('/')[-1])
    model.fit(x=X,
              y=y,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.2,
              callbacks=[tb, ckpt])

    model.save(saved_model)

    # Save model.h5 on to google storage
    with file_io.FileIO(saved_model, mode='rb') as input_f:
        with file_io.FileIO(job_dir + '/' + saved_model,
                            mode='w+') as output_f:
            output_f.write(input_f.read())
confmat = np.array([[53, 23, 15], [124, 51, 26], [2934, 540, 2634]])
vocab = ['Class A', 'Class B', 'Class C']
# vocab = np.arange(3)

cdata = []
for target_index, target_row in enumerate(confmat):
    print(target_index, target_row)
    print("\n")
    for pred_idx, count in enumerate(target_row):
        cdata.append((vocab[target_index], vocab[pred_idx], count))

df_cm = pd.DataFrame(cdata, columns=['target', 'predicted', 'count'])
cm_file = os.path.join('gs://data-folder/kubeflow_data_trial1',
                       'confusion_matrix.csv')

with file_io.FileIO(cm_file, 'w') as fl:
    df_cm.to_csv(fl,
                 columns=['target', 'predicted', 'count'],
                 header=False,
                 index=False)

# metadata = {
# 	'outputs' : [

# 	]
# }
# with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as fl:
# 	json.dump(metadata, fl)

metrics = {
    'metrics': [
Esempio n. 12
0
 def testWriteBinaryMode(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     file_io.FileIO(file_path, "wb").write("testing")
     with file_io.FileIO(file_path, mode="r") as f:
         self.assertEqual("testing", f.read())
Esempio n. 13
0
 def testReadBinaryMode(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     file_io.write_string_to_file(file_path, "testing")
     with file_io.FileIO(file_path, mode="rb") as f:
         self.assertEqual(b"testing", f.read())
Esempio n. 14
0
 def testUTF8StringPath(self):
     file_path = os.path.join(self._base_dir, "UTF8测试_file")
     file_io.write_string_to_file(file_path, "testing")
     with file_io.FileIO(file_path, mode="rb") as f:
         self.assertEqual(b"testing", f.read())
Esempio n. 15
0
def _batch_csv_reader(csv_file, n):
    with file_io.FileIO(csv_file, 'r') as f:
        args = [f] * n
        return six.moves.zip_longest(*args)
Esempio n. 16
0
def main(job_dir):
    EPOCHS = 100
    INIT_LR = 1e-3
    BS = 50
    IMAGE_DIMS = (96, 96, 3)

    data = []
    labels = []
    current_dir = os.path.dirname(os.path.abspath(__file__))

    print('downloading!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

    with file_io.FileIO('gs://data_bbp/data.zip', mode='wb+') as f:
        with file_io.FileIO(current_dir + '/data.zip', mode='wb+') as output_f:
            output_f.write(f.read())
    print('downloaded!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

    with zipfile.ZipFile(current_dir + '/data.zip', 'r') as f:

        for member in f.infolist():
            f.extract(member, current_dir + '/data')

    print('unzipped!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

    imagePaths = sorted(list(paths.list_images(current_dir)))

    random.seed(42)
    random.shuffle(imagePaths)

    for imagePath in imagePaths:
        image = cv2.imread(imagePath)
        image = cv2.resize(image, (IMAGE_DIMS[1], IMAGE_DIMS[0]))
        image = img_to_array(image)
        data.append(image)

        label = imagePath.split(os.path.sep)[-2]
        labels.append(label)

    print('processed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')

    data = np.array(data, dtype="float") / 255.0
    labels = np.array(labels)

    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)

    (trainX, testX, trainY, testY) = train_test_split(data,
                                                      labels,
                                                      test_size=0.1,
                                                      random_state=42)

    aug = ImageDataGenerator(rotation_range=25,
                             width_shift_range=0.1,
                             height_shift_range=0.1,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=False,
                             fill_mode="nearest")

    model = build(width=IMAGE_DIMS[1],
                  height=IMAGE_DIMS[0],
                  depth=IMAGE_DIMS[2],
                  classes=len(lb.classes_))
    model.summary()

    opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
    model.compile(loss="categorical_crossentropy",
                  optimizer=opt,
                  metrics=["accuracy"])

    model.fit_generator(aug.flow(trainX, trainY, batch_size=BS),
                        validation_data=(testX, testY),
                        steps_per_epoch=len(trainX) // BS,
                        epochs=EPOCHS,
                        verbose=1)

    model.save('model.h5')
    with file_io.FileIO('model.h5', mode='rb') as input_f:
        with file_io.FileIO('gs://bbp_model_bucket/model/model.h5',
                            mode='wb+') as output_f:
            output_f.write(input_f.read())
Esempio n. 17
0
def train_model(train_file='data/',
                job_dir='./tmp/mnist_mlp',
                dropout_one=0.2,
                dropout_two=0.2,
                **args):

    # ラベル:"Horizontal": 横長,"Vertical": 縦長
    label = ['Horizontal', 'Vertical']

    # 四角形の訓練データの読み込み
    [X_train,
     y_train] = load_rectangles_data(train_file + 'rectangles_train.amat')

    # 四角形のテストデータの読み込み
    [X_test,
     y_test] = load_rectangles_data(train_file + 'rectangles_test.amat')

    print(X_train.shape[0], 'train samples')
    print(X_test.shape[0], 'test samples')

    # ラベルをクラス数に対応する配列に変更
    # 例:y_train:[0 1 0 0] -> Y_train:[[1 0],[0 1],[1 0],[1 0]]
    Y_train = keras.utils.to_categorical(y_train, num_classes)
    Y_test = keras.utils.to_categorical(y_test, num_classes)

    # 多層パーセプトロンのネットワーク作成
    # 入力を784次元(28x28)で、最終的な出力をクラス数に設定
    model = Sequential()
    model.add(Dense(512, activation='relu', input_dim=784, init='uniform'))
    model.add(Dropout(dropout_one))
    model.add(Dense(512, activation='relu', init='uniform'))
    model.add(Dropout(dropout_two))
    model.add(
        Dense(num_classes, activation='softmax', input_dim=512,
              init='uniform'))

    model.summary()

    # 2値分類なのでバイナリを選択,最適化アルゴリズムはRMSpropを選択
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(),
                  metrics=['accuracy'])

    es = keras.callbacks.EarlyStopping(monitor='val_loss',
                                       patience=0,
                                       verbose=0,
                                       mode='auto')
    history = model.fit(X_train,
                        Y_train,
                        epochs=epochs,
                        batch_size=batch_size,
                        verbose=1,
                        validation_data=(X_test, Y_test),
                        callbacks=[es])

    score = model.evaluate(X_test, Y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    # モデルの保存
    model.save('model.h5')

    # Google Cloud Storageのジョブディレクトリにモデルを保存
    with file_io.FileIO('model.h5', mode='r') as input_f:
        with file_io.FileIO(job_dir + '/model.h5', mode='w+') as output_f:
            output_f.write(input_f.read())
Esempio n. 18
0
def dispatch(
        train_files,
        learning_rate,
        job_dir,
        train_batch_size=64,
        num_epochs=100,
        steps_per_epoch=15,
        cv=1,
        val_ratio=0.2,  # cross validation
        decay=0.01,  # learning rate decay
        # num of epoches without improvement to trigger early stopping
    patience=15,
        fc_layers=[512],
        dropouts=[0.5],  # fully connected layers
        trainable_layers=166,  # trainable transfer learning model layers
        do_predict_test=False,
        test_file=''  # predict test data
):
    # log parameters.
    logging.info('start dispatch')
    # Preserve input parameters for saving them later
    parameters = locals()

    # Trainning data
    # Original Data
    with file_io.FileIO(train_files[0], mode='r') as train_input:
        train_data = json.load(train_input)
    train_df = pd.DataFrame(train_data)
    train_target = train_df['is_iceberg']
    # TODO: add reading test data.

    # Preprocess
    # Images: resize images to 75*75, 2 channels, and scale each channel to
    # range 0 to 1.
    band_1 = np.array([
        np.array(band).astype(np.float64).reshape(75, 75)
        for band in train_df["band_1"]
    ])
    # Scale the input graph to -1 to 1
    # preserve those values for later scaling test data use
    band_1_max = band_1.max()
    band_1_min = band_1.min()
    band_1 = (band_1 - band_1_min) / (band_1_max - band_1_min) * 2 - 1

    band_2 = np.array([
        np.array(band).astype(np.float64).reshape(75, 75)
        for band in train_df["band_2"]
    ])
    band_2_max = band_2.max()
    band_2_min = band_2.min()
    band_2 = (band_2 - band_2_min) / (band_2_max - band_2_min) * 2 - 1

    X = np.concatenate(
        [band_1[:, :, :, np.newaxis], band_2[:, :, :, np.newaxis]], axis=-1)

    # Incident angles: fill nan with 0, and scale to 0 - 1.
    train_df.inc_angle = train_df.inc_angle.replace('na', 0)
    X_inc = np.array(train_df.inc_angle)
    X_inc_max = X_inc.max()
    X_inc = X_inc / X_inc_max

    # Ids: for saving prediction use later
    X_id = train_df['id']

    # Testing data
    # Only load them if needed
    if do_predict_test:
        with file_io.FileIO(test_file, mode='r') as test_input:
            test_data = json.load(test_input)
        test_df = pd.DataFrame(test_data)

        # Preprocess
        # Images: resize images to 75*75, 2 channels, and scale each channel to
        # range 0 to 1.
        band_1_test = np.array([
            np.array(band).astype(np.float64).reshape(75, 75)
            for band in test_df["band_1"]
        ])
        # Scale the test graph using the same scale as training
        band_1_test = (band_1_test - band_1_min) / (band_1_max -
                                                    band_1_min) * 2 - 1

        band_2_test = np.array([
            np.array(band).astype(np.float64).reshape(75, 75)
            for band in test_df["band_2"]
        ])
        band_2_test = (band_2_test - band_2_min) / (band_2_max -
                                                    band_2_min) * 2 - 1

        X_test = np.concatenate([
            band_1_test[:, :, :, np.newaxis], band_2_test[:, :, :, np.newaxis]
        ],
                                axis=-1)

        # Incident angles: fill nan with 0, and using the same scale as training
        test_df.inc_angle = test_df.inc_angle.replace('na', 0)
        X_inc_test = np.array(test_df.inc_angle)
        X_inc_test = X_inc_test / X_inc_max

        test_id = test_df['id']

    # Set up cross validation: randomnly divide the data into several
    # training and validation splits, validation size can be set through
    # val_ratio, default to 20% of the total data.
    sample_size = len(train_target)
    validate_size = int(sample_size * val_ratio)
    np.random.seed(CV_RANDOM_SEED)  # set random seed for reproducing results.
    folds = []
    for i in range(cv):
        # generate a shuffle.
        permutation = np.random.permutation(sample_size)
        # validation set.
        X_id_val = X_id[permutation[:validate_size]]
        X_val = X[permutation[:validate_size]]
        X_inc_val = X_inc[permutation[:validate_size]]
        y_val = train_target[permutation[:validate_size]]
        # trainning set.
        X_id_train = X_id[permutation[validate_size:]]
        X_train = X[permutation[validate_size:]]
        X_inc_train = X_inc[permutation[validate_size:]]
        y_train = train_target[permutation[validate_size:]]
        # add to folds.
        folds.append((X_id_train, X_train, X_inc_train, y_train, X_id_val,
                      X_val, X_inc_val, y_val))

    # Training, cross validation and predict on test if needed.
    avg_val_score = 0
    avg_train_score = 0
    avg_pred_test = None
    pred_dir = os.path.join(job_dir, 'predictions')

    for i, (X_id_train, X_train, X_inc_train, y_train, X_id_val, X_val,
            X_inc_val, y_val) in enumerate(folds):
        logging.info('===================FOLD=%d' % i)
        # sanity check
        train_size = sample_size - validate_size
        assert len(X_id_train) == train_size
        assert len(X_train) == train_size
        assert len(X_inc_train) == train_size
        assert len(y_train) == train_size
        assert len(X_id_val) == validate_size
        assert len(X_val) == validate_size
        assert len(X_inc_val) == validate_size
        assert len(y_val) == validate_size

        # TODO:
        # 1. save the best model
        # 2. predict on test set
        # 3. record prediction on trainning and validation for analysis

        model = get_model(fc_layers, dropouts, trainable_layers)

        # optimizer
        optimizer = Adam(
            lr=learning_rate,
            decay=decay,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-08,
        )

        # compile model
        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=['accuracy'])

        # data flow generator, with image data augmented.
        # generator = ImageDataGenerator(
        #     horizontal_flip=True,
        #     vertical_flip=True
        # )
        generator = ImageDataGenerator(rotation_range=20,
                                       horizontal_flip=True,
                                       vertical_flip=True,
                                       width_shift_range=0.1,
                                       height_shift_range=0.1,
                                       zoom_range=0.1)

        gen_flow = gen_flow_for_two_inputs(X_train, X_inc_train, y_train,
                                           generator, train_batch_size)

        # Callbacks
        # TensorBoard callback, used to record training process for later
        # plotting using TensorBoard
        tensorboard = TensorBoard(log_dir=os.path.join(job_dir, 'logs'),
                                  write_graph=False)

        # EarlyStopping callback. By default monitoring val_loss decrease
        early_stopping = EarlyStopping(patience=patience)

        # ModelCheckpoint callback. By default monitoring val_loss min
        # TODO: add a callback to record models so that we can pick up one and
        # keep training
        model_dir = os.path.join(job_dir, 'models')
        if job_dir.startswith("gs://"):
            # Work-around fro h5py not able to handle writing to Google Cloud
            # Storage. Save to local first then copy to GCS
            best_model_path = 'best_model_%d.hdf5' % i
        else:
            if not os.path.exists(model_dir):
                os.mkdir(model_dir)
            best_model_path = os.path.join(model_dir, 'best_model_%d.hdf5' % i)

        model_checkpoint = ModelCheckpoint(
            best_model_path,
            save_best_only=True,
            save_weights_only=True  # model architecture won't change when load
        )

        # Train model and validate along the way
        model.fit_generator(
            gen_flow,
            # TODO: investigate if the gen_flow shuffle before every epoch,
            # else, each epoch will be seeing the same samples
            steps_per_epoch=steps_per_epoch,
            epochs=num_epochs,
            shuffle=True,
            verbose=1,
            validation_data=([X_val, X_inc_val], y_val),
            callbacks=[tensorboard, early_stopping, model_checkpoint])

        # Load the best model and save train and validation predictions
        model.load_weights(filepath=best_model_path)

        if job_dir.startswith("gs://"):
            # Work-around for pandas not able to handling writing to GCS.
            # Save to local first then copy to GCS.
            pred_val_path = 'pred_val_%d.csv' % i
            pred_train_path = 'pred_train_%d.csv' % i
        else:
            if not os.path.exists(pred_dir):
                os.mkdir(pred_dir)
            pred_val_path = os.path.join(pred_dir, 'pred_val_%d.csv' % i)
            pred_train_path = os.path.join(pred_dir, 'pred_train_%d.csv' % i)

        # Get validation Score.
        pred_val = model.predict([X_val, X_inc_val
                                  ]).ravel()  # flatten the 2-d array to 1-d
        avg_val_score += log_loss(y_val, pred_val)
        pred_val_df = pd.DataFrame({
            'id': X_id_val,
            'pred': pred_val,
            'is_iceberg': y_val
        })
        pred_val_df.to_csv(pred_val_path)

        pred_train = model.predict([X_train, X_inc_train]).ravel()
        avg_train_score += log_loss(y_train, pred_train)
        pred_train_df = pd.DataFrame({
            'id': X_id_train,
            'pred': pred_train,
            'is_iceberg': y_train
        })
        pred_train_df.to_csv(pred_train_path)

        # Copy files to GCS if running on cloud
        if job_dir.startswith("gs://"):
            # best model
            copy_file_to_gcs(model_dir, best_model_path)
            # predictions
            copy_file_to_gcs(pred_dir, pred_val_path)
            copy_file_to_gcs(pred_dir, pred_train_path)

        if do_predict_test:
            pred_test = model.predict([X_test, X_inc_test]).flatten()
            if avg_pred_test is None:
                avg_pred_test = pred_test
            else:
                avg_pred_test += pred_test

    # Add average validation and training score to record
    parameters['avg_val_score'] = avg_val_score / cv
    parameters['avg_train_score'] = avg_train_score / cv
    parameters_json_str = json.dumps(parameters, indent=2)
    logging.info(parameters_json_str)

    # Write parameters and scores to a file for experiment analysis
    with file_io.FileIO(os.path.join(job_dir, 'records.json'),
                        mode='w') as records_output:
        records_output.write(parameters_json_str)

    if do_predict_test:
        avg_pred_test = avg_pred_test / cv
        leaky_angle = [34.4721, 42.5591, 33.6352, 36.1061, 39.2340]
        mask = [X_inc_test[i] in leaky_angle for i in range(len(test_id))]
        avg_pred_test[mask] = 1

        # Save average test prediction
        if job_dir.startswith("gs://"):
            pred_test_path = 'pred_test_%s.csv' % job_dir.split('/')[-1]
        else:
            pred_test_path = os.path.join(pred_dir, 'pred_test.csv')
        save_submit(test_id, avg_pred_test, pred_test_path)
        if job_dir.startswith("gs://"):
            copy_file_to_gcs(pred_dir, pred_test_path)
Esempio n. 19
0
def copy_file_to_gcs(job_dir, file_path):
    with file_io.FileIO(file_path, mode='rb') as input_f:
        with file_io.FileIO(os.path.join(job_dir, file_path),
                            mode='w+') as output_f:
            output_f.write(input_f.read())
Esempio n. 20
0
def main(input_file_name, output_file_name, job_dir, batch_size, num_units,
         num_layers, learning_rate, num_epoches, check_point, output_len,
         use_gpu):

    with file_io.FileIO(input_file_name, 'r') as input:
        sample = list(input.read())

    seq_len = 1
    output_file = os.path.join(job_dir, output_file_name)
    output = file_io.FileIO(output_file, 'w')

    char_to_idx, idx_to_char = build_dataset(sample)

    # use gpu
    device_name = '/device:GPU:0'
    if use_gpu != 1:
        device_name = '/cpu:0'

    vocab_len = len(char_to_idx)
    with tf.device(device_name):
        model = LSTM(num_units, num_layers, seq_len, batch_size, vocab_len)

    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(
        model.loss)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        print('training...')
        pred_inputs, _ = next(
            next_batch(sample, batch_size, seq_len, char_to_idx))
        for i in range(num_epoches):
            print('===============')
            print('epoch: %d' % i)
            print('===============')

            cur_state = sess.run(model.initial_states)
            for inputs, targets in next_batch(sample, batch_size, seq_len,
                                              char_to_idx):
                feed_dict = {
                    model.inputs: inputs,
                    model.targets: targets,
                    model.initial_states: cur_state
                }

                _, state, loss = sess.run(
                    [optimizer, model.final_state, model.loss],
                    feed_dict=feed_dict)
                cur_state = state
                print(loss)

            if i != 0 and i % check_point == 0:
                output.write('[epoch %d] predicting...\n' % i)

                with tf.device(device_name):
                    pred = model.predict(sess, pred_inputs, output_len,
                                         seq_len, idx_to_char)
                output.write("".join(pred))
                output.write("\n\n")
                print("".join(pred))

            # reset state
            # sess.run([model.reset_state])

        print('predicting...\n')
        with tf.device(device_name):
            pred = model.predict(sess, pred_inputs, output_len, seq_len,
                                 idx_to_char)
        output.write("".join(pred))
        output.write("\n\n")
        print("".join(pred))

    output.close()
Esempio n. 21
0
    def __init__(self,
                 datadir,
                 verbose=False,
                 temporal_samples=None,
                 section="dataset",
                 augment=False):
        self.verbose = verbose

        self.augment = augment

        # parser reads serialized tfrecords file and creates a feature object
        parser = utils.S2parser()
        self.parsing_function = parser.parse_example

        self.temp_samples = temporal_samples
        self.section = section

        # if datadir is None:
        #    dataroot=os.environ["datadir"]
        # else:
        dataroot = datadir

        # csv list of geotransforms of each tile: tileid, xmin, xres, 0, ymax, 0, -yres, srid
        # use querygeotransform.py or querygeotransforms.sh to generate csv
        # fills dictionary:
        # geotransforms[<tileid>] = (xmin, xres, 0, ymax, 0, -yres)
        # srid[<tileid>] = srid
        self.geotransforms = dict()
        # https://en.wikipedia.org/wiki/Spatial_reference_system#Identifier
        self.srids = dict()
        with file_io.FileIO(os.path.join(dataroot, "geotransforms.csv"),
                            'r') as f:  # gcp
            # with open(os.path.join(dataroot, "geotransforms.csv"),'r') as f:
            reader = csv.reader(f, delimiter=',')
            for row in reader:
                # float(row[1]), int(row[2]), int(row[3]), float(row[4]), int(row[5]), int(row[6]))
                self.geotransforms[str(row[0])] = (float(row[1]),
                                                   float(row[2]), int(row[3]),
                                                   float(row[4]), int(row[5]),
                                                   float(row[6]))
                self.srids[str(row[0])] = int(row[7])

        classes = os.path.join(dataroot, "classes.txt")
        with file_io.FileIO(classes, 'r') as f:  # gcp
            # with open(classes, 'r') as f:
            classes = f.readlines()

        self.ids = list()
        self.classes = list()
        for row in classes:
            row = row.replace("\n", "")
            if '|' in row:
                id, cl = row.split('|')
                self.ids.append(int(id))
                self.classes.append(cl)

        ## create a lookup table to map labelids to dimension ids

        # map data ids [0, 2, 4,..., nclasses_originalID]
        labids = tf.constant(self.ids, dtype=tf.int64)

        # to dimensions [0, 1, 2, ... nclasses_orderID]
        dimids = tf.constant(list(range(0, len(self.ids), 1)), dtype=tf.int64)

        self.id_lookup_table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(labids, dimids),
            default_value=-1)

        self.inverse_id_lookup_table = tf.contrib.lookup.HashTable(
            tf.contrib.lookup.KeyValueTensorInitializer(dimids, labids),
            default_value=-1)

        # self.classes = [cl.replace("\n","") for cl in f.readlines()]

        cfgpath = os.path.join(dataroot, "dataset.ini")
        # load dataset configs
        datacfg = configparser.ConfigParser()
        with file_io.FileIO(cfgpath, 'r') as f:  # gcp
            datacfg.readfp(f)
        cfg = datacfg[section]

        self.tileidfolder = os.path.join(dataroot, "tileids")
        self.datadir = os.path.join(dataroot, cfg["datadir"])

        assert 'pix10' in cfg.keys()
        assert 'nobs' in cfg.keys()
        assert 'nbands10' in cfg.keys()
        assert 'nbands20' in cfg.keys()
        assert 'nbands60' in cfg.keys()

        self.tiletable = cfg["tiletable"]

        self.nobs = int(cfg["nobs"])

        self.expected_shapes = self.calc_expected_shapes(
            int(cfg["pix10"]), int(cfg["nobs"]), int(cfg["nbands10"]),
            int(cfg["nbands20"]), int(cfg["nbands60"]))

        # expected datatypes as read from disk
        self.expected_datatypes = (tf.float32, tf.float32, tf.float32,
                                   tf.float32, tf.float32, tf.int64)
Esempio n. 22
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='ML Trainer')
    parser.add_argument('--predictions',
                        type=str,
                        help='GCS path of prediction file pattern.')
    parser.add_argument('--trueclass',
                        type=str,
                        help='The name of the class as true value.')
    parser.add_argument(
        '--target_lambda',
        type=str,
        help='a lambda function as a string to determine positive or negative.'
        + 'For example, "lambda x: x[\'a\'] and x[\'b\']". If missing, ' +
        'trueclass must be set and input must have a "target" column.')
    parser.add_argument('--output',
                        type=str,
                        help='GCS path of the output directory.')
    args = parser.parse_args()

    if not args.target_lambda and not args.trueclass:
        raise ValueError('Either target_lambda or trueclass must be set.')

    schema_file = os.path.join(os.path.dirname(args.predictions),
                               'schema.json')
    schema = json.loads(file_io.read_file_to_string(schema_file))
    names = [x['name'] for x in schema]
    dfs = []
    files = file_io.get_matching_files(args.predictions)
    for file in files:
        with file_io.FileIO(file, 'r') as f:
            dfs.append(pd.read_csv(f, names=names))

    df = pd.concat(dfs)
    if args.target_lambda:
        df['target'] = df.apply(eval(args.target_lambda), axis=1)
    else:
        df['target'] = df['target'].apply(lambda x: 1
                                          if x == args.trueclass else 0)
    fpr, tpr, thresholds = roc_curve(df['target'], df[args.trueclass])
    df_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds})
    roc_file = os.path.join(args.output, 'roc.csv')
    with file_io.FileIO(roc_file, 'w') as f:
        df_roc.to_csv(f,
                      columns=['fpr', 'tpr', 'thresholds'],
                      header=False,
                      index=False)

    metadata = {
        'outputs': [{
            'type':
            'roc',
            'storage':
            'gcs',
            'format':
            'csv',
            'schema': [
                {
                    'name': 'fpr',
                    'type': 'NUMBER'
                },
                {
                    'name': 'tpr',
                    'type': 'NUMBER'
                },
                {
                    'name': 'thresholds',
                    'type': 'NUMBER'
                },
            ],
            'source':
            roc_file
        }]
    }
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
Esempio n. 23
0
def save_model(model, output_path): #Save model file to GCS
    print('saved model to ', output_path)
    model.save(MODEL_FILE)
    with file_io.FileIO(MODEL_FILE, mode='rb') as input_f:
        with file_io.FileIO(output_path + '/' + MODEL_FILE, mode='wb+') as output_f:
            output_f.write(input_f.read())
Esempio n. 24
0
def load_inputs_from_input_arg_string(inputs_str, input_exprs_str,
                                      input_examples_str):
  """Parses input arg strings and create inputs feed_dict.

  Parses '--inputs' string for inputs to be loaded from file, and parses
  '--input_exprs' string for inputs to be evaluated from python expression.
  '--input_examples' string for inputs to be created from tf.example feature
  dictionary list.

  Args:
    inputs_str: A string that specified where to load inputs. Each input is
        separated by semicolon.
        * For each input key:
            '<input_key>=<filename>' or
            '<input_key>=<filename>[<variable_name>]'
        * The optional 'variable_name' key will be set to None if not specified.
        * File specified by 'filename' will be loaded using numpy.load. Inputs
            can be loaded from only .npy, .npz or pickle files.
        * The "[variable_name]" key is optional depending on the input file type
            as descripted in more details below.
        When loading from a npy file, which always contains a numpy ndarray, the
        content will be directly assigned to the specified input tensor. If a
        variable_name is specified, it will be ignored and a warning will be
        issued.
        When loading from a npz zip file, user can specify which variable within
        the zip file to load for the input tensor inside the square brackets. If
        nothing is specified, this function will check that only one file is
        included in the zip and load it for the specified input tensor.
        When loading from a pickle file, if no variable_name is specified in the
        square brackets, whatever that is inside the pickle file will be passed
        to the specified input tensor, else SavedModel CLI will assume a
        dictionary is stored in the pickle file and the value corresponding to
        the variable_name will be used.
    input_exprs_str: A string that specifies python expressions for inputs.
        * In the format of: '<input_key>=<python expression>'.
        * numpy module is available as np.
    input_examples_str: A string that specifies tf.Example with dictionary.
        * In the format of: '<input_key>=<[{feature:value list}]>'

  Returns:
    A dictionary that maps input tensor keys to numpy ndarrays.

  Raises:
    RuntimeError: An error when a key is specified, but the input file contains
        multiple numpy ndarrays, none of which matches the given key.
    RuntimeError: An error when no key is specified, but the input file contains
        more than one numpy ndarrays.
  """
  tensor_key_feed_dict = {}

  inputs = preprocess_inputs_arg_string(inputs_str)
  input_exprs = preprocess_input_exprs_arg_string(input_exprs_str)
  input_examples = preprocess_input_examples_arg_string(input_examples_str)

  for input_tensor_key, (filename, variable_name) in inputs.items():
    data = np.load(file_io.FileIO(filename, mode='rb'))

    # When a variable_name key is specified for the input file
    if variable_name:
      # if file contains a single ndarray, ignore the input name
      if isinstance(data, np.ndarray):
        warnings.warn(
            'Input file %s contains a single ndarray. Name key \"%s\" ignored.'
            % (filename, variable_name))
        tensor_key_feed_dict[input_tensor_key] = data
      else:
        if variable_name in data:
          tensor_key_feed_dict[input_tensor_key] = data[variable_name]
        else:
          raise RuntimeError(
              'Input file %s does not contain variable with name \"%s\".' %
              (filename, variable_name))
    # When no key is specified for the input file.
    else:
      # Check if npz file only contains a single numpy ndarray.
      if isinstance(data, np.lib.npyio.NpzFile):
        variable_name_list = data.files
        if len(variable_name_list) != 1:
          raise RuntimeError(
              'Input file %s contains more than one ndarrays. Please specify '
              'the name of ndarray to use.' % filename)
        tensor_key_feed_dict[input_tensor_key] = data[variable_name_list[0]]
      else:
        tensor_key_feed_dict[input_tensor_key] = data

  # When input is a python expression:
  for input_tensor_key, py_expr_evaluated in input_exprs.items():
    if input_tensor_key in tensor_key_feed_dict:
      warnings.warn(
          'input_key %s has been specified with both --inputs and --input_exprs'
          ' options. Value in --input_exprs will be used.' % input_tensor_key)
    tensor_key_feed_dict[input_tensor_key] = py_expr_evaluated

  # When input is a tf.Example:
  for input_tensor_key, example in input_examples.items():
    if input_tensor_key in tensor_key_feed_dict:
      warnings.warn(
          'input_key %s has been specified in multiple options. Value in '
          '--input_examples will be used.' % input_tensor_key)
    tensor_key_feed_dict[input_tensor_key] = example
  return tensor_key_feed_dict
Esempio n. 25
0
    def from_frozen_graph(cls,
                          graph_def_file,
                          input_arrays,
                          output_arrays,
                          input_shapes=None):
        """Creates a TFLiteConverter class from a file containing a frozen GraphDef.

    Args:
      graph_def_file: Full filepath of file containing frozen GraphDef.
      input_arrays: List of input tensors to freeze graph with.
      output_arrays: List of output tensors to freeze graph with.
      input_shapes: Dict of strings representing input tensor names to list of
        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
        Automatically determined when input shapes is None (e.g., {"foo" :
          None}). (default None)

    Returns:
      TFLiteConverter class.

    Raises:
      IOError:
        File not found.
        Unable to parse input file.
      ValueError:
        The graph is not frozen.
        input_arrays or output_arrays contains an invalid tensor name.
        input_shapes is not correctly defined when required
    """
        with _ops.Graph().as_default():
            with _session.Session() as sess:
                # Read GraphDef from file.
                if not _file_io.file_exists(graph_def_file):
                    raise IOError(
                        "File '{0}' does not exist.".format(graph_def_file))
                with _file_io.FileIO(graph_def_file, "rb") as f:
                    file_content = f.read()

                try:
                    graph_def = _graph_pb2.GraphDef()
                    graph_def.ParseFromString(file_content)
                except (_text_format.ParseError, DecodeError):
                    try:
                        print("Ignore 'tcmalloc: large alloc' warnings.")

                        if not isinstance(file_content, str):
                            if PY3:
                                file_content = file_content.decode("utf-8")
                            else:
                                file_content = file_content.encode("utf-8")
                        graph_def = _graph_pb2.GraphDef()
                        _text_format.Merge(file_content, graph_def)
                    except (_text_format.ParseError, DecodeError):
                        raise IOError(
                            "Unable to parse input file '{}'.".format(
                                graph_def_file))

                # Handles models with custom TFLite ops that cannot be resolved in
                # TensorFlow.
                load_model_in_session = True
                try:
                    _import_graph_def(graph_def, name="")
                except _NotFoundError:
                    load_model_in_session = False

                if load_model_in_session:
                    # Check if graph is frozen.
                    if not _is_frozen_graph(sess):
                        raise ValueError(
                            "Please freeze the graph using freeze_graph.py.")

                    # Get input and output tensors.
                    input_tensors = _get_tensors_from_tensor_names(
                        sess.graph, input_arrays)
                    output_tensors = _get_tensors_from_tensor_names(
                        sess.graph, output_arrays)
                    _set_tensor_shapes(input_tensors, input_shapes)

                    return cls(sess.graph_def, input_tensors, output_tensors)
                else:
                    if not input_shapes:
                        raise ValueError(
                            "input_shapes must be defined for this model.")
                    if set(input_arrays) != set(input_shapes.keys()):
                        raise ValueError(
                            "input_shapes must contain a value for each item "
                            "in input_array.")

                    input_arrays_with_shape = [(name, input_shapes[name])
                                               for name in input_arrays]
                    return cls(graph_def,
                               input_tensors=None,
                               output_tensors=None,
                               input_arrays_with_shape=input_arrays_with_shape,
                               output_arrays=output_arrays)
Esempio n. 26
0
    def prepare_for_training(self):

        print("prepararation is called")
        '''学習に必要になる変数を指定する。One-hot vectorを作成する。
        '''
        if not self.isTrain:
            print("このfunctionはtrainingオブジェクト用です。predictオブジェクトでは使用できません。")
            return
        self.input_chars.append('UKW')
        #self.target_texts.append('UKW')
        self.num_encoder_tokens = len(self.input_chars) + 1
        self.num_decoder_tokens = len(set(self.target_texts)) + 1
        self.max_encoder_seq_length = max(
            [len(txt) for txt in self.input_texts])
        self.max_decoder_seq_length = max(
            [len(txt) for txt in self.target_texts])
        input_chars_indexed = dict([
            (char, i) for i, char in enumerate(self.input_chars, start=1)
        ])
        #target_chars_indexed = dict([(char, i) for i, char in enumerate(self.target_chars, start=1)])
        target_token_indexed = dict([
            (text, i) for i, text in enumerate(set(self.target_texts), start=1)
        ])
        input_chars_indexed['PAD'] = 0
        #target_chars_indexed['PAD'] = 0
        target_token_indexed['PAD'] = 0
        with file_io.FileIO(os.path.join(self.output_path, 'fo_input.json'),
                            'w') as fo_input:
            json.dump(input_chars_indexed, fo_input)
        with file_io.FileIO(os.path.join(self.output_path, 'fo_target.json'),
                            'w') as fo_target:
            json.dump(target_chars_indexed, fo_target)

        # 学習に使用するencoderとdecoderのデータを作成する。
        self.input_sequence = np.zeros(
            (len(self.input_texts), self.max_encoder_seq_length),
            dtype='int32')
        self.target_sequence = np.zeros(
            (len(self.target_texts), self.num_decoder_tokens), dtype='int32')
        ukw_input_index = input_chars_indexed['UKW']
        #ukw_target_index = target_token_indexed['UKW']
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(self.input_texts)
        self.input_sequence = np.array(
            tokenizer.texts_to_matrix(self.input_texts))
        tokenizer.fit_on_texts(self.target_texts)
        self.target_sequence = np.array(
            tokenizer.texts_to_matrix(self.target_texts))
        print('input_sequence', self.input_sequence.dtype,
              self.input_sequence.shape)
        print('target_sequence', self.target_sequence.dtype,
              self.target_sequence.shape)

        # One-hot vectorを作成する。
        for i, (input_text, target_text) in enumerate(
                zip(self.input_texts, self.target_texts)):
            for j, char in enumerate(input_text):
                if input_chars_indexed.get(char, 0) > 0:
                    w = input_chars_indexed[char]
                else:
                    w = ukw_input_index
                self.input_sequence[i, j] = int(w)
            k = target_token_indexed.get(target_text, 0)
            self.target_sequence[i, k] = 1
Esempio n. 27
0
def gen_dtu_resized_path(dtu_data_folder, mode='training'):
    """ generate data paths for dtu dataset """
    sample_list = []

    # parse camera pairs
    cluster_file_path = dtu_data_folder + '/Cameras/pair.txt'

    # cluster_list = open(cluster_file_path).read().split()
    cluster_list = file_io.FileIO(cluster_file_path, mode='r').read().split()

    # 3 sets
    training_set = [
        2, 6, 7, 8, 14, 16, 18, 19, 20, 22, 30, 31, 36, 39, 41, 42, 44, 45, 46,
        47, 50, 51, 52, 53, 55, 57, 58, 60, 61, 63, 64, 65, 68, 69, 70, 71, 72,
        74, 76, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
        100, 101, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116,
        119, 120, 121, 122, 123, 124, 125, 126, 127, 128
    ]
    validation_set = [
        3, 5, 17, 21, 28, 35, 37, 38, 40, 43, 56, 59, 66, 67, 82, 86, 106, 117
    ]

    data_set = []
    if mode == 'training':
        data_set = training_set
    elif mode == 'validation':
        data_set = validation_set

    # for each dataset
    for i in data_set:

        image_folder = os.path.join(dtu_data_folder,
                                    ('Rectified/scan%d_train' % i))
        cam_folder = os.path.join(dtu_data_folder, 'Cameras/train')
        depth_folder = os.path.join(dtu_data_folder,
                                    ('Depths/scan%d_train' % i))

        if mode == 'training':
            # for each lighting
            for j in range(0, 7):
                # for each reference image
                for p in range(0, int(cluster_list[0])):
                    paths = []
                    # ref image
                    ref_index = int(cluster_list[22 * p + 1])
                    ref_image_path = os.path.join(image_folder,
                                                  ('rect_%03d_%d_r5000.png' %
                                                   ((ref_index + 1), j)))
                    ref_cam_path = os.path.join(cam_folder,
                                                ('%08d_cam.txt' % ref_index))
                    paths.append(ref_image_path)
                    paths.append(ref_cam_path)
                    # view images
                    for view in range(FLAGS.view_num - 1):
                        view_index = int(cluster_list[22 * p + 2 * view + 3])
                        view_image_path = os.path.join(
                            image_folder,
                            ('rect_%03d_%d_r5000.png' % ((view_index + 1), j)))
                        view_cam_path = os.path.join(
                            cam_folder, ('%08d_cam.txt' % view_index))
                        paths.append(view_image_path)
                        paths.append(view_cam_path)
                    # depth path
                    depth_image_path = os.path.join(
                        depth_folder, ('depth_map_%04d.pfm' % ref_index))
                    paths.append(depth_image_path)
                    sample_list.append(paths)
        elif mode == 'validation':
            j = 3
            # for each reference image
            for p in range(0, int(cluster_list[0])):
                paths = []
                # ref image
                ref_index = int(cluster_list[22 * p + 1])
                ref_image_path = os.path.join(image_folder,
                                              ('rect_%03d_%d_r5000.png' %
                                               ((ref_index + 1), j)))
                ref_cam_path = os.path.join(cam_folder,
                                            ('%08d_cam.txt' % ref_index))
                paths.append(ref_image_path)
                paths.append(ref_cam_path)
                # view images
                for view in range(FLAGS.view_num - 1):
                    view_index = int(cluster_list[22 * p + 2 * view + 3])
                    view_image_path = os.path.join(image_folder,
                                                   ('rect_%03d_%d_r5000.png' %
                                                    ((view_index + 1), j)))
                    view_cam_path = os.path.join(cam_folder,
                                                 ('%08d_cam.txt' % view_index))
                    paths.append(view_image_path)
                    paths.append(view_cam_path)
                # depth path
                depth_image_path = os.path.join(
                    depth_folder, ('depth_map_%04d.pfm' % ref_index))
                paths.append(depth_image_path)
                sample_list.append(paths)

    return sample_list
Esempio n. 28
0
    def predict_model(self, is_transfer):
        '''
        モデルに予測させ、結果をファイルに保存させる。必要であれば転移学習させる。
        # 引数
            input_seq: 予測させる文字列。
            is_tranfer: 転移学習させるかどうか。trueならさせる。
        '''

        if self.isTrain:
            print("このfunctionはpredictオブジェクト用です。trainingオブジェクトでは使用できません。")
            return
        if is_transfer:  # 過去に学習した重みを読み込んで転移学習させる。
            with file_io.FileIO(os.path.join(self.path_cloud, 'lstm_tag.hdf5'),
                                'r') as reader:
                with file_io.FileIO('lstm_tag.hdf5', 'w+') as writer:
                    writer.write(reader.read())
            self.model.load_weights('lstm_tag.hdf5', by_name=True)

        def decode_sequence(input_seq):
            predict_decode_input = np.zeros((1, self.max_decoder_seq_length),
                                            dtype=np.int)
            predict_decode_input[0, 0] = self.target_token_index['\t']
            predict_out = np.zeros((10, self.max_decoder_seq_length))
            predicted_tag = self.model.predict(
                [input_seq, predict_decode_input])
            top_n = predicted_tag[0, 0, :].argsort()[-self.num_tags - 1:]
            for i in range(self.num_tags):
                if self.reverse_target_char_index[
                        top_n[i]] == 'EOS' or top_n[i] == 0:
                    top_n[i] = predicted_tag[
                        0, 0, self.num_tags].argsort()[-self.num_tags - 1:]
            for i in range(self.num_tags):
                predict_decode_input[0, 1] = top_n[i]
                for j in range(1, 9):
                    predicted_tag = self.model.predict(
                        [input_seq, predict_decode_input])
                    tops = np.argsort(predicted_tag[0, j])[::-1]
                    top = np.argmax(predicted_tag[0, j, :])
                    if self.reverse_target_char_index[
                            top] == 'EOS' or self.reverse_target_char_index[
                                top] == 'PAD':
                        top = tops[1]
                    if self.reverse_target_char_index[
                            top] == 'EOS' or self.reverse_target_char_index[
                                top] == 'PAD':
                        top = tops[2]
                    if top in predict_decode_input[0]:
                        j = 8
                    if j == 8:
                        predict_out[i] = predict_decode_input[0]
                    else:
                        predict_decode_input[0, j + 1] = top

            # one-hotの予測結果から単語の結果に変換する。
            words = [[] for i in range(self.num_tags)]
            for i in range(self.num_tags):
                for j in range(self.num_words):
                    words[i].append(self.reverse_target_char_index[int(
                        predict_out[i, j])])
            return words

        # ファイルに保存する。
        f = file_io.FileIO(os.path.join(self.path_output, self.type + '.tsv'),
                           'w')
        f = file_io.FileIO(os.path.join(self.path_output, "sample.tsv"), "w")
        for seq_index in range(len(self.encoder_input_data)):
            input_seq = self.encoder_input_data[seq_index:seq_index + 1]
            decoded_sentence = decode_sequence(input_seq)
            for i in range(len(decoded_sentence)):
                f.write(str(self.tag_data.iloc[seq_index, 0]) + '\t')
                for j in range(len(decoded_sentence[i])):
                    f.write(decoded_sentence[i][j])
                f.write('\t' + str(i + 1) + '\n')
        f.close()
Esempio n. 29
0
def _load_saved_object_graph_proto(filename):
    with file_io.FileIO(filename, "rb") as f:
        contents = f.read()
        return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
Esempio n. 30
0
 def testFileDelete(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     file_io.FileIO(file_path, mode="w").write("testing")
     file_io.delete_file(file_path)
     self.assertFalse(file_io.file_exists(file_path))