def testEmptyFilename(self): f = file_io.FileIO("", mode="r") with self.assertRaises(errors.NotFoundError): _ = f.read()
def train(job_dir=None, job_id=None, use_transposed_conv=True, score_metric='mse', loss='binary_crossentropy', learning_rate = 0.001, lr_decay=0.001, optimizer_name='adam', n_epochs=100, patience=5, batch_norm_before_activation=False, pool_method='max', **kwargs): '''main training function''' global do_batch_norm_before_activation do_batch_norm_before_activation = batch_norm_before_activation print('--> batch_norm_before_activation== {}!!!\n'.format(do_batch_norm_before_activation)) # input image dimensions img_rows, img_cols, img_chns = 28, 28, 1 original_img_size = (img_rows, img_cols, img_chns) n_filters = 8 # number of convolutional filters to use kernel_size = 3 # convolution kernel size batch_size = 1000 lr = learning_rate decay = lr_decay opt = optimizer_name if job_dir is None: job_dir = './tmp/' if job_id is None: ctime = time.ctime().split() time_str = ctime[4]+ctime[1]+ctime[2]+"_"+''.join(ctime[3].split(":")[0:2]) job_id = time_str checkpoint_filename = 'mnist_autoencoder_checkpoint_{}.hdf5'.format(job_id) else: checkpoint_filename = '{}.hdf5'.format(job_id) if use_transposed_conv is True: print('--> use_transposed_conv is True!!!\n') job_id = 'transposed_conv_' + job_id ############ Encoder ############### x = Input(shape=original_img_size) # reshape to: (100, 28, 28, 1) conv1 = conv_block(x, n_filters, kernel_size) conv1 = conv_block(conv1, n_filters, kernel_size) conv1 = pool_layer(conv1, method=pool_method, pool_size=(2,2), padding='same') conv2 = conv_block(conv1, n_filters*2, kernel_size) conv2 = conv_block(conv2, n_filters*2, kernel_size) conv2 = pool_layer(conv2, method=pool_method, pool_size=(2,2), padding='same') conv3 = conv_block(conv2, n_filters*4, kernel_size) conv3 = conv_block(conv3, n_filters*4, kernel_size) encoded = pool_layer(conv3, method=pool_method, pool_size=(2,2), padding='same') # End of encoder. The compressed representation is (4, 4, 8) conv4 = conv_block(encoded, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv) conv4 = conv_block(conv4, n_filters*4, kernel_size, use_transposed_conv=use_transposed_conv) conv4 = UpSampling2D((2, 2))(conv4) conv5 = conv_block(conv4, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv) conv5 = conv_block(conv5, n_filters*2, kernel_size, use_transposed_conv=use_transposed_conv) conv5 = UpSampling2D((2, 2))(conv5) conv6 = conv_block(conv5, n_filters, kernel_size, use_transposed_conv=use_transposed_conv) conv6 = conv_block(conv6, n_filters, kernel_size, use_transposed_conv=use_transposed_conv) conv6 = UpSampling2D((2, 2))(conv6) decoded = conv_block(conv6, 1, kernel_size=kernel_size, activation='sigmoid', padding='same', batch_norm=False, use_transposed_conv=use_transposed_conv) # the activation here is sigmoid b/c the pixel values are bounded b/w 0-1, and there are lots of 0s decoded = Cropping2D(cropping=((2, 2), (2, 2)))(decoded) # crop 2 on each side of the img to get 28x28 # Put all layers together into a model graph autoencoder = Model(x, decoded) ######### End of decoder ################### ######### Now config models for training and logging ########## if opt =='adam': optimizer = Adam(lr=lr, decay=decay) elif opt =='sgd': optimizer = SGD(lr=lr, momentum=0.9, decay=decay, nesterov=True) autoencoder.compile(optimizer=optimizer, loss = loss, metrics = [score_metric]) autoencoder.summary() # data from MNIST digits (x_train, _), (x_test, y_test) = mnist.load_data() # reshape data to (data_size, n_pix, n_pix, n_channels) x_train = x_train.astype('float32') / 255. x_train = x_train.reshape((x_train.shape[0],) + original_img_size) x_test = x_test.astype('float32') / 255. x_test = x_test.reshape((x_test.shape[0],) + original_img_size) print('x_train.shape:', x_train.shape) print('Initial evaluation of random model ={}\n'.format(autoencoder.evaluate(x_test, x_test, batch_size=batch_size))) callbacks = [EarlyStopping(monitor='val_loss',patience=5,verbose=2, mode='min', min_delta=0.0005), ModelCheckpoint(checkpoint_filename, monitor='val_loss', verbose=2, save_best_only=True), TensorBoard(log_dir=job_dir)] history = autoencoder.fit(x=x_train, y=x_train, shuffle=True, epochs=n_epochs, batch_size=batch_size, callbacks=callbacks, verbose=2, validation_data=(x_test, x_test)) test_score = autoencoder.evaluate(x_test, x_test, verbose=0, batch_size=batch_size) print('Final test score:', test_score) if score_metric == 'mae': history_key_validation = 'val_mean_absolute_error' history_key_train = 'mean_absolute_error' if score_metric == 'mse': history_key_validation = 'val_mean_squared_error' history_key_train = 'mean_squared_error' validation_history = history.history[history_key_validation] training_history = history.history[history_key_train] # Save model to gs if 'gs://' in job_dir: # Save model hdf5 to google storage with file_io.FileIO(checkpoint_filename, mode='r') as input_f: with file_io.FileIO(job_dir + checkpoint_filename, mode='w') as output_f: output_f.write(input_f.read()) #%% Plot a learning curve fig_name = 'lr_{}.pdf'.format(job_id) # if 'gs://' not in job_dir: fig_name = job_dir + fig_name f, axes = plt.subplots(2, sharex=True, figsize=(8,7)) axes[0].plot(training_history) axes[0].set_ylabel('Training score ({})'.format(score_metric)) axes[0].set_title('Final test score ({0}) = {1:2.4f}\n LR={2}, decay={3}, optimizer={4}, pool_method={5}\n \ use_transposed_conv={6}, loss={7}, batch_norm_before_activation={8}'.format(score_metric, test_score[1], lr, decay, opt, pool_method, use_transposed_conv, loss, do_batch_norm_before_activation), fontsize=9) axes[1].plot(validation_history) axes[1].set_xlabel('Epochs') axes[1].set_ylabel('Validation score ({})'.format(score_metric)) #f.suptitle('Config file :{}'.format(train_config_file), fontsize=10) f.subplots_adjust(hspace=0.05) f.savefig(fig_name) if 'gs://' in job_dir: #Save figure to GS with file_io.FileIO(fig_name, mode='r') as input_f: with file_io.FileIO(job_dir + fig_name, mode='w') as output_f: output_f.write(input_f.read()) #%% Sample a few test images and compare with reconstructed ones n_imgs_to_show = 30 x_test_sub = np.random.permutation(x_test)[0:n_imgs_to_show] reconstructed_test = autoencoder.predict(x_test_sub, batch_size=n_imgs_to_show) #reconstructed_train = autoencoder.predict(x_train[0:10000].reshape((10000, 28, 28, 1)), batch_size=batch_size) # plot reconstructed images and compare fig_name = 'compare_{}.pdf'.format(job_id) if 'gs://' not in job_dir: fig_name = job_dir + fig_name n_rows = 3 # split orignal images into 2 rows n_cols = n_imgs_to_show//n_rows f, axes = plt.subplots(n_rows*2, n_cols, sharey=True, figsize=(10,10)) for i in range(n_imgs_to_show): axes[i//n_cols * 2, i % n_cols].imshow(x_test_sub[i,:,:,0]) axes[i//n_cols * 2, 0].set_ylabel('Original') axes[i//n_cols * 2 +1, i % n_cols].imshow(reconstructed_test[i,:,:,0]) axes[i//n_cols * 2 +1, 0].set_ylabel('Reconstructed') f.savefig(fig_name) if 'gs://' in job_dir: #Save figure to GS with file_io.FileIO(fig_name, mode='r') as input_f: with file_io.FileIO(job_dir + fig_name, mode='w') as output_f: output_f.write(input_f.read())
def main(argv=None): parser = argparse.ArgumentParser(description='ML Trainer') parser.add_argument('--predictions', type=str, help='GCS path of prediction file pattern.') parser.add_argument('--output', type=str, help='GCS path of the output directory.') parser.add_argument( '--target_lambda', type=str, help='a lambda function as a string to compute target.' + 'For example, "lambda x: x[\'a\'] + x[\'b\']"' + 'If not set, the input must include a "target" column.') args = parser.parse_args() on_cloud = args.output.startswith('gs://') if not on_cloud and not os.path.exists(args.output): os.makedirs(args.output) schema_file = os.path.join(os.path.dirname(args.predictions), 'schema.json') schema = json.loads(file_io.read_file_to_string(schema_file)) names = [x['name'] for x in schema] dfs = [] files = file_io.get_matching_files(args.predictions) for file in files: with file_io.FileIO(file, 'r') as f: dfs.append(pd.read_csv(f, names=names)) df = pd.concat(dfs) if args.target_lambda: df['target'] = df.apply(eval(args.target_lambda), axis=1) vocab = list(df['target'].unique()) cm = confusion_matrix(df['target'], df['predicted'], labels=vocab) data = [] for target_index, target_row in enumerate(cm): for predicted_index, count in enumerate(target_row): data.append((vocab[target_index], vocab[predicted_index], count)) df_cm = pd.DataFrame(data, columns=['target', 'predicted', 'count']) cm_file = os.path.join(args.output, 'confusion_matrix.csv') with file_io.FileIO(cm_file, 'w') as f: df_cm.to_csv(f, columns=['target', 'predicted', 'count'], header=False, index=False) metadata = { 'outputs': [{ 'type': 'confusion_matrix', 'storage': 'gcs', 'format': 'csv', 'schema': [ { 'name': 'target', 'type': 'CATEGORY' }, { 'name': 'predicted', 'type': 'CATEGORY' }, { 'name': 'count', 'type': 'NUMBER' }, ], 'source': cm_file, # Convert vocab to string because for bealean values we want "True|False" to match csv data. 'labels': list(map(str, vocab)), }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f) accuracy = accuracy_score(df['target'], df['predicted']) metrics = { 'metrics': [{ 'name': 'accuracy-score', 'numberValue': accuracy, 'format': "PERCENTAGE", }] } with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f: json.dump(metrics, f)
def run_local_analysis(output_dir, csv_file_pattern, schema, inverted_features): """Use pandas to analyze csv files. Produces a stats file and vocab files. Args: output_dir: output folder csv_file_pattern: list of csv file paths, may contain wildcards schema: BQ schema list inverted_features: inverted_features dict Raises: ValueError: on unknown transfrorms/schemas """ sys.stdout.write('Expanding any file patterns...\n') sys.stdout.flush() header = [column['name'] for column in schema] input_files = [] for file_pattern in csv_file_pattern: input_files.extend(file_io.get_matching_files(file_pattern)) sys.stdout.write('file list computed.\n') sys.stdout.flush() # Make a copy of inverted_features and update the target transform to be # identity or one hot depending on the schema. inverted_features_target = copy.deepcopy(inverted_features) for name, transform_set in six.iteritems(inverted_features_target): if transform_set == set([constant.TARGET_TRANSFORM]): target_schema = next(col['type'].lower() for col in schema if col['name'] == name) if target_schema in constant.NUMERIC_SCHEMA: inverted_features_target[name] = {constant.IDENTITY_TRANSFORM} else: inverted_features_target[name] = {constant.ONE_HOT_TRANSFORM} # initialize the results def _init_numerical_results(): return { 'min': float('inf'), 'max': float('-inf'), 'count': 0, 'sum': 0.0 } numerical_results = collections.defaultdict(_init_numerical_results) vocabs = collections.defaultdict(lambda: collections.defaultdict(int)) num_examples = 0 # for each file, update the numerical stats from that file, and update the set # of unique labels. for input_file in input_files: sys.stdout.write('Analyzing file %s...\n' % input_file) sys.stdout.flush() with file_io.FileIO(input_file, 'r') as f: for line in csv.reader(f): if len(header) != len(line): raise ValueError( 'Schema has %d columns but a csv line only has %d columns.' % (len(header), len(line))) parsed_line = dict(zip(header, line)) num_examples += 1 for col_name, transform_set in six.iteritems( inverted_features_target): # All transforms in transform_set require the same analysis. So look # at the first transform. transform_name = next(iter(transform_set)) if transform_name in constant.TEXT_TRANSFORMS: split_strings = parsed_line[col_name].split(' ') # If a label is in the row N times, increase it's vocab count by 1. # This is needed for TFIDF, but it's also an interesting stat. for one_label in set(split_strings): # Filter out empty strings if one_label: vocabs[col_name][one_label] += 1 elif transform_name in constant.CATEGORICAL_TRANSFORMS: if parsed_line[col_name]: vocabs[col_name][parsed_line[col_name]] += 1 elif transform_name in constant.NUMERIC_TRANSFORMS: if not parsed_line[col_name].strip(): continue numerical_results[col_name]['min'] = (min( numerical_results[col_name]['min'], float(parsed_line[col_name]))) numerical_results[col_name]['max'] = (max( numerical_results[col_name]['max'], float(parsed_line[col_name]))) numerical_results[col_name]['count'] += 1 numerical_results[col_name]['sum'] += float( parsed_line[col_name]) sys.stdout.write('file %s analyzed.\n' % input_file) sys.stdout.flush() # Write the vocab files. Each label is on its own line. vocab_sizes = {} for name, label_count in six.iteritems(vocabs): # df is now: # label1,count # label2,count # ... # where label1 is the most frequent label, and label2 is the 2nd most, etc. df = pd.DataFrame([{ 'label': label, 'count': count } for label, count in sorted( six.iteritems(label_count), key=lambda x: x[1], reverse=True)], columns=['label', 'count']) csv_string = df.to_csv(index=False, header=False) file_io.write_string_to_file( os.path.join(output_dir, constant.VOCAB_ANALYSIS_FILE % name), csv_string) vocab_sizes[name] = {'vocab_size': len(label_count)} # Update numerical_results to just have min/min/mean for col_name in numerical_results: if float(numerical_results[col_name]['count']) == 0: raise ValueError('Column %s has a zero count' % col_name) mean = (numerical_results[col_name]['sum'] / float(numerical_results[col_name]['count'])) del numerical_results[col_name]['sum'] del numerical_results[col_name]['count'] numerical_results[col_name]['mean'] = mean # Write the stats file. numerical_results.update(vocab_sizes) stats = {'column_stats': numerical_results, 'num_examples': num_examples} file_io.write_string_to_file( os.path.join(output_dir, constant.STATS_FILE), json.dumps(stats, indent=2, separators=(',', ': ')))
def run_predict(output_dir, data_path, schema, target_name, model_export_dir, project, mode, batch_size): """Run predictions with given model using DataFlow. Args: output_dir: output folder data_path: test data file path. schema: schema list. target_name: target column name. model_export_dir: GCS or local path of exported model trained with tft preprocessed data. project: the project to run dataflow in. local: whether the job should be local or cloud. batch_size: batch size when running prediction. """ target_type = next(x for x in schema if x['name'] == target_name)['type'] labels_file = os.path.join(model_export_dir, 'assets', 'vocab_' + target_name) is_classification = file_io.file_exists(labels_file) output_file_prefix = os.path.join(output_dir, 'prediction_results') output_schema_file = os.path.join(output_dir, 'schema.json') names = [x['name'] for x in schema] output_schema = filter(lambda x: x['name'] != target_name, schema) if is_classification: with file_io.FileIO(labels_file, mode='r') as f: labels = [x.strip() for x in f.readlines()] output_schema.append({'name': 'target', 'type': 'CATEGORY'}) output_schema.append({'name': 'predicted', 'type': 'CATEGORY'}) output_schema.extend([{'name': x, 'type': 'NUMBER'} for x in labels]) else: output_schema.append({'name': 'target', 'type': 'NUMBER'}) output_schema.append({'name': 'predicted', 'type': 'NUMBER'}) if mode == 'local': pipeline_options = None runner = 'DirectRunner' elif mode == 'cloud': options = { 'job_name': 'pipeline-predict-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S'), 'temp_location': os.path.join(output_dir, 'tmp'), 'project': project, 'setup_file': './setup.py', } pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options) runner = 'DataFlowRunner' else: raise ValueError("Invalid mode %s." % mode) with beam.Pipeline(runner, options=pipeline_options) as p: raw_results = (p | 'read data' >> beam.io.ReadFromText(data_path) | 'move target to last' >> beam.ParDo( TargetToLastDoFn(names, target_name)) | 'batch' >> beam.ParDo(EmitAsBatchDoFn(batch_size)) | 'predict' >> beam.ParDo(PredictDoFn(model_export_dir))) if is_classification: processed_results = ( raw_results | 'unbatch' >> beam.FlatMap(lambda x: zip(x['source'], x['scores'])) | 'get predicted' >> beam.Map(lambda x: x[0] + [labels[x[1].argmax()]] + list(x[1])) ) else: processed_results = ( raw_results | 'unbatch' >> beam.FlatMap(lambda x: zip(x['source'], x['outputs'])) | 'get predicted' >> beam.Map(lambda x: x[0] + list(x[1]))) results_save = ( processed_results | 'write csv lines' >> beam.ParDo(ListToCsvDoFn()) | 'write file' >> beam.io.WriteToText(output_file_prefix)) (results_save | 'fixed one' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) | 'set schema' >> beam.Map(lambda path: json.dumps(output_schema)) | 'write schema file' >> beam.io.WriteToText(output_schema_file, shard_name_template=''))
def _copy_file_to_gcs(job_dir, file_path): gcs_url = os.path.join(job_dir, file_path) logger.info('Saving models to GCS: %s' % gcs_url) with file_io.FileIO(file_path, mode='rb') as input_f: with file_io.FileIO(gcs_url, mode='w+') as output_f: output_f.write(input_f.read())
model.summary() model.fit(train_parsed_dataset, epochs=args.num_epochs, steps_per_epoch=int(train_records/batch_size)+1, validation_data=valid_parsed_dataset, validation_steps=int(valid_records/batch_size)+1, callbacks=[tensorboard_cb, checkpoint] ) # Export the model to a local SavedModel directory/cloud storage export_path = tf.contrib.saved_model.save_keras_model(model, saved_model_path=os.path.join(output_dir,"model")) # export checkpoint to cloud storage output directory if args.hypertune=="hypertune": with file_io.FileIO(checkpoint_path, mode='rb') as input_f: with file_io.FileIO(os.path.join(output_dir,'checkpoint',checkpoint_path), mode='wb+') as output_f: output_f.write(input_f.read()) test_parsed_dataset = test_parsed_dataset.batch(batch_size) test_parsed_dataset = test_parsed_dataset.repeat() model.evaluate(test_parsed_dataset,steps=int(test_records/batch_size)+1)
def train_model(train_file='data/mnist.pkl', job_dir='./tmp/mnist_mlp', **args): # set the logging path for ML Engine logging to Storage bucket logs_path = job_dir + '/logs/' + datetime.now().isoformat() print('Using logs_path located at {}'.format(logs_path)) # Reading in the pickle file. Pickle works differently with Python 2 vs 3 f = file_io.FileIO(train_file, mode='r') if sys.version_info < (3, ): data = pickle.load(f) else: data = pickle.load(f, encoding='bytes') # with open(train_file, 'rb') as file: # data = pickle.load(file, encoding='bytes') # the data, shuffled and split between train and test sets (x_train, y_train), (x_test, y_test) = data x_train = x_train.reshape(60000, 784) x_test = x_test.reshape(10000, 784) x_train = x_train.astype('float32') x_test = x_test.astype('float32') x_train /= 255 x_test /= 255 print(x_train.shape[0], 'train samples') print(x_test.shape[0], 'test samples') # convert class vectors to binary class matrices y_train = keras.utils.to_categorical(y_train, num_classes) y_test = keras.utils.to_categorical(y_test, num_classes) model = Sequential() model.add(Dense(512, activation='relu', input_shape=(784, ))) model.add(Dropout(0.2)) model.add(Dense(512, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(10, activation='softmax')) model.summary() model.compile(loss='categorical_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) history = model.fit( x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[keras.callbacks.TensorBoard(log_dir=logs_path)], validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # Save the model locally model.save('model.h5') # Save the model to the Cloud Storage bucket's jobs directory with file_io.FileIO('model.h5', mode='r') as input_f: with file_io.FileIO(job_dir + '/model.h5', mode='w+') as output_f: output_f.write(input_f.read())
'--config_file', help= 'A YAML config file that contrains all other trainer input parameters', required=True) parser.add_argument('--data_path', help='GS path for training data', required=True) args = parser.parse_args() arguments = args.__dict__ ########### Load config file and config parameters ################ if args.config_file is None: raise ValueError('config YAML file must not be None!!!') if file_io.file_exists(args.config_file) is not True: # use tf's file_io for both GS and local files raise ValueError('config file does not exsit!!! {}'.format( args.config_file)) with file_io.FileIO( args.config_file, 'r') as f: # This reads BOTH local files and GS bucket files!!! config = yaml.load(f) # actually training happens here mnist_autoencoder_deconv_simple.train(job_dir=args.job_dir, job_id=args.job_id, data_path=args.data_path, **config)
def train_model(train_file='russian punch cards/processedarrays.npy', job_dir='./tmp/example-5', log_dir='./tmp/logs', dropout=0.5, rnn_size=128, rnn_activation='tanh', rnn_layers=1, rnn_cell='LSTM', lr_decay=0, batch_size=64, epochs=100, saved_model='model.h5', test=False, **args): file_stream = file_io.FileIO(train_file, mode='rb') data_dict = np.load(file_stream) data_list = list(data_dict[()].values()) data_unfolded = [ np.ravel(d, order='C').astype(np.uint8) for d in data_list if d.shape[1] == 24 ] MAX_LEN = 2400 data_repeated = [ np.tile(x, MAX_LEN // x.shape[0] + 1) for x in data_unfolded ] pad = pad_sequences(data_repeated, maxlen=MAX_LEN, dtype=np.uint8, value=2, padding='post', truncating='post') if rnn_cell == 'LSTM': if test: cell = LSTM else: cell = CuDNNLSTM elif rnn_cell == 'GRU': cell = CuDNNGRU else: print('unknown rnn cell type, defaulting to LSTM') cell = CuDNNLSTM model = Sequential() model.add( cell(rnn_size, return_sequences=True, batch_input_shape=(None, None, 1))) model.add(Dropout(dropout)) for i in range(rnn_layers - 1): model.add(cell(rnn_size, return_sequences=True)) model.add(Dropout(dropout)) model.add(Activation(rnn_activation)) model.add(TimeDistributed(Dense(1, activation='sigmoid'))) optimizer = RMSprop(clipnorm=1., decay=lr_decay) # try using different optimizers and different optimizer configs model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['binary_accuracy']) if test: X = pad[:256, :-1, None] y = pad[:256, 1:, None] else: X = pad[:, :-1, None] y = pad[:, 1:, None] ckpt = GCSModelCheckpoint('epoch_{epoch}_' + saved_model, job_dir + '/models', monitor='val_binary_accuracy', save_best_only=True, period=10) tb = TensorBoard(log_dir=log_dir + '/' + job_dir.split('/')[-1]) model.fit(x=X, y=y, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[tb, ckpt]) model.save(saved_model) # Save model.h5 on to google storage with file_io.FileIO(saved_model, mode='rb') as input_f: with file_io.FileIO(job_dir + '/' + saved_model, mode='w+') as output_f: output_f.write(input_f.read())
confmat = np.array([[53, 23, 15], [124, 51, 26], [2934, 540, 2634]]) vocab = ['Class A', 'Class B', 'Class C'] # vocab = np.arange(3) cdata = [] for target_index, target_row in enumerate(confmat): print(target_index, target_row) print("\n") for pred_idx, count in enumerate(target_row): cdata.append((vocab[target_index], vocab[pred_idx], count)) df_cm = pd.DataFrame(cdata, columns=['target', 'predicted', 'count']) cm_file = os.path.join('gs://data-folder/kubeflow_data_trial1', 'confusion_matrix.csv') with file_io.FileIO(cm_file, 'w') as fl: df_cm.to_csv(fl, columns=['target', 'predicted', 'count'], header=False, index=False) # metadata = { # 'outputs' : [ # ] # } # with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as fl: # json.dump(metadata, fl) metrics = { 'metrics': [
def testWriteBinaryMode(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, "wb").write("testing") with file_io.FileIO(file_path, mode="r") as f: self.assertEqual("testing", f.read())
def testReadBinaryMode(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.write_string_to_file(file_path, "testing") with file_io.FileIO(file_path, mode="rb") as f: self.assertEqual(b"testing", f.read())
def testUTF8StringPath(self): file_path = os.path.join(self._base_dir, "UTF8测试_file") file_io.write_string_to_file(file_path, "testing") with file_io.FileIO(file_path, mode="rb") as f: self.assertEqual(b"testing", f.read())
def _batch_csv_reader(csv_file, n): with file_io.FileIO(csv_file, 'r') as f: args = [f] * n return six.moves.zip_longest(*args)
def main(job_dir): EPOCHS = 100 INIT_LR = 1e-3 BS = 50 IMAGE_DIMS = (96, 96, 3) data = [] labels = [] current_dir = os.path.dirname(os.path.abspath(__file__)) print('downloading!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') with file_io.FileIO('gs://data_bbp/data.zip', mode='wb+') as f: with file_io.FileIO(current_dir + '/data.zip', mode='wb+') as output_f: output_f.write(f.read()) print('downloaded!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') with zipfile.ZipFile(current_dir + '/data.zip', 'r') as f: for member in f.infolist(): f.extract(member, current_dir + '/data') print('unzipped!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') imagePaths = sorted(list(paths.list_images(current_dir))) random.seed(42) random.shuffle(imagePaths) for imagePath in imagePaths: image = cv2.imread(imagePath) image = cv2.resize(image, (IMAGE_DIMS[1], IMAGE_DIMS[0])) image = img_to_array(image) data.append(image) label = imagePath.split(os.path.sep)[-2] labels.append(label) print('processed!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!') data = np.array(data, dtype="float") / 255.0 labels = np.array(labels) lb = LabelBinarizer() labels = lb.fit_transform(labels) (trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.1, random_state=42) aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.1, height_shift_range=0.1, shear_range=0.2, zoom_range=0.2, horizontal_flip=False, fill_mode="nearest") model = build(width=IMAGE_DIMS[1], height=IMAGE_DIMS[0], depth=IMAGE_DIMS[2], classes=len(lb.classes_)) model.summary() opt = Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS) model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"]) model.fit_generator(aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS, verbose=1) model.save('model.h5') with file_io.FileIO('model.h5', mode='rb') as input_f: with file_io.FileIO('gs://bbp_model_bucket/model/model.h5', mode='wb+') as output_f: output_f.write(input_f.read())
def train_model(train_file='data/', job_dir='./tmp/mnist_mlp', dropout_one=0.2, dropout_two=0.2, **args): # ラベル:"Horizontal": 横長,"Vertical": 縦長 label = ['Horizontal', 'Vertical'] # 四角形の訓練データの読み込み [X_train, y_train] = load_rectangles_data(train_file + 'rectangles_train.amat') # 四角形のテストデータの読み込み [X_test, y_test] = load_rectangles_data(train_file + 'rectangles_test.amat') print(X_train.shape[0], 'train samples') print(X_test.shape[0], 'test samples') # ラベルをクラス数に対応する配列に変更 # 例:y_train:[0 1 0 0] -> Y_train:[[1 0],[0 1],[1 0],[1 0]] Y_train = keras.utils.to_categorical(y_train, num_classes) Y_test = keras.utils.to_categorical(y_test, num_classes) # 多層パーセプトロンのネットワーク作成 # 入力を784次元(28x28)で、最終的な出力をクラス数に設定 model = Sequential() model.add(Dense(512, activation='relu', input_dim=784, init='uniform')) model.add(Dropout(dropout_one)) model.add(Dense(512, activation='relu', init='uniform')) model.add(Dropout(dropout_two)) model.add( Dense(num_classes, activation='softmax', input_dim=512, init='uniform')) model.summary() # 2値分類なのでバイナリを選択,最適化アルゴリズムはRMSpropを選択 model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy']) es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto') history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_test, Y_test), callbacks=[es]) score = model.evaluate(X_test, Y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # モデルの保存 model.save('model.h5') # Google Cloud Storageのジョブディレクトリにモデルを保存 with file_io.FileIO('model.h5', mode='r') as input_f: with file_io.FileIO(job_dir + '/model.h5', mode='w+') as output_f: output_f.write(input_f.read())
def dispatch( train_files, learning_rate, job_dir, train_batch_size=64, num_epochs=100, steps_per_epoch=15, cv=1, val_ratio=0.2, # cross validation decay=0.01, # learning rate decay # num of epoches without improvement to trigger early stopping patience=15, fc_layers=[512], dropouts=[0.5], # fully connected layers trainable_layers=166, # trainable transfer learning model layers do_predict_test=False, test_file='' # predict test data ): # log parameters. logging.info('start dispatch') # Preserve input parameters for saving them later parameters = locals() # Trainning data # Original Data with file_io.FileIO(train_files[0], mode='r') as train_input: train_data = json.load(train_input) train_df = pd.DataFrame(train_data) train_target = train_df['is_iceberg'] # TODO: add reading test data. # Preprocess # Images: resize images to 75*75, 2 channels, and scale each channel to # range 0 to 1. band_1 = np.array([ np.array(band).astype(np.float64).reshape(75, 75) for band in train_df["band_1"] ]) # Scale the input graph to -1 to 1 # preserve those values for later scaling test data use band_1_max = band_1.max() band_1_min = band_1.min() band_1 = (band_1 - band_1_min) / (band_1_max - band_1_min) * 2 - 1 band_2 = np.array([ np.array(band).astype(np.float64).reshape(75, 75) for band in train_df["band_2"] ]) band_2_max = band_2.max() band_2_min = band_2.min() band_2 = (band_2 - band_2_min) / (band_2_max - band_2_min) * 2 - 1 X = np.concatenate( [band_1[:, :, :, np.newaxis], band_2[:, :, :, np.newaxis]], axis=-1) # Incident angles: fill nan with 0, and scale to 0 - 1. train_df.inc_angle = train_df.inc_angle.replace('na', 0) X_inc = np.array(train_df.inc_angle) X_inc_max = X_inc.max() X_inc = X_inc / X_inc_max # Ids: for saving prediction use later X_id = train_df['id'] # Testing data # Only load them if needed if do_predict_test: with file_io.FileIO(test_file, mode='r') as test_input: test_data = json.load(test_input) test_df = pd.DataFrame(test_data) # Preprocess # Images: resize images to 75*75, 2 channels, and scale each channel to # range 0 to 1. band_1_test = np.array([ np.array(band).astype(np.float64).reshape(75, 75) for band in test_df["band_1"] ]) # Scale the test graph using the same scale as training band_1_test = (band_1_test - band_1_min) / (band_1_max - band_1_min) * 2 - 1 band_2_test = np.array([ np.array(band).astype(np.float64).reshape(75, 75) for band in test_df["band_2"] ]) band_2_test = (band_2_test - band_2_min) / (band_2_max - band_2_min) * 2 - 1 X_test = np.concatenate([ band_1_test[:, :, :, np.newaxis], band_2_test[:, :, :, np.newaxis] ], axis=-1) # Incident angles: fill nan with 0, and using the same scale as training test_df.inc_angle = test_df.inc_angle.replace('na', 0) X_inc_test = np.array(test_df.inc_angle) X_inc_test = X_inc_test / X_inc_max test_id = test_df['id'] # Set up cross validation: randomnly divide the data into several # training and validation splits, validation size can be set through # val_ratio, default to 20% of the total data. sample_size = len(train_target) validate_size = int(sample_size * val_ratio) np.random.seed(CV_RANDOM_SEED) # set random seed for reproducing results. folds = [] for i in range(cv): # generate a shuffle. permutation = np.random.permutation(sample_size) # validation set. X_id_val = X_id[permutation[:validate_size]] X_val = X[permutation[:validate_size]] X_inc_val = X_inc[permutation[:validate_size]] y_val = train_target[permutation[:validate_size]] # trainning set. X_id_train = X_id[permutation[validate_size:]] X_train = X[permutation[validate_size:]] X_inc_train = X_inc[permutation[validate_size:]] y_train = train_target[permutation[validate_size:]] # add to folds. folds.append((X_id_train, X_train, X_inc_train, y_train, X_id_val, X_val, X_inc_val, y_val)) # Training, cross validation and predict on test if needed. avg_val_score = 0 avg_train_score = 0 avg_pred_test = None pred_dir = os.path.join(job_dir, 'predictions') for i, (X_id_train, X_train, X_inc_train, y_train, X_id_val, X_val, X_inc_val, y_val) in enumerate(folds): logging.info('===================FOLD=%d' % i) # sanity check train_size = sample_size - validate_size assert len(X_id_train) == train_size assert len(X_train) == train_size assert len(X_inc_train) == train_size assert len(y_train) == train_size assert len(X_id_val) == validate_size assert len(X_val) == validate_size assert len(X_inc_val) == validate_size assert len(y_val) == validate_size # TODO: # 1. save the best model # 2. predict on test set # 3. record prediction on trainning and validation for analysis model = get_model(fc_layers, dropouts, trainable_layers) # optimizer optimizer = Adam( lr=learning_rate, decay=decay, beta_1=0.9, beta_2=0.999, epsilon=1e-08, ) # compile model model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # data flow generator, with image data augmented. # generator = ImageDataGenerator( # horizontal_flip=True, # vertical_flip=True # ) generator = ImageDataGenerator(rotation_range=20, horizontal_flip=True, vertical_flip=True, width_shift_range=0.1, height_shift_range=0.1, zoom_range=0.1) gen_flow = gen_flow_for_two_inputs(X_train, X_inc_train, y_train, generator, train_batch_size) # Callbacks # TensorBoard callback, used to record training process for later # plotting using TensorBoard tensorboard = TensorBoard(log_dir=os.path.join(job_dir, 'logs'), write_graph=False) # EarlyStopping callback. By default monitoring val_loss decrease early_stopping = EarlyStopping(patience=patience) # ModelCheckpoint callback. By default monitoring val_loss min # TODO: add a callback to record models so that we can pick up one and # keep training model_dir = os.path.join(job_dir, 'models') if job_dir.startswith("gs://"): # Work-around fro h5py not able to handle writing to Google Cloud # Storage. Save to local first then copy to GCS best_model_path = 'best_model_%d.hdf5' % i else: if not os.path.exists(model_dir): os.mkdir(model_dir) best_model_path = os.path.join(model_dir, 'best_model_%d.hdf5' % i) model_checkpoint = ModelCheckpoint( best_model_path, save_best_only=True, save_weights_only=True # model architecture won't change when load ) # Train model and validate along the way model.fit_generator( gen_flow, # TODO: investigate if the gen_flow shuffle before every epoch, # else, each epoch will be seeing the same samples steps_per_epoch=steps_per_epoch, epochs=num_epochs, shuffle=True, verbose=1, validation_data=([X_val, X_inc_val], y_val), callbacks=[tensorboard, early_stopping, model_checkpoint]) # Load the best model and save train and validation predictions model.load_weights(filepath=best_model_path) if job_dir.startswith("gs://"): # Work-around for pandas not able to handling writing to GCS. # Save to local first then copy to GCS. pred_val_path = 'pred_val_%d.csv' % i pred_train_path = 'pred_train_%d.csv' % i else: if not os.path.exists(pred_dir): os.mkdir(pred_dir) pred_val_path = os.path.join(pred_dir, 'pred_val_%d.csv' % i) pred_train_path = os.path.join(pred_dir, 'pred_train_%d.csv' % i) # Get validation Score. pred_val = model.predict([X_val, X_inc_val ]).ravel() # flatten the 2-d array to 1-d avg_val_score += log_loss(y_val, pred_val) pred_val_df = pd.DataFrame({ 'id': X_id_val, 'pred': pred_val, 'is_iceberg': y_val }) pred_val_df.to_csv(pred_val_path) pred_train = model.predict([X_train, X_inc_train]).ravel() avg_train_score += log_loss(y_train, pred_train) pred_train_df = pd.DataFrame({ 'id': X_id_train, 'pred': pred_train, 'is_iceberg': y_train }) pred_train_df.to_csv(pred_train_path) # Copy files to GCS if running on cloud if job_dir.startswith("gs://"): # best model copy_file_to_gcs(model_dir, best_model_path) # predictions copy_file_to_gcs(pred_dir, pred_val_path) copy_file_to_gcs(pred_dir, pred_train_path) if do_predict_test: pred_test = model.predict([X_test, X_inc_test]).flatten() if avg_pred_test is None: avg_pred_test = pred_test else: avg_pred_test += pred_test # Add average validation and training score to record parameters['avg_val_score'] = avg_val_score / cv parameters['avg_train_score'] = avg_train_score / cv parameters_json_str = json.dumps(parameters, indent=2) logging.info(parameters_json_str) # Write parameters and scores to a file for experiment analysis with file_io.FileIO(os.path.join(job_dir, 'records.json'), mode='w') as records_output: records_output.write(parameters_json_str) if do_predict_test: avg_pred_test = avg_pred_test / cv leaky_angle = [34.4721, 42.5591, 33.6352, 36.1061, 39.2340] mask = [X_inc_test[i] in leaky_angle for i in range(len(test_id))] avg_pred_test[mask] = 1 # Save average test prediction if job_dir.startswith("gs://"): pred_test_path = 'pred_test_%s.csv' % job_dir.split('/')[-1] else: pred_test_path = os.path.join(pred_dir, 'pred_test.csv') save_submit(test_id, avg_pred_test, pred_test_path) if job_dir.startswith("gs://"): copy_file_to_gcs(pred_dir, pred_test_path)
def copy_file_to_gcs(job_dir, file_path): with file_io.FileIO(file_path, mode='rb') as input_f: with file_io.FileIO(os.path.join(job_dir, file_path), mode='w+') as output_f: output_f.write(input_f.read())
def main(input_file_name, output_file_name, job_dir, batch_size, num_units, num_layers, learning_rate, num_epoches, check_point, output_len, use_gpu): with file_io.FileIO(input_file_name, 'r') as input: sample = list(input.read()) seq_len = 1 output_file = os.path.join(job_dir, output_file_name) output = file_io.FileIO(output_file, 'w') char_to_idx, idx_to_char = build_dataset(sample) # use gpu device_name = '/device:GPU:0' if use_gpu != 1: device_name = '/cpu:0' vocab_len = len(char_to_idx) with tf.device(device_name): model = LSTM(num_units, num_layers, seq_len, batch_size, vocab_len) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize( model.loss) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print('training...') pred_inputs, _ = next( next_batch(sample, batch_size, seq_len, char_to_idx)) for i in range(num_epoches): print('===============') print('epoch: %d' % i) print('===============') cur_state = sess.run(model.initial_states) for inputs, targets in next_batch(sample, batch_size, seq_len, char_to_idx): feed_dict = { model.inputs: inputs, model.targets: targets, model.initial_states: cur_state } _, state, loss = sess.run( [optimizer, model.final_state, model.loss], feed_dict=feed_dict) cur_state = state print(loss) if i != 0 and i % check_point == 0: output.write('[epoch %d] predicting...\n' % i) with tf.device(device_name): pred = model.predict(sess, pred_inputs, output_len, seq_len, idx_to_char) output.write("".join(pred)) output.write("\n\n") print("".join(pred)) # reset state # sess.run([model.reset_state]) print('predicting...\n') with tf.device(device_name): pred = model.predict(sess, pred_inputs, output_len, seq_len, idx_to_char) output.write("".join(pred)) output.write("\n\n") print("".join(pred)) output.close()
def __init__(self, datadir, verbose=False, temporal_samples=None, section="dataset", augment=False): self.verbose = verbose self.augment = augment # parser reads serialized tfrecords file and creates a feature object parser = utils.S2parser() self.parsing_function = parser.parse_example self.temp_samples = temporal_samples self.section = section # if datadir is None: # dataroot=os.environ["datadir"] # else: dataroot = datadir # csv list of geotransforms of each tile: tileid, xmin, xres, 0, ymax, 0, -yres, srid # use querygeotransform.py or querygeotransforms.sh to generate csv # fills dictionary: # geotransforms[<tileid>] = (xmin, xres, 0, ymax, 0, -yres) # srid[<tileid>] = srid self.geotransforms = dict() # https://en.wikipedia.org/wiki/Spatial_reference_system#Identifier self.srids = dict() with file_io.FileIO(os.path.join(dataroot, "geotransforms.csv"), 'r') as f: # gcp # with open(os.path.join(dataroot, "geotransforms.csv"),'r') as f: reader = csv.reader(f, delimiter=',') for row in reader: # float(row[1]), int(row[2]), int(row[3]), float(row[4]), int(row[5]), int(row[6])) self.geotransforms[str(row[0])] = (float(row[1]), float(row[2]), int(row[3]), float(row[4]), int(row[5]), float(row[6])) self.srids[str(row[0])] = int(row[7]) classes = os.path.join(dataroot, "classes.txt") with file_io.FileIO(classes, 'r') as f: # gcp # with open(classes, 'r') as f: classes = f.readlines() self.ids = list() self.classes = list() for row in classes: row = row.replace("\n", "") if '|' in row: id, cl = row.split('|') self.ids.append(int(id)) self.classes.append(cl) ## create a lookup table to map labelids to dimension ids # map data ids [0, 2, 4,..., nclasses_originalID] labids = tf.constant(self.ids, dtype=tf.int64) # to dimensions [0, 1, 2, ... nclasses_orderID] dimids = tf.constant(list(range(0, len(self.ids), 1)), dtype=tf.int64) self.id_lookup_table = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(labids, dimids), default_value=-1) self.inverse_id_lookup_table = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(dimids, labids), default_value=-1) # self.classes = [cl.replace("\n","") for cl in f.readlines()] cfgpath = os.path.join(dataroot, "dataset.ini") # load dataset configs datacfg = configparser.ConfigParser() with file_io.FileIO(cfgpath, 'r') as f: # gcp datacfg.readfp(f) cfg = datacfg[section] self.tileidfolder = os.path.join(dataroot, "tileids") self.datadir = os.path.join(dataroot, cfg["datadir"]) assert 'pix10' in cfg.keys() assert 'nobs' in cfg.keys() assert 'nbands10' in cfg.keys() assert 'nbands20' in cfg.keys() assert 'nbands60' in cfg.keys() self.tiletable = cfg["tiletable"] self.nobs = int(cfg["nobs"]) self.expected_shapes = self.calc_expected_shapes( int(cfg["pix10"]), int(cfg["nobs"]), int(cfg["nbands10"]), int(cfg["nbands20"]), int(cfg["nbands60"])) # expected datatypes as read from disk self.expected_datatypes = (tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.int64)
def main(argv=None): parser = argparse.ArgumentParser(description='ML Trainer') parser.add_argument('--predictions', type=str, help='GCS path of prediction file pattern.') parser.add_argument('--trueclass', type=str, help='The name of the class as true value.') parser.add_argument( '--target_lambda', type=str, help='a lambda function as a string to determine positive or negative.' + 'For example, "lambda x: x[\'a\'] and x[\'b\']". If missing, ' + 'trueclass must be set and input must have a "target" column.') parser.add_argument('--output', type=str, help='GCS path of the output directory.') args = parser.parse_args() if not args.target_lambda and not args.trueclass: raise ValueError('Either target_lambda or trueclass must be set.') schema_file = os.path.join(os.path.dirname(args.predictions), 'schema.json') schema = json.loads(file_io.read_file_to_string(schema_file)) names = [x['name'] for x in schema] dfs = [] files = file_io.get_matching_files(args.predictions) for file in files: with file_io.FileIO(file, 'r') as f: dfs.append(pd.read_csv(f, names=names)) df = pd.concat(dfs) if args.target_lambda: df['target'] = df.apply(eval(args.target_lambda), axis=1) else: df['target'] = df['target'].apply(lambda x: 1 if x == args.trueclass else 0) fpr, tpr, thresholds = roc_curve(df['target'], df[args.trueclass]) df_roc = pd.DataFrame({'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds}) roc_file = os.path.join(args.output, 'roc.csv') with file_io.FileIO(roc_file, 'w') as f: df_roc.to_csv(f, columns=['fpr', 'tpr', 'thresholds'], header=False, index=False) metadata = { 'outputs': [{ 'type': 'roc', 'storage': 'gcs', 'format': 'csv', 'schema': [ { 'name': 'fpr', 'type': 'NUMBER' }, { 'name': 'tpr', 'type': 'NUMBER' }, { 'name': 'thresholds', 'type': 'NUMBER' }, ], 'source': roc_file }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f)
def save_model(model, output_path): #Save model file to GCS print('saved model to ', output_path) model.save(MODEL_FILE) with file_io.FileIO(MODEL_FILE, mode='rb') as input_f: with file_io.FileIO(output_path + '/' + MODEL_FILE, mode='wb+') as output_f: output_f.write(input_f.read())
def load_inputs_from_input_arg_string(inputs_str, input_exprs_str, input_examples_str): """Parses input arg strings and create inputs feed_dict. Parses '--inputs' string for inputs to be loaded from file, and parses '--input_exprs' string for inputs to be evaluated from python expression. '--input_examples' string for inputs to be created from tf.example feature dictionary list. Args: inputs_str: A string that specified where to load inputs. Each input is separated by semicolon. * For each input key: '<input_key>=<filename>' or '<input_key>=<filename>[<variable_name>]' * The optional 'variable_name' key will be set to None if not specified. * File specified by 'filename' will be loaded using numpy.load. Inputs can be loaded from only .npy, .npz or pickle files. * The "[variable_name]" key is optional depending on the input file type as descripted in more details below. When loading from a npy file, which always contains a numpy ndarray, the content will be directly assigned to the specified input tensor. If a variable_name is specified, it will be ignored and a warning will be issued. When loading from a npz zip file, user can specify which variable within the zip file to load for the input tensor inside the square brackets. If nothing is specified, this function will check that only one file is included in the zip and load it for the specified input tensor. When loading from a pickle file, if no variable_name is specified in the square brackets, whatever that is inside the pickle file will be passed to the specified input tensor, else SavedModel CLI will assume a dictionary is stored in the pickle file and the value corresponding to the variable_name will be used. input_exprs_str: A string that specifies python expressions for inputs. * In the format of: '<input_key>=<python expression>'. * numpy module is available as np. input_examples_str: A string that specifies tf.Example with dictionary. * In the format of: '<input_key>=<[{feature:value list}]>' Returns: A dictionary that maps input tensor keys to numpy ndarrays. Raises: RuntimeError: An error when a key is specified, but the input file contains multiple numpy ndarrays, none of which matches the given key. RuntimeError: An error when no key is specified, but the input file contains more than one numpy ndarrays. """ tensor_key_feed_dict = {} inputs = preprocess_inputs_arg_string(inputs_str) input_exprs = preprocess_input_exprs_arg_string(input_exprs_str) input_examples = preprocess_input_examples_arg_string(input_examples_str) for input_tensor_key, (filename, variable_name) in inputs.items(): data = np.load(file_io.FileIO(filename, mode='rb')) # When a variable_name key is specified for the input file if variable_name: # if file contains a single ndarray, ignore the input name if isinstance(data, np.ndarray): warnings.warn( 'Input file %s contains a single ndarray. Name key \"%s\" ignored.' % (filename, variable_name)) tensor_key_feed_dict[input_tensor_key] = data else: if variable_name in data: tensor_key_feed_dict[input_tensor_key] = data[variable_name] else: raise RuntimeError( 'Input file %s does not contain variable with name \"%s\".' % (filename, variable_name)) # When no key is specified for the input file. else: # Check if npz file only contains a single numpy ndarray. if isinstance(data, np.lib.npyio.NpzFile): variable_name_list = data.files if len(variable_name_list) != 1: raise RuntimeError( 'Input file %s contains more than one ndarrays. Please specify ' 'the name of ndarray to use.' % filename) tensor_key_feed_dict[input_tensor_key] = data[variable_name_list[0]] else: tensor_key_feed_dict[input_tensor_key] = data # When input is a python expression: for input_tensor_key, py_expr_evaluated in input_exprs.items(): if input_tensor_key in tensor_key_feed_dict: warnings.warn( 'input_key %s has been specified with both --inputs and --input_exprs' ' options. Value in --input_exprs will be used.' % input_tensor_key) tensor_key_feed_dict[input_tensor_key] = py_expr_evaluated # When input is a tf.Example: for input_tensor_key, example in input_examples.items(): if input_tensor_key in tensor_key_feed_dict: warnings.warn( 'input_key %s has been specified in multiple options. Value in ' '--input_examples will be used.' % input_tensor_key) tensor_key_feed_dict[input_tensor_key] = example return tensor_key_feed_dict
def from_frozen_graph(cls, graph_def_file, input_arrays, output_arrays, input_shapes=None): """Creates a TFLiteConverter class from a file containing a frozen GraphDef. Args: graph_def_file: Full filepath of file containing frozen GraphDef. input_arrays: List of input tensors to freeze graph with. output_arrays: List of output tensors to freeze graph with. input_shapes: Dict of strings representing input tensor names to list of integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}). Automatically determined when input shapes is None (e.g., {"foo" : None}). (default None) Returns: TFLiteConverter class. Raises: IOError: File not found. Unable to parse input file. ValueError: The graph is not frozen. input_arrays or output_arrays contains an invalid tensor name. input_shapes is not correctly defined when required """ with _ops.Graph().as_default(): with _session.Session() as sess: # Read GraphDef from file. if not _file_io.file_exists(graph_def_file): raise IOError( "File '{0}' does not exist.".format(graph_def_file)) with _file_io.FileIO(graph_def_file, "rb") as f: file_content = f.read() try: graph_def = _graph_pb2.GraphDef() graph_def.ParseFromString(file_content) except (_text_format.ParseError, DecodeError): try: print("Ignore 'tcmalloc: large alloc' warnings.") if not isinstance(file_content, str): if PY3: file_content = file_content.decode("utf-8") else: file_content = file_content.encode("utf-8") graph_def = _graph_pb2.GraphDef() _text_format.Merge(file_content, graph_def) except (_text_format.ParseError, DecodeError): raise IOError( "Unable to parse input file '{}'.".format( graph_def_file)) # Handles models with custom TFLite ops that cannot be resolved in # TensorFlow. load_model_in_session = True try: _import_graph_def(graph_def, name="") except _NotFoundError: load_model_in_session = False if load_model_in_session: # Check if graph is frozen. if not _is_frozen_graph(sess): raise ValueError( "Please freeze the graph using freeze_graph.py.") # Get input and output tensors. input_tensors = _get_tensors_from_tensor_names( sess.graph, input_arrays) output_tensors = _get_tensors_from_tensor_names( sess.graph, output_arrays) _set_tensor_shapes(input_tensors, input_shapes) return cls(sess.graph_def, input_tensors, output_tensors) else: if not input_shapes: raise ValueError( "input_shapes must be defined for this model.") if set(input_arrays) != set(input_shapes.keys()): raise ValueError( "input_shapes must contain a value for each item " "in input_array.") input_arrays_with_shape = [(name, input_shapes[name]) for name in input_arrays] return cls(graph_def, input_tensors=None, output_tensors=None, input_arrays_with_shape=input_arrays_with_shape, output_arrays=output_arrays)
def prepare_for_training(self): print("prepararation is called") '''学習に必要になる変数を指定する。One-hot vectorを作成する。 ''' if not self.isTrain: print("このfunctionはtrainingオブジェクト用です。predictオブジェクトでは使用できません。") return self.input_chars.append('UKW') #self.target_texts.append('UKW') self.num_encoder_tokens = len(self.input_chars) + 1 self.num_decoder_tokens = len(set(self.target_texts)) + 1 self.max_encoder_seq_length = max( [len(txt) for txt in self.input_texts]) self.max_decoder_seq_length = max( [len(txt) for txt in self.target_texts]) input_chars_indexed = dict([ (char, i) for i, char in enumerate(self.input_chars, start=1) ]) #target_chars_indexed = dict([(char, i) for i, char in enumerate(self.target_chars, start=1)]) target_token_indexed = dict([ (text, i) for i, text in enumerate(set(self.target_texts), start=1) ]) input_chars_indexed['PAD'] = 0 #target_chars_indexed['PAD'] = 0 target_token_indexed['PAD'] = 0 with file_io.FileIO(os.path.join(self.output_path, 'fo_input.json'), 'w') as fo_input: json.dump(input_chars_indexed, fo_input) with file_io.FileIO(os.path.join(self.output_path, 'fo_target.json'), 'w') as fo_target: json.dump(target_chars_indexed, fo_target) # 学習に使用するencoderとdecoderのデータを作成する。 self.input_sequence = np.zeros( (len(self.input_texts), self.max_encoder_seq_length), dtype='int32') self.target_sequence = np.zeros( (len(self.target_texts), self.num_decoder_tokens), dtype='int32') ukw_input_index = input_chars_indexed['UKW'] #ukw_target_index = target_token_indexed['UKW'] tokenizer = Tokenizer() tokenizer.fit_on_texts(self.input_texts) self.input_sequence = np.array( tokenizer.texts_to_matrix(self.input_texts)) tokenizer.fit_on_texts(self.target_texts) self.target_sequence = np.array( tokenizer.texts_to_matrix(self.target_texts)) print('input_sequence', self.input_sequence.dtype, self.input_sequence.shape) print('target_sequence', self.target_sequence.dtype, self.target_sequence.shape) # One-hot vectorを作成する。 for i, (input_text, target_text) in enumerate( zip(self.input_texts, self.target_texts)): for j, char in enumerate(input_text): if input_chars_indexed.get(char, 0) > 0: w = input_chars_indexed[char] else: w = ukw_input_index self.input_sequence[i, j] = int(w) k = target_token_indexed.get(target_text, 0) self.target_sequence[i, k] = 1
def gen_dtu_resized_path(dtu_data_folder, mode='training'): """ generate data paths for dtu dataset """ sample_list = [] # parse camera pairs cluster_file_path = dtu_data_folder + '/Cameras/pair.txt' # cluster_list = open(cluster_file_path).read().split() cluster_list = file_io.FileIO(cluster_file_path, mode='r').read().split() # 3 sets training_set = [ 2, 6, 7, 8, 14, 16, 18, 19, 20, 22, 30, 31, 36, 39, 41, 42, 44, 45, 46, 47, 50, 51, 52, 53, 55, 57, 58, 60, 61, 63, 64, 65, 68, 69, 70, 71, 72, 74, 76, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 109, 111, 112, 113, 115, 116, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128 ] validation_set = [ 3, 5, 17, 21, 28, 35, 37, 38, 40, 43, 56, 59, 66, 67, 82, 86, 106, 117 ] data_set = [] if mode == 'training': data_set = training_set elif mode == 'validation': data_set = validation_set # for each dataset for i in data_set: image_folder = os.path.join(dtu_data_folder, ('Rectified/scan%d_train' % i)) cam_folder = os.path.join(dtu_data_folder, 'Cameras/train') depth_folder = os.path.join(dtu_data_folder, ('Depths/scan%d_train' % i)) if mode == 'training': # for each lighting for j in range(0, 7): # for each reference image for p in range(0, int(cluster_list[0])): paths = [] # ref image ref_index = int(cluster_list[22 * p + 1]) ref_image_path = os.path.join(image_folder, ('rect_%03d_%d_r5000.png' % ((ref_index + 1), j))) ref_cam_path = os.path.join(cam_folder, ('%08d_cam.txt' % ref_index)) paths.append(ref_image_path) paths.append(ref_cam_path) # view images for view in range(FLAGS.view_num - 1): view_index = int(cluster_list[22 * p + 2 * view + 3]) view_image_path = os.path.join( image_folder, ('rect_%03d_%d_r5000.png' % ((view_index + 1), j))) view_cam_path = os.path.join( cam_folder, ('%08d_cam.txt' % view_index)) paths.append(view_image_path) paths.append(view_cam_path) # depth path depth_image_path = os.path.join( depth_folder, ('depth_map_%04d.pfm' % ref_index)) paths.append(depth_image_path) sample_list.append(paths) elif mode == 'validation': j = 3 # for each reference image for p in range(0, int(cluster_list[0])): paths = [] # ref image ref_index = int(cluster_list[22 * p + 1]) ref_image_path = os.path.join(image_folder, ('rect_%03d_%d_r5000.png' % ((ref_index + 1), j))) ref_cam_path = os.path.join(cam_folder, ('%08d_cam.txt' % ref_index)) paths.append(ref_image_path) paths.append(ref_cam_path) # view images for view in range(FLAGS.view_num - 1): view_index = int(cluster_list[22 * p + 2 * view + 3]) view_image_path = os.path.join(image_folder, ('rect_%03d_%d_r5000.png' % ((view_index + 1), j))) view_cam_path = os.path.join(cam_folder, ('%08d_cam.txt' % view_index)) paths.append(view_image_path) paths.append(view_cam_path) # depth path depth_image_path = os.path.join( depth_folder, ('depth_map_%04d.pfm' % ref_index)) paths.append(depth_image_path) sample_list.append(paths) return sample_list
def predict_model(self, is_transfer): ''' モデルに予測させ、結果をファイルに保存させる。必要であれば転移学習させる。 # 引数 input_seq: 予測させる文字列。 is_tranfer: 転移学習させるかどうか。trueならさせる。 ''' if self.isTrain: print("このfunctionはpredictオブジェクト用です。trainingオブジェクトでは使用できません。") return if is_transfer: # 過去に学習した重みを読み込んで転移学習させる。 with file_io.FileIO(os.path.join(self.path_cloud, 'lstm_tag.hdf5'), 'r') as reader: with file_io.FileIO('lstm_tag.hdf5', 'w+') as writer: writer.write(reader.read()) self.model.load_weights('lstm_tag.hdf5', by_name=True) def decode_sequence(input_seq): predict_decode_input = np.zeros((1, self.max_decoder_seq_length), dtype=np.int) predict_decode_input[0, 0] = self.target_token_index['\t'] predict_out = np.zeros((10, self.max_decoder_seq_length)) predicted_tag = self.model.predict( [input_seq, predict_decode_input]) top_n = predicted_tag[0, 0, :].argsort()[-self.num_tags - 1:] for i in range(self.num_tags): if self.reverse_target_char_index[ top_n[i]] == 'EOS' or top_n[i] == 0: top_n[i] = predicted_tag[ 0, 0, self.num_tags].argsort()[-self.num_tags - 1:] for i in range(self.num_tags): predict_decode_input[0, 1] = top_n[i] for j in range(1, 9): predicted_tag = self.model.predict( [input_seq, predict_decode_input]) tops = np.argsort(predicted_tag[0, j])[::-1] top = np.argmax(predicted_tag[0, j, :]) if self.reverse_target_char_index[ top] == 'EOS' or self.reverse_target_char_index[ top] == 'PAD': top = tops[1] if self.reverse_target_char_index[ top] == 'EOS' or self.reverse_target_char_index[ top] == 'PAD': top = tops[2] if top in predict_decode_input[0]: j = 8 if j == 8: predict_out[i] = predict_decode_input[0] else: predict_decode_input[0, j + 1] = top # one-hotの予測結果から単語の結果に変換する。 words = [[] for i in range(self.num_tags)] for i in range(self.num_tags): for j in range(self.num_words): words[i].append(self.reverse_target_char_index[int( predict_out[i, j])]) return words # ファイルに保存する。 f = file_io.FileIO(os.path.join(self.path_output, self.type + '.tsv'), 'w') f = file_io.FileIO(os.path.join(self.path_output, "sample.tsv"), "w") for seq_index in range(len(self.encoder_input_data)): input_seq = self.encoder_input_data[seq_index:seq_index + 1] decoded_sentence = decode_sequence(input_seq) for i in range(len(decoded_sentence)): f.write(str(self.tag_data.iloc[seq_index, 0]) + '\t') for j in range(len(decoded_sentence[i])): f.write(decoded_sentence[i][j]) f.write('\t' + str(i + 1) + '\n') f.close()
def _load_saved_object_graph_proto(filename): with file_io.FileIO(filename, "rb") as f: contents = f.read() return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
def testFileDelete(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") file_io.delete_file(file_path) self.assertFalse(file_io.file_exists(file_path))