import datetime import pickle import gzip # 训练集文件路径 dir_name = r'' model_save_path = r"model/" if not os.path.isdir(model_save_path): os.makedirs(model_save_path) # 创建输出文件目录 model_fn = path.join(model_save_path, 'save_net.ckpt') start_time = datetime.datetime.now() print("startTime: ", start_time) # 提起pickle数据 data包含 特征+标签 data = tool_set.read_and_decode(dir_name + "new.pkl") isize = 10 img_channel = 3 img_pixel = isize ''' # CNN 完整程序 训练模型 ''' # Parameters training_epochs = 200 batch_size = 128 display_step = 10 channels = img_channel per_process_gpu_memory_fraction = 1
def writeMNIST(sc, dir_name,img_pixel,channels, output, format, num_partitions): # 按照自己数据格式相应修改 ''' :remark 将原有的数据转成需要的格式 存储成HDFS :param sc: SparkContext 不改 :param dir_name: 输入存放影像路径 如:存有0,1文件夹(2类) :param img_pixel: 图像大小(如:mnist 28) :param channels: 图像波段数(如:mnist 波段为 1) :param output: 转成HDFS输出路径 :param format: 需要转成的数据格式 :param num_partitions: 实际图像分类数 mnist 10分类 所有为10 :return: HDFS ''' """Writes MNIST image/label vectors into parallelized files on HDFS""" ''' # load MNIST gzip into memory with open(input_images, 'rb') as f: images = numpy.array(mnist.extract_images(f)) # 将原有的图像数据全部提取出来转成numpy array with open(input_labels, 'rb') as f: # 将原有的标签数据全部提取出来转成numpy array if format == "csv2": # 数据格式 labels = numpy.array(mnist.extract_labels(f, one_hot=False)) # array else: labels = numpy.array(mnist.extract_labels(f, one_hot=True)) # array ''' # tool_set.create_pickle_train(dir_name,img_pixel,channels) # data=tool_set.read_and_decode(dir_name+"/train_data.pkl",img_pixel,channels) ''' 直接读取图像 data=tool_set.create_pickle_train(dir_name,img_pixel,channels) ''' # 读取pickle data = tool_set.read_and_decode(dir_name, img_pixel, channels) ## 图像-->numpy array # data = create_pickle_train(dir_name,img_pixel,channels) #(image+label) # 将数据按行打乱 index = [i for i in range(len(data))] # len(data)得到的行数 np.random.shuffle(index) # 将索引打乱 data = data[index] del index labels_dense=data[:,-1] #取出标签列 if format == "csv2": # 数据格式 labels=labels_dense else: # 转成one_hot labels=tool_set.dense_to_one_hot2(labels_dense,num_partitions) del labels_dense images_=data[:, 0:img_pixel * img_pixel * channels] images=images_.reshape((-1,img_pixel,img_pixel,channels)) del data #标签--->float,图像数据-->int #下面这个处理一定要加,否则后续训练由于数据类型与TensorFlowOnSpark自带的数据类型 # 不一致,而产生各种错误 labels=labels.astype(np.float16) images=images.astype(np.uint8) # 如果使用自己的数据转成HDFS 需要修改 上面两个 open 将自己的数据 转成 numpy array shape = images.shape # 图像总数 x 28 x 28 x 1(波段数) print("images.shape: {0}".format(shape)) # 60000 x 28 x 28 mnist数据 28x28x1 0~9(10类) print("labels.shape: {0}".format(labels.shape)) # 60000 x 10 # create RDDs of vectors imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]*shape[3]), num_partitions) # [-1,28*28*1] # imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]*nBands), num_partitions) nBands 图像波段数 labelRDD = sc.parallelize(labels, num_partitions) output_images = output + "/images" # 输出路径 output_labels = output + "/labels" # 输出路径 # save RDDs as specific format if format == "pickle": imageRDD.saveAsPickleFile(output_images) #保存成Pickle labelRDD.saveAsPickleFile(output_labels) # elif format == "csv": imageRDD.map(toCSV).saveAsTextFile(output_images) # 转成csv 再转成 Text labelRDD.map(toCSV).saveAsTextFile(output_labels) # 转成csv 再转成 Text elif format == "csv2": imageRDD.map(toCSV).zip(labelRDD).map(lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output) # image + label 放在一个文件转成 text else: # format == "tfr": tfRDD = imageRDD.zip(labelRDD).map(lambda x: (bytearray(toTFExample(x[0], x[1])), None)) # 转成 .tfrecord # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar tfRDD.saveAsNewAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat", keyClass="org.apache.hadoop.io.BytesWritable", valueClass="org.apache.hadoop.io.NullWritable")
import pyximport pyximport.install() import tool_set import numpy as np from datetime import datetime if __name__ == "__main__": # path="/home/wu/Water_extract/data/0_1.tif" # img=tool_set.Multiband2Array(path) # print(img.shape) # pass dir_name = "/home/wu/Water_extract/data/data/" start = datetime.now() tool_set.create_pickle_train(dir_name, 10, 4) # data0 = tool_set.read_and_decode(dir_name + 'train_data.pkl', 10, 4) print(data0.shape) data1 = tool_set.read_and_decode(dir_name + 'train_data_1.pkl', 10, 4) print(data1.shape) # data=np.vstack((data0,data1)) # print(data.shape) print(datetime.now() - start)
# 训练集文件路径 dir_name = 'F:/water_detect/pkl/' #生成模型最终会保存在model文件夹下 model_save_path = "model/" # # 输出文件路径设置 fpa_path = path.join(dir_name, 'train_output.txt') fpa = open(fpa_path, "a") #这个文件好像没什么用 by xjxf # # fpa.close() start_time = datetime.datetime.now() print("startTime: ", start_time) # 提起pickle数据 data包含 特征+标签 data = tool_set.read_and_decode(dir_name + "train_data_64.pkl", 64) isize = 9 img_channel = 4 img_pixel = isize ''' # CNN 完整程序 训练模型 ''' # Parameters training_epochs = 500 batch_size = 920 display_step = 10 channels = img_channel # Network Parameters img_size = isize * isize * channels # data input (img shape: 28*28*3)
# Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=pred)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Evaluate model correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) #初始化所有的op init = tf.global_variables_initializer() if __name__ == '__main__': # 提起pickle数据 data包含 特征+标签 data = tool_set.read_and_decode(dir_name + "train_data.pkl", img_pixel, channels) # data_1 = tool_set.read_and_decode(dir_name_1 + "train_data.pkl", img_pixel, channels) # data=np.vstack((data,data_1)) #2组数据按行合并 saver = tf.train.Saver() # 默认是保存所有变量 with tf.Session() as sess: sess.run(init) total_batch = int(img_nums / batch_size) for epoch in range(training_epochs): # 每进行一个周期训练时,先将原数据按行打乱 index = [i for i in range(len(data))] # len(data)得到的行数
import datetime import pickle import gzip # 训练集文件路径 dir_name = r'' model_save_path = r"model/" if not os.path.isdir(model_save_path): os.makedirs(model_save_path) # 创建输出文件目录 model_fn = path.join(model_save_path, 'save_net.ckpt') # 存放掩膜影像 start_time = datetime.datetime.now() print("startTime: ", start_time) # 提起pickle数据 data包含 特征+标签 data = tool_set.read_and_decode(dir_name + "04_pool_50p.pkl") isize = 2 img_channel = 4 img_pixel = isize ''' # CNN 完整程序 训练模型 ''' # Parameters training_epochs = 50 batch_size = 128 display_step = 10 channels = img_channel # Network Parameters
dir_name = '' dir_summary_name = '' #生成模型最终会保存在model文件夹下 model_save_path = "model/" # # 输出文件路径设置 fpa_path = path.join(dir_name, 'train_output.txt') fpa = open(fpa_path, "a") #这个文件好像没什么用 by xjxf # # fpa.close() start_time = datetime.datetime.now() print("startTime: ", start_time) # 提起pickle数据 data包含 特征+标签 data = tool_set.read_and_decode(dir_name + "train_data_400_all.pkl", 3) isize = 400 img_channel = 3 img_pixel = isize ''' # CNN 完整程序 训练模型 ''' # Parameters training_epochs = 2500 batch_size = 1 display_step = 1 channels = img_channel # Network Parameters