def ParseOther(self, baseDir, withMS = False): self.baseDir = baseDir pathDir = os.path.join(baseDir, "*.npy") files = glob.glob(pathDir) instanceCount = 0 dataPb = deepnet_pb2.Dataset() for i, feature in enumerate(self.featureGroups): data = deepnet_pb2.Dataset.Data() data.name = feature + "_"+ os.path.basename(baseDir) data.file_pattern = "*"+feature+"*.npy" if withMS: data.dimensions.extend([self.featureGroupsIndex[i+1]-self.featureGroupsIndex[i]]) else: dimensions = 0 for entry in self.featureGroupsDict[feature]: dimensions = dimensions + entry[1] - entry[0] data.dimensions.extend([dimensions]) dataPb.data.extend([data]) data = deepnet_pb2.Dataset.Data() data.name = "label_" + os.path.basename(baseDir) data.dimensions.extend([1]) data.file_pattern = "*label.npy" dataPb.data.extend([data]) if withMS: MS = "withMS" outputProtoFile = os.path.join(baseDir, MS, "data_withMS.pbtxt") else: MS = "withoutMS" outputProtoFile = os.path.join(baseDir, MS, "data_withoutMS.pbtxt") dataPb.name = os.path.basename(baseDir) + "_"+ MS dirPath = os.path.join(baseDir, MS) dataPb.prefix = dirPath for fileEntry in files: tempData = np.load(fileEntry) if len(tempData.shape) == 1 or tempData.shape[1] != 17593: continue instanceCount = instanceCount + tempData.shape[0] baseName = os.path.basename(fileEntry) fileName = os.path.join(dirPath,os.path.splitext(baseName)[0]) + "_" + MS np.save(fileName + '_label.npy', tempData[:, 17592]) if withMS: for i, feature in enumerate(self.featureGroups): np.save(fileName + '_' + feature + "_withMS.npy", tempData[:, self.featureGroupsIndex[i]:self.featureGroupsIndex[i + 1]]) else: for feature in self.featureGroups: tempTuple = self.featureGroupsDict[feature][0] tempArray = tempData[:, tempTuple[0]: tempTuple[1]] if len(self.featureGroupsDict[feature]) > 1: for i in range(1, len(self.featureGroupsDict[feature])): tempTuple = self.featureGroupsDict[feature][i] tempArray = np.concatenate((tempArray, tempData[:,tempTuple[0]: tempTuple[1]]), axis = 1) np.save(fileName + '_' + feature + "_withoutMS.npy", tempArray) for entry in dataPb.data: entry.size = instanceCount with open(outputProtoFile, 'w') as f: text_format.PrintMessage(dataPb, f)
def main(): data_pbtxt = sys.argv[1] output_dir = sys.argv[2] prefix = sys.argv[3] r = int(sys.argv[4]) gpu_mem = sys.argv[5] main_mem = sys.argv[6] if not os.path.isdir(output_dir): os.makedirs(output_dir) rep_dict, stats_files = MakeDict(data_pbtxt) reps = rep_dict.keys() indices_file = os.path.join(prefix, 'splits', 'train_indices_%d.npy' % r) if os.path.exists(indices_file): train = np.load(indices_file) valid = np.load(os.path.join(prefix, 'splits', 'valid_indices_%d.npy' % r)) test = np.load(os.path.join(prefix, 'splits', 'test_indices_%d.npy' % r)) else: print 'Creating new split.' indices = np.arange(25000) np.random.shuffle(indices) train = indices[:10000] valid = indices[10000:15000] test = indices[15000:] np.save(os.path.join(prefix, 'splits', 'train_indices_%d.npy' % r), train) np.save(os.path.join(prefix, 'splits', 'valid_indices_%d.npy' % r), valid) np.save(os.path.join(prefix, 'splits', 'test_indices_%d.npy' % r), test) print 'Splitting data' dataset_pb = deepnet_pb2.Dataset() dataset_pb.name = 'flickr_split_%d' % r dataset_pb.gpu_memory = gpu_mem dataset_pb.main_memory = main_mem for rep in reps: data = rep_dict[rep] stats_file = stats_files[rep] DumpDataSplit(data[train], output_dir, 'train_%s' % rep, dataset_pb, stats_file) DumpDataSplit(data[valid], output_dir, 'valid_%s' % rep, dataset_pb, stats_file) DumpDataSplit(data[test], output_dir, 'test_%s' % rep, dataset_pb, stats_file) print 'Splitting labels' labels = np.load(os.path.join(prefix, 'labels.npy')).astype('float32') DumpLabelSplit(labels[train,], output_dir, 'train_labels', dataset_pb) DumpLabelSplit(labels[valid,], output_dir, 'valid_labels', dataset_pb) DumpLabelSplit(labels[test,], output_dir, 'test_labels', dataset_pb) #d = 'indices' #np.save(os.path.join(output_dir, 'train_%s.npy' % d), train) #np.save(os.path.join(output_dir, 'valid_%s.npy' % d), valid) #np.save(os.path.join(output_dir, 'test_%s.npy' % d), test) with open(os.path.join(output_dir, 'data.pbtxt'), 'w') as f: text_format.PrintMessage(dataset_pb, f) print 'Output written in directory %s' % output_dir
def withPbtxt(dbPbtxt, modality1, modality2, outputpath): datapb = util.ReadData(dbPbtxt) datasets = ["train", "validation", "test"] datapbNew = deepnet_pb2.Dataset() namePrefix = modality1 + "_" + modality2 + "_" datapbNew.prefix = outputpath datapbNew.name = namePrefix + "combined_input" for dataset in datasets: fileNames1 = [] fileNames2 = [] for dataEntry in datapb.data: if modality1 in dataEntry.name and dataset in dataEntry.name: fileNames1 = sorted( glob.glob( os.path.join(datapb.prefix, dataEntry.file_pattern))) if modality2 in dataEntry.name and dataset in dataEntry.name: fileNames2 = sorted( glob.glob( os.path.join(datapb.prefix, dataEntry.file_pattern))) for i, (file1, file2) in enumerate(zip(fileNames1, fileNames2)): data1 = np.load(file1) data2 = np.load(file2) dataCombined = np.concatenate((data1, data2), axis=1) if i == 0: data = dataCombined else: data = np.concatenate((data, dataCombined), axis=0) if not os.path.exists(os.path.join(outputpath, dataset)): os.makedirs(os.path.join(outputpath, dataset)) np.save(os.path.join(outputpath, dataset, "data"), data) dataItem = deepnet_pb2.Dataset.Data() dataItem.name = namePrefix + "combined_" + dataset dataItem.dimensions.extend([data.shape[1]]) dataItem.size = data.shape[0] dataItem.file_pattern = os.path.join(dataset, "data.npy") datapbNew.data.extend([dataItem]) with open(os.path.join(outputpath, "input_data.pbtxt"), 'w') as f: text_format.PrintMessage(datapbNew, f)
def ParsePerson(self, baseDir, ne=True, withMS=False): self.baseDir = baseDir for person in self.subPersonDir: instanceCount = 0 dataPb = deepnet_pb2.Dataset() outputProtoFile = os.path.join(self.baseDir, person, 'data.pbtxt') for i, feature in enumerate(self.featureGroups): data = deepnet_pb2.Dataset.Data() data.name = person + "_" + feature data.file_pattern = "*" + feature + ".npy" if withMS: data.dimensions.extend([ self.featureGroupsIndex[i + 1] - self.featureGroupsIndex[i] ]) else: dimensions = 0 for entry in self.featureGroupsDict[feature]: dimensions = dimensions + entry[1] - entry[0] data.dimensions.extend([dimensions]) dataPb.data.extend([data]) data = deepnet_pb2.Dataset.Data() data.name = person + "_label" data.dimensions.extend([1]) data.file_pattern = "*label.npy" dataPb.data.extend([data]) dataPb.prefix = os.path.join(self.baseDir, person) if withMS: dataPb.name = os.path.basename(baseDir) + "withMS" outputProtoFile = os.path.join(baseDir, 'data_withMS.pbtxt') else: dataPb.name = os.path.basename(baseDir) + "withoutMS" outputProtoFile = os.path.join(baseDir, 'data_withoutMS.pbtxt') if ne: filePath = os.path.join(self.baseDir, person, "*.npy") files = glob.glob(filePath) for fileEntry in files: tempData = np.load(fileEntry) assert (tempData.shape[1] == 17593) instanceCount = instanceCount + tempData.shape[0] fileName = os.path.splitext(fileEntry)[0] if withMS: for i, feature in self.featureGroups: np.save( fileName + '_' + feature + "_withMS.npy", tempData[:, self.featureGroupsIndex[i]:self. featureGroupsIndex[i + 1]]) else: for feature in self.featureGroups: tempTuple = self.featureGroupsDict[feature][0] tempArray = tempData[:, tempTuple[0]:tempTuple[1]] if len(self.featureGroupsDict[feature]) > 1: for i in range( 1, len(self.featureGroupsDict[feature])): tempTuple = self.featureGroupsDict[ feature][i] tempArray = np.concatenate( (tempArray, tempData[:, tempTuple[0]:tempTuple[1]]), axis=1) np.save( fileName + '_' + feature + "_withoutMS.npy", tempArray) np.save(fileName + '_label.npy', tempData[:, 17592]) else: for fType in self.subTypeDir: filePath = os.path.join(self.baseDir, person, fType, "*.npy") files = glob.glob(filePath) for fileEntry in files: tempData = np.load(fileEntry) assert (tempData.shape[1] == 17593) instanceCount = instanceCount + tempData.shape[0] baseName = os.path.splitext( os.path.basename(fileEntry))[0] fileName = os.path.join(self.baseDir, person, baseName) if withMS: for i, feature in enumerate(self.featureGroups): np.save( fileName + '_' + feature + "_withtMS.npy", tempData[:, self.featureGroupsIndex[i]:self. featureGroupsIndex[i + 1]]) else: for feature in self.featureGroups: tempTuple = self.featureGroupsDict[feature][0] tempArray = tempData[:, tempTuple[0]:tempTuple[1]] if len(self.featureGroupsDict[feature]) > 1: for i in range( 1, len(self.featureGroupsDict[feature] )): tempTuple = self.featureGroupsDict[ feature][i] tempArray = np.concatenate(( tempArray, tempData[:, tempTuple[0]:tempTuple[1]] ), axis=1) np.save( fileName + '_' + feature + "_withoutMS.npy", tempArray) np.save(fileName + '_label.npy', tempData[:, 17592]) for entry in dataPb.data: entry.size = instanceCount with open(outputProtoFile, 'w') as f: text_format.PrintMessage(dataPb, f)
def main(): model_file = sys.argv[1] base_output_dir = sys.argv[2] rep_dir = sys.argv[3] prefix = sys.argv[4] gpu_mem = sys.argv[5] main_mem = sys.argv[6] model = util.ReadModel(model_file) data_pb = deepnet_pb2.Dataset() data_pb.name = model.name data_pb.gpu_memory = gpu_mem data_pb.main_memory = main_mem output_dir = os.path.join(base_output_dir, 'validation') if not os.path.isdir(output_dir): os.makedirs(output_dir) output_proto_file = os.path.join(base_output_dir, 'data.pbtxt') # IMAGE PATHWAY img_input_pbtxt = os.path.join(prefix, 'flickr.pbtxt') img_hidden1_pbtxt = os.path.join(rep_dir, 'image_rbm1_LAST', 'data.pbtxt') img_hidden2_pbtxt = os.path.join(rep_dir, 'image_rbm2_LAST', 'data.pbtxt') # TEXT PATHWAY text_input_pbtxt = os.path.join(prefix, 'flickr_nnz.pbtxt') text_hidden1_pbtxt = os.path.join(rep_dir, 'text_rbm1_LAST', 'data.pbtxt') text_hidden2_pbtxt = os.path.join(rep_dir, 'text_rbm2_LAST', 'data.pbtxt') text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt') joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt') img_input_pb = util.ReadData(img_input_pbtxt) data = next(d for d in img_input_pb.data if d.name == 'image_labelled') data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern) data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file) data.name = 'image_input' data_pb.data.extend([data]) img_hidden1_pb = util.ReadData(img_hidden1_pbtxt) data = next(d for d in img_hidden1_pb.data if d.name == 'image_hidden1_validation') data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern) data.name = 'image_hidden1' data_pb.data.extend([data]) img_hidden2_pb = util.ReadData(img_hidden2_pbtxt) data = next(d for d in img_hidden2_pb.data if d.name == 'image_hidden2_validation') data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern) data.name = 'image_hidden2' data_pb.data.extend([data]) indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz') indices = np.load(indices_file) nnz_indices = indices['nnz_indices'] z_indices = indices['z_indices'] text_pb_z = util.ReadData(text_pbtxt_z) text_input_pb = util.ReadData(text_input_pbtxt) data_nnz = next(d for d in text_input_pb.data if d.name == 'text_labelled') data_z = next(d for d in text_pb_z.data if d.name == 'text_input_layer_validation') output_file = os.path.join(output_dir, 'text_input-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_input_pb.prefix, 'text_input', output_file) data_pb.data.extend([data]) text_hidden1_pb = util.ReadData(text_hidden1_pbtxt) data_nnz = next(d for d in text_hidden1_pb.data if d.name == 'text_hidden1_validation') data_z = next(d for d in text_pb_z.data if d.name == 'text_hidden1_validation') output_file = os.path.join(output_dir, 'text_hidden1-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_hidden1_pb.prefix, 'text_hidden1', output_file) data_pb.data.extend([data]) text_hidden2_pb = util.ReadData(text_hidden2_pbtxt) data_nnz = next(d for d in text_hidden2_pb.data if d.name == 'text_hidden2_validation') data_z = next(d for d in text_pb_z.data if d.name == 'text_hidden2_validation') output_file = os.path.join(output_dir, 'text_hidden2-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, text_hidden2_pb.prefix, 'text_hidden2', output_file) data_pb.data.extend([data]) joint_pb = util.ReadData(joint_pbtxt) data_nnz = next(d for d in joint_pb.data if d.name == 'joint_hidden_validation') data_z = next(d for d in text_pb_z.data if d.name == 'joint_hidden_validation') output_file = os.path.join(output_dir, 'joint_hidden-00001-of-00001.npy') data = Merge(data_nnz, data_z, nnz_indices, z_indices, text_pb_z.prefix, joint_pb.prefix, 'joint_hidden', output_file) data_pb.data.extend([data]) with open(output_proto_file, 'w') as f: text_format.PrintMessage(data_pb, f)
def main(): model_file = sys.argv[1] base_output_dir = sys.argv[2] rep_dir = sys.argv[3] prefix = sys.argv[4] gpu_mem = sys.argv[5] main_mem = sys.argv[6] model = util.ReadModel(model_file) data_pb = deepnet_pb2.Dataset() data_pb.name = model.name data_pb.gpu_memory = gpu_mem data_pb.main_memory = main_mem output_dir = os.path.join(base_output_dir, 'validation') if not os.path.isdir(output_dir): os.makedirs(output_dir) output_proto_file = os.path.join(base_output_dir, 'data.pbtxt') # IMAGE PATHWAY img_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt') img_hidden1_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm1_LAST', 'data.pbtxt') #img_hidden2_pbtxt = os.path.join(rep_dir, 'RNA1seq_rbm2_LAST', 'data.pbtxt') # TEXT PATHWAY text_input_pbtxt = os.path.join(prefix, 'RNAseq.pbtxt') text_hidden1_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm1_LAST', 'data.pbtxt') #text_hidden2_pbtxt = os.path.join(rep_dir, 'RNA2seq_rbm2_LAST', 'data.pbtxt') #text_pbtxt_z = os.path.join(rep_dir, 'generated_text', 'data.pbtxt') joint_pbtxt = os.path.join(rep_dir, 'joint_rbm_LAST', 'data.pbtxt') joint2_pbtxt = os.path.join(rep_dir, 'joint_rbm2_LAST', 'data.pbtxt') img_input_pb = util.ReadData(img_input_pbtxt) data = next(d for d in img_input_pb.data if d.name == 'RNA1seq_train') data.file_pattern = os.path.join(img_input_pb.prefix, data.file_pattern) #data.stats_file = os.path.join(img_input_pb.prefix, data.stats_file) data.name = 'RNA1seq_input' data_pb.data.extend([data]) img_hidden1_pb = util.ReadData(img_hidden1_pbtxt) data = next(d for d in img_hidden1_pb.data if d.name == 'RNA1seq_hidden1_train') data.file_pattern = os.path.join(img_hidden1_pb.prefix, data.file_pattern) data.name = 'RNA1seq_hidden1' data_pb.data.extend([data]) #img_hidden2_pb = util.ReadData(img_hidden2_pbtxt) #data = next(d for d in img_hidden2_pb.data if d.name == 'RNA1seq_hidden2_train') #data.file_pattern = os.path.join(img_hidden2_pb.prefix, data.file_pattern) #data.name = 'RNA1seq_hidden2' #data_pb.data.extend([data]) #indices_file = os.path.join(prefix, 'text', 'indices_labelled.npz') #indices = np.load(indices_file) #nnz_indices = indices['nnz_indices'] #z_indices = indices['z_indices'] #text_pb_z = util.ReadData(text_pbtxt_z) text_input_pb = util.ReadData(text_input_pbtxt) data = next(d for d in text_input_pb.data if d.name == 'RNA2seq_train') data.file_pattern = os.path.join(text_input_pb.prefix, data.file_pattern) data.name = 'RNA2seq_input' data_pb.data.extend([data]) text_hidden1_pb = util.ReadData(text_hidden1_pbtxt) data = next(d for d in text_hidden1_pb.data if d.name == 'RNA2seq_hidden1_train') data.file_pattern = os.path.join(text_hidden1_pb.prefix, data.file_pattern) data.name = 'RNA2seq_hidden1' data_pb.data.extend([data]) #text_hidden2_pb = util.ReadData(text_hidden2_pbtxt) #data = next(d for d in text_hidden2_pb.data if d.name == 'RNA2seq_hidden2_train') #data.file_pattern = os.path.join(text_hidden2_pb.prefix, data.file_pattern) #data.name = 'RNA2seq_hidden2' #data_pb.data.extend([data]) joint_pb = util.ReadData(joint_pbtxt) data = next(d for d in joint_pb.data if d.name == 'joint_hidden_train') data.file_pattern = os.path.join(joint_pb.prefix, data.file_pattern) data.name = 'joint_hidden' data_pb.data.extend([data]) joint2_pb = util.ReadData(joint2_pbtxt) data = next(d for d in joint2_pb.data if d.name == 'joint_hidden2_train') data.file_pattern = os.path.join(joint2_pb.prefix, data.file_pattern) data.name = 'joint_hidden2' data_pb.data.extend([data]) with open(output_proto_file, 'w') as f: text_format.PrintMessage(data_pb, f)