def torch_read_tfrecord(image_path, part_idx=0): tfrecord_path = image_path + '_part_' + str(part_idx) + ".tfrecord" index_path = image_path + '_part_' + str(part_idx) + ".idx" # index_path =None description = { "image": "byte", "label": "int", "index": "int", "name": "byte" } batch_size = 1 num_worker = 6 dataset = TFRecordDataset(tfrecord_path, index_path, description, shuffle_queue_size=batch_size * num_worker, transform=decode_image) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, pin_memory=False, drop_last=False, num_workers=num_worker) i = 1 for data in tqdm(loader): print("data", i, len(data["label"]), data["label"]) #print(data["label"],data["image"]) i += 1
def tfrec_extract(filename): global train_row global test_row global train global test tfrecord_path = os.path.join(tfrec_dir, filename) index_path = tfrecord_path.replace('.tfrec', '.index') if 'train' in filename: savedir = train_dir else:savedir = test_dir dataset = TFRecordDataset(tfrecord_path, index_path, transform=decode_image) loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False) for data in T(loader): # print(len(loader)) if 'train' in filename: train_row += 1 else: test_row += 1 img_name = data['image_name'].squeeze().data.cpu().numpy().copy() img_name = os.path.join(savedir, ''.join(map(chr, img_name))) img_name += '.jpg' image_file = data['image'].squeeze().data.cpu().numpy() cv2.imwrite(img_name, image_file) del data['image'] del data['image_name'] for k, v in data.items(): if 'train' in filename: train.loc[train_row, 'image_name'] = img_name train.loc[train_row, k] = v.squeeze().data.cpu().numpy() train.loc[train_row, 'tfrec'] = filename.replace('.tfrec', '') else: test.loc[test_row, 'image_name'] = img_name test.loc[test_row, k] = v.squeeze().data.cpu().numpy() test.loc[test_row, 'tfrec'] = filename.replace('.tfrec', '')
def __init__(self): tfrecord_path = cfg['datasets']['train']['tfrecord_path'] self.HR_size = cfg['datasets']['train']['HR_size'] # self.batch_size = cfg['datasets']['train']['batch_size'] self.dataset = TFRecordDataset(tfrecord_path, None) self.loader = iter(torch.utils.data.DataLoader(self.dataset, batch_size=1))
def __init__(self): tfrecord_path = cfg['datasets']['train']['tfrecord_path'] self.mask_dir = cfg['datasets']['train']['masks'] self.mask_files = glob.glob(self.mask_dir + '/**/*.png', recursive=True) self.HR_size = cfg['datasets']['train']['HR_size'] # self.batch_size = cfg['datasets']['train']['batch_size'] self.dataset = TFRecordDataset(tfrecord_path, None) self.loader = iter(torch.utils.data.DataLoader(self.dataset, batch_size=1))
def main(): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") auto_encoder = AE().to(device) optimizer = torch.optim.Adam(auto_encoder.parameters(), lr=LR, weight_decay=0.01) loss_func = nn.MSELoss() # load input data # index_path = None description = {"vol_raw": "byte"} test_dataset = TFRecordDataset(tfrecord_path, index_path=None, description=description) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size) # data = next(iter(loader)) # print(data) for epoch in range(n_epochs): loss = 0 for step, input in enumerate(test_loader): # ===================== forward ========================= input_x = input_pipeline(input['vol_raw'].float()) encoder_out, decoder_out = auto_encoder(input_x) # encoder_out = encoder_out.float() decoder_out = decoder_out.float() # loss = loss_func(decoder_out, brain_img.float()) # print(decoder_out) # print(decoder_out) # ===================== backward ========================= optimizer.zero_grad() # loss.backward() optimizer.step() # --------------------accuracy begin-------------------------# # _, prediction = torch.max(decoder_out, 1) # prediction里面是一维索引 pred = decoder_out.argmax(dim=1) # correct_num += torch.eq(pred, brain_labels).sum().float().item() # print(prediction.shape) # print(prediction) # print(brain_labels.shape) # print(brain_labels) # correct += (prediction == brain_labels).sum().item() # 获得一个batch正确的数量 # out_class = (decoder_out[:] > 0).float() # 将out矩阵中大于0的转化为1,小于0的转化为0,存入a中 # right_num = torch.sum(b_x == out_class).float() # 分类对的数值 # precision = correct / decoder_out.shape[0] # 准确率 # --------------------accuracy end-------------------------# if step % 100 == 0: print('Epoch:{}, Train_loss:{:.8f}'.format(epoch, loss.item()))
def load_dataset(transform=None): filename="zoom_blur_1" return TFRecordDataset( data_path=imagenetc_path+filename+'.tfrecords', index_path=imagenetc_path+filename+'.tfrecords_index', description={ 'height': 'int', 'width': 'int', 'depth': 'int', 'corruption_type': 'byte', 'severity_level' : 'int', 'class_label': 'byte', 'image_raw': 'byte' }, transform=transform)
def __init__(self, tfrecord_path, description=None, index_path=None, batch_size=16, transform_fn=None): if description is None: description = {"image": "byte", "label": "float"} self.dataset = TFRecordDataset(tfrecord_path, index_path, description, transform=transform_fn) self.loader = torch.utils.data.DataLoader(self.dataset, batch_size=batch_size)
def viz(model_name, tf_records_path, record_num, word_num, table_num, pred_thresh): model_path = "C:/Users/Jesper/Desktop/TableRecognition/Table_Detection_and_Recognition/Table_Recognition/models/{}/model.pt".format( model_name) model = VexMoutNet() model.load_state_dict(torch.load(model_path)) model.eval() path = "C:/Users/Jesper/Desktop/TableRecognition/Table_Detection_and_Recognition/Table_Recognition/Data/{}".format( tf_records_path) files = os.listdir(path) record = files[record_num] device = torch.device("cpu") batch_size = 1 #variables for tfrecord loader index_path = None tfrecord_description = { "imgs": "float", "num_words": "int", "vertex_features": "float", "adjacency_matrix_cells": "int", "adjacency_matrix_cols": "int", "adjacency_matrix_rows": "int", "num_edges": 'int', "edge_indexes": 'int' } tfrecord_path = os.path.join(path, record) dataset = TFRecordDataset(tfrecord_path, index_path, tfrecord_description) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size) for idx, tmp in enumerate(loader): batch = tmp if idx == table_num: break data_dict = tfrecord_preparer(batch, device=device, batch_size=batch_size) preds_dict = model(data_dict, device, 0.5) for k, v in preds_dict.items(): reshap = int(math.sqrt(v.shape[0])) preds_dict[k] = torch.sigmoid(v.reshape(reshap, reshap)) img_cells = visualize(word_num, data_dict, preds_dict, pred_thresh, 'cells') img_rows = visualize(word_num, data_dict, preds_dict, pred_thresh, 'rows') img_cols = visualize(word_num, data_dict, preds_dict, pred_thresh, 'cols') return img_cells, img_rows, img_cols
def __getitem__(self, idx): tfr_file, n = self.data_index[idx] # load tfrecord dataset = TFRecordDataset(tfr_file, tfr_file.replace('tfrecords', 'index')) single_data = list(dataset)[n] img = single_data['data'].reshape( single_data['shape']).astype('float32') # normalize image to [0, 1] img = (img / 2**(8 - self.n_bits_x)).round() / (2.**self.n_bits_x) # apply transformation img = self.transform(img) # return img return img, single_data['label'][0]
def main(dataset, split, tfr_path, lmdb_path): assert split in {'train', 'validation'} # create target directory if not os.path.exists(lmdb_path): os.makedirs(lmdb_path, exist_ok=True) if dataset == 'celeba' and split in {'train', 'validation'}: num_shards = {'train': 120, 'validation': 40}[split] lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split) tfrecord_path_template = os.path.join( tfr_path, '%s/%s-r08-s-%04d-of-%04d.tfrecords') elif dataset == 'imagenet-oord_32': num_shards = {'train': 2000, 'validation': 80}[split] # imagenet_oord_lmdb_path += '_32' lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split) tfrecord_path_template = os.path.join( tfr_path, '%s/%s-r05-s-%04d-of-%04d.tfrecords') elif dataset == 'imagenet-oord_64': num_shards = {'train': 2000, 'validation': 80}[split] # imagenet_oord_lmdb_path += '_64' lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split) tfrecord_path_template = os.path.join( tfr_path, '%s/%s-r06-s-%04d-of-%04d.tfrecords') else: raise NotImplementedError # create lmdb env = lmdb.open(lmdb_path, map_size=1e12) count = 0 with env.begin(write=True) as txn: for tf_ind in range(num_shards): # read tf_record tfrecord_path = tfrecord_path_template % (split, split, tf_ind, num_shards) index_path = None description = {'shape': 'int', 'data': 'byte', 'label': 'int'} dataset = TFRecordDataset(tfrecord_path, index_path, description) loader = torch.utils.data.DataLoader(dataset, batch_size=1) # put the data in lmdb for data in loader: im = data['data'][0].cpu().numpy() txn.put(str(count).encode(), im) count += 1 if count % 100 == 0: print(count) print('added %d items to the LMDB dataset.' % count)
def get_train_loader(conf,worker_rank,use_hdf5=True,use_tfrecord=False): if use_hdf5: class_num = 144752 # class_num = 1013232 train_sampler = None hdf5file = str(conf.ms1m_folder) + '/datasets_miracle_v2_part' + str(worker_rank) + '.hdf5' # hdf5file = '/sdd_data/100WID_part' + str(worker_rank) + '.hdf5' print("using hdf5 file,", hdf5file, ",DataLoader with multi process") # datah5=dataset_h5(hdf5file) datah5 = dataset_h5_concurrency(hdf5file) # loader =DataLoader(dataset=datah5, batch_size=conf.batch_size, shuffle=False, pin_memory=conf.pin_memory, # num_workers=0) loader = DataLoader(dataset=datah5, batch_size=conf.batch_size, shuffle=True, pin_memory=conf.pin_memory, num_workers=conf.num_workers) elif use_tfrecord: # class_num=144752 # tfrecord_path =str(conf.ms1m_folder) + '/datasets_miracle_v2_part_' + str(worker_rank) + '.tfrecord' # index_path = str(conf.ms1m_folder) + '/datasets_miracle_v2_part_' + str(worker_rank) + '.idx' # tfrecord_path = str(conf.ms1m_folder) + '/100WID_part_' + str(worker_rank) + '.tfrecord' # index_path = str(conf.ms1m_folder) + '/100WID_part_' + str(worker_rank) + '.idx' tfrecord_path = str(conf.data_path) + '/100WID_part_' + str(worker_rank) + '.tfrecord' index_path = str(conf.data_path) + '/100WID_part_' + str(worker_rank) + '.idx' class_num = 1013232 train_sampler = None print("use tfrecord file",tfrecord_path) description = {"image": "byte", "label": "int", "index": "int", "name": "byte"} dataset = TFRecordDataset(tfrecord_path, index_path, description,shuffle_queue_size=conf.batch_size*conf.num_workers*10, transform=decode_image) # dataset = TFRecordDataset(tfrecord_path, index_path, description, transform=decode_image) # loader = torch.utils.data.DataLoader(dataset, batch_size=conf.batch_size ,shuffle=False, pin_memory=conf.pin_memory, # num_workers=conf.num_workers,drop_last=False,sampler=train_sampler) loader = DataLoaderX(dataset, batch_size=conf.batch_size, shuffle=False, pin_memory=conf.pin_memory, num_workers=conf.num_workers, drop_last=False, sampler=train_sampler) else: print("use data folder") # ds, class_num = get_train_dataset(conf.ms1m_folder / 'datasets_miracle_v2') ds, class_num = get_train_dataset(conf.data_path / '100WID') train_sampler = torch.utils.data.distributed.DistributedSampler(ds) loader = DataLoader(ds, batch_size=conf.batch_size, shuffle=(train_sampler is None), pin_memory=conf.pin_memory, num_workers=0,drop_last=False,sampler=train_sampler) # loader = DataLoader(ds, batch_size=conf.batch_size, shuffle=(train_sampler is None), pin_memory=conf.pin_memory, # num_workers=conf.num_workers, drop_last=False, sampler=train_sampler) print('ms1m ImageFlod generated:class ', class_num) print("check Data class num:",class_num) return loader, class_num ,train_sampler
def build_loader(self): train_sampler = None print("read tfrecord file:", self.tfrecord_path) description = { "image": "byte", "label": "int", "index": "int", "name": "byte" } dataset = TFRecordDataset(self.tfrecord_path, self.index_path, description, shuffle_queue_size=self.batch_size * self.num_workers * 10, transform=decode_image) self.sample_loader = DataLoaderX(dataset, batch_size=self.batch_size, shuffle=False, pin_memory=self.pin_memory, num_workers=self.num_workers, drop_last=False, sampler=train_sampler)
####################################################################################################### for epoch in range(num_epochs): train_loss = 0 val_loss = 0 ct_train = 0 ct_val = 0 model.train() #load filenames of folder: tfrecord_files = os.listdir(Train_path) loop = tqdm(enumerate(tfrecord_files), total=len(tfrecord_files)) for idx, record in loop: tfrecord_path = os.path.join(Train_path, record) dataset = TFRecordDataset(tfrecord_path, config.index_path, config.tfrecord_description) loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size) for batch in loader: ct_train += 1 data_dict = tfrecord_preparer(batch, device=device, batch_size=batch_size) optimizer.zero_grad() loss_cells, loss_cols, loss_rows, stat_dict = model( data_dict, device, prediction_thres) total_loss = loss_cells + loss_cols + loss_rows
albert_pretrain.cuda() print(albert_pretrain.device) # Create optimizer optimizer = Lamb([{ "params": [p for n, p in list(albert_pretrain.named_parameters())] }], lr=LEARNING_RATE) # FP16 albert_pretrain, optimizer = amp.initialize(albert_pretrain, optimizer, opt_level="O2") albert_pretrain.train() dataset = TFRecordDataset(pretrain_file, index_path=None, description=feat_map) loader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, drop_last=True) tmp_loss = 0 start_time = time.time() if os.path.isfile('pretrain_checkpoint'): print(f"--- Load from checkpoint ---") checkpoint = torch.load("pretrain_checkpoint") albert_pretrain.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) epoch = checkpoint['epoch'] loss = checkpoint['loss'] losses = checkpoint['losses']
from grid import * MIXUP = False GRIDMASK = False NUMLABEL = 100 EPOCHS = 200 # SET GPU ID 0 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") os.environ['CUDA_VISIBLE_DEVICES'] = '0' #CONFIGURE DATA tfrecord_train = "/home/lizhaochen/FYP/data/cifar-100-data-im-0.05/train.tfrecords" tfrecord_val = "/home/lizhaochen/FYP/data/cifar-100-data-im-0.05/eval.tfrecords" index_path = None description = {"image": "byte", "label": "int"} train_dataset = TFRecordDataset(tfrecord_train, index_path, description) val_dataset = TFRecordDataset(tfrecord_val, index_path, description) trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=512) valloader = torch.utils.data.DataLoader(val_dataset, batch_size=64) # function for MIX UP def mixup_data(x, y, alpha=1.0): '''Returns mixed inputs, pairs of targets, and lambda''' if alpha > 0: lam = np.random.beta(alpha, alpha) else: lam = 1 batch_size = x.size()[0] index = torch.randperm(batch_size).cuda()
def predict(args, model): tf.gfile.MakeDirs(args.output_dir) #read prediction candidates from entire prediction input jsonl.gz file candidates_dict = read_candidates(args.predict_file) #Prediction! logging.info("start predicting!") #define following routines to call full_tydi_pred_dict = {} total_num_examples = 0 shards_iter = enumerate( ((f, 0, 0) for f in sorted(tf.gfile.Glob(args.precomputed_predict_file))), 1) #Iterating through different shards to get results as we want for shard_num, (shard_filename, shard_num_examples, shard_num_features) in shards_iter: all_results = [] total_num_examples += shard_num_examples logging.info( "Shard %d: Running prediction for %s; %d examples, %d features.", shard_num, shard_filename, shard_num_examples, shard_num_features) print(shard_filename) #use tfrecord_dataset to read tfrecord into dataset for pytorch eval_dataset = TFRecordDataset(shard_filename, index_path=None) eval_dataloader = DataLoader(eval_dataset, batch_size=args.predict_batch_size, shuffle=False) for step, batch in enumerate(eval_dataloader): #Turn on model evaluation mode, and set frame to no_grad model.eval() with torch.no_grad(): outputs = model( is_training=False, input_ids=batch["input_ids"].long().to(DEVICE), attention_mask=batch['input_mask'].long().to(DEVICE), token_type_ids=batch['segment_ids'].long().to(DEVICE), ) #print(torch.max(outputs[0], -1)) #write results into RawResult format for post-process for num, (i, j, k) in enumerate( zip(outputs[0], outputs[1], outputs[2])): unique_ids = int(batch['unique_ids'][num]) start_logits = [float(x) for x in i] end_logits = [float(x) for x in j] answer_type_logits = [float(x) for x in k] all_results.append( RawResult(unique_id=unique_ids, start_logits=start_logits, end_logits=end_logits, answer_type_logits=answer_type_logits)) print('We at step %d of shard % d', step, shard_filename) predict_features = [ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(shard_filename) ] logging.info("Shard %d: Post-processing predictions.", shard_num) logging.info( " Num candidate examples loaded (includes all shards): %d", len(candidates_dict)) logging.info(" Num candidate features loaded: %d", len(predict_features)) logging.info(" Num prediction result features: %d", len(all_results)) logging.info(" Num shard features: %d", shard_num_features) #pass candidates dict, raw results and features to postproc for later use tydi_pred_dict = postproc.compute_pred_dict( candidates_dict, predict_features, [r._asdict() for r in all_results], candidate_beam=args.candidate_beam) logging.info("Shard %d: Post-processed predictions.", shard_num) logging.info(" Num shard examples: %d", shard_num_examples) logging.info(" Num post-processed results: %d", len(tydi_pred_dict)) if shard_num_examples != len(tydi_pred_dict): logging.warning(" Num missing predictions: %d", shard_num_examples - len(tydi_pred_dict)) for key, value in tydi_pred_dict.items(): if key in full_tydi_pred_dict: logging.warning("ERROR: '%s' already in full_tydi_pred_dict!", key) full_tydi_pred_dict[key] = value #break #Finish up predictions for all shards and start logging logging.info("Prediction finished for all shards.") logging.info(" Total input examples: %d", total_num_examples) logging.info(" Total output predictions: %d", len(full_tydi_pred_dict)) with tf.gfile.Open(args.output_prediction_file, "w") as output_file: for prediction in full_tydi_pred_dict.values(): output_file.write((json.dumps(prediction) + "\n").encode())
# @Description : # https://github.com/spotify/tfreader # https://www.w3cschool.cn/tensorflow_python from pathlib import Path # Torch # https://github.com/vahidk/tfrecord import torch from tfrecord.torch.dataset import TFRecordDataset p = "/Users/yuanjie/Desktop/Projects/Spark/MIPush/test-output.tfrecord" tfrecord_paths = list(map(str, Path(p).glob("part*"))) index_path = None description = {"id": "int", "feature": "int"} dataset = TFRecordDataset(tfrecord_paths[0], index_path, description) loader = torch.utils.data.DataLoader(dataset, batch_size=32) data = next(iter(loader)) print(data) # TF # https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset import tensorflow as tf # d = tf.data.TFRecordDataset(input_file) # d = d.shard(num_workers, worker_index) # d = d.repeat(num_epochs) # d = d.shuffle(shuffle_buffer_size) # d = d.map(parser_fn, num_parallel_calls=num_map_threads)