def load_data(self): datas = [] labels = [] with codecs.open(self.path + self.data_name, "r", encoding="utf-8") as fin: for ele in fin: if len(ele) < 10: continue data, label = ele.split("\t") data = data.split() if not data: continue datas.append(data[0]) labels.append(self.label_dic[label[:-1]]) datas = [data.split("。") for data in datas] datas = sorted(datas, key=lambda x: len(x), reverse=True) # for i,data in enumerate(datas): # for j,sentence in enumerate(datas[i]): # # datas[i][j] = sentence datas, segment_ids, input_masks = self.convert_data2id(datas) return datas, segment_ids, input_masks, labels
def __getitem__(self,index): data = self.datas[index] image = data.split(' ')[0].replace('\\','/') label1 = data.split(' ')[1] label2 = data.split(' ')[2].strip('\n') img = cv2.imread(data_path + image) img = cv2.resize(img, (224,224)) img = img.transpose(2,0,1) img = torch.from_numpy(img).float() time = torch.tensor(period.index(label1)) weather = torch.tensor(weathers.index(label2)) return img,time,weather
def __init__(self, data_root, data_list, transform=None): self.root = data_root self.transform = transform f = open(data_list, 'r') data_list = f.readlines() f.close() self.n_data = len(data_list) self.img_paths = [] self.img_labels = [] for data in data_list: self.img_paths.append(data.split()[0]) self.img_labels.append(data.split()[1])
def __init__(self, path, config): self.config = config if isinstance(path, list): data = "" for element in path: f = open(element, 'r', encoding='utf-8', errors='ignore') data += f.read() f.close() data = data.split('\n\n') else: f = open(path, 'r', encoding='utf-8', errors='ignore') data = f.read().split('\n\n') f.close() data = [[(element2.split('\t')[0], element2.split('\t')[1]) for element2 in element1.split('\n') if len(element2) != 0] for element1 in data if len(element1.strip('\n')) != 0] self.sentenceList = [[element2[0] for element2 in element1] for element1 in data] self.tagList = [[element2[1] for element2 in element1] for element1 in data] if self.config['model']['pretrained_model'] == 'Bert': self.tokenizer = BertTokenizer.from_pretrained( self.config['model']['bert_base_chinese'], do_lower_case=True) if self.config['model']['pretrained_model'] == 'XLNet': self.tokenizer = XLNetTokenizer.from_pretrained( self.config['model']['xlnet_base_chinese'], do_lower_case=False)
def load_data(filename): with open(filename, 'r') as f: data = f.read() tokens = data.split() return tokens
def read_data(self): """ 数据加载 """ data = open(self.path + "/text8.txt").read() data = data.split() self.word2freq = {} for word in data: """ 构建词频 """ if self.word2freq.get(word) != None: self.word2freq[word] += 1 else: self.word2freq[word] = 1 word2id = {} id2word = {} for word in self.word2freq: """ 构建word2id """ if self.word2freq[word] < self.min_count: continue else: if word2id.get(word) == None: word2id[word] = len(word2id) id2word[len(id2word)] = word self.word2id = word2id self.id2word = id2word print(len(self.word2id)) return data
def __init__(self, root='/vulcanscratch/koutilya/kitti', data_file='train.txt', phase='train', img_transform=None, joint_transform=None, depth_transform=None, depth_resize='bilinear'): self.root = root self.data_file = data_file self.files = [] self.phase = phase self.img_transform = img_transform self.joint_transform = joint_transform self.depth_transform = depth_transform self.depth_resize = depth_resize depth_path = '' if self.depth_resize == 'bilinear': depth_path = 'Bilinear_model_pseudo_labels' elif self.depth_resize == 'bicubic': depth_path = 'Bicubic_model_pseudo_labels' with open(osp.join(self.root, self.data_file), 'r') as f: data_list = f.read().split('\n') for data in data_list: if len(data) == 0: continue data_info = data.split(' ') self.files.append({ "l_rgb": data_info[0], "r_rgb": data_info[1], "cam_intrin": data_info[2], "depth": osp.join('Depth_baseline_all_syn', depth_path, data_info[0]) })
def latent_space_transition(items): # input is list of tuples of (a,b) load_last_model() model.eval() data = [im for item in items for im in item[:-1]] data = [totensor(i) for i in data] data = torch.stack(data, dim=0) data = Variable(data) if args.cuda: data = data.cuda() z = model.get_latent_var(data.view(-1, model.nc, model.ndf, model.ngf)) it = iter(z.split(1)) z = zip(it, it) zs = [] numsample = 11 for i, j in z: for factor in np.linspace(0, 1, numsample): zs.append(i + (j - i) * factor) z = torch.cat(zs, 0) recon = model.decode(z) it1 = iter(data.split(1)) it2 = [iter(recon.split(1))] * numsample result = zip(it1, it1, *it2) result = [im for item in result for im in item] result = torch.cat(result, 0) torchvision.utils.save_image(result.data, '../imgs/trans.jpg', nrow=2 + numsample, padding=2)
def preprocess(self): """Preprocess the Pixels attribute file.""" if self.mode == 'train': trainFileList = os.listdir(self.train_image_dir) random.seed(1234) random.shuffle(trainFileList) for i, trainFile in enumerate(trainFileList): filename = trainFile n = trainFile[:-4].split('_') # ind = int(n[0]) nx = float(n[1].split('=')[1]) ny = float(n[2].split('=')[1]) nz = float(n[3].split('=')[1]) normal = [nx, ny, nz] self.train_dataset.append([filename, normal]) if self.mode == 'test': testFileList = os.listdir(self.test_image_dir) f = open(os.path.join(self.test_image_dir, testFileList[0])) data = f.read() lines = data.split('\n') for i in range(len(lines) - 1): line = lines[i].split('\\') ind = int(line[0]) nx = float(line[1]) ny = float(line[2]) nz = float(line[3]) normal = [nx, ny, nz] mapsind = [int(l) for l in line[4:14]] maps = [float(l) for l in line[14:24]] self.test_dataset.append([ind, normal, mapsind, maps]) print('Finished preprocessing the Pixels dataset...')
def __init__(self, root='./datasets', data_file='train.txt', phase='train', img_transform=None, depth_transform=None, joint_transform=None): self.root = root self.data_file = data_file self.files = [] self.phase = phase self.img_transform = img_transform self.depth_transform = depth_transform self.joint_transform = joint_transform self.to_tensor = torchvision.transforms.ToTensor() with open(osp.join('./datasets/vkitti/', self.data_file), 'r') as f: data_list = f.read().split('\n') for data in data_list: if len(data) == 0: continue data_info = data.split(' ') self.files.append({"rgb": data_info[0], "depth": data_info[1]})
def __init__(self, root='./datasets/kitti', data_file='train.list', phase='train', joint_transform=None): self.root = root self.data_file = data_file self.files = [] self.joint_transform = joint_transform self.phase = phase self.no_gt = False with open(osp.join(self.root, self.data_file), 'r') as f: data_list = f.read().split('\n') for data in data_list: if len(data) == 0: continue data_info = data.split(' ') if len(data_info) == 3: self.files.append({ "rgb": data_info[0], "sparse": data_info[1], "gt": data_info[2] }) else: self.files.append({ "rgb": data_info[0], "sparse": data_info[1], }) self.no_gt = True self.nSamples = len(self.files)
def __init__(self, root='./datasets', data_file='test.list', phase='test', img_transform=None, joint_transform=None, depth_transform=None): self.root = root self.data_file = data_file self.files = [] self.phase = phase self.img_transform = img_transform self.joint_transform = joint_transform with open(osp.join(self.root, self.data_file), 'r') as f: data_list = f.read().split('\n') for data in data_list: if len(data) == 0: continue data_info = data.split(' ') self.files.append({ "rgb": data_info[0], })
def __obtain(self, file_paths): datas, boxes = [], [] for file_idx, file_path in enumerate(file_paths): assert osp.isfile( file_path), 'The path : {} is not a file.'.format(file_path) listfile = open(file_path, 'r') listdata = listfile.read().splitlines() listfile.close() print('Load [{:d}/{:d}]-th list : {:} with {:} images'.format( file_idx, len(file_paths), file_path, len(listdata))) for idx, data in enumerate(listdata): alls = data.split(' ') if '' in alls: alls.remove('') assert len(alls) == 6 or len( alls) == 7, 'The {:04d}-th line is wrong : {:}'.format( idx, data) datas.append(alls[0]) box = np.array([ float(alls[2]), float(alls[3]), float(alls[4]), float(alls[5]) ]) boxes.append(box) labels = [] for idx, data in enumerate(datas): assert isinstance( data, str), 'The type of data is not correct : {}'.format(data) meta = Point_Meta(1, None, boxes[idx], data, self.dataset_name) labels.append(meta) return datas, labels
def load_list(self, file_lists, num_pts, reset): lists = load_file_lists(file_lists) print('GeneralDataset : load-list : load {:} lines'.format(len(lists))) datas, labels, boxes, face_sizes = [], [], [], [] for idx, data in enumerate(lists): alls = [x for x in data.split(' ') if x != ''] assert len(alls) == 6 or len( alls) == 7, 'The {:04d}-th line in {:} is wrong : {:}'.format( idx, data) datas.append(alls[0]) if alls[1] == 'None': labels.append(None) else: labels.append(alls[1]) box = np.array([ float(alls[2]), float(alls[3]), float(alls[4]), float(alls[5]) ]) boxes.append(box) if len(alls) == 6: face_sizes.append(None) else: face_sizes.append(float(alls[6])) self.load_data(datas, labels, boxes, face_sizes, num_pts, reset)
def perform_latent_space_arithmatics( items): # input is list of tuples of 3 [(a1,b1,c1), (a2,b2,c2)] load_last_model() model.eval() data = [im for item in items for im in item] data = [totensor(i) for i in data] data = torch.stack(data, dim=0) data = Variable(data) if args.cuda: data = data.cuda() z = model.get_latent_var(data.view(-1, model.nc, model.ndf, model.ngf)) it = iter(z.split(1)) z = zip(it, it, it) zs = [] numsample = 11 for i, j, k in z: for factor in np.linspace(0, 1, numsample): zs.append((i - j) * factor + k) z = torch.cat(zs, 0) recon = model.decode(z) it1 = iter(data.split(1)) it2 = [iter(recon.split(1))] * numsample result = zip(it1, it1, it1, *it2) result = [im for item in result for im in item] result = torch.cat(result, 0) torchvision.utils.save_image(result.data, '../imgs/vec_math.jpg', nrow=3 + numsample, padding=2)
def _makelist(self,readtxt): with open(readtxt, 'r') as f : data = f.read() datalist = data.split('\n') datalist.pop() return datalist
def load_data(self): datas = open(self.path + self.data_name, encoding="utf-8").read().splitlines() datas = [ data.split(" ")[-1].split() + [data.split(" ")[2]] for data in datas ] datas = sorted(datas, key=lambda x: len(x), reverse=True) labels = [int(data[-1]) - 1 for data in datas] datas = [data[0:-1] for data in datas] if self.word2id == None: self.get_word2id(datas) for i, data in enumerate(datas): datas[i] = " ".join(data).split("<sssss>") for j, sentence in enumerate(datas[i]): datas[i][j] = sentence.split() datas = self.convert_data2id(datas) return datas, labels
def __getitem__(self, index): data = self.info[index] data = data.split() img = Image.open(self.img_dir + data[0]) img = (np.asarray(img.resize((224, 224))).astype(np.float32) - np.array([104., 117., 124.])) / 127.5 img = torch.Tensor(np.transpose(img, [2, 0, 1])).cuda() age = torch.Tensor([int(data[1])]).long().cuda() return img, age
def __getitem__(self, index): data = Image.open(os.path.join(self.file_path, self.ims[index])) bic_im = self.trans_bic(data) data = data.convert("YCbCr") data_y, cb, cr = data.split() data = self.trans(data_y) # batch must contain tensors, numbers, dicts or lists; # data_dict = {'bic':bic_im,'name':self.ims[index]} return data, bic_im, self.ims[index]
def padding(self, data, max_len): data = data.split() if len(data) > max_len-2: data = data[:max_len-2] Y = list(map(lambda t: self.w2id.get(t, 3), data)) Y = [1] + Y + [2] length = len(Y) Y = torch.cat([torch.LongTensor(Y), torch.zeros(max_len - length).long()]) return Y
def __init__(self, state="train", k=0, embedding_type="word2vec"): self.path = os.path.abspath('') if "data" not in self.path: self.path += "/data" # 导入数据及 pos_samples = open(self.path + "/MR/rt-polarity.pos", errors="ignore").readlines() neg_samples = open(self.path + "/MR/rt-polarity.neg", errors="ignore").readlines() datas = pos_samples + neg_samples #datas = [nltk.word_tokenize(data) for data in datas] datas = [data.split() for data in datas] max_sample_length = max([len(sample) for sample in datas ]) # 求句子最大长度,将所有句子pad成一样的长度 labels = [1] * len(pos_samples) + [0] * len(neg_samples) word2id = {"<pad>": 0} # 生成word2id for i, data in enumerate(datas): for j, word in enumerate(data): if word2id.get(word) == None: word2id[word] = len(word2id) datas[i][j] = word2id[word] datas[i] = datas[i] + [0] * (max_sample_length - len(datas[i])) self.n_vocab = len(word2id) self.word2id = word2id if embedding_type == "word2vec": self.get_word2vec() elif embedding_type == "glove": self.get_glove_embedding() else: pass #self.get_word2vec() c = list(zip(datas, labels)) # 打乱训练集 random.seed(1) random.shuffle(c) datas[:], labels[:] = zip(*c) if state == "train": # 生成训练集 self.datas = datas[:int(k * len(datas) / 10)] + datas[int( (k + 1) * len(datas) / 10):] self.labels = labels[:int(k * len(datas) / 10)] + labels[int( (k + 1) * len(labels) / 10):] self.datas = np.array(self.datas[0:int(0.9 * len(self.datas))]) self.labels = np.array(self.labels[0:int(0.9 * len(self.labels))]) elif state == "valid": # 生成验证集 self.datas = datas[:int(k * len(datas) / 10)] + datas[int( (k + 1) * len(datas) / 10):] self.labels = labels[:int(k * len(datas) / 10)] + labels[int( (k + 1) * len(labels) / 10):] self.datas = np.array(self.datas[int(0.9 * len(self.datas)):]) self.labels = np.array(self.labels[int(0.9 * len(self.labels)):]) elif state == "test": # 生成测试集 self.datas = np.array( datas[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)]) self.labels = np.array(labels[int(k * len(datas) / 10):int((k + 1) * len(datas) / 10)])
def extract_vertices(pth): f = open(pth, "r") data = f.read() data = [i.lstrip("") for i in data.split('\n') if i != ''] data = [i.split(',') for i in data] data = np.array(data) vertices = np.array(np.array([list(map(int, i[:8])) for i in data])).astype(np.int32) labels = np.array([0 if '###' in i else 1 for i in data]) return vertices.reshape(labels.shape[0], 4, 2), labels
def __init__(self, path, seq_len, split=b'\n', pad_idx=0, eos_idx=1, cache_dir='.cache'): super().__init__() self.path = path self.seq_len = seq_len self.pad_idx = pad_idx self.eos_idx = eos_idx self.vocab_size = 128 data = Path(path).read_bytes() m = hashlib.sha256() m.update(data) m.update(str((self.seq_len, self.pad_idx, self.eos_idx)).encode()) if cache_dir is not None: cache_path = Path(cache_dir) / (m.hexdigest() + '.p') if cache_path.exists(): print(f'Using cached file {cache_path}') self.samples = torch.load(str(cache_path)) return if isinstance(split, str): split = split.encode('utf8') lines = data.split(split) self.samples = torch.full([len(lines), seq_len], fill_value=pad_idx, dtype=torch.long) for i, line in enumerate(tqdm(lines)): line = line[:seq_len] self.samples[i, :len(line)] = torch.tensor(list(line), dtype=torch.long) if eos_idx is not None: lengths = torch.tensor(list(map(len, lines)), dtype=torch.long).clamp_max(seq_len - 1) self.samples[torch.arange(len(lines)), lengths] = eos_idx if cache_dir is not None: cache_path = Path(cache_dir) / (m.hexdigest() + '.p') cache_path.parent.mkdir(exist_ok=True) torch.save(self.samples, cache_path)
def __init__(self, root, transform=None, target_transform=None, download=True): self.root = os.path.expanduser(root) self.transform = transform self.target_transform = target_transform if download: self.download() if not self._check_integrity(): raise RuntimeError('Dataset not found or corrupted.' + ' You can use download=True to download it') self.data = [] self.labels = [] fp = os.path.join(root, self.filename) file = open(fp, 'r') data = file.read() file.close() dataSplitted = data.split("\n")[:-1] datasetLength = len(dataSplitted) i = 0 while i < datasetLength: # Get the 'i-th' row strings = dataSplitted[i] # Split row into numbers(string), and avoid blank at the end stringsSplitted = (strings[:-1]).split(" ") # Get data (which ends at column 256th), then in a numpy array. rawData = stringsSplitted[:256] dataFloat = [float(j) for j in rawData] img = np.array(dataFloat[:16]) j = 16 k = 0 while j < len(dataFloat): temp = np.array(dataFloat[k:j]) img = np.vstack((img, temp)) k = j j += 16 self.data.append(img) # Get label and convert it into numbers, then in a numpy array. labelString = stringsSplitted[256:] labelInt = [int(index) for index in labelString] self.labels.append(np.array(labelInt)) i += 1
def plot_text_loss(): txt_file = 'p1a_validate.txt' plot_x = [] plot_y = [] with open(txt_file, 'r') as f: for line in f: data = line.strip() data = data.split(' ') plot_x.append(int(data[0])) plot_y.append(float(data[1])) plt.plot(plot_x, plot_y, 'b') plt.title('validate accuracy') plt.show()
def plot_training_loss(): txt_file = 'p1a_trainloss.txt' plot_x = [] plot_y = [] with open(txt_file, 'r') as f: for line in f: data = line.strip() data = data.split(' ') plot_x.append(int(data[0])) plot_y.append(float(data[1])) plt.plot(plot_x, plot_y, 'b') plt.title('training loss') plt.show()
def handle_data(self, data): if self.in_table: if data == 'Image Not Found': self.current_img = None elif self.current_tag == 'a': img_id = data.split('/')[-2] img_id = os.path.join(self.root, img_id + '_*.jpg') img_id = glob.glob(img_id)[0] self.current_img = img_id self.annotations[img_id] = [] elif self.current_tag == 'li' and self.current_img: img_id = self.current_img self.annotations[img_id].append(data.strip())
def _parse_list(self): self.video_list = [] if self.phase == 'Fntest': vid = 0 for x in open(self.list_file): idx = 0 for i in range(self.n_times): for j in range(self.crop_num): data = x.strip().split(' ')[0] name = data.split('/')[-1].split('.')[0] path = self.root_path if os.path.exists(os.path.join(path, name)): self.video_list.append(VideoRecord([name, x.split(' ')[1], x.split( ' ')[2]], self.root_path, phase='Val', copy_id=i, crop=j, vid=vid)) idx += 1 vid += 1 elif self.phase == 'Val': for x in open(self.list_file): data = x.strip().split(' ')[0] name = data.split('/')[-1].split('.')[0] path = self.root_path if os.path.exists(os.path.join(path, name)): self.video_list.append(VideoRecord( [name, x.split(' ')[1], x.split(' ')[2]], self.root_path, )) else: for x in open(self.list_file): data = x.strip().split(' ')[0] name = data.split('/')[-1].split('.')[0] path = self.root_path if os.path.exists(os.path.join(path, name)): self.video_list.append(VideoRecord( [name, x.split(' ')[1], x.split(' ')[2]], self.root_path, )) self.rng.shuffle(self.video_list)
def _create_class_idx_dict_val(self): val_annotations_file = os.path.join(self.val_dir, "val_annotations.txt") self.val_img_to_class = {} set_of_classes = set() with open(val_annotations_file, 'r') as fo: entry = fo.readlines() for data in entry: words = data.split("\t") self.val_img_to_class[words[0]] = words[1] set_of_classes.add(words[1]) self.len_dataset = len(list(self.val_img_to_class.keys())) classes = sorted(list(set_of_classes)) self.class_to_tgt_idx = {classes[i]: i for i in range(len(classes))} self.tgt_idx_to_class = {i: classes[i] for i in range(len(classes))}
def padding(data, max_len, language): if language == 'src': # source language vocabs = src_vocabs elif language == 'tgt': # target language vocabs = tgt_vocabs data = data.split() if len(data) > max_len - 2: data = data[:max_len - 2] Y = list(map(lambda t: vocabs.get(t, 3), data)) Y = [1] + Y + [2] length = len(Y) Y = torch.cat( [torch.LongTensor(Y), torch.zeros(max_len - length).long()]) return Y
def __getitem__(self, index): ''' 返回一张图片的数据 ''' index = self.img_idxs[index] data = self.image_data[index] data = data.split() input = [float(i) for i in data] input = np.array(input) input = input.reshape([48,48])/255. image = Image.fromarray(input) #生成image对象 data = self.transforms(image) if self.test: id = index return id,data else: # label = self.image_label[index] #完成多类别的label转化为one-hot label vector # label_vector = torch.zeros(1,7).scatter_(1,label,1) label_vector = self.label_tensor[index] return label_vector,data
def load_list(self, file_lists, num_pts, reset): lists = load_file_lists(file_lists) print ('GeneralDataset : load-list : load {:} lines'.format(len(lists))) datas, labels, boxes, face_sizes = [], [], [], [] for idx, data in enumerate(lists): alls = [x for x in data.split(' ') if x != ''] assert len(alls) == 6 or len(alls) == 7, 'The {:04d}-th line in {:} is wrong : {:}'.format(idx, data) datas.append( alls[0] ) if alls[1] == 'None': labels.append( None ) else: labels.append( alls[1] ) box = np.array( [ float(alls[2]), float(alls[3]), float(alls[4]), float(alls[5]) ] ) boxes.append( box ) if len(alls) == 6: face_sizes.append( None ) else: face_sizes.append( float(alls[6]) ) self.load_data(datas, labels, boxes, face_sizes, num_pts, reset)