def __init__(self, opt): """ Attributes: _data (dict): 预处理之后的数据,包括所有图片的文件名,以及处理过后的描述 all_imgs (tensor): 利用resnet50提取的图片特征,形状(200000,2048) caption(list): 长度为20万的list,包括每张图片的文字描述 ix2id(dict): 指定序号的图片对应的文件名 start_(int): 起始序号,训练集的起始序号是0,验证集的起始序号是190000,即 前190000张图片是训练集,剩下的10000张图片是验证集 len_(init): 数据集大小,如果是训练集,长度就是190000,验证集长度为10000 traininig(bool): 是训练集(True),还是验证集(False) 相当于从图片特征到文字的特征 """ self.opt = opt data = t.load(opt.caption_data_path) word2ix = data['word2ix'] self.captions = data['caption'] self.padding = word2ix.get(data.get('padding')) self.end = word2ix.get(data.get('end')) self._data = data self.ix2id = data['ix2id'] self.all_imgs = t.load(opt.img_feature_path)
def process_line(cls, data, tokenizer, tokenize): prompts, texts = [], [] text = data.get("title", "") + data.get("abstract", "") + data.get("content", "") if text: p, t = cls.process_sample("", tokenizer, tokenize), cls.process_sample(text, tokenizer, tokenize) prompts.append(p) texts.append(t) return prompts, texts
def __init__(self, opt): self.opt = opt data = t.load(opt.caption_data_path) word2ix = data['word2ix'] self.padding = word2ix.get(data.get('padding')) self.end = word2ix.get(data.get('end')) self._data = data self.img_feats = t.load(opt.feats_path) self.train()
def __init__(self, opt, transforms=None, training=True): self.opt = opt self.training = training data = t.load(opt.caption_data_path) word2ix = data['word2ix'] self.padding = word2ix.get(data.get('padding')) self.end = word2ix.get(data.get('end')) self._data = data self.normalize = tv.transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) self.train()
def __init__(self, opt): self.opt = opt data = t.load(opt.caption_data_path) self.data = data self.captions = data['caption'] self.word2ix = data['word2ix'] self.padding = self.word2ix[data.get('padding')] self.end = self.word2ix[data.get('end')] self._data = data self.ix2word = data['ix2word'] self.all_imgs = t.load(opt.img_feature_path) # 200k*2048
def __init__(self, spec): self.mode = spec['mode'] # either 'train' or 'test' self.root = spec['root'] self.images = None self.labels = None # self.transforms = spec['transforms'] # Already transformed (?) with h5py.File('usps.h5', 'r') as hf: data = hf.get(self.mode) self.images = data.get('data')[:] self.images = torch.as_tensor(self.images).reshape(-1, 1, 16, 16) self.labels = data.get('target')[:] self.labels = torch.as_tensor(self.labels)
def __init__(self): data = t.load('caption.pth') word2ix = data['word2ix'] self.ix2word = data['ix2word'] self.captions = data['caption'] self.padding = word2ix.get(data.get('padding')) self.end = word2ix.get(data.get('end')) self._data = data self.ix2id = data['ix2id'] all_low = t.load('results.pth') self.all_low = all_low all_pic_r = t.load('data_save/multi_label_extract_pic.pth') all_pic = all_pic_r['multi_label_extract_pic'] self.all_pic = all_pic all_block_r = t.load('data_save/imageai_multi_label_extract_block.pth') all_block = all_block_r['multi_label_extract_block'] self.all_block = all_block
def process_line(self, data, tokenizer, tokenize): text = data.get("text", None) if text: prompt, text = self.process_sample("", tokenizer, tokenize), self.process_sample( text, tokenizer, tokenize) return [prompt], [text] else: return [], []
def process_line(self, data, tokenizer, tokenize): text = "" title = data.get("title", None) description = data.get("description", None) maintext = data.get("maintext", None) if title: text += title.strip() + " " if description and (not maintext or not maintext.startswith(description)): text += description.strip() + " " if maintext: text += maintext if len(text) > 100: prompt, text = self.process_sample("", tokenizer, tokenize), self.process_sample( text, tokenizer, tokenize) return [prompt], [text] else: return [], []
def __init__(self,opt,transforms=None): ''' Attributes: _data (dict): 预处理之后的数据,包括所有图片的文件名,以及处理过后的描述 all_imgs (tensor): 利用resnet50提取的图片特征,形状(200000,2048) caption(list): 长度为20万的list,包括每张图片的文字描述 ix2id(dict): 指定序号的图片对应的文件名 start_(int): 起始序号,训练集的起始序号是0,验证集的起始序号是190000,即 前190000张图片是训练集,剩下的10000张图片是验证集 len_(init): 数据集大小,如果是训练集,长度就是190000,验证集长度为10000 traininig(bool): 是训练集(True),还是验证集(False) ''' self.opt = opt data = t.load(opt.caption_data_path) word2ix = data['word2ix'] self.captions = data['caption'] self.padding = word2ix.get(data.get('padding')) self.end = word2ix.get(data.get('end')) self._data = data self.ix2id = data['ix2id'] self.all_imgs = t.load(opt.img_feature_path)
def process_line(cls, data, tokenizer, tokenize): if "title" not in data: return [], [] prompts, texts = [], [] qtitle = data["title"] qcontent = data.get("content", "") qcontent = cls.trim_field(qcontent, max_length=100) prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.answer_prefix prompt = cls.process_sample(prompt, tokenizer, tokenize) if "best_answer" in data: text = data["best_answer"]["content"] if len(text) > 10: text = cls.process_sample(text, tokenizer, tokenize) prompts.append(prompt) texts.append(text) for answer in data.get("other_answers", []): text = answer["content"] if len(text) > 100: text = cls.process_sample(text, tokenizer, tokenize) prompts.append(prompt) texts.append(text) return prompts, texts
def process_line(cls, data, tokenizer, tokenize): prompts, texts = [], [] ans_length = len(data.get("ans-content", "")) ans_up = data.get("ans-up-num", "") ans_up = int(ans_up) if ans_up else 0 if ans_length > 100 or ans_up > 1000: qtitle = data["q_title"] qcontent = data["q-content"] if qcontent is None: qcontent = "" qcontent = cls.trim_field(qcontent, max_length=100) user = data.get("user-signature", "") prompt = cls.qtitle_prefix + qtitle + cls.qcontent_prefix + qcontent + cls.user_prefix + user + cls.answer_prefix text = data["ans-content"] prompt, text = cls.process_sample(prompt, tokenizer, tokenize), cls.process_sample(text, tokenizer, tokenize) prompts.append(prompt) texts.append(text) # prompt = data["q_title"] + data["q-content"] + data["user-signature"] # text = data["ans-content"] # prompts.append(prompt) # texts.append(text) return prompts, texts
def process_line(self, data, tokenizer, tokenize): source = data["meta"].get("pile_set_name", None) text = data.get("text", None) if source and text: if source in self.filtered_sources: return [], [], None elif source in self.downsample_sources and random.random( ) > self.downsample_sources[source]: return [], [], None else: prompt, text = self.process_sample( "", tokenizer, tokenize), self.process_sample(text, tokenizer, tokenize) return [prompt], [text], source else: return [], [], None
def read_normalize(self): try: with open(self.path + "\\dataSetJson.json", "r") as json_file: data = json.load(json_file) print(self.path + "\\dataSetJson.json") if not (data.get('normalize') is None): #print("Qui") norm = data['normalize'] if not (norm.get('mean') and norm.get('dev_std')) is None: self.mean = tuple(norm['mean']) print(self.mean) self.dev_std = tuple(norm['dev_std']) print(self.dev_std) self.transform = transforms.Compose([ transforms.Resize(self.resize), transforms.ToTensor(), transforms.Normalize(self.mean, self.dev_std) ]) except: sys.stderr.write("Error not mormalize") exit(0)
def eval_split(model, crit, loader, args): verbose = True num_images = 10 split = 'val' lang_eval = 0 dataset = 'coco' beam_size = 1 model.eval() loader.reset_iterator(split) n = 0 loss = 0 loss_sum = 0 loss_evals = 1e-8 predictions = [] while True: data = loader.get_batch(split) n = n + loader.batch_size if data.get('labels', None) is not None: tmp = [ data['fc_feats'], data['att_feats'], data['labels'], data['masks'] ] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] fc_feats, att_feats, labels, masks = tmp loss = crit(model(fc_feats, att_feats, labels), labels[:, 1:], masks[:, 1:]).data[0] loss_sum = loss_sum + loss loss_evals = loss_evals + 1 tmp = [ data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img] ] tmp = [ Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp ] fc_feats, att_feats = tmp seq, _ = model.sample(fc_feats, att_feats) sents = decode_sequence(loader.get_vocab(), seq) for k, sent in enumerate(sents): entry = {'image_id': data['infos'][k]['id'], 'caption': sent} entry['file_name'] = data['infos'][k]['file_path'] predictions.append(entry) cmd = 'cp "' + os.path.join( args.data_dir, data['infos'][k]['file_path']) + '" vis/imgs/img' + str( len(predictions)) + '.jpg' #print(cmd) print('image %s: %s' % (entry['image_id'], entry['caption'])) os.system(cmd) ix0 = data['bounds']['it_pos_now'] ix1 = data['bounds']['it_max'] if num_images != -1: ix1 = min(ix1, num_images) for i in range(n - ix1): predictions.pop() if data['bounds']['new_epoch']: break if num_images >= 0 and n >= num_images: break lang_stats = language_eval(predictions) return lang_stats
def go_eval(netG, dataset_dt, dt_kw=[], g_kw={}, crop_size=74): scale = None def _overlap_crop_forward(x, shave=10, min_size=100000, bic=None): """ chop for less memory consumption during test """ n_GPUs = 2 #scale = self.scale b, c, h, w = x.size() h_half, w_half = h // 2, w // 2 h_size, w_size = h_half + shave, w_half + shave lr_list = [ x[:, :, 0:h_size, 0:w_size], x[:, :, 0:h_size, (w - w_size):w], x[:, :, (h - h_size):h, 0:w_size], x[:, :, (h - h_size):h, (w - w_size):w] ] if bic is not None: bic_h_size = h_size * scale bic_w_size = w_size * scale bic_h = h * scale bic_w = w * scale bic_list = [ bic[:, :, 0:bic_h_size, 0:bic_w_size], bic[:, :, 0:bic_h_size, (bic_w - bic_w_size):bic_w], bic[:, :, (bic_h - bic_h_size):bic_h, 0:bic_w_size], bic[:, :, (bic_h - bic_h_size):bic_h, (bic_w - bic_w_size):bic_w] ] if w_size * h_size < min_size: sr_list = [] for i in range(0, 4, n_GPUs): lr_batch = torch.cat(lr_list[i:(i + n_GPUs)], dim=0) if bic is not None: bic_batch = torch.cat(bic_list[i:(i + n_GPUs)], dim=0) sr_batch_temp = netG(lr_batch, scale=4) if isinstance(sr_batch_temp, list): sr_batch = sr_batch_temp[-1] else: sr_batch = sr_batch_temp sr_list.extend(sr_batch.chunk(n_GPUs, dim=0)) else: sr_list = [ _overlap_crop_forward(patch, shave=shave, min_size=min_size) \ for patch in lr_list ] h, w = scale * h, scale * w h_half, w_half = scale * h_half, scale * w_half h_size, w_size = scale * h_size, scale * w_size shave *= scale output = x.new(b, c, h, w) output[:, :, 0:h_half, 0:w_half] \ = sr_list[0][:, :, 0:h_half, 0:w_half] output[:, :, 0:h_half, w_half:w] \ = sr_list[1][:, :, 0:h_half, (w_size - w + w_half):w_size] output[:, :, h_half:h, 0:w_half] \ = sr_list[2][:, :, (h_size - h + h_half):h_size, 0:w_half] output[:, :, h_half:h, w_half:w] \ = sr_list[3][:, :, (h_size - h + h_half):h_size, (w_size - w + w_half):w_size] return output dl = torch.utils.data.DataLoader(dataset_dt, batch_size=1) psnr_list = [] ssim_list = [] for i, data in enumerate(dl): if dt_kw is None: lr, hr = data else: lr, hr = data.get(dt_kw[0]), data.get(dt_kw[1]) scale = g_kw.get("scale") if "scale" in g_kw else 4 o_hr = _overlap_crop_forward(lr) assert (o_hr.shape == hr.shape) o_hr = Tensor2np(o_hr[0].cpu()) hr = Tensor2np(hr[0].cpu()) psnr, ssim = metric.calc_metrics(o_hr, hr, crop_border=scale) psnr_list.append(psnr) ssim_list.append(ssim) print("psnr:", np.mean(psnr_list), "ssim:", np.mean(ssim_list))
def row(self, data): return self.fmt.format(**{ k: str(data.get(k, ''))[:w] for k, w in self.width.iteritems() })
def __getitem__(self, indexs): if self.data is None: self.data = np.load(self.hparams.ds_name + '.npz', allow_pickle=True) ret = [] if not isinstance(indexs, list): _indexs = [indexs] else: _indexs = indexs for index in _indexs: data = self.data[self.audio_and_text_keys[index]].item() text = np.array(data['text'], np.int) if self.hparams.multispeaker: spk_id = data.get('spk_id', 0) else: spk_id = 0 if self.hparams.add_sil == 2: text = text.reshape(-1, 3 if not self.hparams.use_pinyin else 2) text = np.concatenate( [text, 2 * np.ones([text.shape[0], 1], np.int)], -1) # [L, 4] text = text.reshape(-1) # [L + L//3] elif self.hparams.add_sil == 3: text = np.stack([text, 128 + text, 256 + text], -1) # [L, 3] text = text.reshape( -1, 9 if not self.hparams.use_pinyin else 2) # [L/3, 9] text = np.concatenate( [text, 2 * np.ones([text.shape[0], 1], np.int)], -1) # [L/3, 10] text = text.reshape(-1) # [10L/3] text = torch.from_numpy(text) mel = torch.from_numpy(np.array(data['mels']).reshape(-1, 80).T) if self.hparams.use_linear or self.hparams.linear_directly: linear = torch.from_numpy( np.array(data['linear']).reshape(-1, self.hparams.num_freq).T) else: linear = None if self.hparams.speech and self.type != 'val': mel = mel[:, :1550] text = text[:350] if linear: linear = linear[:1550] pitch = None if self.hparams.use_pitch: pitch_key = 'pitches' if not self.hparams.use_smooth_pitch else 'smooth_pitches' pitch = torch.from_numpy(np.array(data[pitch_key], np.int)) utt_ids = torch.from_numpy(np.array(data['utt_id'], np.int)) if self.hparams.prefix_len > 0: text_len = int(self.hparams.prefix_len * text.shape[0] / mel.shape[1]) text = text[:text_len] mel = mel[:, :self.hparams.prefix_len] pitch = pitch[:self.hparams.prefix_len] attn = None if self.hparams.use_ali or self.hparams.use_ali_mask: attn = np.zeros((mel.shape[1], text.shape[0])) if self.hparams.use_phoneme_align: mel_splits = [ int(x * self.hparams.audio_sample_rate / self.hparams.hop_size) for x in data['splits'] ] last = 0 for t_idx, s in enumerate(mel_splits): attn[last:s, t_idx] = 1 else: splits_begin = np.clip(np.array(data['splits'], np.int), 0, mel.shape[1] - 1) splits_end = np.clip(np.array(data['splits_end'], np.int), 0, mel.shape[1] - 1) splits_begin = [0] + list(splits_begin) splits_end = [0] + list(splits_end) if not self.hparams.use_ali_mask2: # TODO: PINYIN? if self.hparams.use_pinyin: for i in range(text.shape[0] // 3): splits_begin_step = (splits_begin[i + 1] - splits_begin[i] - 3) / 2 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 3] += 1 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 3 + 1] += 1 attn[int(splits_begin[i + 1]) - 3:int(splits_begin[i + 1]), i * 3 + 2] += 1 else: if self.hparams.add_sil == 0: for i in range(text.shape[0] // 3): splits_begin_step = (splits_begin[i + 1] - splits_begin[i]) / 3 splits_end_step = (splits_end[i + 1] - splits_end[i]) / 3 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) splits_end_step = np.clip( splits_end_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 3] += 0.5 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 3 + 1] += 0.5 attn[int(splits_begin[i] + splits_begin_step * 2):int(splits_begin[i + 1]), i * 3 + 2] += 0.5 attn[int(splits_end[i] ):int(splits_end[i] + splits_end_step), i * 3] += 0.5 attn[int(splits_end[i] + splits_end_step ):int(splits_end[i] + splits_end_step * 2), i * 3 + 1] += 0.5 attn[int(splits_end[i] + splits_end_step * 2):int(splits_end[i + 1]), i * 3 + 2] += 0.5 elif self.hparams.add_sil == 2: for i in range(text.shape[0] // 4): splits_begin_step = (splits_begin[i + 1] - splits_begin[i] - 3) / 3 splits_end_step = (splits_end[i + 1] - splits_end[i] - 3) / 3 if self.hparams.attn_step_clip10: splits_begin_step = np.clip( splits_begin_step, 0, 10) splits_end_step = np.clip( splits_end_step, 0, 10) attn[int(splits_begin[i] ):int(splits_begin[i] + splits_begin_step), i * 4] += 0.5 attn[int(splits_begin[i] + splits_begin_step ):int(splits_begin[i] + splits_begin_step * 2), i * 4 + 1] += 0.5 attn[int(splits_begin[i] + splits_begin_step * 2):int(splits_begin[i + 1]) - 3, i * 4 + 2] += 0.5 attn[int(splits_begin[i + 1]) - 3:int(splits_begin[i + 1]), i * 4 + 3] += 0.5 attn[int(splits_end[i] ):int(splits_end[i] + splits_end_step), i * 4] += 0.5 attn[int(splits_end[i] + splits_end_step ):int(splits_end[i] + splits_end_step * 2), i * 4 + 1] += 0.5 attn[int(splits_end[i] + splits_end_step * 2):int(splits_end[i + 1]) - 3, i * 4 + 2] += 0.5 attn[int(splits_end[i + 1]) - 3:int(splits_end[i + 1]), i * 4 + 3] += 0.5 else: for i in range(text.shape[0] // 3): attn[int(splits_begin[i]):int(splits_begin[i + 1]), i * 3:(i + 1) * 3] = 1 attn[int(splits_end[i]):int(splits_end[i + 1]), i * 3:(i + 1) * 3] = 1 attn = torch.from_numpy(attn) if self.hparams.use_wavenet: wav = torch.from_numpy(np.array(data['raw_wav'])) max_time_steps = self.hparams.wavenet_max_time if wav.shape[0] > max_time_steps: max_time_frames = max_time_steps // audio.get_hop_size( self.hparams) start_cond_idx = torch.randint( mel.shape[1] - max_time_frames, []) else: start_cond_idx = 0 else: wav = None start_cond_idx = None if self.hparams.linear_directly: mel = linear ret.append([ text, mel, pitch, utt_ids, attn, linear, spk_id, wav, start_cond_idx ]) if not isinstance(indexs, list): return ret[0] else: return ret
model = torchvision.models.resnet152(pretrained=False).to(device) # model = torch.load(MODEL).to(device) optimizer = optim.Adam(model.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss() EPOCHS = 50 net = model.train() for epoch in range(EPOCHS): print("Epoch = ", epoch) i = 0 cum_loss = 0 total_train = 0 correct_train = 0 for data in trainset: X = data.get("image").to(device) y = data.get("solution").to(device) optimizer.zero_grad() output = model(X) loss = criterion(output, y.long()) loss.backward() optimizer.step() i += BATCH_SIZE cum_loss += loss _, prediction = torch.max(output.data, 1) print(loss) print(i) total_train += y.nelement() correct_train += prediction.eq(y.data).sum().item() train_accuracy = 100 * correct_train / total_train print("Accuracy = ",train_accuracy, "%")