def get_distance(result): locs = list( set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(geohash.decode_exactly(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] manhattan_distance = [] for i in geohashed_loc: if i[0] is not np.nan and i[1] is not np.nan: lat1, lon1, _, _ = loc_dict[i[0]] lat2, lon2, _, _ = loc_dict[i[1]] distance.append( cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) manhattan_distance.append( manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) else: distance.append(np.nan) manhattan_distance.append(np.nan) result.loc[:, 'distance'] = distance result.loc[:, 'manhattan'] = manhattan_distance return result
def find_similar_data(): ''' 相似数据:指目的港口数据在阈值内 :return: ''' # input_path = r'D:\baiph\BDC2020\data\train\data\train-clean' # out_path = r'D:\baiph\BDC2020\data\train\data\similar\train' input_path = r'D:\baiph\BDC2020\data\train\cleans\clean2' out_path = r'D:\baiph\BDC2020\data\train\cleans\silimar2' files = os.listdir(input_path) test_path = r'D:\baiph\BDC2020\data\test/testA.csv' test_data = pd.read_csv(test_path).groupby('loadingOrder') test_gloal_port = [] for key, group in test_data: test_gloal_port.append(group[['end_longitude', 'end_latitude']].values[0]) thread = 30000 for file in tqdm(files): datas = pd.read_csv(os.path.join(input_path, file)).groupby('loadingOrder') for key, data in datas: data.reset_index(drop=True, inplace=True) gloal_port = data[['longitude', 'latitude']].values[-1] for port in test_gloal_port: if cal_distance(gloal_port[1], port[1], gloal_port[0], port[0]) < thread: data.to_csv(os.path.join(out_path, key + '.csv'), index=None) break
def clean_train_data(): ''' 清洗数据 :return: ''' # train_gps_path = r'D:\baiph\BDC2020\data\train\train' # out_path = r'D:\baiph\BDC2020\data\train\data\train-clean' train_gps_path = r'D:\baiph\BDC2020\data\train\cleans\clean1' out_path = r'D:\baiph\BDC2020\data\train\cleans\clean1-train' mmsis = os.listdir(train_gps_path) thread = 30000 for mm in tqdm(mmsis): path = os.path.join(train_gps_path, mm) datas = pd.read_csv(path) datas['timestamp'] = pd.to_datetime(datas['timestamp'], infer_datetime_format=True) datas.sort_values(['loadingOrder', 'timestamp'], inplace=True) groups = datas.groupby('loadingOrder') result = [] for key, group in groups: group = group.reset_index(drop=True) # 清洗起航时数据,将speed为0的删除 id = group['speed'].ne(0).idxmax() group = group.iloc[id:] group.reset_index(drop=True, inplace=True) last_address = group[['longitude', 'latitude']].values[-1] address = group[['longitude', 'latitude']].values count = 0 for i, add in enumerate(address): # 清洗到港数据 a = cal_distance(last_address[1], add[1], last_address[0], add[0]) if a < thread and group['speed'].iloc[i] == 0: count = i break if count != 0: group = group.iloc[0:count] group.reset_index(drop=True, inplace=True) result.append(group) if len(result) != 0: result = pd.concat(result, axis=0) result.to_csv(os.path.join(out_path, mm), index=None)
def get_distance(result): locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc'])) if np.nan in locs: locs.remove(np.nan) deloc = [] for loc in locs: deloc.append(geohash.decode_exactly(loc)) loc_dict = dict(zip(locs, deloc)) geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values distance = [] manhattan_distance = [] for i in geohashed_loc: if i[0] is not np.nan and i[1] is not np.nan: lat1, lon1, _, _ = loc_dict[i[0]] lat2, lon2, _, _ = loc_dict[i[1]] distance.append(cal_distance(float(lat1), float(lon1), float(lat2), float(lon2))) manhattan_distance.append(manhattan(float(lat1), float(lon1), float(lat2), float(lon2))) else: distance.append(np.nan) manhattan_distance.append(np.nan) result.loc[:, 'distance'] = distance result.loc[:, 'manhattan'] = manhattan_distance return result
def run_train(self, mixpoet, tool, optimizerRec, optimizerDis, optimizerGen, logger, epoch): logger.set_start_time() for step in range(0, tool.train_batch_num): batch = tool.train_batches[step] batch_keys = batch[0].to(device) batch_poems = batch[1].to(device) batch_dec_inps = [dec_inp.to(device) for dec_inp in batch[2]] batch_labels = batch[3].to(device) batch_label_mask = batch[4].to(device) batch_lengths = batch[5].to(device) # train the classifier, recognition network and decoder rec_loss, cl_loss_w, cl_loss_xw, entro_loss, outs_post, clabels1, clabels2 = \ self.run_rec_step(mixpoet, optimizerRec, batch_keys, batch_poems, batch_dec_inps, batch_labels, batch_label_mask, batch_lengths) logger.add_rec_losses(rec_loss, cl_loss_w, cl_loss_xw, entro_loss) logger.set_rate("learning_rate", optimizerRec.rate()) # train discriminator if logger.total_steps > self.hps.rec_warm_steps: dis_loss = 0 for i in range(0, self.hps.ndis): step_dis_loss = self.run_dis_step(mixpoet, optimizerDis, batch_keys, batch_poems, batch_labels, batch_label_mask) dis_loss += step_dis_loss dis_loss /= self.hps.ndis logger.add_dis_loss(dis_loss) logger.set_rate('noise_weight', self.noise_decay_tool.do_step()) if logger.total_steps > self.hps.rec_warm_steps: # train prior and posterior generators adv_loss = self.run_adv_step(mixpoet, optimizerGen, batch_keys, batch_poems, batch_labels, batch_label_mask) logger.add_adv_loss(adv_loss) # temperature annealing mixpoet.set_tau(self.tau_decay_tool.do_step()) logger.set_rate('temperature', self.tau_decay_tool.get_rate()) if (step % 40 == 0) and (logger.total_steps > self.hps.rec_warm_steps): dist = utils.cal_distance(mixpoet, batch_keys, batch_poems, batch_labels, batch_label_mask) if not np.isnan(dist): logger.add_distance(dist) #------------ fadist = utils.factor_distance(mixpoet, batch_keys, self.hps.n_class1, self.hps.n_class2, device) if not np.isnan(fadist): logger.add_factor_distance(fadist) if step % self.hps.log_steps == 0: logger.set_end_time() outs_prior = self.gen_from_prior(mixpoet, batch_keys, batch_poems, batch_dec_inps, batch_labels, batch_label_mask, batch_lengths) utils.sample_mix(batch_keys, batch_dec_inps, batch_labels, clabels1, clabels2, outs_post, outs_prior, self.hps.sample_num, tool) logger.print_log() logger.draw_curves() logger.set_start_time()
def test(model, test_loader, cost, print_freq=40, batch_num=None, denoise=None, random_layer=None): batch_time = AverageMeter() losses = AverageMeter() error = AverageMeter() raw_error = AverageMeter() cat_error = {} raw_cat_error = {} meanD = AverageMeter() end = time.time() model.eval() if denoise: denoise.eval() if random_layer: random_layer.eval() custdv, cumean = torch.tensor(stdv).cuda(), torch.tensor(mean).cuda() """ print(mean,stdv) tran = transforms.Compose([ transforms.Resize((299,299)), transforms.ToTensor(), transforms.Normalize(mean=mean, std=stdv), ]) data = datasets.ImageFolder("/home/data/wanghao/tianchi/data/IJCAI_2019_AAAC_test",transform=tran) test_loader1=DataLoader(data, batch_size=args.batch_size, shuffle=True,num_workers=4) """ nx = np.array([[0., 0., 0.], [1., 1., 1.]]) x_min, x_max = (nx - mean) / stdv epss = [40 / 255] for i, (clean_images, labels) in enumerate(test_loader): clean_images, labels = clean_images.cuda(), labels.cuda() adversarial_images = FGSM(clean_images, labels, model, torch.nn.functional.cross_entropy, eps=epss[random.randint(0, len(epss) - 1)], x_min=x_min, x_max=x_max) #tmp=adversarial_images #print(adversarial_images) if i % print_freq == 0: ims1, clean_imgs = convert(adversarial_images.detach(), cumean, custdv, clean_images.cuda()) for im, imc in zip(ims1, clean_imgs): im = np.rint(im * 255.) imc = np.rint(imc * 255.) #print(imc.shape) #print(imc[:5][:5][:5]) #print(imc[0][0],im[0][0]) #print(im.shape,imc.shape) dd = cal_distance(im, imc) #print("distance between clean adn adversarial:",diffs.item(),dd) im = np.rint(im).astype(np.uint8) imc = np.rint(imc).astype(np.uint8) #print(im.shape) cv2.imwrite( "./output" + "/" + str(i // print_freq % print_freq) + "ad.jpg", im) cv2.imwrite( "./output" + "/" + str(i // print_freq % print_freq) + "test.jpg", imc) if denoise: d_adversarial_images = denoise(adversarial_images) d_clean_images = denoise(clean_images) #print(d_clean_images[:][:5][:5]) if i % print_freq == 0: ims1, clean_imgs = convert(d_adversarial_images.detach(), cumean, custdv, clean_images.cuda()) for im, imc in zip(ims1, clean_imgs): im = im * 255. imc = imc * 255. #print(imc.shape) #print(imc[:5][:5][:5]) dd = cal_distance(im, imc) #print("distance after denoise:",diffs.item(),dd) im = np.rint(im).astype(np.uint8) imc = np.rint(imc).astype(np.uint8) #print(im.shape) cv2.imwrite( "./output" + "/" + str(i // print_freq % print_freq) + "adrec.jpg", im) cv2.imwrite( "./output" + "/" + str(i // print_freq % print_freq) + "testrec.jpg", imc) #ims1,clean_imgs=convert(d_adversarial_images.detach(),cumean,custdv,d_clean_images.cuda()) if random_layer: d_adversarial_images = random_layer(d_adversarial_images) #print(diffs) #diffs=diffs.cu outputs = model(d_adversarial_images) outputs_clean = model(d_clean_images) loss = cost(outputs, labels) batch_size = labels.size(0) outputs = outputs.max(1)[1] outputs_clean = outputs_clean.max(1)[1] #print(batch_size,outputs.shape) #print(outputs,labels) for _ in range(batch_size): real_cat = labels[_].item() raw_cat = outputs_clean[_].item() ad_cat = outputs[_].item() if real_cat not in cat_error.keys(): cat_error[real_cat] = AverageMeter() cat_error[real_cat].update(1.0 if real_cat == ad_cat else 0.0) if real_cat not in raw_cat_error.keys(): raw_cat_error[real_cat] = AverageMeter() raw_cat_error[real_cat].update(1.0 if real_cat == raw_cat else 0.0) error.update( torch.ne(outputs.cpu(), labels.cpu()).float().sum().item() / batch_size, batch_size) raw_error.update( torch.ne(outputs_clean.cpu(), labels.cpu()).float().sum().item() / batch_size, batch_size) losses.update(loss.item(), batch_size) #meanD.update(where((outputs==labels).cpu(),diffs.float(),0.).mean().item(),batch_size) batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: res = '\t'.join([ 'test', 'Iter: [%d/%d]' % (i + 1, batch_num), 'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg), 'Loss %.4f (%.4f)' % (losses.val, losses.avg), 'Error %.4f (%.4f)' % (error.val, error.avg), 'raw_error %.4f (%.4f)' % (raw_error.val, raw_error.avg), 'meanD %.4f (%.4f)' % (meanD.val, meanD.avg) ]) print(res) return batch_time.avg, losses.avg, error.avg, meanD.avg, raw_cat_error, cat_error
def generate_adversial_examles(model, data_dir, adversarial_method, save_dir, batch_size=16, true_label=True, pho_size=299, eps=0.03, iteration=None): batch_time = AverageMeter() meanD = AverageMeter() end = time.time() data = Dataset_filename(data_dir, w=pho_size, h=pho_size, mean=mean, stdv=stdv, need_filename=True) dataLoader = DataLoader(data, batch_size=batch_size, shuffle=True, num_workers=4) custdv, cumean = torch.tensor(stdv).cuda(), torch.tensor(mean).cuda() for i, (inputs, labels, filenames) in enumerate(dataLoader): #print("gendrate adversarial samples [{}/{}]".format(i,len(dataLoader))) #print(filenames,inputs.shape,inputs.dtype,type(labels),type(filenames)) if torch.cuda.is_available(): inputs = inputs.cuda() labels = labels.cuda() model.eval() output = labels if true_label else model(inputs).max(1)[1] if ( adversarial_method == "fgsm" or adversarial_method == "i-fgsm") else model(inputs).min(1)[1] output = output.detach() if adversarial_method == "fgsm": adv_input = FGSM(inputs, output, model, torch.nn.functional.cross_entropy, eps=eps) elif adversarial_method == "step-ll": adv_input = adversary.step_ll(inputs, output, model, torch.nn.functional.cross_entropy, eps=eps) elif adversarial_method == "i-fgsm": nx = np.array([[0., 0., 0.], [1., 1., 1.]]) x_min, x_max = (nx - mean) / stdv adv_input = adversary.i_fgsm(inputs, output, model, torch.nn.functional.cross_entropy, targeted=False, eps=eps, alpha=1.0 / 255 / 0.3, iteration=iteration, x_val_min=x_min, x_val_max=x_max) adv_input = adv_input.data.cuda() if i % 40 == 0: adv_input, inputs = convert(adv_input, cumean, custdv, inputs) #print(adv_input) adv_input = np.rint(adv_input).astype(np.uint8) inputs = np.rint(inputs).astype(np.uint8) else: adv_input = convert(adv_input, cumean, custdv) #print(adv_input) adv_input = np.rint(adv_input).astype(np.uint8) for idx, filename in enumerate(filenames): #print("write",args.output_dir+'/'+filename) save_path = os.path.join(save_dir, str(labels[idx].item())) if not os.path.exists(save_path): os.makedirs(save_path) im = Image.fromarray(adv_input[idx]) im.save(os.path.join(save_path, filename)) #print(cal_distance(adv_input[idx],inputs[idx])) if i % 40 == 0: meanD.update( cal_distance(adv_input[idx].astype(np.float), inputs[idx].astype(np.float))) batch_time.update(time.time() - end) end = time.time() if i % 40 == 0: res = '\t'.join([ 'generate', 'Iter: [%d/%d]' % (i + 1, len(dataLoader)), 'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg), 'meanD %.3f (%.3f)' % (meanD.val, meanD.avg), ]) print(res)