Exemple #1
0
def get_distance(result):
    locs = list(
        set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
    if np.nan in locs:
        locs.remove(np.nan)
    deloc = []
    for loc in locs:
        deloc.append(geohash.decode_exactly(loc))
    loc_dict = dict(zip(locs, deloc))
    geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values
    distance = []
    manhattan_distance = []
    for i in geohashed_loc:
        if i[0] is not np.nan and i[1] is not np.nan:
            lat1, lon1, _, _ = loc_dict[i[0]]
            lat2, lon2, _, _ = loc_dict[i[1]]
            distance.append(
                cal_distance(float(lat1), float(lon1), float(lat2),
                             float(lon2)))
            manhattan_distance.append(
                manhattan(float(lat1), float(lon1), float(lat2), float(lon2)))
        else:
            distance.append(np.nan)
            manhattan_distance.append(np.nan)
    result.loc[:, 'distance'] = distance
    result.loc[:, 'manhattan'] = manhattan_distance
    return result
Exemple #2
0
def find_similar_data():
    '''
    相似数据:指目的港口数据在阈值内
    :return:
    '''
    # input_path = r'D:\baiph\BDC2020\data\train\data\train-clean'
    # out_path = r'D:\baiph\BDC2020\data\train\data\similar\train'
    input_path = r'D:\baiph\BDC2020\data\train\cleans\clean2'
    out_path = r'D:\baiph\BDC2020\data\train\cleans\silimar2'
    files = os.listdir(input_path)

    test_path = r'D:\baiph\BDC2020\data\test/testA.csv'
    test_data = pd.read_csv(test_path).groupby('loadingOrder')

    test_gloal_port = []
    for key, group in test_data:
        test_gloal_port.append(group[['end_longitude',
                                      'end_latitude']].values[0])

    thread = 30000
    for file in tqdm(files):

        datas = pd.read_csv(os.path.join(input_path,
                                         file)).groupby('loadingOrder')
        for key, data in datas:
            data.reset_index(drop=True, inplace=True)
            gloal_port = data[['longitude', 'latitude']].values[-1]
            for port in test_gloal_port:
                if cal_distance(gloal_port[1], port[1], gloal_port[0],
                                port[0]) < thread:
                    data.to_csv(os.path.join(out_path, key + '.csv'),
                                index=None)
                    break
Exemple #3
0
def clean_train_data():
    '''
    清洗数据
    :return:
    '''
    # train_gps_path = r'D:\baiph\BDC2020\data\train\train'
    # out_path = r'D:\baiph\BDC2020\data\train\data\train-clean'
    train_gps_path = r'D:\baiph\BDC2020\data\train\cleans\clean1'

    out_path = r'D:\baiph\BDC2020\data\train\cleans\clean1-train'
    mmsis = os.listdir(train_gps_path)
    thread = 30000
    for mm in tqdm(mmsis):
        path = os.path.join(train_gps_path, mm)
        datas = pd.read_csv(path)
        datas['timestamp'] = pd.to_datetime(datas['timestamp'],
                                            infer_datetime_format=True)
        datas.sort_values(['loadingOrder', 'timestamp'], inplace=True)
        groups = datas.groupby('loadingOrder')

        result = []
        for key, group in groups:
            group = group.reset_index(drop=True)
            # 清洗起航时数据,将speed为0的删除
            id = group['speed'].ne(0).idxmax()
            group = group.iloc[id:]
            group.reset_index(drop=True, inplace=True)

            last_address = group[['longitude', 'latitude']].values[-1]
            address = group[['longitude', 'latitude']].values
            count = 0
            for i, add in enumerate(address):
                # 清洗到港数据
                a = cal_distance(last_address[1], add[1], last_address[0],
                                 add[0])
                if a < thread and group['speed'].iloc[i] == 0:
                    count = i
                    break
            if count != 0:
                group = group.iloc[0:count]
            group.reset_index(drop=True, inplace=True)
            result.append(group)
        if len(result) != 0:

            result = pd.concat(result, axis=0)
            result.to_csv(os.path.join(out_path, mm), index=None)
Exemple #4
0
def get_distance(result):
    locs = list(set(result['geohashed_start_loc']) | set(result['geohashed_end_loc']))
    if np.nan in locs: 
        locs.remove(np.nan)
    deloc = []
    for loc in locs:
        deloc.append(geohash.decode_exactly(loc))
    loc_dict = dict(zip(locs, deloc))
    geohashed_loc = result[['geohashed_start_loc', 'geohashed_end_loc']].values
    distance = []
    manhattan_distance = []
    for i in geohashed_loc:
        if i[0] is not np.nan and i[1] is not np.nan:
            lat1, lon1, _, _ = loc_dict[i[0]]
            lat2, lon2, _, _ = loc_dict[i[1]]
            distance.append(cal_distance(float(lat1), float(lon1), float(lat2), float(lon2)))
            manhattan_distance.append(manhattan(float(lat1), float(lon1), float(lat2), float(lon2)))
        else:
            distance.append(np.nan)
            manhattan_distance.append(np.nan)
    result.loc[:, 'distance'] = distance
    result.loc[:, 'manhattan'] = manhattan_distance
    return result
Exemple #5
0
    def run_train(self, mixpoet, tool, optimizerRec, optimizerDis,
                  optimizerGen, logger, epoch):

        logger.set_start_time()

        for step in range(0, tool.train_batch_num):

            batch = tool.train_batches[step]
            batch_keys = batch[0].to(device)
            batch_poems = batch[1].to(device)
            batch_dec_inps = [dec_inp.to(device) for dec_inp in batch[2]]
            batch_labels = batch[3].to(device)
            batch_label_mask = batch[4].to(device)
            batch_lengths = batch[5].to(device)

            # train the classifier, recognition network and decoder
            rec_loss, cl_loss_w, cl_loss_xw, entro_loss, outs_post, clabels1, clabels2 = \
                self.run_rec_step(mixpoet, optimizerRec,
                    batch_keys, batch_poems, batch_dec_inps,
                    batch_labels, batch_label_mask, batch_lengths)

            logger.add_rec_losses(rec_loss, cl_loss_w, cl_loss_xw, entro_loss)
            logger.set_rate("learning_rate", optimizerRec.rate())

            # train discriminator
            if logger.total_steps > self.hps.rec_warm_steps:
                dis_loss = 0
                for i in range(0, self.hps.ndis):
                    step_dis_loss = self.run_dis_step(mixpoet, optimizerDis,
                                                      batch_keys, batch_poems,
                                                      batch_labels,
                                                      batch_label_mask)
                    dis_loss += step_dis_loss
                dis_loss /= self.hps.ndis
                logger.add_dis_loss(dis_loss)
                logger.set_rate('noise_weight',
                                self.noise_decay_tool.do_step())

            if logger.total_steps > self.hps.rec_warm_steps:
                # train prior and posterior generators
                adv_loss = self.run_adv_step(mixpoet, optimizerGen, batch_keys,
                                             batch_poems, batch_labels,
                                             batch_label_mask)

                logger.add_adv_loss(adv_loss)

            # temperature annealing
            mixpoet.set_tau(self.tau_decay_tool.do_step())
            logger.set_rate('temperature', self.tau_decay_tool.get_rate())

            if (step % 40
                    == 0) and (logger.total_steps > self.hps.rec_warm_steps):
                dist = utils.cal_distance(mixpoet, batch_keys, batch_poems,
                                          batch_labels, batch_label_mask)
                if not np.isnan(dist):
                    logger.add_distance(dist)
                #------------
                fadist = utils.factor_distance(mixpoet, batch_keys,
                                               self.hps.n_class1,
                                               self.hps.n_class2, device)
                if not np.isnan(fadist):
                    logger.add_factor_distance(fadist)

            if step % self.hps.log_steps == 0:
                logger.set_end_time()
                outs_prior = self.gen_from_prior(mixpoet, batch_keys,
                                                 batch_poems, batch_dec_inps,
                                                 batch_labels,
                                                 batch_label_mask,
                                                 batch_lengths)

                utils.sample_mix(batch_keys, batch_dec_inps, batch_labels,
                                 clabels1, clabels2, outs_post, outs_prior,
                                 self.hps.sample_num, tool)
                logger.print_log()
                logger.draw_curves()
                logger.set_start_time()
Exemple #6
0
def test(model,
         test_loader,
         cost,
         print_freq=40,
         batch_num=None,
         denoise=None,
         random_layer=None):
    batch_time = AverageMeter()
    losses = AverageMeter()
    error = AverageMeter()
    raw_error = AverageMeter()
    cat_error = {}
    raw_cat_error = {}
    meanD = AverageMeter()

    end = time.time()
    model.eval()
    if denoise:
        denoise.eval()
    if random_layer:
        random_layer.eval()
    custdv, cumean = torch.tensor(stdv).cuda(), torch.tensor(mean).cuda()
    """
    print(mean,stdv)
    tran = transforms.Compose([
        transforms.Resize((299,299)),
        transforms.ToTensor(),
        transforms.Normalize(mean=mean, std=stdv),
        ])
    
    data = datasets.ImageFolder("/home/data/wanghao/tianchi/data/IJCAI_2019_AAAC_test",transform=tran)

    test_loader1=DataLoader(data, batch_size=args.batch_size, shuffle=True,num_workers=4)
    """
    nx = np.array([[0., 0., 0.], [1., 1., 1.]])
    x_min, x_max = (nx - mean) / stdv
    epss = [40 / 255]
    for i, (clean_images, labels) in enumerate(test_loader):
        clean_images, labels = clean_images.cuda(), labels.cuda()
        adversarial_images = FGSM(clean_images,
                                  labels,
                                  model,
                                  torch.nn.functional.cross_entropy,
                                  eps=epss[random.randint(0,
                                                          len(epss) - 1)],
                                  x_min=x_min,
                                  x_max=x_max)
        #tmp=adversarial_images
        #print(adversarial_images)

        if i % print_freq == 0:

            ims1, clean_imgs = convert(adversarial_images.detach(), cumean,
                                       custdv, clean_images.cuda())
            for im, imc in zip(ims1, clean_imgs):
                im = np.rint(im * 255.)
                imc = np.rint(imc * 255.)

                #print(imc.shape)
                #print(imc[:5][:5][:5])
                #print(imc[0][0],im[0][0])
                #print(im.shape,imc.shape)
                dd = cal_distance(im, imc)
                #print("distance between clean adn  adversarial:",diffs.item(),dd)
                im = np.rint(im).astype(np.uint8)
                imc = np.rint(imc).astype(np.uint8)
                #print(im.shape)
                cv2.imwrite(
                    "./output" + "/" + str(i // print_freq % print_freq) +
                    "ad.jpg", im)
                cv2.imwrite(
                    "./output" + "/" + str(i // print_freq % print_freq) +
                    "test.jpg", imc)

        if denoise:
            d_adversarial_images = denoise(adversarial_images)
            d_clean_images = denoise(clean_images)

        #print(d_clean_images[:][:5][:5])

        if i % print_freq == 0:

            ims1, clean_imgs = convert(d_adversarial_images.detach(), cumean,
                                       custdv, clean_images.cuda())
            for im, imc in zip(ims1, clean_imgs):
                im = im * 255.
                imc = imc * 255.
                #print(imc.shape)
                #print(imc[:5][:5][:5])
                dd = cal_distance(im, imc)
                #print("distance after denoise:",diffs.item(),dd)
                im = np.rint(im).astype(np.uint8)
                imc = np.rint(imc).astype(np.uint8)
                #print(im.shape)
                cv2.imwrite(
                    "./output" + "/" + str(i // print_freq % print_freq) +
                    "adrec.jpg", im)
                cv2.imwrite(
                    "./output" + "/" + str(i // print_freq % print_freq) +
                    "testrec.jpg", imc)

            #ims1,clean_imgs=convert(d_adversarial_images.detach(),cumean,custdv,d_clean_images.cuda())

        if random_layer:
            d_adversarial_images = random_layer(d_adversarial_images)
        #print(diffs)
        #diffs=diffs.cu

        outputs = model(d_adversarial_images)
        outputs_clean = model(d_clean_images)
        loss = cost(outputs, labels)

        batch_size = labels.size(0)
        outputs = outputs.max(1)[1]
        outputs_clean = outputs_clean.max(1)[1]

        #print(batch_size,outputs.shape)
        #print(outputs,labels)
        for _ in range(batch_size):
            real_cat = labels[_].item()
            raw_cat = outputs_clean[_].item()
            ad_cat = outputs[_].item()
            if real_cat not in cat_error.keys():
                cat_error[real_cat] = AverageMeter()
            cat_error[real_cat].update(1.0 if real_cat == ad_cat else 0.0)
            if real_cat not in raw_cat_error.keys():
                raw_cat_error[real_cat] = AverageMeter()
            raw_cat_error[real_cat].update(1.0 if real_cat == raw_cat else 0.0)
        error.update(
            torch.ne(outputs.cpu(), labels.cpu()).float().sum().item() /
            batch_size, batch_size)
        raw_error.update(
            torch.ne(outputs_clean.cpu(), labels.cpu()).float().sum().item() /
            batch_size, batch_size)
        losses.update(loss.item(), batch_size)
        #meanD.update(where((outputs==labels).cpu(),diffs.float(),0.).mean().item(),batch_size)

        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            res = '\t'.join([
                'test',
                'Iter: [%d/%d]' % (i + 1, batch_num),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'Loss %.4f (%.4f)' % (losses.val, losses.avg),
                'Error %.4f (%.4f)' % (error.val, error.avg),
                'raw_error %.4f (%.4f)' % (raw_error.val, raw_error.avg),
                'meanD %.4f (%.4f)' % (meanD.val, meanD.avg)
            ])
            print(res)
    return batch_time.avg, losses.avg, error.avg, meanD.avg, raw_cat_error, cat_error
def generate_adversial_examles(model,
                               data_dir,
                               adversarial_method,
                               save_dir,
                               batch_size=16,
                               true_label=True,
                               pho_size=299,
                               eps=0.03,
                               iteration=None):
    batch_time = AverageMeter()
    meanD = AverageMeter()
    end = time.time()
    data = Dataset_filename(data_dir,
                            w=pho_size,
                            h=pho_size,
                            mean=mean,
                            stdv=stdv,
                            need_filename=True)
    dataLoader = DataLoader(data,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=4)
    custdv, cumean = torch.tensor(stdv).cuda(), torch.tensor(mean).cuda()
    for i, (inputs, labels, filenames) in enumerate(dataLoader):
        #print("gendrate adversarial samples [{}/{}]".format(i,len(dataLoader)))
        #print(filenames,inputs.shape,inputs.dtype,type(labels),type(filenames))
        if torch.cuda.is_available():
            inputs = inputs.cuda()
            labels = labels.cuda()
        model.eval()
        output = labels if true_label else model(inputs).max(1)[1] if (
            adversarial_method == "fgsm"
            or adversarial_method == "i-fgsm") else model(inputs).min(1)[1]
        output = output.detach()
        if adversarial_method == "fgsm":
            adv_input = FGSM(inputs,
                             output,
                             model,
                             torch.nn.functional.cross_entropy,
                             eps=eps)
        elif adversarial_method == "step-ll":
            adv_input = adversary.step_ll(inputs,
                                          output,
                                          model,
                                          torch.nn.functional.cross_entropy,
                                          eps=eps)
        elif adversarial_method == "i-fgsm":
            nx = np.array([[0., 0., 0.], [1., 1., 1.]])
            x_min, x_max = (nx - mean) / stdv
            adv_input = adversary.i_fgsm(inputs,
                                         output,
                                         model,
                                         torch.nn.functional.cross_entropy,
                                         targeted=False,
                                         eps=eps,
                                         alpha=1.0 / 255 / 0.3,
                                         iteration=iteration,
                                         x_val_min=x_min,
                                         x_val_max=x_max)

        adv_input = adv_input.data.cuda()

        if i % 40 == 0:
            adv_input, inputs = convert(adv_input, cumean, custdv, inputs)
            #print(adv_input)
            adv_input = np.rint(adv_input).astype(np.uint8)
            inputs = np.rint(inputs).astype(np.uint8)
        else:
            adv_input = convert(adv_input, cumean, custdv)
            #print(adv_input)
            adv_input = np.rint(adv_input).astype(np.uint8)

        for idx, filename in enumerate(filenames):
            #print("write",args.output_dir+'/'+filename)
            save_path = os.path.join(save_dir, str(labels[idx].item()))
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            im = Image.fromarray(adv_input[idx])
            im.save(os.path.join(save_path, filename))
            #print(cal_distance(adv_input[idx],inputs[idx]))
            if i % 40 == 0:
                meanD.update(
                    cal_distance(adv_input[idx].astype(np.float),
                                 inputs[idx].astype(np.float)))

        batch_time.update(time.time() - end)
        end = time.time()
        if i % 40 == 0:
            res = '\t'.join([
                'generate',
                'Iter: [%d/%d]' % (i + 1, len(dataLoader)),
                'Time %.3f (%.3f)' % (batch_time.val, batch_time.avg),
                'meanD %.3f (%.3f)' % (meanD.val, meanD.avg),
            ])
            print(res)