Beispiel #1
0
    def prepare_data(self):
        self.dataset_class = DataLoader(self.train_path, self.test_path)

        self.train_data_df, self.test_data_df, self.n_users, self.n_items = self.dataset_class.load_file_as_dataFrame()
        self.train_matrix, _ = self.dataset_class.dataFrame_to_matrix(self.train_data_df, self.n_users, self.n_items)
        self.test_matrix, _ = self.dataset_class.dataFrame_to_matrix(self.test_data_df, self.n_users, self.n_items)
        pass
Beispiel #2
0
def greedy_tag(to_pred, model_file, feature_map, out_name="greedy_pred"):
    out_file = open(out_name, "wt")

    model = pickle.load(open(model_file, "rb"))
    ftr_builders = [TransitionFtr(out_dim=LEN_FTR), EmmisionFtr(out_dim=LEN_FTR), SuffixPrefix(out_dim=LEN_FTR),
                    CombinationsWordsPos(out_dim=LEN_FTR), CostumeFtr()]
    dl = DataLoader(to_pred, feature_map, ftr_builders)

    all_count = 0
    true_count = 0
    len_data = len(dl)
    for j, (all_pos, all_words) in enumerate(dl.data):
        if (100 * j / len_data) % 10 == 0:
            print(str((100 * j / len_data)) + "%")
        prev_pos = [START, START]
        for i, (word, pos) in enumerate(zip(all_words, all_pos)):
            curr_pred = model.predict(dl.to_sparse(all_words, prev_pos, i))
            prev_pos.append(pos)
            all_count += 1
            curr_pred_label = dl.idx_to_label(int(curr_pred[0]))
            out_file.write(word + "/" + curr_pred_label + " ")
            true_count += 1 if pos == curr_pred_label else 0
            # print(word, pos, dl.idx_to_label(int(curr_pred[0])))
        out_file.write("\n")
    out_file.close()
    print(all_count, true_count, "\t~" + str(int(100*true_count/all_count)) + "%")
Beispiel #3
0
 def prepare_data(self):
     self.path_train = './data/%s/%s_train.dat' % (self.data_set,
                                                   self.data_set)
     path_test = './data/%s/%s_test.dat' % (self.data_set, self.data_set)
     self.dataset_class = DataLoader(self.path_train, path_test)
     self.train_data_df, _, self.n_users, self.n_items = self.dataset_class.load_file_as_dataFrame(
     )
Beispiel #4
0
 def __init__(self,
              exp_counter,
              low_freq=0.1,
              hi_freq=3,
              pick_channels=['Cz'],
              signal_tmin=-3,
              signal_tmax=5,
              noise_tmin=3,
              noise_tmax=11,
              generate_report=False):
     self.exp_counter = exp_counter
     self.pick_channels = pick_channels
     self.data_loader = DataLoader(exp_counter=self.exp_counter)
     self.data_loader.init_task_dependent_variables()
     self.data_loader.load_data()
     self.exp_name = self.data_loader.exp_name
     self.channel_dict = self.data_loader.channel_dict
     self.fs = self.data_loader.fs
     self.low_freq = low_freq
     self.hi_freq = hi_freq
     self.signal_tmin = signal_tmin
     self.signal_tmax = signal_tmax
     self.noise_tmin = noise_tmin
     self.noise_tmax = noise_tmax
     self.report = mne.Report(verbose=True)
     self.generate_report = generate_report
Beispiel #5
0
def train(model, model_name):

    loader = DataLoader()
    pretrain_data, pretrain_labels, pretrain_names = loader.load_pretrain_datasets(
    )

    # pretrain model
    model.fit(pretrain_data,
              pretrain_labels,
              batch_size=BATCH_SIZE,
              epochs=PRETRAIN_EPOCHS)

    deep_utils.create_directory("../models")
    model_filename = "../models/pretrained_" + model_name + ".h5"
    model.save(model_filename)

    train_data, train_labels, train_names = loader.load_train_datasets()
    test_data, test_labels, test_names = loader.load_test_datasets()

    # train model
    model.fit(train_data,
              train_labels,
              validation_data=(test_data, test_labels),
              batch_size=BATCH_SIZE,
              epochs=TRAIN_EPOCHS)

    deep_utils.create_directory("../models")
    model_filename = "../models/fine_tuned_" + model_name + ".h5"
    model.save(model_filename)

    # evaluate model
    scores = model.evaluate(test_data, test_labels, verbose=1)
    return scores
def best_threshold(model):
    def metrics(Y):
        positive = sum([y['target'] for y in Y])

        thresholds = [0.5, 0.45, 0.4, 0.35, 0.3, 0.25]
        index = 0
        right, wrong = 0, 0
        existed_edges = {ids: test[ids]['source_edges'] for ids in test.ids}
        id2node = {
            node['osmid']: node
            for ids in test.ids for node in test[ids]['nodes']
        }
        best_f1, best_th = 0, 0
        for _, th in enumerate(thresholds):
            for i in range(index, len(Y)):
                if Y[i]['score'] < math.log(th):
                    index = i
                    break
                if is_valid({
                        'start': Y[i]['start'],
                        'end': Y[i]['end']
                }, existed_edges[Y[i]['id']], id2node):
                    existed_edges[Y[i]['id']].append({
                        'start': Y[i]['start'],
                        'end': Y[i]['end']
                    })
                    if Y[i]['target'] == 1:
                        right += 1
                    else:
                        wrong += 1
            p = 1.0 * right / (right + wrong + 1e-9)
            r = 1.0 * right / positive
            f1 = 2 * p * r / (p + r + 1e-9)
            if best_f1 < f1:
                best_f1 = f1
                best_th = th
                print(p, r, best_f1, best_th)
        return best_f1, best_th

    test = DataLoader(
        'E:/python-workspace/CityRoadPrediction/data_20200610/test/')
    test.load_all_datas()
    result = load_model_result(model.lower(), data_dir)
    y = []
    for city in result:
        for index, v in result[city].items():
            for sample in v:
                y.append({
                    'id': index,
                    'start': sample['start'],
                    'end': sample['end'],
                    'score': sample['score'],
                    'target': int(sample['target'])
                })
    del result
    y = sorted(y, key=lambda e: e['score'], reverse=True)
    f1, th = metrics(y)
    print(f1, th)
Beispiel #7
0
def main(args):
    set_gpu_growth()
    dataset = args.dataset  # 'A' or 'B'
    cfg.init_path(dataset)  # 初始化路径名
    print(cfg.WEIGHT_PATH)
    # 加载数据生成器
    train_data_gen = DataLoader(cfg.TRAIN_PATH,
                                cfg.TRAIN_GT_PATH,
                                batch_size=cfg.TRAIN_BATCH_SIZE,
                                shuffle=True,
                                gt_downsample=True,
                                mean=cfg.MEAN,
                                std=cfg.STD)
    val_data_gen = DataLoader(cfg.VAL_PATH,
                              cfg.VAL_GT_PATH,
                              batch_size=cfg.VAL_BATCH_SIZE,
                              shuffle=False,
                              gt_downsample=True,
                              mean=cfg.MEAN,
                              std=cfg.STD)

    # 定义模型
    input_shape = (None, None, 1)
    model = MCNN(input_shape)
    adam = Adam(lr=1e-4)
    model.compile(loss='mse', optimizer=adam, metrics=[mae, mse])
    # 加载与训练模型
    if args.weight_path is not None:
        model.load_weights(args.weight_path, by_name=True)

    # 定义callback
    checkpoint = ModelCheckpoint(filepath=cfg.WEIGHT_PATH,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=False,
                                 save_weights_only=True,
                                 mode='min',
                                 period=5)
    callback_list = [checkpoint]

    # 训练
    print('Training Part_{} ...'.format(dataset))
    model.fit_generator(train_data_gen,
                        validation_data=val_data_gen,
                        epochs=cfg.EPOCHS,
                        initial_epoch=args.init_epoch,
                        callbacks=callback_list,
                        use_multiprocessing=True,
                        workers=4,
                        verbose=1)
    model.save(cfg.WEIGHT_PATH)
Beispiel #8
0
    def execute(self):
        # self.data_path_clean = './data/ml100k/ml100k_train.dat'
        # self.data_path_attacked = './results/data_attacked/ml100k/ml100k_AUSH_0.data'
        #
        path_test = self.data_path_clean.replace('train', 'test')
        # load real profile matrix
        dataset_class_real = DataLoader(self.data_path_clean, path_test)
        train_data_df_real, _, n_users_real, n_items_real = dataset_class_real.load_file_as_dataFrame()
        train_matrix_real, _ = dataset_class_real.dataFrame_to_matrix(train_data_df_real, n_users_real, n_items_real)
        train_matrix_real = train_matrix_real.toarray()

        # load fake profile matrix
        dataset_class_attacked = DataLoader(self.data_path_attacked, path_test)
        train_data_df_attacked, _, n_users_attacked, n_items_attacked = dataset_class_attacked.load_file_as_dataFrame()
        train_matrix_attacked, _ = dataset_class_attacked.dataFrame_to_matrix(train_data_df_attacked, n_users_attacked,
                                                                              n_items_attacked)
        train_matrix_fake = train_matrix_attacked.toarray()[n_users_real:, :]

        # cacu item distribution
        real_item_distribution = self.get_item_distribution(train_matrix_real)
        fake_item_distribution = self.get_item_distribution(train_matrix_fake)
        #
        TVD_distance = self.get_TVD_distance(real_item_distribution, fake_item_distribution)
        JS_distance = self.get_JS_distance(real_item_distribution, fake_item_distribution)
        #
        res_str = 'TVD:%.4f\tJS:%.4f' % (TVD_distance, JS_distance)
        print('result begin', res_str, 'result end')
        return TVD_distance, JS_distance
Beispiel #9
0
 def __init__(self, gen_network: Callable, dis_network: Callable,
              dataloader_args: Dict):
     tf.enable_eager_execution()
     self.data_loader = DataLoader(
         **dataloader_args)  # util to load dataset from dir
     self.dataset = self.data_loader.get_dataset()
     self.generator = gen_network()
     self.discriminator = dis_network(self.data_loader.output_shape, (1, ))
     noise = Input(shape=LATENT_SHAPE)
     gen_img = self.generator(noise)
     disc_output = self.discriminator(gen_img)
     self.combined = Model(noise, disc_output)
     self.combined.layers[
         2].trainable = False  # combined model is only trained on generator network
     self.fixed_noise = tf.random.normal([128] + list(LATENT_SHAPE))
Beispiel #10
0
def main(args):
    dataset = args.dataset  # 'A' or 'B'
    output_dir = os.path.join(cfg.HM_GT_PATH, 'Part_{}'.format(dataset))

    for _dir in [cfg.HM_GT_PATH, output_dir]:
        if not os.path.exists(_dir):
            os.mkdir(_dir)

    test_path = cfg.TEST_PATH.format(dataset)
    test_gt_path = cfg.TEST_GT_PATH.format(dataset)
    # load data
    data_loader = DataLoader(test_path,
                             test_gt_path,
                             shuffle=False,
                             gt_downsample=True)
    # data_loader = ImageDataLoader(test_path, test_gt_path, shuffle=False, gt_downsample=True,pre_load=True)

    # create heatmaps
    print('Creating heatmaps for Part_{} ...'.format(dataset))
    for i, (img, den) in enumerate(data_loader):
        data = img
        gt = den
        img_name = data_loader.filename_list[i]
        gt = np.squeeze(gt)  # shape(1, h, w, 1) -> shape(h, w)
        save_heatmap(gt, data, img_name, output_dir, gt=True)
    print('All Done.')
    def __init__(self,
                 path=None,
                 json_file=None,
                 yaml_file=None,
                 split=0.1,
                 nb_timesteps=6):
        self.split = split

        # Search json and yaml in path if path not None
        # else use directly json_file and yaml_file
        if path:
            path = path.rstrip('/')
            json_file = [file for file in glob("{}/*.json".format(path))]
            yaml_file = [file for file in glob("{}/*.yaml".format(path))]
            if len(json_file) != 1:
                print("No json or more than one file in the specified path.")
                exit(1)
            if len(yaml_file) != 1:
                print("No yaml or more than one file in the specified path.")
                exit(1)
            json_file = json_file[0]
            yaml_file = yaml_file[0]

        self.path = path
        self.json_file = json_file
        self.yaml_file = yaml_file

        self.data_formater = DataFormater(nb_timesteps=nb_timesteps)
        self.data_loader = DataLoader(json_file=json_file,
                                      yaml_file=yaml_file,
                                      nb_gates_nominal=2,
                                      nb_gates_ir=1)
        data = np.array(self.data_loader.data)
        self.init(data)
def prepareData(path):
    try:
        # embedder
        embedder = FeatureExtractor(config.EMBEDDING_MODEL_PATH)
        # face detector
        faceDetector = FaceDetector(config.FACE_DECTOR_PATH)
        # image paths
        imagePaths = DataLoader(config.DATASET_PATH)
        names = []
        embeddedVectors = []
        for imagePath in imagePaths:
            name = imagePath.split(os.path.sep)[1]
            img = cv2.imread(imagePath)
            H, W = img.shape[:2]
            # face detection
            detectedFaces = faceDetector.detect(img, .3)
            for detectedFace in detectedFaces:
                #grab bounding box
                bbox = detectedFace.bounding_box.flatten() * np.array(
                    [W, H, W, H])
                xmin, ymin, xmax, ymax = bbox.astype('int')
                # grab ROI from image
                roi = img[ymin:ymax, xmin:xmax]
                vector = embedder.run(roi)
                embeddedVectors.append(vector)
                names.append(name)

        data = {'data': embeddedVectors, 'names': names}

        with open(path, 'wb') as f:
            f.write(pickle.dumps(data))

    except Exception as e:
        raise e
Beispiel #13
0
def main_process():
    # 加载数据
    data = DataLoader()
    data_frame = get_dataframe(data_conn=data)

    # 数据自定义过滤清洗
    data_frame = custom_dataframe_handler(df=data_frame)
def main(train_flag=False, test_flag=False, use_encoder=False):
    # before train
    config = Config()
    data_loader = DataLoader(TRAIN_DATA_PATH)
    data_loader.process_raw_data()
    sent_models = SentMatching(data_loader)
    model = sent_models.model
    encoder = sent_models.encoder

    if train_flag:
        ranking_model = sent_models.get_ranking_model(
            data_loader.y_valid.shape[0])
        model, encoder = train(model, encoder, ranking_model, data_loader,
                               config)
    else:
        model.load_weights(MODEL_PATH)
        encoder = Model(inputs=model.input,
                        outputs=model.get_layer(index=3).output)

    if test_flag:
        # 测试文件样例大小最小:11
        test_data, test_vec = evaluate_(encoder, TEST_DATA_PATH, data_loader,
                                        sent_models)

        # test_data, x_test, y_test, test_id2g = data_loader.process_test_data(TEST_DATA_PATH)
        # test_vec = encoder.predict(x_test,
        #                            verbose=True,
        #                            batch_size=1000)  # encoder计算句向量

        while True:
            input_sent = input()
            predict(encoder, data_loader, test_data, test_vec, input_sent)

    # todo
    if use_encoder:
        tmp_data, x_tmp, y_tmp, tmp_id2g = data_loader.process_test_data(
            PATH_FOR_ENCODER)
        tmp_vec = encoder.predict(x_tmp, verbose=True,
                                  batch_size=1000)  # encoder计算句向量
        print(tmp_vec.shape)
        print(tmp_vec[0].shape)
        sims = np.dot(tmp_vec, tmp_vec[0])
        for i in sims.argsort()[-1:][::-1]:
            print(tmp_data.iloc[i][1], sims[i])

        y_pred = KMeans(n_clusters=3, random_state=42).fit_predict(tmp_vec)
        print(y_pred)
Beispiel #15
0
    def execute(self):
        # temp file path
        cur_time = time.time()
        label_file_path = './label_%f.tmp' % cur_time
        conf_file_path = './conf_%f.tmp' % cur_time

        args = {
            'ratings': self.data_path_attacked,
            'ratings.setup': '-columns 0 1 2',
            'label': label_file_path,
            'methodName': 'FAP',
            'evaluation.setup': '-ap 0.000001',
            'seedUser': 5,
            'topKSpam': 50,
            'output.setup': 'on -dir ./',
        }

        # write conf file
        with open(conf_file_path, 'w') as fout:
            fout.write('\n'.join(['%s=%s' % i for i in args.items()]))

        # write label file
        _, _, n_users_real, _ = DataLoader(self.data_path_clean,
                                           self.data_path_clean.replace('train', 'test'),
                                           verbose=False).load_file_as_dataFrame()
        _, _, n_users_attacked, _ = DataLoader(self.data_path_attacked,
                                               self.data_path_clean.replace('train', 'test'),
                                               verbose=False).load_file_as_dataFrame()

        uids, labels = np.arange(n_users_attacked), np.zeros(n_users_attacked)
        labels[n_users_real:] = 1

        with open(label_file_path, 'w') as fout:
            fout.write('\n'.join(["%d\t%d" % i for i in list(zip(uids, labels))]))

        sd = SDLib(Config(conf_file_path))
        result = sd.execute()
        res_str = "pre:%.4f\trecall:%.4f" % tuple(result)
        print('result begin', res_str, 'result end')
        #
        os.remove(label_file_path)
        os.remove(conf_file_path)
        #
        pass
Beispiel #16
0
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    dataset = args.dataset  # 'A' or 'B'
    output_dir = args.output_dir
    weight_path = args.weight_path
    cfg.init_path(dataset)

    heatmaps_dir = os.path.join(output_dir, 'heatmaps')  # directory to save heatmap
    results_txt = os.path.join(heatmaps_dir, 'results.txt')  # file to save predicted results
    for _dir in [output_dir, heatmaps_dir]:
        if not os.path.exists(_dir):
            os.mkdir(_dir)

    # load test set
    data_loader = DataLoader(cfg.TEST_PATH,
                             cfg.TEST_GT_PATH,
                             shuffle=False,
                             gt_downsample=True)

    # data_loader = ImageDataLoader(cfg.TEST_PATH, cfg.TEST_GT_PATH, shuffle=False, gt_downsample=True, pre_load=True)
    # load model
    model = MCNN(input_shape=(None, None, 1))
    model.load_weights(weight_path, by_name=True)

    # test
    print('Testing Part_{} ...'.format(dataset))
    mae = 0.0
    mse = 0.0
    print(model.input_shape)
    for idx, (img, g) in enumerate(data_loader):
        if idx == len(data_loader.filename_list):
            break
        print(idx)
        gt = g
        data = img
        filename = data_loader.filename_list[idx]
        pred = model.predict(data)
        pred *= cfg.STD
        pred += cfg.MEAN
        gt_count = np.sum(gt)
        pred_count = np.sum(pred)
        mae += abs(gt_count - pred_count)
        mse += ((gt_count - pred_count) * (gt_count - pred_count))
        # create and save heatmap
        pred = np.squeeze(pred)  # shape(1, h, w, 1) -> shape(h, w)
        # save_heatmap(pred, img, filename, heatmaps_dir)
        # save results
        with open(results_txt, 'a') as f:
            line = '<{}> {:.2f} -- {:.2f}\n'.format(filename, gt_count, pred_count)
            f.write(line)

    mae = mae / len(data_loader)
    mse = np.sqrt(mse / len(data_loader))
    print('MAE: %0.2f, MSE: %0.2f' % (mae, mse))
    with open(results_txt, 'a') as f:
        f.write('MAE: %0.2f, MSE: %0.2f' % (mae, mse))
Beispiel #17
0
def train(model, model_name):

    loader = DataLoader()
    train_data, train_labels, train_names = loader.load_train_datasets()
    test_data, test_labels, test_names = loader.load_test_datasets()
    model.fit(train_data,
              train_labels,
              batch_size=BATCH_SIZE,
              epochs=TRAIN_EPOCHS,
              validation_data=(test_data, test_labels),
              shuffle=True)

    # save trained model
    deep_utils.create_directory("../models")
    model_filename = "../models/base_" + model_name + ".h5"
    model.save(model_filename)

    scores = model.evaluate(test_data, test_labels, verbose=1)
    return scores
Beispiel #18
0
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    dataset = args.dataset  # 'A' or 'B'
    if dataset == 'A':
        model_path = './trained_models/mcnn_A_train.hdf5'
    else:
        model_path = './trained_models/mcnn_B_train.hdf5'

    output_dir = './output_{}/'.format(dataset)
    heatmaps_dir = os.path.join(output_dir,
                                'heatmaps')  # directory to save heatmap
    results_txt = os.path.join(output_dir,
                               'results.txt')  # file to save predicted results
    for _dir in [output_dir, heatmaps_dir]:
        if not os.path.exists(_dir):
            os.mkdir(_dir)

    test_path = cfg.TEST_PATH.format(dataset)
    test_gt_path = cfg.TEST_GT_PATH.format(dataset)
    # load test set
    print('Loading data, wait a moment...')
    data_loader = DataLoader(test_path,
                             test_gt_path,
                             shuffle=False,
                             gt_downsample=True)
    # load model
    model = load_model(model_path)

    # test
    print('Testing Part_{} ...'.format(dataset))
    mae = 0.0
    mse = 0.0
    for blob in data_loader:
        img = blob['data']
        gt = blob['gt']
        pred = model.predict(np.expand_dims(img, axis=0))
        gt_count = np.sum(gt)
        pred_count = np.sum(pred)
        mae += abs(gt_count - pred_count)
        mse += ((gt_count - pred_count) * (gt_count - pred_count))
        # create and save heatmap
        pred = np.squeeze(pred)  # shape(1, h, w, 1) -> shape(h, w)
        save_heatmap(pred, blob, test_path, heatmaps_dir)
        # save results
        with open(results_txt, 'a') as f:
            line = '<{}> {:.2f} -- {:.2f}\n'.format(
                blob['fname'].split('.')[0], gt_count, pred_count)
            f.write(line)

    mae = mae / data_loader.num_samples
    mse = np.sqrt(mse / data_loader.num_samples)
    print('MAE: %0.2f, MSE: %0.2f' % (mae, mse))
    with open(results_txt, 'a') as f:
        f.write('MAE: %0.2f, MSE: %0.2f' % (mae, mse))
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = "1"
    dataset = args.dataset  # 'A' or 'B'
    output_dir = args.output_dir
    weight_path = args.weight_path
    cfg.init_path(dataset)

    heatmaps_dir = os.path.join(output_dir,
                                'heatmaps')  # directory to save heatmap
    results_txt = os.path.join(output_dir,
                               'results.txt')  # file to save predicted results
    for _dir in [output_dir, heatmaps_dir]:
        if not os.path.exists(_dir):
            os.mkdir(_dir)

    # load test set
    data_loader = DataLoader(cfg.TEST_PATH, cfg.TEST_GT_PATH, shuffle=False)
    # load model
    print('[INFO] Load model ...')
    model = CMTL(input_shape=(None, None, 1))
    model.load_weights(weight_path, by_name=True)

    # test
    print('[INFO] Testing Part_{} ...'.format(dataset))
    mae = 0.0
    mse = 0.0
    acc = 0.0
    for blob in data_loader.blob_list:
        img = blob['data']
        gt_den = blob['gt_den']
        gt_cls = np.argmax(blob['gt_class'])
        pred_den, pred_cls = model.predict(img[np.newaxis, ...])
        if np.argmax(pred_cls[0]) == gt_cls:
            acc += 1
        gt_count = np.sum(gt_den)
        pred_count = np.sum(pred_den)
        mae += abs(gt_count - pred_count)
        mse += ((gt_count - pred_count) * (gt_count - pred_count))
        # # create and save heatmap
        # pred = np.squeeze(pred)  # shape(1, h, w, 1) -> shape(h, w)
        # save_heatmap(pred, blob, test_path, heatmaps_dir)
        # save results
        with open(results_txt, 'a') as f:
            line = '<{}> {:.2f}--{:.2f}\t{}--{}\n'.format(
                blob['fname'].split('.')[0], gt_count, pred_count, gt_cls,
                np.argmax(pred_cls[0]))
            f.write(line)

    mae = mae / data_loader.num_samples
    mse = np.sqrt(mse / data_loader.num_samples)
    acc = acc / data_loader.num_samples
    print('[RESULT] MAE: %0.2f, MSE: %0.2f, Acc: %0.2f' % (mae, mse, acc))
    with open(results_txt, 'a') as f:
        f.write('MAE: %0.2f, MSE: %0.2f, Acc: %0.2f' % (mae, mse, acc))
Beispiel #20
0
    def prepare_data(self):

        self.path_train = './data/%s/%s_train.dat' % (self.data_set, self.data_set)
        path_test = './data/%s/%s_test.dat' % (self.data_set, self.data_set)

        dataset_class = DataLoader(self.path_train, path_test)
        self.train_data_df, self.test_data_df, self.n_users, self.n_items = dataset_class.load_file_as_dataFrame()
        train_matrix, _ = dataset_class.dataFrame_to_matrix(self.train_data_df, self.n_users, self.n_items)
        test_matrix, _ = dataset_class.dataFrame_to_matrix(self.test_data_df, self.n_users, self.n_items)
        self.train_array, self.test_array = train_matrix.toarray(), test_matrix.toarray()
 
        self.data_loader = torch.utils.data.DataLoader(dataset=torch.from_numpy(self.train_array).type(torch.float32),
                                                       batch_size=self.batch_size_D, shuffle=True, drop_last=True)
 
        self.target_users = np.where(self.train_array[:, self.target_id] == 0)[0]

        attack_target = np.zeros((len(self.target_users), self.n_items))
        attack_target[:, self.target_id] = 1.0
        self.attack_target = torch.from_numpy(attack_target).type(torch.float32).to(self.device)
        pass
Beispiel #21
0
def main(args):
    dataset = args.dataset  # 'A' or 'B'
    cfg.init_path(dataset)  # 初始化路径名
    # 加载数据生成器
    train_data_gen = DataLoader(cfg.TRAIN_PATH,
                                cfg.TRAIN_GT_PATH,
                                batch_size=cfg.TRAIN_BATCH_SIZE,
                                shuffle=True,
                                gt_downsample=True)
    dens = [np.ravel(den) for im, den in train_data_gen]
    dens = np.concatenate(dens, axis=0)
    print("mean:{},std:{}".format(np.mean(dens), np.std(dens)))
def main(args):
    dataset = args.dataset
    neighborhood_size = args.neighborhood_size
    recommended_list_size = args.recommended_list_size

    data_loader = DataLoader(dataset)
    data_loader.load_data()
    user_number, item_number = data_loader.get_dataset_info()
    train, test = data_loader.train_test_split()
    recommender = RecommenderSystem()
    rating_predictions = recommender.predict_topk_nobias(train,
                                                         k=neighborhood_size)

    evaluator = RecommenderEvaluator()
    print("RMSE={}".format(evaluator.rmse(rating_predictions, test)))
    print("MAE={}".format(evaluator.mae(rating_predictions, test)))
    mean_test = np.true_divide(test.sum(1), (test != 0).sum(1))
    precisions, recalls = evaluator.precision_recall_at_k(
        rating_predictions, test, mean_test, user_number,
        recommended_list_size)
    precision = sum(prec for prec in precisions.values()) / len(precisions)
    recall = sum(rec for rec in recalls.values()) / len(recalls)
    f1 = evaluator.f1(precision, recall)
    print("Precision({})={}".format(recommended_list_size, precision))
    print("Recall({})={}".format(recommended_list_size, recall))
    print("F1({})={}".format(recommended_list_size, f1))
Beispiel #23
0
def train_model(args):
    # Data
    data_loader = DataLoader(args.batch_size)
    train_ds, test_ds = data_loader.make_dataset()

    # Prior and Plot objects
    prior_factory = PriorFactory(
        args.n_classes, gm_x_stddev=args.gm_x_stddev, gm_y_stddev=args.gm_y_stddev
    )
    plot_factory = PlotFactory(
        prior_factory,
        args.results_dir,
        args.prior_type,
        args.n_classes,
        data_loader.img_size_x,
        data_loader.img_size_y,
    )

    # Model
    gan = Gan(image_dim=data_loader.img_size_x * data_loader.img_size_y)

    # Optimizers
    optimizers_dict = {
        "encoder": tf.optimizers.Adam(learning_rate=args.learning_rate),
        "discriminator": tf.optimizers.Adam(learning_rate=args.learning_rate / 5),
        "gan": tf.optimizers.Adam(learning_rate=args.learning_rate),
    }

    # Training
    train_all_steps(
        gan,
        optimizers_dict,
        train_ds,
        args.n_epochs,
        args.prior_type,
        args.n_classes,
        data_loader,
        plot_factory,
        args.log_dir,
    )
Beispiel #24
0
 def __init__(self,
              to_pred,
              model_file,
              feature_map,
              out_name="greedy_pred"):
     self._probs = {}
     self._model = pickle.load(open(model_file, "rb"))
     ftr_builders = [
         TransitionFtr(out_dim=LEN_FTR),
         EmmisionFtr(out_dim=LEN_FTR),
         SuffixPrefix(out_dim=LEN_FTR),
         CombinationsWordsPos(out_dim=LEN_FTR),
         CostumeFtr()
     ]
     self._dl = DataLoader(to_pred, feature_map, ftr_builders)
     self._label_list = self._dl.label_list + [START]
     self._label_to_idx = {
         label: i
         for i, label in enumerate(self._label_list)
     }
     self._tagger = ViterbiAlg(self._label_list, self._prob_func)
     self._init_probs()
    def run(self):
        start = datetime.now()
        thread_id = start.strftime('%Y%m%d%H%M%S')
        logging.info("Thread %s - %s started" % (thread_id, self.file_path))

        sftp_reader = SFTPReader(self.host, self.port, self.username, self.password,
                                 self.ssh_key_path, self.sftp_max_retry)
        byte_io = sftp_reader.load_file(self.file_path)

        sftp_reader.close()
        step = datetime.now()
        logging.info("Thread %s - %s loaded data - Time: %d" % (thread_id, self.file_path, (step - start).seconds))

        if self.try_send_data:
            data_loader = DataLoader()

            processed_df = data_loader.load(byte_io, self.columns_seletion, fill_na_dict=self.fill_na_dict,
                                            concat_dict=self.concat_dict, rename_dict=self.rename_dict)
            step = datetime.now()
            logging.info("Thread %s - %s parsed data - Time: %d" % (thread_id, self.file_path, (step - start).seconds))

            event_sender = EventSender(self.connection_string, self.eventhub_name, self.max_event_per_batch,
                                       self.eventhub_max_retry, self.metadata, self.zvelo_helper)
            event_sender.send(processed_df)
            event_sender.close()

            step = datetime.now()
            logging.info("Thread %s - %s sent data - Time: %d" % (thread_id, self.file_path, (step - start).seconds))

        # Copy raw data to ADLS
        if (not self.blob_name == False) or (not self.blob_key == False):
            blob_helper = BlobHelper(self.blob_name, self.blob_key)
            file_name = self.file_path[self.file_path.rindex("/") + 1 : ]
            blob_path = "%s/%s" % (self.blob_path, file_name)
            byte_io.seek(0)
            blob_helper.upload_data(byte_io, self.blob_container, blob_path, overwrite=True)

        step = datetime.now()
        logging.info("Thread %s - %s stopped - Time: %d" % (thread_id, self.file_path, (step - start).seconds))
Beispiel #26
0
def prepare_real_samples():
    """
    prepare_real_samples function load the data provider and set
    training and testing dataset

    :return: X_train
    """
    # loading real data
    (X_train, y_train), (X_test, y_test) = DataLoader.load_data()
    # convert from int to float and [0,255] to [-1,1] scaling
    X_train = (X_train.astype(np.float32) - 127.5) / 127.5
    X_train = X_train[:, :, :, None]
    X_test = X_test[:, :, :, None]
    # X_train = X_train.reshape((X_train.shape, 1) + X_train.shape[1:])
    return X_train
Beispiel #27
0
    def prepare_real_samples(self):
        """
        prepare_real_samples function load the data provider and set
        training and testing dataset

        :return: X
        """
        # loading real data
        (x_train, _), (_, _) = DataLoader.load_data()
        # adding channels to expand to 3d
        X = expand_dims(x_train, axis=-1)
        # convert from int to float and [0,255] to [-1,1] scaling
        X = X.astype('float32')
        X = (X - 127.5) / 127.5
        return X
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    dataset = args.dataset  # 'A' or 'B'

    train_path = cfg.TRAIN_PATH.format(dataset)
    train_gt_path = cfg.TRAIN_GT_PATH.format(dataset)
    val_path = cfg.VAL_PATH.format(dataset)
    val_gt_path = cfg.VAL_GT_PATH.format(dataset)
    # 加载数据
    print('Loading data, wait a moment...')
    train_data_gen = DataLoader(train_path,
                                train_gt_path,
                                shuffle=True,
                                gt_downsample=True)
    val_data_gen = DataLoader(val_path,
                              val_gt_path,
                              shuffle=False,
                              gt_downsample=True)

    # 定义模型
    input_shape = (None, None, 1)
    model = MCNN(input_shape)
    # 编译
    adam = Adam(lr=1e-4)
    model.compile(loss='mse', optimizer=adam, metrics=[mae, mse])

    # 定义callback
    checkpointer_best_train = ModelCheckpoint(filepath=os.path.join(
        cfg.MODEL_DIR, 'mcnn_' + dataset + '_train.hdf5'),
                                              monitor='loss',
                                              verbose=1,
                                              save_best_only=True,
                                              mode='min')
    callback_list = [checkpointer_best_train]

    # 训练
    print('Training Part_{} ...'.format(dataset))
    model.fit_generator(
        train_data_gen.flow(cfg.TRAIN_BATCH_SIZE),
        steps_per_epoch=train_data_gen.num_samples // cfg.TRAIN_BATCH_SIZE,
        validation_data=val_data_gen.flow(cfg.VAL_BATCH_SIZE),
        validation_steps=val_data_gen.num_samples // cfg.VAL_BATCH_SIZE,
        epochs=cfg.EPOCHS,
        callbacks=callback_list,
        verbose=1)
def main(args):
    dataset = args.dataset  # 'A' or 'B'
    output_dir = os.path.join(cfg.HM_GT_PATH, 'Part_{}'.format(dataset))

    for _dir in [cfg.HM_GT_PATH, output_dir]:
        if not os.path.exists(_dir):
            os.mkdir(_dir)

    test_path = cfg.TEST_PATH.format(dataset)
    test_gt_path = cfg.TEST_GT_PATH.format(dataset)
    # load data
    data_loader = DataLoader(test_path,
                             test_gt_path,
                             shuffle=False,
                             gt_downsample=True)

    # create heatmaps
    print('Creating heatmaps for Part_{} ...'.format(dataset))
    for blob in data_loader:
        gt = blob['gt']
        # create and save heatmap
        gt = np.squeeze(gt)  # shape(1, h, w, 1) -> shape(h, w)
        save_heatmap(gt, blob, test_path, output_dir, gt=True)
    print('All Done.')
    def prepare_train_embedding(self, data_dir):
        data = copy.deepcopy(self.train_loader.data[self.city])
        data.update(self.tester.test_loader.data[self.city])
        keys = sorted(list(data.keys()))
        embeds = {}
        for i in range(0, len(keys), 40):
            print(self.city, i, len(keys))
            nodes, edges = [], []
            for index in keys[i: i + 40]:
                nodes += data[index]['nodes']
                edges += data[index]['source_edges']
            G = DataLoader.build_graph(nodes, edges)
            self.vec_model.build_model(G)
            embeds.update(self.vec_model.train(embed_size=self.embed_dim))
        for index in self.train_loader.data[self.city]:
            positive, negative = [], []
            sample = self.train_loader.data[self.city][index]
            for i, n1 in enumerate(sample['nodes']):
                for j, n2 in enumerate(sample['nodes'][i + 1:]):
                    if {'start': n1['osmid'], 'end': n2['osmid']} in sample['target_edges'] or \
                            {'start': n2['osmid'], 'end': n1['osmid']} in sample['target_edges']:
                        positive.append([n1['osmid'], n2['osmid'], 1])
                    elif {'start': n1['osmid'], 'end': n2['osmid']} not in sample['source_edges'] and \
                            {'start': n2['osmid'], 'end': n1['osmid']} not in sample['source_edges']:
                        negative.append([n1['osmid'], n2['osmid'], 0])
            samples = positive + negative
            for (start, end, target) in samples:
                self.embedding.append({
                    'start_id': str(start),
                    'end_id': str(end),
                    'start_embedding': embeds[str(start)] if str(start) in embeds else np.zeros(self.embed_dim),
                    'end_embedding': embeds[str(end)] if str(end) in embeds else np.zeros(self.embed_dim),
                    'target': target,
                })
        pickle.dump(self.embedding,
                    open(data_dir + 'train/' + self.city + '_embedding.pkl', 'wb'))

        test_embedding = {}
        for index in self.tester.test_loader.data[self.city]:
            positive, negative = [], []
            sample = self.tester.test_loader.data[self.city][index]
            for i, n1 in enumerate(sample['nodes']):
                for j, n2 in enumerate(sample['nodes'][i + 1:]):
                    if {'start': n1['osmid'], 'end': n2['osmid']} in sample['target_edges'] or \
                            {'start': n2['osmid'], 'end': n1['osmid']} in sample['target_edges']:
                        positive.append([n1['osmid'], n2['osmid'], 1])
                    elif {'start': n1['osmid'], 'end': n2['osmid']} not in sample['source_edges'] and \
                            {'start': n2['osmid'], 'end': n1['osmid']} not in sample['source_edges']:
                        negative.append([n1['osmid'], n2['osmid'], 0])
            samples = positive + negative
            test_embedding[index] = []
            for (start, end, target) in samples:
                test_embedding[index].append({
                    'start_id': str(start),
                    'end_id': str(end),
                    'start_embedding': embeds[str(start)] if str(start) in embeds else np.zeros(self.embed_dim),
                    'end_embedding': embeds[str(end)] if str(end) in embeds else np.zeros(self.embed_dim),
                    'target': target,
                })
        print(self.city, len(self.embedding), len(test_embedding))
        pickle.dump(test_embedding,
                    open(data_dir + 'test/' + self.city + '_embedding.pkl', 'wb'))