コード例 #1
0
ファイル: run_gin.py プロジェクト: zonghua94/euler
def main(_):
    flags_obj = tf.flags.FLAGS
    euler_graph = tf_euler.dataset.get_dataset(flags_obj.dataset)
    euler_graph.load_graph()

    dims = [flags_obj.hidden_dim] * (flags_obj.layers + 1)
    if flags_obj.run_mode == 'train':
        metapath = [euler_graph.train_edge_type] * flags_obj.layers
    else:
        metapath = [euler_graph.all_edge_type] * flags_obj.layers
    num_steps = int((euler_graph.total_size + 1) // flags_obj.batch_size *
                    flags_obj.num_epochs)

    model = GIN(dims,
                metapath,
                euler_graph.num_classes,
                euler_graph.sparse_fea_idx,
                euler_graph.sparse_fea_max_id,
                eps=flags_obj.eps,
                train_eps=flags_obj.train_eps)

    params = {
        'num_classes': euler_graph.num_classes,
        'optimizer': flags_obj.optimizer,
        'learning_rate': flags_obj.learning_rate,
        'log_steps': flags_obj.log_steps,
        'train_rate': euler_graph.train_rate,
        'id_file': euler_graph.id_file,
        'label': ['label'],
        'model_dir': flags_obj.model_dir,
        'total_size': euler_graph.total_size,
        'infer_dir': flags_obj.model_dir,
        'batch_size': flags_obj.batch_size,
        'total_step': num_steps
    }

    config = tf.estimator.RunConfig(log_step_count_steps=None)
    model_estimator = GraphEstimator(model, params, config)

    if flags_obj.run_mode == 'train':
        model_estimator.train()
    elif flags_obj.run_mode == 'evaluate':
        model_estimator.evaluate()
    elif flags_obj.run_mode == 'infer':
        model_estimator.infer()
    else:
        raise ValueError('Run mode not exist!')
コード例 #2
0
ファイル: train.py プロジェクト: czkkkkkk/ragdoll
def run(rank, world_size, args):
    print('Running DDP on rank', rank)
    setup(rank, world_size, args)
    ragdoll.init()
    dev_id = ragdoll.device_id()
    if len(args.input_graph) > 0 or len(args.cached_dir) > 0:
        data = SynDataset(rank == 0, args)
    else:
        data = Dataset(rank == 0, args)

    feat_size = args.feat_size
    n_classes = args.n_classes

    torch.cuda.set_device(dev_id)
    features = torch.FloatTensor(data.features).cuda()
    labels = torch.LongTensor(data.labels).cuda()
    labels = torch.LongTensor([0]).cuda()
    train_mask = torch.BoolTensor(data.train_mask).cuda()
    val_mask = torch.BoolTensor(data.val_mask).cuda()
    test_mask = torch.BoolTensor(data.test_mask).cuda()

    n_classes = args.n_classes
    n_nodes = data.n_nodes
    local_n_nodes = data.local_n_nodes

    model = GIN(
        args.num_layers, args.num_mlp_layers,
        feat_size, args.hidden_dim, n_classes,
        args.final_dropout, args.learn_eps,
        args.graph_pooling_type, args.neighbor_pooling_type, n_nodes, local_n_nodes)

    model.cuda()
    model = DDP(model, device_ids=[dev_id])
    loss_fcn = torch.nn.CrossEntropyLoss()
    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr)
    optimizer.zero_grad()

    print("Start training")
    dur = []
    for epoch in range(args.epochs):
        model.train()
        torch.distributed.barrier()
        if epoch >= 3:
            t0 = time.time()
        # with profiler.profile(record_shapes=True, use_cuda=True) as prof:
        logits = model(data.graph, features)
        loss = loss_fcn(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.current_stream().synchronize()
        # print(prof.key_averages().table(sort_by="cuda_time_total"))
        if epoch >= 3:
            dur.append(time.time() - t0)
        print('Peak memory is {} GB'.format(torch.cuda.max_memory_allocated(dev_id) / 1e9))

        print('acc is {}, loss is {}, this epoch using time {} s, avg time {} s.'.format(
            0, loss.item(), dur[-1] if epoch >= 3 else 0, np.mean(dur)))


    cleanup()
コード例 #3
0
def main(args):

    # set up seeds, args.seed supported
    torch.manual_seed(seed=args.seed)
    np.random.seed(seed=args.seed)

    is_cuda = not args.disable_cuda and torch.cuda.is_available()

    if is_cuda:
        args.device = torch.device("cuda:" + str(args.device))
        torch.cuda.manual_seed_all(seed=args.seed)
    else:
        args.device = torch.device("cpu")

    dataset = GINDataset(args.dataset, not args.learn_eps,
                         args.degree_as_nlabel)
    trainloader, validloader = GINDataLoader(
        dataset,
        batch_size=args.batch_size,
        device=args.device,
        seed=args.seed,
        shuffle=True,
        split_name='fold10',
        fold_idx=args.fold_idx).train_valid_loader()
    # or split_name='rand', split_ratio=0.7

    model = GIN(args.num_layers, args.num_mlp_layers, dataset.dim_nfeats,
                args.hidden_dim, dataset.gclasses, args.final_dropout,
                args.learn_eps, args.graph_pooling_type,
                args.neighbor_pooling_type).to(args.device)

    criterion = nn.CrossEntropyLoss()  # defaul reduce is true
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    # it's not cost-effective to hanle the cursor and init 0
    # https://stackoverflow.com/a/23121189
    tbar = tqdm(range(args.epochs),
                unit="epoch",
                position=3,
                ncols=0,
                file=sys.stdout)
    vbar = tqdm(range(args.epochs),
                unit="epoch",
                position=4,
                ncols=0,
                file=sys.stdout)
    lrbar = tqdm(range(args.epochs),
                 unit="epoch",
                 position=5,
                 ncols=0,
                 file=sys.stdout)

    for epoch, _, _ in zip(tbar, vbar, lrbar):

        train(args, model, trainloader, optimizer, criterion, epoch)
        scheduler.step()

        train_loss, train_acc = eval_net(args, model, trainloader, criterion)
        tbar.set_description(
            'train set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                train_loss, 100. * train_acc))

        valid_loss, valid_acc = eval_net(args, model, validloader, criterion)
        vbar.set_description(
            'valid set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                valid_loss, 100. * valid_acc))

        if not args.filename == "":
            with open(args.filename, 'a') as f:
                f.write(
                    '%s %s %s %s %s' %
                    (args.dataset, args.learn_eps, args.neighbor_pooling_type,
                     args.graph_pooling_type, epoch))
                f.write("\n")
                f.write("%f %f %f %f" %
                        (train_loss, train_acc, valid_loss, valid_acc))
                f.write("\n")

        lrbar.set_description("Learning eps with learn_eps={}: {}".format(
            args.learn_eps,
            [layer.eps.data.item() for layer in model.ginlayers]))

    tbar.close()
    vbar.close()
    lrbar.close()
コード例 #4
0
def main(args):

    # set up seeds, args.seed supported
    mx.random.seed(0)
    np.random.seed(seed=0)

    if args.device >= 0:
        args.device = mx.gpu(args.device)
    else:
        args.device = mx.cpu()

    dataset = GINDataset(args.dataset, not args.learn_eps)

    trainloader, validloader = GraphDataLoader(
        dataset,
        batch_size=args.batch_size,
        collate_fn=collate,
        seed=args.seed,
        shuffle=True,
        split_name='fold10',
        fold_idx=args.fold_idx).train_valid_loader()
    # or split_name='rand', split_ratio=0.7

    model = GIN(args.num_layers, args.num_mlp_layers, dataset.dim_nfeats,
                args.hidden_dim, dataset.gclasses, args.final_dropout,
                args.learn_eps, args.graph_pooling_type,
                args.neighbor_pooling_type)
    model.initialize(ctx=args.device)

    criterion = gluon.loss.SoftmaxCELoss()

    print(model.collect_params())
    lr_scheduler = mx.lr_scheduler.FactorScheduler(50, 0.5)
    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'lr_scheduler': lr_scheduler})

    # it's not cost-effective to hanle the cursor and init 0
    # https://stackoverflow.com/a/23121189
    tbar = tqdm(range(args.epochs),
                unit="epoch",
                position=3,
                ncols=0,
                file=sys.stdout)
    vbar = tqdm(range(args.epochs),
                unit="epoch",
                position=4,
                ncols=0,
                file=sys.stdout)
    lrbar = tqdm(range(args.epochs),
                 unit="epoch",
                 position=5,
                 ncols=0,
                 file=sys.stdout)

    for epoch, _, _ in zip(tbar, vbar, lrbar):
        train(args, model, trainloader, trainer, criterion, epoch)

        train_loss, train_acc = eval_net(args, model, trainloader, criterion)
        tbar.set_description(
            'train set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                train_loss, 100. * train_acc))

        valid_loss, valid_acc = eval_net(args, model, validloader, criterion)
        vbar.set_description(
            'valid set - average loss: {:.4f}, accuracy: {:.0f}%'.format(
                valid_loss, 100. * valid_acc))

        if not args.filename == "":
            with open(args.filename, 'a') as f:
                f.write('%s %s %s %s' %
                        (args.dataset, args.learn_eps,
                         args.neighbor_pooling_type, args.graph_pooling_type))
                f.write("\n")
                f.write("%f %f %f %f" %
                        (train_loss, train_acc, valid_loss, valid_acc))
                f.write("\n")

        lrbar.set_description("Learning eps with learn_eps={}: {}".format(
            args.learn_eps, [
                layer.eps.data(args.device).asscalar()
                for layer in model.ginlayers
            ]))

    tbar.close()
    vbar.close()
    lrbar.close()
コード例 #5
0
def main(args):
    path = os.path.join(args.dataDir, args.dataset + ".npz")
    data = custom_dataset(path, args.dim, args.classes, load_from_txt=False)
    g = data.g

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True

    g = g.int().to(args.gpu)

    features = data.x
    labels = data.y
    in_feats = features.size(1)
    n_classes = data.num_classes

    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    if args.model == 'gcn':
        model = GCN(g,
                    in_feats=in_feats,
                    n_hidden=args.hidden,
                    n_classes=n_classes,
                    n_layers=2)
    else:
        model = GIN(g,
                    input_dim=in_feats,
                    hidden_dim=64,
                    output_dim=n_classes,
                    num_layers=5)

    if cuda: model.cuda()

    loss_fcn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-2,
                                 weight_decay=5e-4)

    torch.cuda.synchronize()
    start = time.perf_counter()
    for _ in tqdm(range(args.n_epochs)):
        model.train()

        logits = model(features)
        loss = loss_fcn(logits[:], labels[:])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    torch.cuda.synchronize()
    dur = time.perf_counter() - start

    if args.model == 'gcn':
        print("DGL GCN (L2-H16) Time: (ms) {:.3f}".format(dur * 1e3 /
                                                          args.n_epochs))
    else:
        print("DGL GIN (L5-H64) Time: (ms) {:.3f}".format(dur * 1e3 /
                                                          args.n_epochs))
    print()
コード例 #6
0
def main():
    args = parse_args()
    if args.augment.lower() == 'none':
        args.augment = None
    device = to_device(args.gpu)

    args.seed = args.seed + args.fold
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    data = load_data(args.data)
    num_features = data.num_features
    num_classes = data.num_classes

    trn_graphs, test_graphs = load_data_fold(args.data, args.fold)
    trn_loader = DataLoader(trn_graphs, batch_size=256)
    test_loader = DataLoader(test_graphs, batch_size=256)

    if args.iters == 'auto':
        args.iters = math.ceil(len(trn_graphs) / args.batch_size)
    else:
        args.iters = int(args.iters)

    model = GIN(num_features, num_classes, args.units, args.layers,
                args.dropout)
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
    loss_func = SoftCELoss()

    augment = Augment(trn_graphs, args.augment, aug_size=args.aug_size)

    if args.verbose > 0:
        print(' epochs\t   loss\ttrn_acc\tval_acc')

    out_list = dict(trn_loss=[], trn_acc=[], test_loss=[], test_acc=[])
    for epoch in range(args.epochs):
        model.train()
        loss_sum = 0
        for _ in range(args.iters):
            idx = torch.randperm(len(trn_graphs))[:args.batch_size]
            data = augment(idx).to(device)
            output = model(data.x, data.edge_index, data.batch)
            loss = loss_func(output, data.y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum += loss.item()

        if args.schedule:
            scheduler.step(epoch)

        trn_loss = loss_sum / args.iters
        trn_acc = eval_acc(model, trn_loader, device)
        test_loss = eval_loss(model, loss_func, test_loader, device)
        test_acc = eval_acc(model, test_loader, device)

        out_list['trn_loss'].append(trn_loss)
        out_list['trn_acc'].append(trn_acc)
        out_list['test_loss'].append(test_loss)
        out_list['test_acc'].append(test_acc)

        if args.verbose > 0 and (epoch + 1) % args.verbose == 0:
            print(
                f'{epoch + 1:7d}\t{trn_loss:7.4f}\t{trn_acc:7.4f}\t{test_acc:7.4f}'
            )

    if args.print_all:
        out = {arg: getattr(args, arg) for arg in vars(args)}
        out['all'] = out_list
        print(json.dumps(out))
    else:
        print(f'Training accuracy: {out_list["trn_acc"][-1]}')
        print(f'Test accuracy: {out_list["test_acc"][-1]}')
コード例 #7
0
'''
    PFN internship 2019 coding task
    machine learning
    task-4
    Issei NAKASONE
'''


import datasets as D
import mlp 
from gin import GIN, TrainGIN 
from iterator import Iterator


dirpath = '../datasets/train/'
predict = '../datasets/test/'
batch_size = 256

train = D.get_dataset(dirpath)
train_iter = Iterator(train, batch_size)

model = GIN()
optimizer = mlp.Adam()
optimizer.setup(model)
trainer = TrainGIN(optimizer, train_iter)
trainer.start(epoch=100)

pred = D.GraphDataset(predict)
trainer.predict(pred)

コード例 #8
0
def main(args, run_config, house_name, csv_raw=None, shuffle=False):

    # set up seeds, args.seed supported
    torch.manual_seed(seed=args.seed)
    np.random.seed(seed=args.seed)

    is_cuda = not args.disable_cuda and torch.cuda.is_available()
    is_cuda = False

    if is_cuda:
        args.device = torch.device("cuda:" + str(args.device))
        torch.cuda.manual_seed_all(seed=args.seed)
    else:
        args.device = torch.device("cpu")

    uniqueIndex = getUniqueStartIndex(csv_raw)

    total_num_iteration_for_LOOCV = 0
    total_acc_for_LOOCV = []
    total_f1_for_LOOCV = []
    total_per_class_accuracy = []
    total_confusion_matrix = []
    total_num_iteration_for_LOOCV = 1
    loo = LeaveOneOut()
    total_embeddings = pd.DataFrame()
    for train_index, test_index in loo.split(uniqueIndex):
        args.save_embeddings = False
        print('\n\n split: ', total_num_iteration_for_LOOCV)
        total_num_iteration_for_LOOCV += 1
        path = 'checkpoint_' + house_name + '.pth'
        # initialize the early_stopping object
        early_stopping = EarlyStopping(patience=15, verbose=True, path=path)

        model = GIN(args.num_layers, args.num_mlp_layers, args.input_features,
                    args.hidden_dim, args.nb_classes, args.final_dropout,
                    args.learn_eps, args.graph_pooling_type,
                    args.neighbor_pooling_type,
                    args.save_embeddings).to(args.device)

        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=1e-5)

        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=50,
                                              gamma=0.5)

        file_names = ['ordonezB', 'houseB', 'houseC', 'houseA', 'ordonezA']

        if run_config is 'raw':
            graph_path = os.path.join('../../../data/', house_name,
                                      house_name + '.bin')

        graphs = []
        labels = []

        if not os.path.exists(graph_path):
            for file_name in file_names:
                print('\n\n\n\n')
                print(
                    '*******************************************************************'
                )
                print('\t\t\t\t\t' + file_name + '\t\t\t\t\t\t\t')
                print(
                    '*******************************************************************'
                )
                print('\n\n\n\n')
                if run_config is 'ob':
                    house = pd.read_csv('../../../data/' + file_name + '/ob_' +
                                        file_name + '.csv')
                    lastChangeTimeInMinutes = pd.read_csv(
                        '../../../data/' + file_name + '/' + 'ob-house' +
                        '-sensorChangeTime.csv')
                elif run_config is 'raw':
                    house = pd.read_csv('../../../data/' + file_name + '/' +
                                        file_name + '.csv')
                    lastChangeTimeInMinutes = pd.read_csv(
                        '../../../data/' + file_name + '/' + 'house' +
                        '-sensorChangeTime.csv')

                nodes = pd.read_csv('../../../data/' + file_name +
                                    '/nodes.csv')
                edges = pd.read_csv('../../../data/' + file_name +
                                    '/bidrectional_edges.csv')

                u = edges['Src']
                v = edges['Dst']

                # Create Graph per row of the House CSV

                # Combine Feature like this: Value, Place_in_House, Type, Last_change_Time_in_Second for each node
                for i in range(len(house)):
                    # for i in range(5000):
                    feature = []
                    flag = 0
                    prev_node_value = 0
                    prev_node_change_time = 0
                    # Define Graph
                    g = dgl.graph((u, v))
                    node_num = 0
                    total_nodes = len(nodes)
                    # Add Features
                    for j in range(total_nodes - 1):
                        if nodes.loc[j, 'Type'] == 1:
                            node_value = -1
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                node_value, node_place_in_house, node_type, -1
                            ])
                            node_num += 1
                            continue

                        if flag == 0:
                            node_value = house.iloc[i, 4 + j - node_num]
                            last_change_time_in_minutes = lastChangeTimeInMinutes.iloc[
                                i, 4 + j - node_num]
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                node_value, node_place_in_house, node_type,
                                last_change_time_in_minutes
                            ])
                            if nodes.loc[j, 'Object'] == nodes.loc[j + 1,
                                                                   'Object']:
                                prev_node_value = node_value
                                prev_node_change_time = last_change_time_in_minutes
                                flag = 1
                        else:
                            node_num += 1
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                prev_node_value, node_place_in_house,
                                node_type, prev_node_change_time
                            ])
                            if nodes.loc[j, 'Object'] != nodes.loc[j + 1,
                                                                   'Object']:
                                flag = 0

                    feature.append(
                        [house.loc[i, 'time_of_the_day'], -1, -1, -1])
                    g.ndata['attr'] = torch.tensor(feature)

                    # Give Label
                    try:
                        mappedActivity = config['merging_activties'][
                            house.iloc[i, 2]]
                        labels.append(
                            getIDFromClassName(mappedActivity, config))
                    except:
                        activity = house.iloc[i, 2]
                        labels.append(getIDFromClassName(activity, config))

                    graphs.append(g)

                graph_labels = {"glabel": torch.tensor(labels)}

                save_graphs(graph_path, graphs, graph_labels)

        else:
            graphs, labels = load_graphs(graph_path)
            labels = list(labels['glabel'].numpy())
        # print(np.unique(labels))
        print(len(graphs))

        if run_config is 'ob':
            config["house_start_end_dict"] = [{
                'ordonezB': (0, 2487)
            }, {
                'houseB': (2487, 4636)
            }, {
                'houseC': (4636, 6954)
            }, {
                'houseA': (6954, 7989)
            }, {
                'ordonezA': (7989, 8557)
            }]
        elif run_config is 'raw':
            config["house_start_end_dict"] = [{
                'ordonezB': (0, 30470)
            }, {
                'houseB': (30470, 51052)
            }, {
                'houseC': (51052, 77539)
            }, {
                'houseA': (77539, 114626)
            }, {
                'ordonezA': (114626, 134501)
            }]

        start, end = getStartAndEndIndex(csv_raw, uniqueIndex[test_index])

        test_graphs = graphs[start:end]
        test_labels = labels[start:end]

        train_idx = list(
            set(np.arange(len(graphs))) - set(np.arange(start, end)))
        valid_idx = train_idx[:int(0.2 * len(train_idx))]
        train_idx = train_idx[int(0.2 * len(train_idx)):]

        train_graphs = [graphs[i] for i in train_idx]
        train_labels = [labels[i] for i in train_idx]

        val_graphs = [graphs[i] for i in valid_idx]
        val_labels = [labels[i] for i in valid_idx]

        trainDataset = GraphHouseDataset(train_graphs, train_labels)
        valDataset = GraphHouseDataset(val_graphs, val_labels)
        testDataset = GraphHouseDataset(test_graphs, test_labels)

        trainloader = GraphDataLoader(
            trainDataset,
            batch_size=args.batch_size,
            device=args.device,
            collate_fn=collate,
            seed=args.seed,
            shuffle=shuffle,
            split_name='fold10',
            fold_idx=args.fold_idx,
            save_embeddings=args.save_embeddings).train_valid_loader()

        validloader = GraphDataLoader(
            valDataset,
            batch_size=args.batch_size,
            device=args.device,
            collate_fn=collate,
            seed=args.seed,
            shuffle=shuffle,
            split_name='fold10',
            fold_idx=args.fold_idx,
            save_embeddings=args.save_embeddings).train_valid_loader()

        testloader = GraphDataLoader(
            testDataset,
            batch_size=args.batch_size,
            device=args.device,
            collate_fn=collate,
            seed=args.seed,
            shuffle=shuffle,
            split_name='fold10',
            fold_idx=args.fold_idx,
            save_embeddings=args.save_embeddings).train_valid_loader()

        # or split_name='rand', split_ratio=0.7
        criterion = nn.CrossEntropyLoss()  # default reduce is true

        for epoch in range(args.epochs):
            train(args, model, trainloader, optimizer, criterion, epoch)
            scheduler.step()

            # early_stopping needs the F1 score to check if it has increased,
            # and if it has, it will make a checkpoint of the current model

            if epoch % 10 == 0:
                print('epoch: ', epoch)
                train_loss, train_acc, train_f1_score, train_per_class_accuracy, _, _ = eval_net(
                    args, model, trainloader, criterion, run_config,
                    house_name)

                print(
                    'train set - average loss: {:.4f}, accuracy: {:.0f}%  train_f1_score: {:.4f} '
                    .format(train_loss, 100. * train_acc, train_f1_score))

                # print('train per_class accuracy', test_per_class_accuracy)

                valid_loss, valid_acc, val_f1_score, val_per_class_accuracy, _, _ = eval_net(
                    args,
                    model,
                    validloader,
                    criterion,
                    run_config,
                    house_name,
                    text='val')

                print(
                    'valid set - average loss: {:.4f}, accuracy: {:.0f}% val_f1_score {:.4f}:  '
                    .format(valid_loss, 100. * valid_acc, val_f1_score))

                test_loss, test_acc, test_f1_score, test_per_class_accuracy, _, _ = eval_net(
                    args, model, testloader, criterion, run_config, house_name)

                print(
                    'test set - average loss: {:.4f}, accuracy: {:.0f}%  test_f1_score: {:.4f} '
                    .format(test_loss, 100. * test_acc, test_f1_score))

                # print('val per_class accuracy', val_per_class_accuracy)

                # early_stopping needs the validation loss to check if it has decresed,
                # and if it has, it will make a checkpoint of the current model
                early_stopping(val_f1_score, model)

                if early_stopping.early_stop:
                    print("Early stopping")
                    break

        args.save_embeddings = True
        model = GIN(args.num_layers, args.num_mlp_layers, args.input_features,
                    args.hidden_dim, args.nb_classes, args.final_dropout,
                    args.learn_eps, args.graph_pooling_type,
                    args.neighbor_pooling_type,
                    args.save_embeddings).to(args.device)
        model.eval()
        # making loader here because weighted sampler is off for testing and it is on for other parts.
        # Since we want embeddings in order so sampler is off for testing.
        testDataset = GraphHouseDataset(test_graphs, test_labels)
        testloader = GraphDataLoader(
            testDataset,
            batch_size=args.batch_size,
            device=args.device,
            collate_fn=collate,
            seed=args.seed,
            shuffle=shuffle,
            split_name='fold10',
            fold_idx=args.fold_idx,
            save_embeddings=args.save_embeddings).train_valid_loader()

        if args.save_embeddings:
            if os.path.exists(path):
                print('loading saved checkpoint')
                state = torch.load(path)
                model.load_state_dict(state)
                # model.load_state_dict(state['state_dict'])
                # optimizer.load_state_dict(state['optimizer'])
        test_loss, test_acc, test_f1_score, test_per_class_accuracy, confusion_matrix, embedding = eval_net(
            args, model, testloader, criterion, run_config, house_name)

        total_embeddings = total_embeddings.append(embedding)
        print('embeddng is', total_embeddings)
        total_acc_for_LOOCV.append(test_acc)
        total_f1_for_LOOCV.append(test_f1_score)
        total_per_class_accuracy.append(test_per_class_accuracy)
        total_confusion_matrix.append(confusion_matrix)

    house_results_dictionary = {}
    print(house_name + '\n \n', 'test_acc:\t',
          np.mean(total_acc_for_LOOCV), '\t test f1 score',
          np.mean(total_f1_for_LOOCV), '\t test_per_class_accuracy: \n',
          dict(pd.DataFrame(total_per_class_accuracy).mean()))

    house_results_dictionary['accuracy'] = np.mean(total_acc_for_LOOCV)

    house_results_dictionary['f1_score'] = np.mean(total_f1_for_LOOCV)

    house_results_dictionary[
        'total_test_per_class_accuracy'] = total_per_class_accuracy

    house_results_dictionary['test_per_class_accuracy'] = dict(
        pd.DataFrame(total_per_class_accuracy).mean())

    house_results_dictionary['confusion_matrix'] = total_confusion_matrix

    # print('test set - average loss: {:.4f}, accuracy: {:.0f}%  test_f1_score: {:.4f} '
    #       .format(test_loss, 100. * test_acc, test_f1_score))

    return house_results_dictionary, total_embeddings
コード例 #9
0
def main(args,
         shuffle=True,
         decompressed_csv_path=None,
         ob_csv_file_path=None):
    file_names = ['ordonezB', 'houseB', 'houseC', 'houseA', 'ordonezA']
    file_names = ['ordonezB']
    # run_time_configs = ['ob_data_compressed', 'raw_data', 'ob_data_Decompressed']
    run_time_configs = ['raw_data']
    for run_configuration in run_time_configs:
        results_list = []
        print('\n\n\n\n Running configuration', run_configuration, '\n\n\n\n')
        for file_name in file_names:
            print('house is: ', file_name)
            if run_configuration is 'raw_data':
                config['ob_data_compressed'] = False
                config['ob_data_Decompressed'] = False
                config['raw_data'] = True

            elif run_configuration is 'ob_data_compressed':
                config['ob_data_compressed'] = True
                config['ob_data_Decompressed'] = False
                config['raw_data'] = False

            elif run_configuration is 'ob_data_Decompressed':
                config['ob_data_compressed'] = False
                config['ob_data_Decompressed'] = True
                config['raw_data'] = False

            if config['ob_data_compressed']:
                ob_csv_file_path = os.path.join(os.getcwd(), '../../../',
                                                'data', file_name,
                                                'ob_' + file_name + '.csv')
                decompressed_csv_path = os.path.join(
                    os.getcwd(), '../../../', 'data', file_name,
                    'ob_' + file_name + '.csv')

            elif config['raw_data']:
                ob_csv_file_path = os.path.join(os.getcwd(), '../../../',
                                                'data', file_name,
                                                file_name + '.csv')
                decompressed_csv_path = os.path.join(os.getcwd(), '../../../',
                                                     'data', file_name,
                                                     file_name + '.csv')

            elif config['ob_data_Decompressed']:
                ob_csv_file_path = os.path.join(os.getcwd(), '../../../',
                                                'data', file_name,
                                                'ob_' + file_name + '.csv')
                decompressed_csv_path = os.path.join(
                    os.getcwd(), '../../../', 'data', file_name,
                    'ob_decompressed_' + file_name + '.csv')

            # # set up seeds, args.seed supported
            # torch.manual_seed(seed=args.seed)
            # np.random.seed(seed=args.seed)

            is_cuda = not args.disable_cuda and torch.cuda.is_available()
            is_cuda = False

            if is_cuda:
                args.device = torch.device("cuda:" + str(args.device))
                torch.cuda.manual_seed_all(seed=args.seed)
            else:
                args.device = torch.device("cpu")

            if config['raw_data']:
                graph_path = os.path.join('../../../data', file_name,
                                          file_name + '.bin')
                # graph_path = os.path.join('../../../data/all_houses/all_houses_raw.bin')
            elif config['ob_data_compressed']:
                graph_path = os.path.join('../../../data', file_name,
                                          'ob_' + file_name + '.bin')
            elif config['ob_data_Decompressed']:
                decompressedGraphPath = os.path.join('../../../data',
                                                     file_name,
                                                     file_name + '.bin')
                graph_path = os.path.join('../../../data', file_name,
                                          'ob_' + file_name + '.bin')

            graphs = []
            labels = []

            if not os.path.exists(graph_path):
                print('\n\n\n\n')
                print(
                    '*******************************************************************'
                )
                print('\t\t\t\t\t' + file_name + '\t\t\t\t\t\t\t')
                print(
                    '*******************************************************************'
                )
                print('\n\n\n\n')

                nodes = pd.read_csv('../../../data/' + file_name +
                                    '/nodes.csv')
                edges = pd.read_csv('../../../data/' + file_name +
                                    '/bidrectional_edges.csv')

                if config['ob_data_compressed']:
                    house = pd.read_csv('../../../data/' + file_name + '/ob_' +
                                        file_name + '.csv')
                    lastChangeTimeInMinutes = pd.read_csv(
                        '../../../data/' + file_name + '/' + 'ob-house' +
                        '-sensorChangeTime.csv')
                elif config['raw_data']:
                    house = pd.read_csv('../../../data/' + file_name + '/' +
                                        file_name + '.csv')
                    lastChangeTimeInMinutes = pd.read_csv(
                        '../../../data/' + file_name + '/' + 'house' +
                        '-sensorChangeTime.csv')

                u = edges['Src']
                v = edges['Dst']

                # Create Graph per row of the House CSV

                # Combine Feature like this: Value, Place_in_House, Type, Last_change_Time_in_Second for each node
                for i in range(len(house)):
                    # for i in range(5000):
                    feature = []
                    flag = 0
                    prev_node_value = 0
                    prev_node_change_time = 0
                    # Define Graph
                    g = dgl.graph((u, v))
                    node_num = 0
                    total_nodes = len(nodes)
                    # Add Features
                    for j in range(total_nodes - 1):
                        if nodes.loc[j, 'Type'] == 1:
                            node_value = -1
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                node_value, node_place_in_house, node_type, -1
                            ])
                            node_num += 1
                            continue

                        if flag == 0:
                            node_value = house.iloc[i, 4 + j - node_num]
                            last_change_time_in_minutes = lastChangeTimeInMinutes.iloc[
                                i, 4 + j - node_num]
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                node_value, node_place_in_house, node_type,
                                last_change_time_in_minutes
                            ])
                            if nodes.loc[j, 'Object'] == nodes.loc[j + 1,
                                                                   'Object']:
                                prev_node_value = node_value
                                prev_node_change_time = last_change_time_in_minutes
                                flag = 1
                        else:
                            node_num += 1
                            node_place_in_house = nodes.loc[j,
                                                            'place_in_house']
                            node_type = nodes.loc[j, 'Type']
                            feature.append([
                                prev_node_value, node_place_in_house,
                                node_type, prev_node_change_time
                            ])
                            if nodes.loc[j, 'Object'] != nodes.loc[j + 1,
                                                                   'Object']:
                                flag = 0

                    feature.append(
                        [house.loc[i, 'time_of_the_day'], -1, -1, -1])
                    g.ndata['attr'] = torch.tensor(feature)

                    # Give Label
                    try:
                        mappedActivity = config['merging_activties'][
                            house.iloc[i, 2]]
                        labels.append(
                            getIDFromClassName(mappedActivity, config))
                    except:
                        activity = house.iloc[i, 2]
                        labels.append(getIDFromClassName(activity, config))

                    graphs.append(g)

                graph_labels = {"glabel": torch.tensor(labels)}

                save_graphs(graph_path, graphs, graph_labels)

            else:
                graphs, labels = load_graphs(graph_path)
                labels = list(labels['glabel'].numpy())
                if config['ob_data_Decompressed']:
                    DecompressedGraphs, DecompressedLabels = load_graphs(
                        decompressedGraphPath)
                    DecompressedLabels = list(
                        DecompressedLabels['glabel'].numpy())

            print(len(graphs))

            total_num_iteration_for_LOOCV = 0
            total_acc_for_LOOCV = []
            total_f1_for_LOOCV = []
            total_per_class_accuracy = []
            total_confusion_matrix = []
            score = 0
            accuracy = 0

            df = None
            # read csv Files
            house_name, all_test_loss, all_test_acc, all_test_f1_score, all_test_per_class_accuracy, all_test_confusion_matrix = [], [], [], [], [], []
            house_name_list = [
                'ordonezB', 'houseB', 'houseC', 'houseA', 'ordonezA'
            ]

            decompressed_csv = pd.read_csv(decompressed_csv_path)

            compressed_csv = pd.read_csv(ob_csv_file_path)

            uniqueIndex = getUniqueStartIndex(compressed_csv)

            # Required in case of ob Decompressed, when you want test index from
            # Decompressed csv rather than from OB CSV
            uniqueIndex_decompressed = getUniqueStartIndex(decompressed_csv)

            # Mapped Activity as per the config/generalizing the activities not present in all csvs'

            loo = LeaveOneOut()

            for train_index, test_index in loo.split(uniqueIndex):

                model = GIN(args.num_layers, args.num_mlp_layers,
                            args.input_features, args.hidden_dim,
                            args.nb_classes, args.final_dropout,
                            args.learn_eps, args.graph_pooling_type,
                            args.neighbor_pooling_type,
                            args.save_embeddings).to(args.device)
                optimizer = optim.Adam(model.parameters(), lr=args.lr)

                scheduler = optim.lr_scheduler.StepLR(optimizer,
                                                      step_size=50,
                                                      gamma=0.5)
                # initialize the early_stopping object
                early_stopping = EarlyStopping(patience=10, verbose=True)

                print(
                    '----------------------------------------------------------------------------------------------'
                )
                print('\n\n split: ', total_num_iteration_for_LOOCV)
                total_num_iteration_for_LOOCV += 1

                # Get start and end of test dataset
                start, end = getStartAndEndIndex(compressed_csv,
                                                 uniqueIndex[test_index])
                # make dataframe for train, skip everything b/w test start and test end. rest everything is train.

                train_graphs = graphs[:start] + graphs[end:]
                train_labels = labels[:start] + labels[end:]

                # Divide train, test and val dataframe
                val_graphs = train_graphs[:int(
                    len(train_graphs) * args.split_ratio)]
                val_labels = train_labels[:int(
                    len(train_labels) * args.split_ratio)]

                train_graphs = train_graphs[
                    int(len(train_graphs) * args.split_ratio):]
                train_labels = train_labels[
                    int(len(train_labels) * args.split_ratio):]

                # Only Test index will be picked from decompressed because
                # Only while evaluating we are decompressing
                if config['ob_data_Decompressed']:
                    start, end = getStartAndEndIndex(
                        decompressed_csv, uniqueIndex_decompressed[test_index])
                    test_graphs = DecompressedGraphs[start:end]
                    test_labels = DecompressedLabels[start:end]
                else:
                    test_graphs = graphs[start:end]
                    test_labels = labels[start:end]

                # Means this the last split and test has 1 element in it. skip it and continue, because this causes
                # the code to break. Kind of easy fix.
                if start == end:
                    continue

                trainDataset = GraphHouseDataset(train_graphs, train_labels)
                valDataset = GraphHouseDataset(val_graphs, val_labels)
                testDataset = GraphHouseDataset(test_graphs, test_labels)

                trainloader = GraphDataLoader(
                    trainDataset,
                    batch_size=args.batch_size,
                    device=args.device,
                    collate_fn=collate,
                    seed=args.seed,
                    shuffle=shuffle,
                    split_name='fold10',
                    fold_idx=args.fold_idx,
                    save_embeddings=args.save_embeddings).train_valid_loader()

                validloader = GraphDataLoader(
                    valDataset,
                    batch_size=args.batch_size,
                    device=args.device,
                    collate_fn=collate,
                    seed=args.seed,
                    shuffle=shuffle,
                    split_name='fold10',
                    fold_idx=args.fold_idx,
                    save_embeddings=args.save_embeddings).train_valid_loader()

                testloader = GraphDataLoader(
                    testDataset,
                    batch_size=args.batch_size,
                    device=args.device,
                    collate_fn=collate,
                    seed=args.seed,
                    shuffle=shuffle,
                    split_name='fold10',
                    fold_idx=args.fold_idx,
                    save_embeddings=args.save_embeddings).train_valid_loader()

                criterion = nn.CrossEntropyLoss()  # default reduce is true

                # Training
                training(model, trainloader, validloader, optimizer, criterion,
                         scheduler, early_stopping)

                # Load Best Model from early stopping
                path = './checkpoint.pth'
                if os.path.isfile(path):
                    print("=> loading checkpoint '{}'".format(path))
                    checkpoint = torch.load(path,
                                            map_location=torch.device('cpu'))
                    model.load_state_dict(checkpoint)
                    # optimizer.load_state_dict(checkpoint['optimizer'])
                    print("=> loaded checkpoint '{}'".format(path))
                else:
                    print("=> no checkpoint found at '{}'".format(path))

                if len(testloader) != 0:
                    test_loss, test_acc, test_f1_score, test_per_class_accuracy, test_confusion_matrix = eval_net(
                        args, model, testloader, criterion, text='test')

                    total_acc_for_LOOCV.append(test_acc)
                    total_f1_for_LOOCV.append(test_f1_score)
                    total_per_class_accuracy.append(test_per_class_accuracy)
                    total_confusion_matrix.append(test_confusion_matrix)

                    print(
                        'test set - average loss: {:.4f}, accuracy: {:.0f}%  test_f1_score: {:.4f} '
                        .format(test_loss, 100. * test_acc, test_f1_score))

            house_results_dictionary = {}

            print(file_name + '\n \n', 'test_acc:\t',
                  np.mean(total_acc_for_LOOCV), '\t test f1 score',
                  np.mean(total_f1_for_LOOCV),
                  '\t test_per_class_accuracy: \n',
                  dict(pd.DataFrame(total_per_class_accuracy).mean()))

            house_results_dictionary['accuracy'] = np.mean(total_acc_for_LOOCV)

            house_results_dictionary['f1_score'] = np.mean(total_f1_for_LOOCV)

            house_results_dictionary['test_per_class_accuracy'] = dict(
                pd.DataFrame(total_per_class_accuracy).mean())

            house_results_dictionary[
                'confusion_matrix'] = total_confusion_matrix

            house_results_dictionary['house_name'] = file_name

            results_list.append(house_results_dictionary)

            if not os.path.exists(
                    os.path.join('../../../logs',
                                 'singleHouseGraphClassification')):
                os.mkdir(
                    os.path.join('../../../logs',
                                 'singleHouseGraphClassification'))

            print('\n\n\n\n\n\n Finished house', file_name, '\n\n\n\n')

        if config['ob_data_compressed']:
            print('saved')
            np.save(
                os.path.join('../../../logs/singleHouseGraphClassification',
                             'ob_compressed.npy'), results_list)
        elif config['ob_data_Decompressed']:
            print('saved')
            np.save(
                os.path.join('../../../logs/singleHouseGraphClassification',
                             'ob_decompressed.npy'), results_list)
        elif config['raw_data']:
            print('saved')
            np.save(
                os.path.join('../../../logs/singleHouseGraphClassification',
                             'raw.npy'), results_list)