Esempio n. 1
0
 print("Tasks len:", len(raw_tasks))
 if os.path.exists(DATA_DIR + "/" + datasource +
                   "/id_index/processed_dataset_{}.txt".format(
                       retweet_user_size)):
     with open(
             DATA_DIR + "/" + datasource +
             "/id_index/processed_dataset_{}.txt".format(
                 retweet_user_size), "r") as f:
         dataset = f.read()
         dataset = eval(dataset)
     vocab_size = get_vocab_size(datasource)
 else:
     data_builder = DatasetBuilder(datasource,
                                   time_cutoff=None,
                                   only_binary=True)
     dataset = data_builder.create_dataset(dataset_type="id_index",
                                           standardize_features=True)
     vocab_size = data_builder.get_vocab_size()
     np.set_printoptions(threshold=1e6)
     with open(
             DATA_DIR + "/" + datasource +
             "/id_index/processed_dataset_{}.txt".format(
                 retweet_user_size), "w") as f:
         f.write(str(dataset))
 print("dataset size: {}".format(len(dataset)))
 # print("task ids shape:\n{}".format(tasks_ids.shape))
 # split dataset for training and testing
 idxs = np.arange(0, len(raw_tasks))
 train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed)
 val_idxs = train_idxs[-int(len(idxs) * topic_split_rate[1]):]
 train_idxs = train_idxs[:-int(len(idxs) * topic_split_rate[1])]
 # train_idxs, val_idxs, test_idxs = split_dataset(idxs, topic_split_rate, topic_task_nums, seed)
Esempio n. 2
0
            except ValueError:
                pass
        if (i + 1) % 5 == 0:
            time_spent = time.time() - start
            progress = 100. * (i + 1) / total
            print(
                f"{progress:.2f} % DONE, in {time_spent:.2f} seconds. Total would be {time_spent * 100 / (progress * 60):.2f} mins")

    df = pd.DataFrame(data=df_data, columns=df_columns)
    df.label = df.label.astype('category')
    df.to_csv(f"seiz_dataset_{name}.csv", index=False)


if __name__ == "__main__":
    dataset_selected = 'twitter16'
    # Building a SEIZ dataset
    dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000)
    full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False)
    train_set = full_dataset['train']
    dump_seiz_dataset(train_set, name=dataset_selected)
     dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val')
    dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')

    dataset_selected = 'twitter15'
    dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000)
    full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False)
    train_set = full_dataset['train']
    dump_seiz_dataset(train_set, name=dataset_selected)
    dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val')
    dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')
Esempio n. 3
0
        retweet_user_size,
        seed
    )
    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)

    # ----------------------------------------------
    # Load dataset
    # ----------------------------------------------
    # load raw dataset
    raw_dataset = load_raw_dataset(DATA_ROOT_PATH)

    # build dataset with preprocessing
    # parameter setting
    data_builder = DatasetBuilder(raw_dataset, retweet_user_size)
    dataset, topic_index = data_builder.create_dataset()
    print('Topics in dataset: {}'.format(topic_index.keys()))
    print('Dataset size: {}'.format(len(dataset)))
    # raw_task: [[0:[t_ids],1:[t_ids]],...]
    raw_tasks = [topic_index[topic] for topic in topic_index.keys()]
    task_sizes = []
    print("scaled task distribution:")
    for task in raw_tasks:
        print([len(task[key]) for key in task.keys()])
        task_sizes.append(sum([len(task[key]) for key in task.keys()]))
    task_sizes = np.array(task_sizes)

    # split tasks in the dataset for training and testing
    idxs = np.arange(0, len(raw_tasks))

    train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed)
Esempio n. 4
0
def train(dataset, args):

    on_gpu = torch.cuda.is_available()
    if on_gpu:
        print("Using gpu")

    # Loading dataset

    time_cutoff = None if args.time_cutoff == "None" else int(args.time_cutoff)
    dataset_builder = DatasetBuilder(dataset,
                                     only_binary=args.only_binary,
                                     features_to_consider=args.features,
                                     time_cutoff=time_cutoff,
                                     seed=args.seed)
    datasets = dataset_builder.create_dataset(
        standardize_features=args.standardize,
        on_gpu=on_gpu,
        oversampling_ratio=args.oversampling_ratio)
    train_data_loader = torch_geometric.data.DataLoader(
        datasets["train"], batch_size=args.batch_size, shuffle=True)
    val_data_loader = torch_geometric.data.DataLoader(
        datasets["val"], batch_size=args.batch_size, shuffle=True)
    test_data_loader = torch_geometric.data.DataLoader(
        datasets["test"], batch_size=args.batch_size, shuffle=True)

    print("Number of node features", dataset_builder.num_node_features)
    print("Dimension of hidden space", args.hidden_dim)

    # Setting up model
    model = GNNStack(dataset_builder.num_node_features, args.hidden_dim,
                     dataset_builder.num_classes, args)
    # model = GNNStack(dataset.num_node_features, 32, dataset.num_classes, args)
    if on_gpu:
        model.cuda()

    # Tensorboard logging
    log_dir = os.path.join("logs", args.exp_name)
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    train_writer = SummaryWriter(os.path.join(log_dir, "train"))
    val_writer = SummaryWriter(os.path.join(log_dir, "val"))
    test_writer = SummaryWriter(os.path.join(log_dir, "test"))

    # CSV logging
    csv_logging = []

    # Checkpoints
    checkpoint_dir = os.path.join("checkpoints", args.exp_name)
    checkpoint_path = os.path.join(checkpoint_dir, "model.pt")
    if args.exp_name == "default" or not os.path.isfile(checkpoint_path):
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        epoch_ckp = 0
        global_step = 0
        best_val_acc = 0
    else:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        epoch_ckp = checkpoint["epoch"]
        global_step = checkpoint["global_step"]
        best_val_acc = checkpoint["best_val_acc"]
        print("Restoring previous model at epoch", epoch_ckp)

    # Training phase
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=5e-4)
    for epoch in range(epoch_ckp, epoch_ckp + args.num_epochs):
        model.train()
        epoch_loss = 0
        for batch in train_data_loader:
            # print(batch)
            # import pdb; pdb.set_trace()
            optimizer.zero_grad()
            out = model(batch)
            loss = F.nll_loss(out, batch.y)
            epoch_loss += loss.sum().item()

            # Optimization
            loss.backward()
            optimizer.step()

            # TFBoard logging
            train_writer.add_scalar("loss", loss.mean(), global_step)
            global_step += 1

        print("epoch", epoch, "loss:", epoch_loss / len(train_data_loader))
        if epoch % 1 == 0:
            # Evaluation on the training set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in train_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            train_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            train_writer.add_scalar("Accuracy", train_acc, epoch)
            for i in range(dataset_builder.num_classes):
                train_writer.add_scalar("Accuracy_{}".format(i),
                                        acc_per_label[i], epoch)
                train_writer.add_scalar("Recall_{}".format(i),
                                        rec_per_label[i], epoch)
            print('Training accuracy: {:.4f}'.format(train_acc))

            # Evaluation on the validation set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in val_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            val_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            val_writer.add_scalar("Accuracy", val_acc, epoch)
            for i in range(dataset_builder.num_classes):
                val_writer.add_scalar("Accuracy_{}".format(i),
                                      acc_per_label[i], epoch)
                val_writer.add_scalar("Recall_{}".format(i), rec_per_label[i],
                                      epoch)
            print('Validation accuracy: {:.4f}'.format(val_acc))

            # Evaluation on the test set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in test_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            test_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            test_writer.add_scalar("Accuracy", test_acc, epoch)
            for i in range(dataset_builder.num_classes):
                test_writer.add_scalar("Accuracy_{}".format(i),
                                       acc_per_label[i], epoch)
                test_writer.add_scalar("Recall_{}".format(i), rec_per_label[i],
                                       epoch)
            print('Test accuracy: {:.4f}'.format(test_acc))

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                # Saving model if model is better
                checkpoint = {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "epoch_loss": epoch_loss / len(train_data_loader),
                    "global_step": global_step,
                    "best_val_acc": best_val_acc
                }
                torch.save(checkpoint, checkpoint_path)

                dict_logging = vars(args).copy()
                dict_logging["train_acc"] = train_acc
                dict_logging["val_acc"] = val_acc
                dict_logging["test_acc"] = test_acc
                csv_logging.append(dict_logging)

    csv_exists = os.path.exists("results.csv")
    header = dict_logging.keys()

    with open("results.csv", "a") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=header)
        if not csv_exists:
            writer.writeheader()
        for dict_ in csv_logging:
            writer.writerow(dict_)
    return
Esempio n. 5
0
                # }

        # torch.save(checkpoint, checkpoint_path)
        print("epoch", epoch, "loss:", epoch_loss / len(train_loader))

    return max_running_mean


if __name__ == "__main__":
    args = parser.parse_args()
    # Loading dataset

    dataset_builder = DatasetBuilder(args.dataset,
                                     only_binary=True,
                                     time_cutoff=1500)
    full_dataset = dataset_builder.create_dataset(dataset_type="sequential",
                                                  standardize_features=False)
    val_dataset = full_dataset['val']

    if args.debug:
        train_dataset = val_dataset
    else:
        train_dataset = full_dataset['train']

    train_dataset = seq_data_to_dataset(train_dataset,
                                        cap_len=args.cap_len,
                                        num_features=11,
                                        standardize=True)
    val_dataset = seq_data_to_dataset(val_dataset,
                                      cap_len=args.cap_len,
                                      num_features=11,
                                      standardize=True)