Exemple #1
0
    def init_dataset(cls, goal_object):
        """

        :return:
        """
        return DatasetBuilder(
            {
                "name": (),
                "initial_position": ("axis"),
                "initial_angle": (),
                "goal_position": ("axis"),
                "goal_angle": (),
                "position": ("axis"),
                "angle": (),
                "wheel_target_speeds": ("wheel"),
                "scanner_distances": ("scanner_angle"),
                "scanner_image": ("scanner_angle", "channel"),
                "goal_reached": (),
                "goal_position_distance": (),
                "goal_angle_distance": ()
            },
            coords={
                # TODO: run and step might be converted to a MultiIndex, making it
                #       possible to directly use them for indexing
                "run": (),
                "step": (),
                "axis": (..., ["x", "y"]),
                "channel": (..., ["r", "g", "b"]),
                "wheel": (..., ["l", "r"]),
                "scanner_angle": (..., np.linspace(-np.pi, np.pi, 180))
            },
            attrs={"goal_object": goal_object})
def get_training_dataset(audio_params, audio_adapter, audio_path):
    """ Builds training dataset.

    :param audio_params: Audio parameters.
    :param audio_adapter: Adapter to load audio from.
    :param audio_path: Path of directory containing audio.
    :returns: Built dataset.
    """
    builder = DatasetBuilder(audio_params,
                             audio_adapter,
                             audio_path,
                             chunk_duration=audio_params.get(
                                 'chunk_duration', 20.0),
                             random_seed=audio_params.get('random_seed', 0))
    return builder.build(audio_params.get('train_csv'),
                         cache_directory=audio_params.get('training_cache'),
                         batch_size=audio_params.get('batch_size'),
                         n_chunks_per_song=audio_params.get(
                             'n_chunks_per_song', 1),
                         random_data_augmentation=False,
                         convert_to_uint=True,
                         wait_for_cache=False)
def get_validation_dataset(audio_params, audio_adapter, audio_path):
    """ Builds validation dataset.

    :param audio_params: Audio parameters.
    :param audio_adapter: Adapter to load audio from.
    :param audio_path: Path of directory containing audio.
    :returns: Built dataset.
    """
    builder = DatasetBuilder(audio_params,
                             audio_adapter,
                             audio_path,
                             chunk_duration=20.0)
    return builder.build(
        audio_params.get('validation_csv'),
        batch_size=100,
        cache_directory=audio_params.get('validation_cache'),
        convert_to_uint=True,
        infinite_generator=False,
        n_chunks_per_song=1,
        # should not perform data augmentation for eval:
        random_data_augmentation=False,
        random_time_crop=False,
        shuffle=False,
    )
Exemple #4
0
 def _buildItems(self):
     DatasetBuilder().buildDataset(configuration.itemdir)
Exemple #5
0
 def _buildDialogs(self):
     DatasetBuilder().buildDataset(configuration.dialogdir)
Exemple #6
0
 def _buildScripts(self):
     DatasetBuilder().buildDataset(configuration.scriptdir)
Exemple #7
0
 def _buildTilesets(self):
     DatasetBuilder().buildDataset(configuration.tilesetdir)
Exemple #8
0
 def _buildSpritesets(self):
     DatasetBuilder().buildDataset(configuration.spritesetdir)
Exemple #9
0
import pandas as pd
import numpy as np
from dataset import DatasetBuilder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import seaborn as sns
from matplotlib import pyplot as plt
import json

# predict price variations on an altcoin based on btc, and some other altcoins
#
if __name__ == '__main__':
    db = DatasetBuilder()
    target = 'ETH'
    symbols = {
        'ADA': "ADA",
        'BCH': "BCH",
        'BNB': "BNB",
        'BTC': "BTC",
        'BTG': "BTG",
        'DASH': "DASH",
        'DOGE': "DOGE",
        'EOS': "EOS",
        'ETC': "ETC",
        'ETH': "ETH",
        'IOT': "MIOTA",
        'LINK': "LINK",
        'LTC': "LTC",
        'NEO': "NEO",
        'QTUM': "QTUM",
        'TRX': "TRX",
        'USDT': "USDT",
Exemple #10
0
                    help='Image width, this parameter will affect the output '
                    'shape of the model, default is 100, so this model '
                    'can only predict up to 24 characters.')
parser.add_argument('-b',
                    '--batch_size',
                    type=int,
                    default=256,
                    help='Batch size.')
parser.add_argument('-m',
                    '--model',
                    type=str,
                    required=True,
                    help='The saved model.')
parser.add_argument('--img_channels',
                    type=int,
                    default=1,
                    help='0: Use the number of channels in the image, '
                    '1: Grayscale image, 3: RGB image')
parser.add_argument('--ignore_case',
                    action='store_true',
                    help='Whether ignore case.(default false)')
args = parser.parse_args()

dataset_builder = DatasetBuilder(args.table_path, args.img_width,
                                 args.img_channels, args.ignore_case)
eval_ds, size = dataset_builder.build(args.ann_paths, False, args.batch_size)
print('Num of eval samples: {}'.format(size))

model = keras.models.load_model(args.model, compile=False)
model.compile(loss=CTCLoss(), metrics=[WordAccuracy()])
model.evaluate(eval_ds)
Exemple #11
0
def train(dataset, args):

    on_gpu = torch.cuda.is_available()
    if on_gpu:
        print("Using gpu")

    # Loading dataset

    time_cutoff = None if args.time_cutoff == "None" else int(args.time_cutoff)
    dataset_builder = DatasetBuilder(dataset,
                                     only_binary=args.only_binary,
                                     features_to_consider=args.features,
                                     time_cutoff=time_cutoff,
                                     seed=args.seed)
    datasets = dataset_builder.create_dataset(
        standardize_features=args.standardize,
        on_gpu=on_gpu,
        oversampling_ratio=args.oversampling_ratio)
    train_data_loader = torch_geometric.data.DataLoader(
        datasets["train"], batch_size=args.batch_size, shuffle=True)
    val_data_loader = torch_geometric.data.DataLoader(
        datasets["val"], batch_size=args.batch_size, shuffle=True)
    test_data_loader = torch_geometric.data.DataLoader(
        datasets["test"], batch_size=args.batch_size, shuffle=True)

    print("Number of node features", dataset_builder.num_node_features)
    print("Dimension of hidden space", args.hidden_dim)

    # Setting up model
    model = GNNStack(dataset_builder.num_node_features, args.hidden_dim,
                     dataset_builder.num_classes, args)
    # model = GNNStack(dataset.num_node_features, 32, dataset.num_classes, args)
    if on_gpu:
        model.cuda()

    # Tensorboard logging
    log_dir = os.path.join("logs", args.exp_name)
    if not os.path.isdir(log_dir):
        os.makedirs(log_dir)
    train_writer = SummaryWriter(os.path.join(log_dir, "train"))
    val_writer = SummaryWriter(os.path.join(log_dir, "val"))
    test_writer = SummaryWriter(os.path.join(log_dir, "test"))

    # CSV logging
    csv_logging = []

    # Checkpoints
    checkpoint_dir = os.path.join("checkpoints", args.exp_name)
    checkpoint_path = os.path.join(checkpoint_dir, "model.pt")
    if args.exp_name == "default" or not os.path.isfile(checkpoint_path):
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        epoch_ckp = 0
        global_step = 0
        best_val_acc = 0
    else:
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model_state_dict"])
        epoch_ckp = checkpoint["epoch"]
        global_step = checkpoint["global_step"]
        best_val_acc = checkpoint["best_val_acc"]
        print("Restoring previous model at epoch", epoch_ckp)

    # Training phase
    optimizer = torch.optim.AdamW(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=5e-4)
    for epoch in range(epoch_ckp, epoch_ckp + args.num_epochs):
        model.train()
        epoch_loss = 0
        for batch in train_data_loader:
            # print(batch)
            # import pdb; pdb.set_trace()
            optimizer.zero_grad()
            out = model(batch)
            loss = F.nll_loss(out, batch.y)
            epoch_loss += loss.sum().item()

            # Optimization
            loss.backward()
            optimizer.step()

            # TFBoard logging
            train_writer.add_scalar("loss", loss.mean(), global_step)
            global_step += 1

        print("epoch", epoch, "loss:", epoch_loss / len(train_data_loader))
        if epoch % 1 == 0:
            # Evaluation on the training set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in train_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            train_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            train_writer.add_scalar("Accuracy", train_acc, epoch)
            for i in range(dataset_builder.num_classes):
                train_writer.add_scalar("Accuracy_{}".format(i),
                                        acc_per_label[i], epoch)
                train_writer.add_scalar("Recall_{}".format(i),
                                        rec_per_label[i], epoch)
            print('Training accuracy: {:.4f}'.format(train_acc))

            # Evaluation on the validation set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in val_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            val_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            val_writer.add_scalar("Accuracy", val_acc, epoch)
            for i in range(dataset_builder.num_classes):
                val_writer.add_scalar("Accuracy_{}".format(i),
                                      acc_per_label[i], epoch)
                val_writer.add_scalar("Recall_{}".format(i), rec_per_label[i],
                                      epoch)
            print('Validation accuracy: {:.4f}'.format(val_acc))

            # Evaluation on the test set
            model.eval()
            correct = 0
            n_samples = 0
            samples_per_label = np.zeros(dataset_builder.num_classes)
            pred_per_label = np.zeros(dataset_builder.num_classes)
            correct_per_label = np.zeros(dataset_builder.num_classes)
            with torch.no_grad():
                for batch in test_data_loader:
                    _, pred = model(batch).max(dim=1)
                    correct += float(pred.eq(batch.y).sum().item())
                    for i in range(dataset_builder.num_classes):
                        batch_i = batch.y.eq(i)
                        pred_i = pred.eq(i)
                        samples_per_label[i] += batch_i.sum().item()
                        pred_per_label[i] += pred_i.sum().item()
                        correct_per_label[i] += (batch_i * pred_i).sum().item()
                    n_samples += len(batch.y)
            test_acc = correct / n_samples
            acc_per_label = correct_per_label / samples_per_label
            rec_per_label = correct_per_label / pred_per_label
            test_writer.add_scalar("Accuracy", test_acc, epoch)
            for i in range(dataset_builder.num_classes):
                test_writer.add_scalar("Accuracy_{}".format(i),
                                       acc_per_label[i], epoch)
                test_writer.add_scalar("Recall_{}".format(i), rec_per_label[i],
                                       epoch)
            print('Test accuracy: {:.4f}'.format(test_acc))

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                # Saving model if model is better
                checkpoint = {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "epoch_loss": epoch_loss / len(train_data_loader),
                    "global_step": global_step,
                    "best_val_acc": best_val_acc
                }
                torch.save(checkpoint, checkpoint_path)

                dict_logging = vars(args).copy()
                dict_logging["train_acc"] = train_acc
                dict_logging["val_acc"] = val_acc
                dict_logging["test_acc"] = test_acc
                csv_logging.append(dict_logging)

    csv_exists = os.path.exists("results.csv")
    header = dict_logging.keys()

    with open("results.csv", "a") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=header)
        if not csv_exists:
            writer.writeheader()
        for dict_ in csv_logging:
            writer.writerow(dict_)
    return
Exemple #12
0
        dst_root_folder,
        retweet_user_size,
        seed
    )
    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)

    # ----------------------------------------------
    # Load dataset
    # ----------------------------------------------
    # load raw dataset
    raw_dataset = load_raw_dataset(DATA_ROOT_PATH)

    # build dataset with preprocessing
    # parameter setting
    data_builder = DatasetBuilder(raw_dataset, retweet_user_size)
    dataset, topic_index = data_builder.create_dataset()
    print('Topics in dataset: {}'.format(topic_index.keys()))
    print('Dataset size: {}'.format(len(dataset)))
    # raw_task: [[0:[t_ids],1:[t_ids]],...]
    raw_tasks = [topic_index[topic] for topic in topic_index.keys()]
    task_sizes = []
    print("scaled task distribution:")
    for task in raw_tasks:
        print([len(task[key]) for key in task.keys()])
        task_sizes.append(sum([len(task[key]) for key in task.keys()]))
    task_sizes = np.array(task_sizes)

    # split tasks in the dataset for training and testing
    idxs = np.arange(0, len(raw_tasks))
Exemple #13
0
                #     "epoch_loss": args.batch_size * epoch_loss / len(train_loader),
                #     "global_step": global_step
                # }

        # torch.save(checkpoint, checkpoint_path)
        print("epoch", epoch, "loss:", epoch_loss / len(train_loader))

    return max_running_mean


if __name__ == "__main__":
    args = parser.parse_args()
    # Loading dataset

    dataset_builder = DatasetBuilder(args.dataset,
                                     only_binary=True,
                                     time_cutoff=1500)
    full_dataset = dataset_builder.create_dataset(dataset_type="sequential",
                                                  standardize_features=False)
    val_dataset = full_dataset['val']

    if args.debug:
        train_dataset = val_dataset
    else:
        train_dataset = full_dataset['train']

    train_dataset = seq_data_to_dataset(train_dataset,
                                        cap_len=args.cap_len,
                                        num_features=11,
                                        standardize=True)
    val_dataset = seq_data_to_dataset(val_dataset,
Exemple #14
0
 #     task = sample_task_from_raw_task(raw_task, support_shots, query_shots)
 #     tasks.append(task)
 print("Tasks len:", len(raw_tasks))
 if os.path.exists(DATA_DIR + "/" + datasource +
                   "/id_index/processed_dataset_{}.txt".format(
                       retweet_user_size)):
     with open(
             DATA_DIR + "/" + datasource +
             "/id_index/processed_dataset_{}.txt".format(
                 retweet_user_size), "r") as f:
         dataset = f.read()
         dataset = eval(dataset)
     vocab_size = get_vocab_size(datasource)
 else:
     data_builder = DatasetBuilder(datasource,
                                   time_cutoff=None,
                                   only_binary=True)
     dataset = data_builder.create_dataset(dataset_type="id_index",
                                           standardize_features=True)
     vocab_size = data_builder.get_vocab_size()
     np.set_printoptions(threshold=1e6)
     with open(
             DATA_DIR + "/" + datasource +
             "/id_index/processed_dataset_{}.txt".format(
                 retweet_user_size), "w") as f:
         f.write(str(dataset))
 print("dataset size: {}".format(len(dataset)))
 # print("task ids shape:\n{}".format(tasks_ids.shape))
 # split dataset for training and testing
 idxs = np.arange(0, len(raw_tasks))
 train_idxs, test_idxs = split_dataset(idxs, topic_split_rate[2], seed)
Exemple #15
0
            except ValueError:
                pass
        if (i + 1) % 5 == 0:
            time_spent = time.time() - start
            progress = 100. * (i + 1) / total
            print(
                f"{progress:.2f} % DONE, in {time_spent:.2f} seconds. Total would be {time_spent * 100 / (progress * 60):.2f} mins")

    df = pd.DataFrame(data=df_data, columns=df_columns)
    df.label = df.label.astype('category')
    df.to_csv(f"seiz_dataset_{name}.csv", index=False)


if __name__ == "__main__":
    dataset_selected = 'twitter16'
    # Building a SEIZ dataset
    dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000)
    full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False)
    train_set = full_dataset['train']
    dump_seiz_dataset(train_set, name=dataset_selected)
     dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val')
    dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')

    dataset_selected = 'twitter15'
    dataset_builder = DatasetBuilder(dataset_selected, only_binary=False, time_cutoff=10000)
    full_dataset = dataset_builder.create_dataset(dataset_type="raw", standardize_features=False)
    train_set = full_dataset['train']
    dump_seiz_dataset(train_set, name=dataset_selected)
    dump_seiz_dataset(full_dataset['val'], name=dataset_selected + '_val')
    dump_seiz_dataset(full_dataset['test'], name=dataset_selected + '_test')
Exemple #16
0
    def on_batch_begin(self, batch, logs=None):
        lr = cosine_decay_with_warmup(
            global_step=self.global_step,
            learning_rate_base=self.learning_rate_base,
            total_steps=self.total_steps,
            warmup_learning_rate=self.warmup_learning_rate,
            warmup_steps=self.warmup_steps,
            hold_base_rate_steps=self.hold_base_rate_steps)
        K.set_value(self.model.optimizer.lr, lr)
        if self.verbose > 0:
            print('\nBatch %05d: setting learning '
                  'rate to %s.' % (self.global_step + 1, lr))


dataset_builder = DatasetBuilder(args.charset, args.img_width,
                                 args.img_channels, args.ignore_case)
train_ds, train_size = dataset_builder.build(args.train_ann_paths, True,
                                             args.batch_size)

print('Num of training samples: {}'.format(train_size))
print("num of label", dataset_builder.num_classes)
saved_model_prefix = '{epoch:03d}_{word_accuracy:.4f}'
if args.val_ann_paths:
    val_ds, val_size = dataset_builder.build(args.val_ann_paths, False,
                                             args.batch_size)
    print('Num of val samples: {}'.format(val_size))
    saved_model_prefix = saved_model_prefix + '_{val_word_accuracy:.4f}'
else:
    val_ds = None
saved_model_path = ('saved_models/{}/'.format(localtime) + saved_model_prefix +
                    '.h5')
Exemple #17
0
train_files = files[0:int(sample_len * 0.9)]
train_labels = labels[0:int(sample_len * 0.9)]

val_files = files[int(sample_len * 0.9):]
val_labels = labels[int(sample_len * 0.9):]


def preprocess(x, y):
    img = tf.io.read_file(x)
    img = tf.io.decode_jpeg(img, channels=1)
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, (32, 100))
    return img, y


dataset_builder = DatasetBuilder('./table_path.txt', 100, 1, ignore_case=False)
train_ds, train_size = dataset_builder.build(train_files, train_labels, True,
                                             batch_size)
val_ds, val_size = dataset_builder.build(val_files, val_labels, True,
                                         batch_size)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, clipnorm=5)

Epochs = 60
model = build_model(11, channels=1)
model.summary()
model.compile(optimizer=keras.optimizers.Adam(0.0001),
              loss=CTCLoss(),
              metrics=[WordAccuracy()])

localtime = time.strftime("%Y%m%d%H%M%S", time.localtime())