Beispiel #1
0
    def transform(self, X_df, y_df=None):
        """"Transform the columns"""
        assert self._fit_called, 'You will need to first call ' \
                                 'preprocessor.fit_transform before calling ' \
                                 'preprocessor.transform.'
        text_features = []
        categorical_features = []
        numerical_features = []
        for col_name in self._text_feature_names:
            col_value = X_df[col_name]
            col_type = self._column_types[col_name]
            if col_type == _C.TEXT or col_type == _C.CATEGORICAL:
                processed_data = col_value.apply(lambda ele: ''
                                                 if ele is None else str(ele))
            elif col_type == _C.NUMERICAL:
                processed_data = pd.to_numeric(col_value).apply(
                    '{:.3f}'.format)
            else:
                raise NotImplementedError
            processed_data = parallel_transform(
                df=processed_data,
                chunk_processor=functools.partial(tokenize_data,
                                                  tokenizer=self._tokenizer))
            text_features.append(processed_data)

        for col_name, num_category in zip(self._categorical_feature_names,
                                          self._categorical_num_categories):
            col_value = X_df[col_name]
            processed_data = col_value.astype('category')
            generator = self._feature_generators[col_name]
            processed_data = generator.transform(
                pd.DataFrame({col_name: processed_data}))[col_name] \
                .cat.codes.to_numpy(np.int32, copy=True)
            processed_data[processed_data < 0] = num_category - 1
            categorical_features.append(processed_data)

        for col_name in self._numerical_feature_names:
            generator = self._feature_generators[col_name]
            col_value = pd.to_numeric(X_df[col_name]).to_numpy()
            processed_data = generator.transform(
                np.expand_dims(col_value, axis=-1))[:, 0]
            numerical_features.append(processed_data.astype(np.float32))
        if len(numerical_features) > 0:
            numerical_features = [np.stack(numerical_features, axis=-1)]
        if y_df is not None:
            if self.label_type == _C.CATEGORICAL:
                y = self.label_generator.transform(y_df)
            elif self.label_type == _C.NUMERICAL:
                y = pd.to_numeric(y_df).to_numpy()
                y = self.label_scaler.transform(np.expand_dims(
                    y, axis=-1))[:, 0].astype(np.float32)
            else:
                raise NotImplementedError
            all_data = text_features + categorical_features + numerical_features + [
                y
            ]
            return ArrayDataset(*all_data)
        else:
            all_data = text_features + categorical_features + numerical_features
            return ArrayDataset(*all_data)
Beispiel #2
0
    def _load_datasets(self):

        # load images for one of the four domains in OfficeHome
        domain_path = self.get_path(self.subtype)
        print('Loading ' + domain_path)

        # get the class folders
        _, dirnames, _ = next(os.walk(domain_path, (None, None, [])))

        # class index/name dictionaries
        self.class_to_index = dict(zip(dirnames, range(len(dirnames))))
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        print(self.index_to_class)

        all_images = None
        all_labels = np.zeros((0, ), dtype='float32')

        # Read image files in the domain
        for label in self.index_to_class.keys():
            class_path = os.path.join(domain_path, self.index_to_class[label])
            _, _, filenames = next(os.walk(class_path, (None, None, [])))

            # initialize temporary variables
            new_labels = label * np.ones((len(filenames), ), dtype='int32')
            new_images = np.zeros((len(filenames), ) + self.final_image_shape,
                                  dtype='float32')

            print('reading class ', label)

            # Read images of the current class label
            for i, fn in enumerate(filenames):
                image_path = os.path.join(class_path, fn)
                image = mx.image.imread(image_path)  # RGB image data
                image = self.transform(image)
                new_images[i, :, :, :] = np.moveaxis(
                    image, [3], [1])  # rotate color axis 'iyxc->icyx'

            print('images size', new_images.shape)

            # Extract featutes, such as ResNet-50, if an extractor network was given
            if (self.extractor is not None):
                print('extracting features')
                new_images = self.extractor(nd.array(new_images)).asnumpy()

            if (all_images is not None):
                all_images = np.vstack((all_images, new_images))
            else:
                all_images = new_images
            all_labels = np.concatenate((all_labels, new_labels))

            print('all images', all_images.shape)
            print('all labels', all_labels.shape)

        # Note: OfficeHome is a domain adaptation dataset and has no train/test split within a domain
        self.train = ArrayDataset(all_images, all_labels)
        self.test = ArrayDataset(all_images, all_labels)
Beispiel #3
0
def transform(df, params):
    # 定义数据转换接口
    # raw_data --> batch_data
    if 'n_sample' in df:
        n = getattr(params, "n_neg", 1)
        negs = list(zip(*df["sample"].values.tolist()))[:n]
        dataset = ArrayDataset(df["user_id"], df["item_id"], df["score"],
                               df["n_sample"], *negs)
    else:
        dataset = ArrayDataset(df["user_id"], df["item_id"], df["score"])

    return dataset
Beispiel #4
0
def _addDataset(name, dat, idx, batch_size, asdataloader):
    fl = dat.files
    train = ArrayDataset(dat[fl[idx]], dat[fl[idx + 1]])
    test = ArrayDataset(dat[fl[idx + 2]], dat[fl[idx + 3]])

    dat_set = DatasetGroup(name)
    if (asdataloader):
        dat_set.makeDomainDatasetLoader(train, test, batch_size)
    else:
        dat_set.train = train
        dat_set.test = test

    return dat_set
Beispiel #5
0
def train(model, x_train, y_train, x_dev, y_dev, x_test, y_test, epochs):
    model.initialize()
    x_train = x_train[0] if isinstance(x_train, (list, tuple)) else x_train
    x_dev = x_dev[0] if isinstance(x_dev, (list, tuple)) else x_dev
    x_test = x_test[0] if isinstance(x_test, (list, tuple)) else x_test
    x_train = nd.array(x_train)
    y_train = nd.array(y_train)
    x_dev = nd.array(x_dev)
    y_dev = nd.array(y_dev)
    x_test = nd.array(x_test)
    y_test = nd.array(y_test)

    train_loader = DataLoader(ArrayDataset(x_train, y_train), batch_size=32)
    dev_loader = DataLoader(ArrayDataset(x_dev, y_dev), batch_size=32)
    test_loader = DataLoader(ArrayDataset(x_test, y_test), batch_size=32)

    criterion = mxnet.gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=True)
    trainer = mxnet.gluon.Trainer(model.collect_params(), 'sgd',
                                  {'learning_rate': .1})

    for t in range(epochs):
        model._vivisect["epoch"] += 1
        model._vivisect["mode"] = "train"
        train_loss, dev_loss, test_loss = 0.0, 0.0, 0.0
        for x, y in train_loader:
            with mxnet.autograd.record():
                y_pred = model(x)
                loss = criterion(y_pred, y)
            loss.backward()
            trainer.step(y_train.shape[0])
            train_loss += mxnet.nd.sum(loss).asscalar()

        model._vivisect["mode"] = "dev"
        for x, y in dev_loader:
            with mxnet.autograd.record():
                y_pred = model(x)
                loss = criterion(y_pred, y)
            dev_loss += mxnet.nd.sum(loss).asscalar()

        model._vivisect["mode"] = "test"
        for x, y in test_loader:
            with mxnet.autograd.record():
                y_pred = model(x)
                loss = criterion(y_pred, y)
            test_loss += mxnet.nd.sum(loss).asscalar()

        logging.info(
            "Iteration {} train/dev/test loss: {:.4f}/{:.4f}/{:.4f}".format(
                t + 1, train_loss, dev_loss, test_loss))
Beispiel #6
0
    def __init__(self,
                 w,
                 b,
                 num_examples,
                 std_x,
                 noise,
                 hold_out=None,
                 seed=None,
                 context=None):
        """
        :param w: Task's weights vector.
        :param b: Task's bias.
        :param num_examples: Total number of examples per task.
        :param std_x: The covariates are sampled from a zero mean normal distribution with
            standard deviation equal to std_x.
        :param hold_out: Number of examples to hold out for validation
        :param seed: seed for the random generator
        """

        self.w = w
        self.b = b
        self.num_examples = num_examples
        self.seed = seed

        if context is None:
            context = mxnet.cpu()
        self.context = context

        if seed:
            random.seed(seed)
        if hold_out and hold_out < num_examples:
            Xtr, Ytr = self._real_fn(
                std_x * mxnet.nd.random_normal(
                    shape=(num_examples - hold_out, len(w)), ctx=context),
                noise)
            train_dataset = ArrayDataset(Xtr, Ytr)
            Xval, Yval = self._real_fn(
                std_x *
                mxnet.nd.random_normal(shape=(hold_out, len(w)), ctx=context),
                noise)
            val_dataset = ArrayDataset(Xval, Yval)
        else:
            Xtr, Ytr = self._real_fn(
                std_x * mxnet.nd.random_normal(shape=(num_examples, len(w)),
                                               ctx=context), noise)
            train_dataset = ArrayDataset(Xtr, Ytr)
            val_dataset = None

        super().__init__(train_dataset, val_dataset, context=context)
def etl(data_x, data_y, cfg: Configuration):
    batch_size = cfg.batch_size
    dataset = ArrayDataset(
        mx.nd.array(data_x),
        mx.nd.array(data_y)
    )
    return DataLoader(dataset, batch_size=batch_size)
Beispiel #8
0
 def update(self, epochs=100):
     if self.option == 1:
         data_numpy = np.array(self.data)
         X = np.array(data_numpy[:, :-1])
         y = np.array(data_numpy[:, -1])
         # np.mean()
         # self.null_score_diff_threshold = np.median(data_numpy[:,0])
         self.null_score_diff_threshold = np.mean(
             data_numpy[:, 0]) + np.std(data_numpy[:, 0]) * .05
     elif self.option == 2:
         data_numpy = np.array(self.data)
         X = nd.array(data_numpy[:, :-1])
         y = nd.array(data_numpy[:, -1])
         train_dataset = ArrayDataset(X, y)
         train_dataloader = DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=True)
         for e in range(epochs):
             cumulative_train_loss = 0
             for i, (data, label) in enumerate(train_dataloader):
                 data = data.as_in_context(self.ctx)
                 label = label.as_in_context(self.ctx)
                 with autograd.record():
                     # Do forward pass on a batch of training data
                     output = self.classifier(data)
                     # Calculate loss for the training data batch
                     loss_result = self.loss(output, label)
                 # Calculate gradients
                 loss_result.backward()
                 # Update parameters of the network
                 self.trainer.step(len(data))
     self.data = list()
Beispiel #9
0
def toy(ctx, nsamples=10000, seed=0xdeadbeef):
    rs = np.random.RandomState(seed)
    label = rs.uniform(-1., 1., size=nsamples).astype(np.float32)
    signal = np.stack([label, np.zeros(nsamples).astype(np.float32)], axis=-1)
    distractor = rs.multivariate_normal([0, 0], [[1, 1], [1, 1]],
                                        size=nsamples).astype(np.float32)
    data = signal + distractor
    return ArrayDataset(data, label)
Beispiel #10
0
def load_cached_dataset(prefix):
    cached_file_path = os.path.join(_C.CACHE_PATH, prefix + '.npz')
    if os.path.exists(cached_file_path):
        print('Load cached data from {}'.format(cached_file_path))
        dat = np.load(cached_file_path)
        return ArrayDataset(np.array(dat['src_data']), np.array(dat['tgt_data']))
    else:
        return None
Beispiel #11
0
    def run(self,
            infr_executor,
            data,
            param_dict,
            ctx,
            optimizer='adam',
            learning_rate=1e-3,
            max_iter=1000,
            verbose=False):
        """
        :param infr_executor: The MXNet function that computes the training objective.
        :type infr_executor: MXNet Gluon Block
        :param data: a list of observed variables
        :type data: [mxnet.ndarray]
        :param param_dict: The MXNet ParameterDict for Gradient-based optimization
        :type param_dict: mxnet.gluon.ParameterDict
        :param ctx: MXNet context
        :type ctx: mxnet.cpu or mxnet.gpu
        :param optimizer: the choice of optimizer (default: 'adam')
        :type optimizer: str
        :param learning_rate: the learning rate of the gradient optimizer (default: 0.001)
        :type learning_rate: float
        :param max_iter: the maximum number of iterations of gradient optimization
        :type max_iter: int
        :param verbose: whether to print per-iteration messages.
        :type verbose: boolean
        """

        if isinstance(data, mx.gluon.data.DataLoader):
            data_loader = data
        else:
            data_loader = mx.gluon.data.DataLoader(ArrayDataset(*data),
                                                   batch_size=self.batch_size,
                                                   shuffle=True,
                                                   last_batch='rollover')
        trainer = mx.gluon.Trainer(
            param_dict,
            optimizer=optimizer,
            optimizer_params={'learning_rate': learning_rate})
        for e in range(max_iter):
            L_e = 0
            n_batches = 0
            for i, data_batch in enumerate(data_loader):
                with mx.autograd.record():
                    loss, loss_for_gradient = infr_executor(
                        mx.nd.zeros(1, ctx=ctx), *data_batch)
                    loss_for_gradient.backward()
                if verbose:
                    print('\repoch {} Iteration {} loss: {}\t\t\t'.format(
                        e + 1, i + 1,
                        loss.asscalar() / self.batch_size),
                          end='')
                trainer.step(batch_size=self.batch_size,
                             ignore_stale_grad=True)
                L_e += loss.asscalar() / self.batch_size
                n_batches += 1
            if verbose:
                print('epoch-loss: {} '.format(L_e / n_batches))
Beispiel #12
0
 def __init__(self):
     data_set = [
         ArrayDataset(list(range(10))),
         ArrayDataset(list(range(20, 40)))
     ]
     ratio = [3, 4]
     self.dataset_len = 0
     self.data_loader_list = []
     self.dataloader_iter_list = []
     for d, r in zip(data_set, ratio):
         self.data_loader_list.append(
             DataLoader(dataset=d,
                        batch_size=r,
                        last_batch='rollover',
                        shuffle=True))
         self.dataset_len += len(d)
     for s in self.data_loader_list:
         self.dataloader_iter_list.append(iter(s))
Beispiel #13
0
def get_array_data(train_array, val_array, data_shape, batch_size, num_workers=os.cpu_count()):
    train_dataset = ArrayDataset(train_array)
    val_dataset = ArrayDataset(val_array)

    train_transformer = gluon.data.vision.transforms.Compose([
        transforms.RandomFlipLeftRight(),
        transforms.RandomResizedCrop(data_shape, scale=(0.5, 1.0)),
        transforms.RandomBrightness(0.5),
        transforms.RandomHue(0.1),
        transforms.Resize(data_shape),
        transforms.ToTensor()
    ])
    val_transformer = gluon.data.vision.transforms.Compose([
        transforms.Resize(data_shape),
        transforms.ToTensor()
    ])

    train_dataloader = data.DataLoader(train_dataset.transform_first(train_transformer),
                                         batch_size=batch_size, shuffle=True, last_batch='rollover', 
                                        num_workers=num_workers)
    val_dataloader = data.DataLoader(val_dataset.transform_first(val_transformer),
                                         batch_size=batch_size, shuffle=True, last_batch='rollover', 
                                        num_workers=num_workers)

    return train_dataloader, val_dataloader
Beispiel #14
0
def dataiter_all_sensors_seq2seq(flow, scaler, setting, shuffle=True):
    dataset = setting['dataset']
    training = setting['training']

    mask = np.sum(flow, axis=(1, 2)) > 5000

    flow = scaler.transform(flow)

    n_timestamp, num_nodes, _ = flow.shape

    timespan = (np.arange(n_timestamp) % 24) / 24
    timespan = np.tile(timespan, (1, num_nodes, 1)).T
    flow = np.concatenate((flow, timespan), axis=2)

    geo_feature = get_geo_feature(dataset)

    input_len = dataset['input_len']
    output_len = dataset['output_len']
    feature, data, label = [], [], []
    for i in range(n_timestamp - input_len - output_len + 1):
        if mask[i + input_len:i + input_len + output_len].sum() != output_len:
            continue

        data.append(flow[i:i + input_len])
        label.append(flow[i + input_len:i + input_len + output_len])
        feature.append(geo_feature)

        if i % 1000 == 0:
            logging.info('Processing %d timestamps', i)
            # if i > 0: break

    data = mx.nd.array(np.stack(data))  # [B, T, N, D]
    label = mx.nd.array(np.stack(label))  # [B, T, N, D]
    feature = mx.nd.array(np.stack(feature))  # [B, N, D]

    logging.info('shape of feature: %s', feature.shape)
    logging.info('shape of data: %s', data.shape)
    logging.info('shape of label: %s', label.shape)

    from mxnet.gluon.data import ArrayDataset, DataLoader
    return DataLoader(
        ArrayDataset(feature, data, label),
        shuffle=shuffle,
        batch_size=training['batch_size'],
        num_workers=4,
        last_batch='rollover',
    )
Beispiel #15
0
def prepare_pretrain_text_dataset(filenames, tokenizer, max_seq_length,
                                  short_seq_prob, cached_file_path):
    """Create dataset based on the raw text files"""
    if not isinstance(filenames, (list, tuple)):
        filenames = [filenames]
    if cached_file_path:
        # generate a filename based on the input filename ensuring no crash.
        # filename example: urlsf_subset00-130_data.txt
        suffix = re.split(r'\.|/', filenames[0])[-2]
        output_file = os.path.join(cached_file_path,
                                   "{}-pretrain-record.npz".format(suffix))
    else:
        output_file = None
    np_features = get_all_features(
        (filenames, output_file, tokenizer, max_seq_length, short_seq_prob))

    return ArrayDataset(*np_features)
Beispiel #16
0
def display_multi():
    transform = transforms.ToTensor()
    dataset = []
    for rec in args.inputs:
        dataset.append(ImageRecordDataset(rec).transform_first(transform))
    dataset = ArrayDataset(*dataset)
    loader = DataLoader(dataset,
                        batch_size=args.batch_size,
                        shuffle=args.shuffle,
                        last_batch='keep',
                        num_workers=args.num_workers,
                        pin_memory=False)
    for idx, batch_data in enumerate(loader):
        batch_img = []
        for (img, _) in batch_data:
            batch_img.append(img)
        batch_img = mx.nd.concat(*batch_img, dim=0)
        show_images(batch_img, ncols=min(8, args.batch_size))
        input('Press Enter...')
Beispiel #17
0
def dataset_no_bucketing(source_data_path: str, target_data_path: str,
                         max_seq_len_source: int, max_seq_len_target: int,
                         vocab_source: Vocab,
                         vocab_target: Vocab) -> ArrayDataset:
    source_sentences = []  # List[int]
    target_sentences = []  # List[int]
    for words_source, words_target in BilingualTextReader(
            path_source=source_data_path,
            path_target=target_data_path,
            max_seq_len_source=max_seq_len_source,
            max_seq_len_target=max_seq_len_target):
        source_sentences.append(
            seq2integer(words_source, vocab_source, max_seq_len_source))
        target_sentences.append(
            seq2integer(words_target, vocab_target, max_seq_len_target))
    print(len(target_sentences))
    print(len(source_sentences))
    assert len(source_sentences) == len(target_sentences)
    return ArrayDataset(source_sentences, target_sentences)
def train(args: argparse.Namespace) -> HybridBlock:
    session = boto3.session.Session()

    client = session.client(service_name="secretsmanager",
                            region_name="us-east-1")
    mlflow_secret = client.get_secret_value(SecretId=args.mlflow_secret)
    mlflowdb_conf = json.loads(mlflow_secret["SecretString"])

    converters.encoders[np.float64] = converters.escape_float
    converters.conversions = converters.encoders.copy()
    converters.conversions.update(converters.decoders)

    mlflow.set_tracking_uri(
        f"mysql+pymysql://{mlflowdb_conf['username']}:{mlflowdb_conf['password']}@{mlflowdb_conf['host']}/mlflow"
    )

    if mlflow.get_experiment_by_name(args.mlflow_experiment) is None:
        mlflow.create_experiment(args.mlflow_experiment,
                                 args.mlflow_artifacts_location)
    mlflow.set_experiment(args.mlflow_experiment)

    col_names = ["target"] + [f"kinematic_{i}" for i in range(1, 22)]

    train_df = pd.read_csv(f"{args.train_channel}/train.csv.gz",
                           header=None,
                           names=col_names)

    val_df = pd.read_csv(f"{args.validation_channel}/val.csv.gz",
                         header=None,
                         names=col_names)

    train_X = train_df.drop("target", axis=1)
    train_y = train_df["target"]
    train_dataset = ArrayDataset(train_X.to_numpy(dtype="float32"),
                                 train_y.to_numpy(dtype="float32"))
    train = DataLoader(train_dataset, batch_size=args.batch_size)

    val_X = val_df.drop("target", axis=1)
    val_y = val_df["target"]
    val_dataset = ArrayDataset(val_X.to_numpy(dtype="float32"),
                               val_y.to_numpy(dtype="float32"))
    validation = DataLoader(val_dataset, batch_size=args.batch_size)

    ctx = [gpu(i) for i in range(args.gpus)] if args.gpus > 0 else cpu()

    mlflow.gluon.autolog()

    with mlflow.start_run():
        net = HybridSequential()
        with net.name_scope():
            net.add(Dense(256))
            net.add(Dropout(.2))
            net.add(Dense(64))
            net.add(Dropout(.1))
            net.add(Dense(16))
            net.add(Dense(2))

        net.initialize(Xavier(magnitude=2.24), ctx=ctx)
        net.hybridize()

        trainer = Trainer(net.collect_params(), "sgd",
                          {"learning_rate": args.learning_rate})
        est = estimator.Estimator(net=net,
                                  loss=SoftmaxCrossEntropyLoss(),
                                  trainer=trainer,
                                  train_metrics=Accuracy(),
                                  context=ctx)
        est.fit(train, epochs=args.epochs, val_data=validation)

    return net
Beispiel #19
0
def split_to_sequences(x, y, n_prev=10):
    docX, docY = [], []
    for i in range(len(x) - n_prev):
        docX.append(x[i:i + n_prev])
        docY.append(y[i + n_prev])

    return np.array(docX).astype('float32'), np.array(docY).astype('float32')


data_x, data_y = split_to_sequences(data_x,
                                    data_y,
                                    n_prev=args.sequence_length)
ntr = int(len(data_x) * (1 - args.test_split))

# --- dataloader
train_dataset = ArrayDataset(data_x[:ntr], data_y[:ntr])
train_dataloader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              last_batch='discard',
                              shuffle=True)

test_dataset = ArrayDataset(data_x[ntr:], data_y[ntr:])
test_dataloader = DataLoader(train_dataset,
                             batch_size=args.batch_size,
                             last_batch='discard',
                             shuffle=True)

logger.info('datasets: train=%d samples, validation=%d samples' %
            (len(train_dataset), len(test_dataset)))

# --- run
Beispiel #20
0
def transform(x, y, batch_size, **params):
    dataset = ArrayDataset(x.astype("float32"), y.astype("float32"))
    return DataLoader(dataset, batch_size=batch_size, **params)
Beispiel #21
0
def main(model_saved_path, model_name):
    ne_cate_dic = Configuer.ne_cate_dic
    word_path = Configuer.word_path
    label_path = Configuer.label_path
    nature_path = Configuer.nature_path

    X_path = Configuer.X_path
    y_path = Configuer.y_path
    nature_py_path = Configuer.nature_py_path
    word_vocab_path = Configuer.word_vocab_path
    label_vocab_path = Configuer.label_vocab_path
    nature_vocab_path = Configuer.nature_vocab_path

    max_seq_len = Configuer.MAX_SEQ_LEN
    pad = Configuer.PAD
    pad_nature = Configuer.PAD_NATURE
    unk = Configuer.UNK
    not_ne = Configuer.NOT

    # 从本地读取数据
    if os.path.exists(word_vocab_path) and os.path.exists(label_vocab_path)\
            and os.path.exists(nature_vocab_path) and os.path.exists(X_path)\
            and os.path.exists(y_path) and os.path.exists(nature_py_path):
        print('Loading existed data...')
        with open(word_vocab_path,
                  'rb') as f1, open(label_vocab_path,
                                    'rb') as f2, open(nature_vocab_path,
                                                      'rb') as f3:
            word_vocab = pickle.load(f1)
            label_vocab = pickle.load(f2)
            nature_vocab = pickle.load(f3)
        data_x, data_y, data_nature = np.load(X_path), np.load(
            y_path), np.load(nature_py_path)
        print('Loading end!')
    else:
        # 转换文本数据到 numpy数据 和 pickle 数据
        print('Converting data from scratch...')
        word_vocab, label_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs = read_data(
            word_path, label_path, nature_path, max_seq_len, pad, not_ne,
            pad_nature, unk)
        data_x, data_y, data_nature = convert_txt_data(
            X_path, y_path, nature_py_path, input_seqs, output_seqs,
            nature_seqs, word_vocab, label_vocab, nature_vocab, max_seq_len,
            unk)
        with open(word_vocab_path,
                  'wb') as fw1, open(label_vocab_path, 'wb') as fw2, open(
                      nature_vocab_path, 'wb') as fw3:
            pickle.dump(word_vocab, fw1)
            pickle.dump(label_vocab, fw2)
            pickle.dump(nature_vocab, fw3)
        np.save(X_path, data_x)
        np.save(y_path, data_y)
        np.save(nature_py_path, data_nature)
        print('Converting end!')

    # 切分训练集和验证集
    X_train, X_valid, Y_train, Y_valid, nature_train, nature_valid = train_test_split(
        data_x, data_y, data_nature, test_size=0.1, random_state=33)
    print(X_train.shape, X_valid.shape)
    # X_train = X_train[0:512]
    # nature_train = nature_train[0:512]
    # Y_train = Y_train[0:512]
    # X_valid = X_valid[0:512]
    # nature_valid = nature_valid[0:512]
    # Y_valid = Y_valid[0:512]
    dataset_train = ArrayDataset(nd.array(X_train, ctx=CTX),
                                 nd.array(nature_train, ctx=CTX),
                                 nd.array(Y_train, ctx=CTX))
    data_iter_train = DataLoader(dataset_train,
                                 batch_size=256,
                                 shuffle=True,
                                 last_batch='rollover')
    dataset_valid = ArrayDataset(nd.array(X_valid, ctx=CTX),
                                 nd.array(nature_valid, ctx=CTX),
                                 nd.array(Y_valid, ctx=CTX))
    data_iter_valid = DataLoader(dataset_valid, batch_size=256, shuffle=False)

    # 根据参数配置模型
    model, loss = None, None
    word_vocab_size, word_vec_size = len(word_vocab), 300
    nature_vocab_size, nature_vec_size = len(nature_vocab), 50
    drop_prob = 0.3
    num_epochs = 20
    lr = 0.0001

    if model_name == 'lstm_crf':
        print('train lstm_crf model')
        hidden_dim = 128
        num_layers = 2
        tag2idx = label_vocab.token_to_idx
        model = LSTM_CRF(word_vocab_size, word_vec_size, nature_vocab_size,
                         nature_vec_size, hidden_dim, num_layers, tag2idx,
                         drop_prob)
        model.initialize(init=init.Xavier(), ctx=CTX)
        loss = model.crf.neg_log_likelihood
    elif model_name == 'cnn_crf':
        pass
    elif model_name == 'cnn':
        pass

    trainer = gluon.Trainer(model.collect_params(), 'adam',
                            {'learning_rate': lr})

    # 开始训练
    print('waiting...')
    print(model)
    th.train(data_iter_train, data_iter_valid, model, loss, trainer, CTX,
             num_epochs, word_vocab, label_vocab, max_seq_len, ne_cate_dic)

    # 保存模型参数
    model.save_parameters(model_saved_path)
    print(model_name + 'model params has saved in :',
          os.path.abspath(model_saved_path))
Beispiel #22
0
def main():
    # importing libraries
    import pandas as pd
    import mxnet as mx
    from mxnet import nd, autograd, gluon
    from mxnet.gluon.data import ArrayDataset
    from mxnet.gluon.data import DataLoader
    import numpy as np
    import random

    # creating variables
    extension = '.csv'

    # Load Data
    categories = ['Excellent', 'Very_good', 'Good', 'Average', 'Poor']

    # Load the data in memory
    MAX_ITEMS_PER_CATEGORY = 80000

    # Loading data from file if exist
    try:
        data = pd.read_pickle('pickleddata.pkl')
    except:
        data = None

    if data is None:
        data = pd.DataFrame(data={'X': [], 'Y': []})
        for index, category in enumerate(categories):
            df = pd.read_csv(category + extension, encoding='utf8')
            df = pd.DataFrame(data={
                'X': (df['Review'])[:MAX_ITEMS_PER_CATEGORY],
                'Y': index
            })
            data = data.append(df)
            print('{}:{} reviews'.format(category, len(df)))

        # Shuffle the samples
        data = data.sample(frac=1)
        data.reset_index(drop=True, inplace=True)
        # Saving the data in a pickled file
        pd.to_pickle(data, 'pickleddata.pkl')

    print('Value counts:\n', data['Y'].value_counts())
    for i, cat in enumerate(categories):
        print(i, cat)
    data.head()

    # Creating the dataset
    ALPHABET = list(
        "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+ =<>()[]{}"
    )  # The 69 characters as specified in the paper
    ALPHABET_INDEX = {letter: index
                      for index, letter in enumerate(ALPHABET)
                      }  # { a: 0, b: 1, etc}
    FEATURE_LEN = 1014  # max-length in characters for one document
    NUM_WORKERS = 0  # number of workers used in the data loading
    BATCH_SIZE = 128  # number of documents per batch

    def encode(text):
        encoded = np.zeros([len(ALPHABET), FEATURE_LEN], dtype='float32')
        review = text.lower()[:FEATURE_LEN - 1:-1]
        i = 0
        for letter in text:
            if i >= FEATURE_LEN:
                break
            if letter in ALPHABET_INDEX:
                encoded[ALPHABET_INDEX[letter]][i] = 1
            i += 1
        return encoded

    def transform(x, y):
        return encode(x), y

    split = 0.8
    split_index = int(split * len(data))
    train_data_X = data['X'][:split_index].as_matrix()
    train_data_Y = data['Y'][:split_index].as_matrix()
    test_data_X = data['X'][split_index:].as_matrix()
    test_data_Y = data['Y'][split_index:].as_matrix()
    train_dataset = ArrayDataset(train_data_X,
                                 train_data_Y).transform(transform)
    test_dataset = ArrayDataset(test_data_X, test_data_Y).transform(transform)

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=BATCH_SIZE,
                                  num_workers=NUM_WORKERS,
                                  last_batch='rollover')
    test_dataloader = DataLoader(test_dataset,
                                 shuffle=False,
                                 batch_size=BATCH_SIZE,
                                 num_workers=NUM_WORKERS,
                                 last_batch='rollover')

    ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()

    NUM_FILTERS = 256  # number of convolutional filters per convolutional layer
    NUM_OUTPUTS = len(categories)  # number of classes
    FULLY_CONNECTED = 1024  # number of unit in the fully connected dense layer
    DROPOUT_RATE = 0.5  # probability of node drop out
    LEARNING_RATE = 0.0001  # learning rate of the gradient
    MOMENTUM = 0.9  # momentum of the gradient
    WDECAY = 0.00001  # regularization term to limit size of weights

    net = gluon.nn.HybridSequential()
    with net.name_scope():
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=7,
                            activation='relu'))
        net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=7,
                            activation='relu'))
        net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=3,
                            activation='relu'))
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=3,
                            activation='relu'))
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=3,
                            activation='relu'))
        net.add(
            gluon.nn.Conv1D(channels=NUM_FILTERS,
                            kernel_size=3,
                            activation='relu'))
        net.add(gluon.nn.MaxPool1D(pool_size=3, strides=3))
        net.add(gluon.nn.Flatten())
        net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
        net.add(gluon.nn.Dropout(DROPOUT_RATE))
        net.add(gluon.nn.Dense(FULLY_CONNECTED, activation='relu'))
        net.add(gluon.nn.Dropout(DROPOUT_RATE))
        net.add(gluon.nn.Dense(NUM_OUTPUTS))
    print(net)

    hybridize = True  # for speed improvement, compile the network but no in-depth debugging possible
    # load_params = True  # Load pre-trained model

    net.initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)

    if hybridize:
        net.hybridize(static_alloc=True, static_shape=True)

    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()

    trainer = gluon.Trainer(net.collect_params(), 'sgd', {
        'learning_rate': LEARNING_RATE,
        'wd': WDECAY,
        'momentum': MOMENTUM
    })

    def evaluate_accuracy(data_iterator, net):
        acc = mx.metric.Accuracy()
        for i, (data, label) in enumerate(data_iterator):
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)
            output = net(data)
            prediction = nd.argmax(output, axis=1)
            acc.update(preds=prediction, labels=label)
        return acc.get()[1]

    start_epoch = 6
    number_epochs = 7
    smoothing_constant = .01
    for e in range(start_epoch, number_epochs):
        for i, (review, label) in enumerate(train_dataloader):
            review = review.as_in_context(ctx)
            label = label.as_in_context(ctx)
            with autograd.record():
                output = net(review)
                loss = softmax_cross_entropy(output, label)
            loss.backward()
            trainer.step(review.shape[0])

            # moving average of the loss
            curr_loss = nd.mean(loss)
            moving_loss = (curr_loss if (i == 0) else
                           (1 - smoothing_constant) * moving_loss +
                           (smoothing_constant) * curr_loss)

            if (i % 200 == 0):
                print(
                    'Batch {}: Instant loss {:.4f}, Moving loss {:.4f}'.format(
                        i, curr_loss.asscalar(), moving_loss.asscalar()))

        test_accuracy = evaluate_accuracy(test_dataloader, net)
        # Save the model using the gluon params format
        net.save_parameters('crepe_epoch_{}_test_acc_{}.params'.format(
            e,
            int(test_accuracy * 10000) / 100))
        print("Epoch {}. Loss: {:.4f}, Test_acc {:.4f}".format(
            e, moving_loss.asscalar(), test_accuracy))

    net.export('crepe', epoch=number_epochs)

    for i in range(50):
        index = random.randint(1, len(data))
        review = data['X'][index]
        label = categories[int(data['Y'][index])]
        print(review)
        print('\nCategory: {}\n'.format(label))
        encoded = nd.array([encode(review)], ctx=ctx)
        output = net(encoded)
        predicted = categories[np.argmax(output[0].asnumpy())]
        if predicted == label:
            print('Correctly predicted the right category')
        else:
            print('Incorrectly predicted {}'.format(predicted))

    review_title = "Good stuff"
    review = "This course is definitely better than the previous one"

    print(review_title)
    print(review + '\n')
    encoded = nd.array([encode(review + " | " + review_title)], ctx=ctx)
    output = net(encoded)
    softmax = nd.exp(output) / nd.sum(nd.exp(output))[0]
    predicted = categories[np.argmax(output[0].asnumpy())]
    print('Predicted: {}\n'.format(predicted))
    for i, val in enumerate(categories):
        print(val, float(int(softmax[0][i].asnumpy() * 1000) / 10), '%')
Beispiel #23
0
    def fit_transform(self, X, y):
        """Fit and Transform the dataframe

        Parameters
        ----------
        X
            The feature dataframe
        y
            The label series

        Returns
        -------
        processed_X
            The processed X data
        (processed_y)
            The processed Y data
        """
        if self._fit_called:
            raise RuntimeError(
                'Fit has been called. Please create a new preprocessor and call '
                'fit again!')
        self._fit_called = True
        text_features = []
        categorical_features = []
        numerical_features = []
        for col_name in sorted(X.columns):
            col_type = self._column_types[col_name]
            logger.log(10, f'Process col "{col_name}" with type "{col_type}"')
            col_value = X[col_name]
            if col_type == _C.NULL:
                self._ignore_columns_set.add(col_name)
                continue
            elif col_type == _C.TEXT:
                col_value = col_value.apply(lambda ele: ''
                                            if ele is None else str(ele))
                processed_col_value = parallel_transform(
                    df=col_value,
                    chunk_processor=functools.partial(
                        tokenize_data, tokenizer=self._tokenizer))
                text_features.append(processed_col_value)
                self._text_feature_names.append(col_name)
            elif col_type == _C.CATEGORICAL:
                if self.cfg.categorical.convert_to_text:
                    # Convert categorical column as text column
                    processed_data = col_value.apply(
                        lambda ele: '' if ele is None else str(ele))
                    if len(np.unique(processed_data)) == 1:
                        self._ignore_columns_set.add(col_name)
                        continue
                    processed_data = parallel_transform(
                        df=processed_data,
                        chunk_processor=functools.partial(
                            tokenize_data, tokenizer=self._tokenizer))
                    text_features.append(processed_data)
                    self._text_feature_names.append(col_name)
                else:
                    processed_data = col_value.astype('category')
                    generator = self._feature_generators[col_name]
                    processed_data = generator.fit_transform(
                        pd.DataFrame({col_name: processed_data}))[col_name]\
                        .cat.codes.to_numpy(np.int32, copy=True)
                    if len(np.unique(processed_data)) == 1:
                        self._ignore_columns_set.add(col_name)
                        continue
                    num_categories = len(generator.category_map[col_name])
                    processed_data[processed_data < 0] = num_categories
                    self._categorical_num_categories.append(num_categories + 1)
                    categorical_features.append(processed_data)
                    self._categorical_feature_names.append(col_name)
            elif col_type == _C.NUMERICAL:
                processed_data = pd.to_numeric(col_value)
                if len(processed_data.unique()) == 1:
                    self._ignore_columns_set.add(col_name)
                    continue
                if self.cfg.numerical.convert_to_text:
                    processed_data = processed_data.apply('{:.3f}'.format)
                    processed_data = parallel_transform(
                        df=processed_data,
                        chunk_processor=functools.partial(
                            tokenize_data, tokenizer=self._tokenizer))
                    text_features.append(processed_data)
                    self._text_feature_names.append(col_name)
                else:
                    generator = self._feature_generators[col_name]
                    processed_data = generator.fit_transform(
                        np.expand_dims(processed_data.to_numpy(), axis=-1))[:,
                                                                            0]
                    numerical_features.append(processed_data.astype(
                        np.float32))
                    self._numerical_feature_names.append(col_name)
            else:
                raise NotImplementedError(
                    f'Type of the column is not supported currently. '
                    f'Received {col_name}={col_type}.')
        if len(numerical_features) > 0:
            numerical_features = [np.stack(numerical_features, axis=-1)]
        if self.label_type == _C.CATEGORICAL:
            if self._label_generator is None:
                self._label_generator = LabelEncoder()
                y = self._label_generator.fit_transform(y)
            else:
                y = self._label_generator.transform(y)
        elif self.label_type == _C.NUMERICAL:
            y = pd.to_numeric(y).to_numpy()
            y = self._label_scaler.fit_transform(np.expand_dims(
                y, axis=-1))[:, 0].astype(np.float32)
        else:
            raise NotImplementedError(
                f'Type of label column is not supported. '
                f'Label column type={self._label_column}')
        # Wrap the processed features and labels into a training dataset
        all_data = text_features + categorical_features + numerical_features + [
            y
        ]
        dataset = ArrayDataset(*all_data)
        return dataset
Beispiel #24
0
                 [3.22600627, 0.]])

y = mx.nd.array([
    1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
    1., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
    1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1.,
    0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0.,
    1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.,
    0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1.,
    1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.
])

val_x = mx.nd.array([[4., 0.], [2., 0.], [-2., 0.]])
val_ground_truth_class = mx.nd.array([0., 1., 1.])

train_dataset = ArrayDataset(X, y)
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

val_dataset = ArrayDataset(val_x, val_ground_truth_class)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

classifier = nn.HybridSequential()
with classifier.name_scope():
    classifier.add(nn.Dense(units=10, activation='relu'))  # input layer
    classifier.add(nn.Dense(units=10, activation='relu'))  # inner layer 1
    classifier.add(nn.Dense(units=10, activation='relu'))  # inner layer 2
    classifier.add(
        nn.Dense(units=1))  # output layer: notice, it must have only 1 neuron
classifier.initialize(mx.init.Xavier())
Beispiel #25
0
 X_train = load_data('./dataset/task8_train_input.csv')
 y_train = load_data('./dataset/task8_train_output.csv')
 X_test = load_data('./dataset/task8_test_input.csv')
 y_test = load_data('./dataset/task8_test_output.csv')
 X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                   y_train,
                                                   train_size=0.9,
                                                   random_state=0)
 print('TrainSet Shape:{}'.format(X_train.shape))
 print('TestSet Shape:{}'.format(X_test.shape))
 build_dataloader = partial(DataLoader,
                            batch_size=32,
                            shuffle=False,
                            last_batch='keep',
                            batchify_fn=_default_batchify_fn)
 train_dataloader = build_dataloader(dataset=ArrayDataset(X_train, y_train))
 test_dataloader = build_dataloader(dataset=ArrayDataset(X_test, y_test))
 val_dataloader = build_dataloader(dataset=ArrayDataset(X_val, y_val))
 '''start training'''
 sess.run(tf.global_variables_initializer())
 train_loss, train_acc = _MMetric(), _MMetric()
 print_freq = 50
 for step, (x, y) in enumerate(
         tqdm(train_dataloader, desc='Training', position=0)):
     sos_input = np.ones(shape=(len(y), 1), dtype=np.int32) * sos_token
     t = np.random.rand() < use_teacher_forcing_ratio
     d = sos_input if not t else np.concatenate(
         (sos_input, y[:, 1:]), axis=1)
     feed_dict = {
         encoder_input: x,
         decoder_input: d,
Beispiel #26
0
    transforms.ToTensor(),
])

# %%

emnist_train_data, emnist_train_labels = extract_training_samples('balanced')
emnist_test_data, emnist_test_labels = extract_test_samples('balanced')

emnist_train_data = nd.array(255 - emnist_train_data[:, :, :, None])
emnist_test_data = nd.array(255 - emnist_test_data[:, :, :, None])

# %%
BS = 64

emnist_train_dataset = ArrayDataset(
    SimpleDataset(emnist_train_data).transform(transform_train_emnist),
    emnist_train_labels)
emnist_train_loader = DataLoader(emnist_train_dataset,
                                 shuffle=True,
                                 batch_size=BS)

emnist_test_dataset = ArrayDataset(
    SimpleDataset(emnist_test_data).transform(transform_test),
    emnist_test_labels)
emnist_test_loader = DataLoader(emnist_test_dataset, batch_size=BS)

# with SummaryWriter(logdir='./logs') as sw:
#    sw.add_histogram('emnist_classes', mx.nd.array([c for (f,c) in emnist_train_dataset]), bins=np.arange(-0.5, len(classes)+1))
#    sw.add_histogram('emnist_classes', mx.nd.array([c for (f,c) in emnist_test_dataset]), bins=np.arange(-0.5, len(classes)+1))

# %%
Beispiel #27
0
from tmnt.bert_handling import get_bert_datasets
from mxnet.gluon.data import ArrayDataset

data, y = fetch_20newsgroups(shuffle=True, random_state=1,
                              remove=('headers', 'footers', 'quotes'),
                              return_X_y=True)
train_data = data[:2000]
dev_data   = data[-2000:]
train_y    = y[:2000]
dev_y      = y[-2000:]
model_name = 'bert_12_768_12'
dataset = 'book_corpus_wiki_en_uncased'
batch_size = 32
seq_len = 64
pad = True
tr_ds = ArrayDataset(train_data, train_y)
dev_ds = ArrayDataset(dev_data, dev_y)

vectorizer = TMNTVectorizer(vocab_size=2000)
vectorizer.fit_transform(train_data)

ctx = mx.cpu() ## or mx.gpu(N) if using GPU device=N

tr_dataset, dev_dataset, num_examples, bert_base, _ = get_bert_datasets(None, vectorizer,
                                                                        tr_ds, dev_ds, batch_size, seq_len,
                                                                        bert_model_name=model_name,
                                                                        bert_dataset=dataset,
                                                                        pad=False, ctx=ctx)
num_classes = int(np.max(y) + 1)

estimator = SeqBowEstimator(bert_base, bert_model_name = model_name, bert_data_name = dataset,
Beispiel #28
0
    return encoded

"""The MXNet DataSet and DataLoader API lets you create different worker to pre-fetch the data and encode it the way you want, in order to prevent your GPU from starving"""

def transform(x, y):
    return encode(x), y

"""We split our data into a training and a testing dataset"""

split = 0.8
split_index = int(split*len(data))
train_data_X = data['X'][:split_index].as_matrix()
train_data_Y = data['Y'][:split_index].as_matrix()
test_data_X = data['X'][split_index:].as_matrix()
test_data_Y = data['Y'][split_index:].as_matrix()
train_dataset = ArrayDataset(train_data_X, train_data_Y).transform(transform)
test_dataset = ArrayDataset(test_data_X, test_data_Y).transform(transform)

"""Creating the training and testing dataloader, with NUM_WORKERS set to the number of CPU core"""

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, last_batch='rollover')

test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, last_batch='rollover')

"""## Creation of the network

The context will define where the training takes place, on the CPU or on the GPU
"""

ctx = mx.gpu() if mx.context.num_gpus() else mx.cpu()
Beispiel #29
0
def dataiter_all_sensors_seq2seq(aqi, scaler, setting, shuffle=True, offset=0):
    dataset = setting['dataset']
    training = setting['training']

    aqi_fill = utils.fill_missing(aqi)
    aqi_fill = scaler.transform(aqi_fill)  #[T, N, D]

    n_timestamp, num_nodes, _ = aqi_fill.shape

    timestamp = np.arange(n_timestamp) + offset

    timespan = (timestamp % 24) / 24
    timespan = np.tile(timespan, (1, num_nodes, 1)).T
    aqi_fill = np.concatenate((aqi_fill, timespan),
                              axis=2)  #[T, N, D]  add time of day

    timespan = ((timestamp // 24) % 7) / 7
    timespan = np.tile(timespan, (1, num_nodes, 1)).T
    aqi_fill = np.concatenate((aqi_fill, timespan),
                              axis=2)  #[T, N, D]  add day of week

    timespan = ((timestamp // (24 * 31)) % 12) / 12
    timespan = np.tile(timespan, (1, num_nodes, 1)).T
    aqi_fill = np.concatenate((aqi_fill, timespan),
                              axis=2)  #[T, N, D]  add month of year

    geo_feature, _ = get_geo_feature(
        dataset)  #[num_station, num_geo_feature (26)]

    data_fill = np.concatenate((aqi_fill,
                                np.tile(np.expand_dims(geo_feature, axis=0),
                                        (n_timestamp, 1, 1))),
                               axis=2)

    input_len = dataset['input_len']
    output_len = dataset['output_len']
    feature, data, mask, label = [], [], [], []
    for i in range(n_timestamp - input_len - output_len + 1):
        data.append(data_fill[i:i + input_len])

        mask.append(1.0 - np.isnan(aqi[i + input_len:i + input_len +
                                       output_len, :, 0]).astype(float))

        label.append(data_fill[i + input_len:i + input_len + output_len])

        feature.append(geo_feature)

        if i % 1000 == 0:
            logging.info('Processing %d timestamps', i)
            # if i > 0: break

    data = mx.nd.array(np.stack(data))  # [B, T, N, D(35)]
    label = mx.nd.array(np.stack(label))  # [B, T, N, D]
    mask = mx.nd.array(np.expand_dims(np.stack(mask), axis=3))  # [B, T, N, 1]
    feature = mx.nd.array(np.stack(feature))  # [B, N, D]

    logging.info('shape of feature: %s', feature.shape)
    logging.info('shape of data: %s', data.shape)
    logging.info('shape of mask: %s', mask.shape)
    logging.info('shape of label: %s', label.shape)

    from mxnet.gluon.data import ArrayDataset, DataLoader
    return DataLoader(
        ArrayDataset(feature, data, label, mask),
        shuffle=shuffle,
        batch_size=training['batch_size'],
        num_workers=4,
        last_batch='rollover',
    )
Beispiel #30
0
def dataiter_all_sensors_seq2seq(df, scaler, setting, shuffle=True):
    dataset = setting['dataset']
    training = setting['training']

    df_fill = utils.fill_missing(df)
    df_fill = scaler.transform(df_fill)

    n_timestamp = df_fill.shape[0]
    data_list = [np.expand_dims(df_fill.values, axis=-1)]

    # time in day
    time_idx = (df_fill.index.values -
                df_fill.index.values.astype('datetime64[D]')) / np.timedelta64(
                    1, 'D')
    time_in_day = np.tile(time_idx, [1, NUM_NODES, 1]).transpose((2, 1, 0))
    data_list.append(time_in_day)

    # day in week
    day_in_week = np.zeros(shape=(n_timestamp, NUM_NODES, 7))
    day_in_week[np.arange(n_timestamp), :, df_fill.index.dayofweek] = 1
    data_list.append(day_in_week)

    # temporal feature
    temporal_feature = np.concatenate(data_list, axis=-1)

    geo_feature, _ = get_geo_feature(dataset)

    input_len = dataset['input_len']
    output_len = dataset['output_len']
    feature, data, mask, label = [], [], [], []
    for i in range(n_timestamp - input_len - output_len + 1):
        data.append(temporal_feature[i:i + input_len])

        _mask = np.array(
            df.iloc[i + input_len:i + input_len + output_len] > 1e-5,
            dtype=np.float32)
        mask.append(_mask)

        label.append(temporal_feature[i + input_len:i + input_len +
                                      output_len])

        feature.append(geo_feature)

        if i % 1000 == 0:
            logging.info('Processing %d timestamps', i)
            # if i > 0: break

    data = mx.nd.array(np.stack(data))
    label = mx.nd.array(np.stack(label))
    mask = mx.nd.array(np.expand_dims(np.stack(mask), axis=3))
    feature = mx.nd.array(np.stack(feature))

    logging.info('shape of feature: %s', feature.shape)
    logging.info('shape of data: %s', data.shape)
    logging.info('shape of mask: %s', mask.shape)
    logging.info('shape of label: %s', label.shape)

    from mxnet.gluon.data import ArrayDataset, DataLoader
    return DataLoader(
        ArrayDataset(feature, data, label, mask),
        shuffle=shuffle,
        batch_size=training['batch_size'],
        num_workers=4,
        last_batch='rollover',
    )