Exemple #1
0
def download(dataset='uea'):
    """ Downloads the uea data to '/raw/uea'. """
    raw_dir = DATA_DIR + '/raw'
    assert os.path.isdir(raw_dir), "No directory exists at data/raw. Please make one to continue."

    if dataset == 'uea':
        url = 'http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_arff.zip'
        save_dir = DATA_DIR + '/raw/UEA'
        zipname = save_dir + '/uea.zip'
    elif dataset == 'ucr':
        url = 'http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip'
        save_dir = DATA_DIR + '/raw/UCR'
        zipname = save_dir + '/ucr.zip'
    elif dataset == 'tsr':
        url = 'https://zenodo.org/record/3902651/files/Monash_UEA_UCR_Regression_Archive.zip?download=1'
        save_dir = DATA_DIR + '/raw/TSR'
        zipname = save_dir + '/tsr.zip'
    else:
        raise ValueError('Can only download uea, ucr or tsr. Was asked for {}.'.format(dataset))

    if os.path.exists(save_dir):
        print('Path already exists at {}. If you wish to re-download you must delete this folder.'.format(save_dir))
        return

    mkdir_if_not_exists(save_dir)

    if len(os.listdir(save_dir)) == 0:
        download_url(url, zipname)
        unzip(zipname, save_dir)
def load_results(filename):
    with open(filename) as f:
        data = json.load(f)
        results = list(
            map(lambda xy_tuples: list(unzip(sorted(xy_tuples))), data))
        labels = first(results[0])
        assessments = list(map(second, results))
        return labels, assessments
def multi_func(x):
    global tile_times
    tile_times = []
    section_time = time.time()

    if isinstance(good_index.iloc[x]['TILE'], float):
        print(f'Skipping placeholder tile index ...')
        return

    row = good_index.iloc[[x]]
    tile_base = row['TILE'][x]
    print(f'Working on tile {tile_base} ...')

    # Intersect tile and footprints to determine if any need processed
    tile = good_index[good_index['TILE'] == tile_base]
    # this method clips footprint to part that's inside of tile
    # subset = gpd.overlay(tile, footprints_county, how='intersection')
    # this method only selects footprints completely within the tile
    subset = footprints_county.copy()
    subset['test'] = footprints_county.apply(
        lambda x: tile.geometry.contains(x.geometry), axis=1)
    subset = subset[subset['test']]

    if subset.shape[0] != 0:

        # Create DSM and DTM download and local file paths
        dsm_path = dsm_gcp_path + str(tile_base) + '.zip'
        dsm_file = os.path.join(dsm_dir, str(tile_base) + '.img')
        dsm_zip = os.path.join(dsm_dir, str(tile_base) + '.zip')
        dtm_path = dtm_gcp_path + str(tile_base) + '.zip'
        dtm_path = dtm_path.replace('hh', 'bh')
        dtm_file = os.path.join(dtm_dir, str(tile_base) + '.img')
        dtm_file = dtm_file.replace('hh', 'bh')

        # Select a tile and download DSM and DTM from google cloud storage

        if not os.path.isfile(dsm_zip):
            try:
                dsm = wget.download(dsm_path, dsm_dir)
            except HTTPError:
                print('Encountered HTTPError, skipping to next tile ...')
                return
        else:
            dsm = dsm_zip
            print(f'{dsm} already exists ...')

        try:
            dtm = wget.download(dtm_path, dtm_dir)
        except HTTPError:
            print('Encountered HTTPError, skipping to next tile ...')
            return

        print(f'Downloaded {dsm} and {dtm} ...')

        # Unzip DSM and DTM
        unzip(dsm_dir, dsm)
        unzip(dtm_dir, dtm)

        # Iterate over footprints in the tile
        for j in np.arange(subset.shape[0]):
            temp = subset.iloc[[j]]
            updated = get_height(temp, dsm_file, dtm_file, keep_pts, pool_pts)
            if j == 0:
                subset_final = updated
            else:
                subset_final = subset_final.append(updated, ignore_index=True)

        # Delete DSM and DTM files, zipped files, and all .xmls
        os.remove(dsm_file)
        os.remove(dsm)
        os.remove(dtm_file)
        os.remove(dtm)
        for item in os.listdir(dsm_dir):
            if item.endswith(".xml"):
                os.remove(os.path.join(dsm_dir, item))
        for item in os.listdir(dtm_dir):
            if item.endswith(".xml"):
                os.remove(os.path.join(dtm_dir, item))

        # print("Time elapsed for tile subset: {:.2f}s".format(time.time() - section_time))
        tile_times.append(time.time() - section_time)

        section_time = time.time()

        subset_final = subset_final[keep_cols]

        # if x == 0 or all_footprints is None:
        #     all_footprints = subset_final
        # else:
        #     all_footprints = all_footprints.append(subset_final, ignore_index=True)

        return subset_final

        del updated
        del subset_final

    else:
        del subset
        print('    No overlapping footprints in tile, moving on ...')
def heatmap(d, scale, vmin=None, vmax=None, cmap=None, ax=None,
            scientific=False, style='triangular', colorbar=True):
    """
    Plots heatmap of given color values.

    Parameters
    ----------
    d: dictionary
        A dictionary mapping the i, j polygon to the heatmap color, where
        i + j + k = scale.
    scale: Integer
        The scale used to partition the simplex.
    vmin: float, None
        The minimum color value, used to normalize colors. Computed if absent.
    vmax: float, None
        The maximum color value, used to normalize colors. Computed if absent.
    cmap: String or matplotlib.colors.Colormap, None
        The name of the Matplotlib colormap to use.
    ax: Matplotlib AxesSubplot, None
        The subplot to draw on.
    scientific: Bool, False
        Whether to use scientific notation for colorbar numbers.
    style: String, "triangular"
        The style of the heatmap, "triangular" or "hexagonal".
    colorbar: bool, True
        Show colorbar.

    Returns
    -------
    ax: The matplotlib axis
    """
    
    if not ax:
        fig, ax = pyplot.subplots()
    cmap = get_cmap(cmap)
    if not vmin:
        vmin = min(d.values())
    if not vmax:
        vmax = max(d.values())
    style = style.lower()[0]
    if style not in ["t", "h"]:
        raise ValueError("Heatmap style must be 'triangular' or 'hexagonal'")
    if style == "h":
        mapping_functions = [(hexagon_coordinates, d.items())]
    else:
        mapping_functions = [(triangle_coordinates, d.items()), (alt_triangle_coordinates, alt_value_iterator(d))]

    # Color data triangles or hexagons
    for vertex_function, iterator in mapping_functions:
        for key, value in iterator:
            if value is not None:
                i, j = key
                k = scale - i - j
                vertices = vertex_function(i, j, k)
                color = colormapper(value, vmin, vmax, cmap=cmap)
                # Matplotlib wants a list of xs and a list of ys
                xs, ys = unzip(vertices)
                ax.fill(xs, ys, facecolor=color, edgecolor=color)

    if colorbar:
        colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific)
    return ax
Exemple #5
0
def train_lstm(
    dim_proj=128,  # word embeding dimension and LSTM number of hidden units.
    patience=10,  # Number of epoch to wait before early stop if no progress
    max_epochs=5000,  # The maximum number of epoch to run
    dispFreq=10,  # Display to stdout the training progress every N updates
    decay_c=0.,  # Weight decay for the classifier applied to the U weights.
    lrate=0.0001,  # Learning rate for sgd (not used for adadelta and rmsprop)
    n_words=10000,  # Vocabulary size
    optimizer=adadelta,  # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate).
    encoder='lstm',  # TODO: can be removed must be lstm.
    saveto='lstm_model.npz',  # The best model will be saved there
    validFreq=370,  # Compute the validation error after this number of update.
    saveFreq=1110,  # Save the parameters after every saveFreq updates
    maxlen=100,  # Sequence longer then this get ignored
    batch_size=16,  # The batch size during training.
    valid_batch_size=64,  # The batch size used for validation/test set.
    dataset='imdb',

    # Parameter for extra option
    noise_std=0.,
    use_dropout=True,  # if False slightly faster, but worst test error
                       # This frequently need a bigger model.
    reload_model=None,  # Path to a saved model we want to start from.
    test_size=-1,  # If >0, we keep only this number of test example.
):

    # Model options
    model_options = locals().copy()
    print "model options", model_options

    load_data, prepare_data = get_dataset(dataset)

    print 'Loading data'
    train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
                                   maxlen=maxlen)
    if test_size > 0:
        # The test set is sorted by size, but we want to keep random
        # size example.  So we must select a random selection of the
        # examples.
        idx = numpy.arange(len(test[0]))
        numpy.random.shuffle(idx)
        idx = idx[:test_size]
        test = ([test[0][n] for n in idx], [test[1][n] for n in idx])

    ydim = numpy.max(train[1]) + 1

    model_options['ydim'] = ydim

    print 'Building model'
    # This create the initial parameters as numpy ndarrays.
    # Dict name (string) -> numpy ndarray
    params = init_params(model_options)

    if reload_model:
        load_params('lstm_model.npz', params)

    # This create Theano Shared Variable from the parameters.
    # Dict name (string) -> Theano Tensor Shared Variable
    # params and tparams have different copy of the weights.
    tparams = init_tparams(params)

    # use_noise is for dropout
    (use_noise, x, mask,
     y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)

    if decay_c > 0.:
        decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c')
        weight_decay = 0.
        weight_decay += (tparams['U'] ** 2).sum()
        weight_decay *= decay_c
        cost += weight_decay

    f_cost = theano.function([x, mask, y], cost, name='f_cost')

    grads = T.grad(cost, wrt=tparams.values())
    f_grad = theano.function([x, mask, y], grads, name='f_grad')

    lr = T.scalar(name='lr')
    f_grad_shared, f_update = optimizer(lr, tparams, grads,
                                        x, mask, y, cost)

    print 'Optimization'

    kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
    kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

    print "%d train examples" % len(train[0])
    print "%d valid examples" % len(valid[0])
    print "%d test examples" % len(test[0])

    history_errs = []
    best_p = None
    bad_count = 0

    if validFreq == -1:
        validFreq = len(train[0]) / batch_size
    if saveFreq == -1:
        saveFreq = len(train[0]) / batch_size

    uidx = 0  # the number of update done
    estop = False  # early stop
    start_time = time.time()
    try:
        for eidx in xrange(max_epochs):
            n_samples = 0

            # Get new shuffled index for the training set.
            kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)

            for _, train_index in kf:
                uidx += 1
                use_noise.set_value(1.)

                # Select the random examples for this minibatch
                y = [train[1][t] for t in train_index]
                x = [train[0][t]for t in train_index]

                # Get the data in numpy.ndarray format
                # This swap the axis!
                # Return something of shape (minibatch maxlen, n samples)
                x, mask, y = prepare_data(x, y)
                n_samples += x.shape[1]

                cost = f_grad_shared(x, mask, y)
                f_update(lrate)

                if numpy.isnan(cost) or numpy.isinf(cost):
                    print 'NaN detected'
                    return 1., 1., 1.

                if numpy.mod(uidx, dispFreq) == 0:
                    print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost

                if saveto and numpy.mod(uidx, saveFreq) == 0:
                    print 'Saving...',

                    if best_p is not None:
                        params = best_p
                    else:
                        params = unzip(tparams)
                    numpy.savez(saveto, history_errs=history_errs, **params)
                    pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1)
                    print 'Done'

                if numpy.mod(uidx, validFreq) == 0:
                    use_noise.set_value(0.)
                    train_err = pred_error(f_pred, prepare_data, train, kf)
                    valid_err = pred_error(f_pred, prepare_data, valid,
                                           kf_valid)
                    test_err = pred_error(f_pred, prepare_data, test, kf_test)

                    history_errs.append([valid_err, test_err])

                    if (uidx == 0 or
                        valid_err <= numpy.array(history_errs)[:,
                                                               0].min()):

                        best_p = unzip(tparams)
                        bad_counter = 0

                    print ('Train ', train_err, 'Valid ', valid_err,
                           'Test ', test_err)

                    if (len(history_errs) > patience and
                        valid_err >= numpy.array(history_errs)[:-patience,
                                                               0].min()):
                        bad_counter += 1
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

            print 'Seen %d samples' % n_samples

            if estop:
                break

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    if best_p is not None:
        zipp(best_p, tparams)
    else:
        best_p = unzip(tparams)

    use_noise.set_value(0.)
    kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size)
    train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted)
    valid_err = pred_error(f_pred, prepare_data, valid, kf_valid)
    test_err = pred_error(f_pred, prepare_data, test, kf_test)

    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err
    if saveto:
        numpy.savez(saveto, train_err=train_err,
                    valid_err=valid_err, test_err=test_err,
                    history_errs=history_errs, **best_p)
    print 'The code run for %d epochs, with %f sec/epochs' % (
        (eidx + 1), (end_time - start_time) / (1. * (eidx + 1)))
    print >> sys.stderr, ('Training took %.1fs' %
                          (end_time - start_time))
    return train_err, valid_err, test_err
def heatmap(data, scale, vmin=None, vmax=None, cmap=None, ax=None,
            scientific=False, style='triangular', colorbar=True,
            permutation=None, colormap=True):
    """
    Plots heatmap of given color values.

    Parameters
    ----------
    data: dictionary
        A dictionary mapping the i, j polygon to the heatmap color, where
        i + j + k = scale.
    scale: Integer
        The scale used to partition the simplex.
    vmin: float, None
        The minimum color value, used to normalize colors. Computed if absent.
    vmax: float, None
        The maximum color value, used to normalize colors. Computed if absent.
    cmap: String or matplotlib.colors.Colormap, None
        The name of the Matplotlib colormap to use.
    ax: Matplotlib AxesSubplot, None
        The subplot to draw on.
    scientific: Bool, False
        Whether to use scientific notation for colorbar numbers.
    style: String, "triangular"
        The style of the heatmap, "triangular", "dual-triangular" or "hexagonal"
    colorbar: bool, True
        Show colorbar.
    permutation: string, None
        A permutation of the coordinates

    Returns
    -------
    ax: The matplotlib axis
    """

    if not ax:
        fig, ax = pyplot.subplots()
    # If not colormap, then make the RGBA values numpy arrays so that they can
    # be averaged.
    if not colormap:
        for k, v in data.items():
            data[k] = numpy.array(v)
    else:
        cmap = get_cmap(cmap)
        if not vmin:
            vmin = min(data.values())
        if not vmax:
            vmax = max(data.values())
    style = style.lower()[0]
    if style not in ["t", "h", 'd']:
        raise ValueError("Heatmap style must be 'triangular', 'dual-triangular', or 'hexagonal'")

    vertices_values = polygon_generator(data, scale, style,
                                       permutation=permutation)

    # Draw the polygons and color them
    for vertices, value in vertices_values:
        if value is None:
            continue
        if colormap:
            color = colormapper(value, vmin, vmax, cmap=cmap)
        else:
            color = value # rgba tuple (r,g,b,a) all in [0,1]
        # Matplotlib wants a list of xs and a list of ys
        xs, ys = unzip(vertices)
        ax.fill(xs, ys, facecolor=color, edgecolor=color)

    if colorbar and colormap:
        colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific)
    return ax
Exemple #7
0
def heatmap(data,
            scale,
            vmin=None,
            vmax=None,
            cmap=None,
            ax=None,
            scientific=False,
            style='triangular',
            colorbar=True,
            permutation=None):
    """
    Plots heatmap of given color values.

    Parameters
    ----------
    data: dictionary
        A dictionary mapping the i, j polygon to the heatmap color, where
        i + j + k = scale.
    scale: Integer
        The scale used to partition the simplex.
    vmin: float, None
        The minimum color value, used to normalize colors. Computed if absent.
    vmax: float, None
        The maximum color value, used to normalize colors. Computed if absent.
    cmap: String or matplotlib.colors.Colormap, None
        The name of the Matplotlib colormap to use.
    ax: Matplotlib AxesSubplot, None
        The subplot to draw on.
    scientific: Bool, False
        Whether to use scientific notation for colorbar numbers.
    style: String, "triangular"
        The style of the heatmap, "triangular", "dual-triangular" or "hexagonal"
    colorbar: bool, True
        Show colorbar.
    permutation: string, None
        A permutation of the coordinates

    Returns
    -------
    ax: The matplotlib axis
    """

    if not ax:
        fig, ax = pyplot.subplots()
    cmap = get_cmap(cmap)
    if not vmin:
        vmin = min(data.values())
    if not vmax:
        vmax = max(data.values())
    style = style.lower()[0]
    if style not in ["t", "h", 'd']:
        raise ValueError(
            "Heatmap style must be 'triangular', 'dual-triangular', or 'hexagonal'"
        )

    vertices_values = polygon_generator(data,
                                        scale,
                                        style,
                                        permutation=permutation)

    # Draw the polygons and color them
    for vertices, value in vertices_values:
        if value is None:
            continue
        color = colormapper(value, vmin, vmax, cmap=cmap)
        # Matplotlib wants a list of xs and a list of ys
        xs, ys = unzip(vertices)
        ax.fill(xs, ys, facecolor=color, edgecolor=color)

    if colorbar:
        colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific)
    return ax