def download(dataset='uea'): """ Downloads the uea data to '/raw/uea'. """ raw_dir = DATA_DIR + '/raw' assert os.path.isdir(raw_dir), "No directory exists at data/raw. Please make one to continue." if dataset == 'uea': url = 'http://www.timeseriesclassification.com/Downloads/Archives/Multivariate2018_arff.zip' save_dir = DATA_DIR + '/raw/UEA' zipname = save_dir + '/uea.zip' elif dataset == 'ucr': url = 'http://www.timeseriesclassification.com/Downloads/Archives/Univariate2018_arff.zip' save_dir = DATA_DIR + '/raw/UCR' zipname = save_dir + '/ucr.zip' elif dataset == 'tsr': url = 'https://zenodo.org/record/3902651/files/Monash_UEA_UCR_Regression_Archive.zip?download=1' save_dir = DATA_DIR + '/raw/TSR' zipname = save_dir + '/tsr.zip' else: raise ValueError('Can only download uea, ucr or tsr. Was asked for {}.'.format(dataset)) if os.path.exists(save_dir): print('Path already exists at {}. If you wish to re-download you must delete this folder.'.format(save_dir)) return mkdir_if_not_exists(save_dir) if len(os.listdir(save_dir)) == 0: download_url(url, zipname) unzip(zipname, save_dir)
def load_results(filename): with open(filename) as f: data = json.load(f) results = list( map(lambda xy_tuples: list(unzip(sorted(xy_tuples))), data)) labels = first(results[0]) assessments = list(map(second, results)) return labels, assessments
def multi_func(x): global tile_times tile_times = [] section_time = time.time() if isinstance(good_index.iloc[x]['TILE'], float): print(f'Skipping placeholder tile index ...') return row = good_index.iloc[[x]] tile_base = row['TILE'][x] print(f'Working on tile {tile_base} ...') # Intersect tile and footprints to determine if any need processed tile = good_index[good_index['TILE'] == tile_base] # this method clips footprint to part that's inside of tile # subset = gpd.overlay(tile, footprints_county, how='intersection') # this method only selects footprints completely within the tile subset = footprints_county.copy() subset['test'] = footprints_county.apply( lambda x: tile.geometry.contains(x.geometry), axis=1) subset = subset[subset['test']] if subset.shape[0] != 0: # Create DSM and DTM download and local file paths dsm_path = dsm_gcp_path + str(tile_base) + '.zip' dsm_file = os.path.join(dsm_dir, str(tile_base) + '.img') dsm_zip = os.path.join(dsm_dir, str(tile_base) + '.zip') dtm_path = dtm_gcp_path + str(tile_base) + '.zip' dtm_path = dtm_path.replace('hh', 'bh') dtm_file = os.path.join(dtm_dir, str(tile_base) + '.img') dtm_file = dtm_file.replace('hh', 'bh') # Select a tile and download DSM and DTM from google cloud storage if not os.path.isfile(dsm_zip): try: dsm = wget.download(dsm_path, dsm_dir) except HTTPError: print('Encountered HTTPError, skipping to next tile ...') return else: dsm = dsm_zip print(f'{dsm} already exists ...') try: dtm = wget.download(dtm_path, dtm_dir) except HTTPError: print('Encountered HTTPError, skipping to next tile ...') return print(f'Downloaded {dsm} and {dtm} ...') # Unzip DSM and DTM unzip(dsm_dir, dsm) unzip(dtm_dir, dtm) # Iterate over footprints in the tile for j in np.arange(subset.shape[0]): temp = subset.iloc[[j]] updated = get_height(temp, dsm_file, dtm_file, keep_pts, pool_pts) if j == 0: subset_final = updated else: subset_final = subset_final.append(updated, ignore_index=True) # Delete DSM and DTM files, zipped files, and all .xmls os.remove(dsm_file) os.remove(dsm) os.remove(dtm_file) os.remove(dtm) for item in os.listdir(dsm_dir): if item.endswith(".xml"): os.remove(os.path.join(dsm_dir, item)) for item in os.listdir(dtm_dir): if item.endswith(".xml"): os.remove(os.path.join(dtm_dir, item)) # print("Time elapsed for tile subset: {:.2f}s".format(time.time() - section_time)) tile_times.append(time.time() - section_time) section_time = time.time() subset_final = subset_final[keep_cols] # if x == 0 or all_footprints is None: # all_footprints = subset_final # else: # all_footprints = all_footprints.append(subset_final, ignore_index=True) return subset_final del updated del subset_final else: del subset print(' No overlapping footprints in tile, moving on ...')
def heatmap(d, scale, vmin=None, vmax=None, cmap=None, ax=None, scientific=False, style='triangular', colorbar=True): """ Plots heatmap of given color values. Parameters ---------- d: dictionary A dictionary mapping the i, j polygon to the heatmap color, where i + j + k = scale. scale: Integer The scale used to partition the simplex. vmin: float, None The minimum color value, used to normalize colors. Computed if absent. vmax: float, None The maximum color value, used to normalize colors. Computed if absent. cmap: String or matplotlib.colors.Colormap, None The name of the Matplotlib colormap to use. ax: Matplotlib AxesSubplot, None The subplot to draw on. scientific: Bool, False Whether to use scientific notation for colorbar numbers. style: String, "triangular" The style of the heatmap, "triangular" or "hexagonal". colorbar: bool, True Show colorbar. Returns ------- ax: The matplotlib axis """ if not ax: fig, ax = pyplot.subplots() cmap = get_cmap(cmap) if not vmin: vmin = min(d.values()) if not vmax: vmax = max(d.values()) style = style.lower()[0] if style not in ["t", "h"]: raise ValueError("Heatmap style must be 'triangular' or 'hexagonal'") if style == "h": mapping_functions = [(hexagon_coordinates, d.items())] else: mapping_functions = [(triangle_coordinates, d.items()), (alt_triangle_coordinates, alt_value_iterator(d))] # Color data triangles or hexagons for vertex_function, iterator in mapping_functions: for key, value in iterator: if value is not None: i, j = key k = scale - i - j vertices = vertex_function(i, j, k) color = colormapper(value, vmin, vmax, cmap=cmap) # Matplotlib wants a list of xs and a list of ys xs, ys = unzip(vertices) ax.fill(xs, ys, facecolor=color, edgecolor=color) if colorbar: colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific) return ax
def train_lstm( dim_proj=128, # word embeding dimension and LSTM number of hidden units. patience=10, # Number of epoch to wait before early stop if no progress max_epochs=5000, # The maximum number of epoch to run dispFreq=10, # Display to stdout the training progress every N updates decay_c=0., # Weight decay for the classifier applied to the U weights. lrate=0.0001, # Learning rate for sgd (not used for adadelta and rmsprop) n_words=10000, # Vocabulary size optimizer=adadelta, # sgd, adadelta and rmsprop available, sgd very hard to use, not recommanded (probably need momentum and decaying learning rate). encoder='lstm', # TODO: can be removed must be lstm. saveto='lstm_model.npz', # The best model will be saved there validFreq=370, # Compute the validation error after this number of update. saveFreq=1110, # Save the parameters after every saveFreq updates maxlen=100, # Sequence longer then this get ignored batch_size=16, # The batch size during training. valid_batch_size=64, # The batch size used for validation/test set. dataset='imdb', # Parameter for extra option noise_std=0., use_dropout=True, # if False slightly faster, but worst test error # This frequently need a bigger model. reload_model=None, # Path to a saved model we want to start from. test_size=-1, # If >0, we keep only this number of test example. ): # Model options model_options = locals().copy() print "model options", model_options load_data, prepare_data = get_dataset(dataset) print 'Loading data' train, valid, test = load_data(n_words=n_words, valid_portion=0.05, maxlen=maxlen) if test_size > 0: # The test set is sorted by size, but we want to keep random # size example. So we must select a random selection of the # examples. idx = numpy.arange(len(test[0])) numpy.random.shuffle(idx) idx = idx[:test_size] test = ([test[0][n] for n in idx], [test[1][n] for n in idx]) ydim = numpy.max(train[1]) + 1 model_options['ydim'] = ydim print 'Building model' # This create the initial parameters as numpy ndarrays. # Dict name (string) -> numpy ndarray params = init_params(model_options) if reload_model: load_params('lstm_model.npz', params) # This create Theano Shared Variable from the parameters. # Dict name (string) -> Theano Tensor Shared Variable # params and tparams have different copy of the weights. tparams = init_tparams(params) # use_noise is for dropout (use_noise, x, mask, y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options) if decay_c > 0.: decay_c = theano.shared(numpy_floatX(decay_c), name='decay_c') weight_decay = 0. weight_decay += (tparams['U'] ** 2).sum() weight_decay *= decay_c cost += weight_decay f_cost = theano.function([x, mask, y], cost, name='f_cost') grads = T.grad(cost, wrt=tparams.values()) f_grad = theano.function([x, mask, y], grads, name='f_grad') lr = T.scalar(name='lr') f_grad_shared, f_update = optimizer(lr, tparams, grads, x, mask, y, cost) print 'Optimization' kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size) kf_test = get_minibatches_idx(len(test[0]), valid_batch_size) print "%d train examples" % len(train[0]) print "%d valid examples" % len(valid[0]) print "%d test examples" % len(test[0]) history_errs = [] best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size uidx = 0 # the number of update done estop = False # early stop start_time = time.time() try: for eidx in xrange(max_epochs): n_samples = 0 # Get new shuffled index for the training set. kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True) for _, train_index in kf: uidx += 1 use_noise.set_value(1.) # Select the random examples for this minibatch y = [train[1][t] for t in train_index] x = [train[0][t]for t in train_index] # Get the data in numpy.ndarray format # This swap the axis! # Return something of shape (minibatch maxlen, n samples) x, mask, y = prepare_data(x, y) n_samples += x.shape[1] cost = f_grad_shared(x, mask, y) f_update(lrate) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost if saveto and numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = best_p else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb'), -1) print 'Done' if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = pred_error(f_pred, prepare_data, train, kf) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) history_errs.append([valid_err, test_err]) if (uidx == 0 or valid_err <= numpy.array(history_errs)[:, 0].min()): best_p = unzip(tparams) bad_counter = 0 print ('Train ', train_err, 'Valid ', valid_err, 'Test ', test_err) if (len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience, 0].min()): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Seen %d samples' % n_samples if estop: break except KeyboardInterrupt: print "Training interupted" end_time = time.time() if best_p is not None: zipp(best_p, tparams) else: best_p = unzip(tparams) use_noise.set_value(0.) kf_train_sorted = get_minibatches_idx(len(train[0]), batch_size) train_err = pred_error(f_pred, prepare_data, train, kf_train_sorted) valid_err = pred_error(f_pred, prepare_data, valid, kf_valid) test_err = pred_error(f_pred, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err if saveto: numpy.savez(saveto, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **best_p) print 'The code run for %d epochs, with %f sec/epochs' % ( (eidx + 1), (end_time - start_time) / (1. * (eidx + 1))) print >> sys.stderr, ('Training took %.1fs' % (end_time - start_time)) return train_err, valid_err, test_err
def heatmap(data, scale, vmin=None, vmax=None, cmap=None, ax=None, scientific=False, style='triangular', colorbar=True, permutation=None, colormap=True): """ Plots heatmap of given color values. Parameters ---------- data: dictionary A dictionary mapping the i, j polygon to the heatmap color, where i + j + k = scale. scale: Integer The scale used to partition the simplex. vmin: float, None The minimum color value, used to normalize colors. Computed if absent. vmax: float, None The maximum color value, used to normalize colors. Computed if absent. cmap: String or matplotlib.colors.Colormap, None The name of the Matplotlib colormap to use. ax: Matplotlib AxesSubplot, None The subplot to draw on. scientific: Bool, False Whether to use scientific notation for colorbar numbers. style: String, "triangular" The style of the heatmap, "triangular", "dual-triangular" or "hexagonal" colorbar: bool, True Show colorbar. permutation: string, None A permutation of the coordinates Returns ------- ax: The matplotlib axis """ if not ax: fig, ax = pyplot.subplots() # If not colormap, then make the RGBA values numpy arrays so that they can # be averaged. if not colormap: for k, v in data.items(): data[k] = numpy.array(v) else: cmap = get_cmap(cmap) if not vmin: vmin = min(data.values()) if not vmax: vmax = max(data.values()) style = style.lower()[0] if style not in ["t", "h", 'd']: raise ValueError("Heatmap style must be 'triangular', 'dual-triangular', or 'hexagonal'") vertices_values = polygon_generator(data, scale, style, permutation=permutation) # Draw the polygons and color them for vertices, value in vertices_values: if value is None: continue if colormap: color = colormapper(value, vmin, vmax, cmap=cmap) else: color = value # rgba tuple (r,g,b,a) all in [0,1] # Matplotlib wants a list of xs and a list of ys xs, ys = unzip(vertices) ax.fill(xs, ys, facecolor=color, edgecolor=color) if colorbar and colormap: colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific) return ax
def heatmap(data, scale, vmin=None, vmax=None, cmap=None, ax=None, scientific=False, style='triangular', colorbar=True, permutation=None): """ Plots heatmap of given color values. Parameters ---------- data: dictionary A dictionary mapping the i, j polygon to the heatmap color, where i + j + k = scale. scale: Integer The scale used to partition the simplex. vmin: float, None The minimum color value, used to normalize colors. Computed if absent. vmax: float, None The maximum color value, used to normalize colors. Computed if absent. cmap: String or matplotlib.colors.Colormap, None The name of the Matplotlib colormap to use. ax: Matplotlib AxesSubplot, None The subplot to draw on. scientific: Bool, False Whether to use scientific notation for colorbar numbers. style: String, "triangular" The style of the heatmap, "triangular", "dual-triangular" or "hexagonal" colorbar: bool, True Show colorbar. permutation: string, None A permutation of the coordinates Returns ------- ax: The matplotlib axis """ if not ax: fig, ax = pyplot.subplots() cmap = get_cmap(cmap) if not vmin: vmin = min(data.values()) if not vmax: vmax = max(data.values()) style = style.lower()[0] if style not in ["t", "h", 'd']: raise ValueError( "Heatmap style must be 'triangular', 'dual-triangular', or 'hexagonal'" ) vertices_values = polygon_generator(data, scale, style, permutation=permutation) # Draw the polygons and color them for vertices, value in vertices_values: if value is None: continue color = colormapper(value, vmin, vmax, cmap=cmap) # Matplotlib wants a list of xs and a list of ys xs, ys = unzip(vertices) ax.fill(xs, ys, facecolor=color, edgecolor=color) if colorbar: colorbar_hack(ax, vmin, vmax, cmap, scientific=scientific) return ax