def preprocess_data(data_home, **kwargs): bucket_size = kwargs.get('bucket', 300) encoding = kwargs.get('encoding', 'utf-8') celebrity_threshold = kwargs.get('celebrity', 10) mindf = kwargs.get('mindf', 10) dtype = kwargs.get('dtype', 'float32') one_hot_label = kwargs.get('onehot', False) vocab_file = os.path.join(data_home, 'vocab.pkl') dump_file = os.path.join(data_home, 'dump.pkl') if os.path.exists(dump_file) and not model_args.builddata: logging.info('loading data from dumped file...') data = load_obj(dump_file) logging.info('loading data finished!') return data dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b') dl.load_data() dl.assignClasses() dl.tfidf() vocab = dl.vectorizer.vocabulary_ logging.info('saving vocab in {}'.format(vocab_file)) dump_obj(vocab, vocab_file) logging.info('vocab dumped successfully!') U_test = dl.df_test.index.tolist() U_dev = dl.df_dev.index.tolist() U_train = dl.df_train.index.tolist() dl.get_graph() logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(dl.graph, nodelist=xrange(len(U_train + U_dev + U_test)), weight='w') adj.setdiag(0) #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,) selfloop_value = 1 adj.setdiag(selfloop_value) n, m = adj.shape diags = adj.sum(axis=1).flatten() with sp.errstate(divide='ignore'): diags_sqrt = 1.0 / sp.sqrt(diags) diags_sqrt[sp.isinf(diags_sqrt)] = 0 D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr') A = D_pow_neghalf * adj * D_pow_neghalf A = A.astype(dtype) logging.info('adjacency matrix created.') X_train = dl.X_train X_dev = dl.X_dev X_test = dl.X_test Y_test = dl.test_classes Y_train = dl.train_classes Y_dev = dl.dev_classes classLatMedian = { str(c): dl.cluster_median[c][0] for c in dl.cluster_median } classLonMedian = { str(c): dl.cluster_median[c][1] for c in dl.cluster_median } P_test = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist() ] P_train = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist() ] P_dev = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist() ] userLocation = {} for i, u in enumerate(U_train): userLocation[u] = P_train[i] for i, u in enumerate(U_test): userLocation[u] = P_test[i] for i, u in enumerate(U_dev): userLocation[u] = P_dev[i] data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation) if not model_args.builddata: logging.info('dumping data in {} ...'.format(str(dump_file))) dump_obj(data, dump_file) logging.info('data dump finished!') return data
def main(data, args, **kwargs): batch_size = kwargs.get('batch', 500) hidden_size = kwargs.get('hidden', [100]) dropout = kwargs.get('dropout', 0.0) regul = kwargs.get('regularization', 1e-6) dtype = 'float32' dtypeint = 'int32' check_percentiles = kwargs.get('percent', False) A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data logging.info( 'stacking training, dev and test features and creating indices...') X = sp.sparse.vstack([X_train, X_dev, X_test]) if len(Y_train.shape) == 1: Y = np.hstack((Y_train, Y_dev, Y_test)) else: Y = np.vstack((Y_train, Y_dev, Y_test)) Y = Y.astype(dtypeint) X = X.astype(dtype) A = A.astype(dtype) if args.vis: from deepcca import draw_representations draw_representations(A.dot(X), Y, filename='gconv1.pdf') draw_representations(A.dot(A.dot(X)), Y, filename='gconv2.pdf') input_size = X.shape[1] output_size = np.max(Y) + 1 verbose = not args.silent fractions = args.lblfraction stratified = False all_train_indices = np.asarray(range(0, X_train.shape[0])).astype(dtypeint) logging.info('running mlp with graph conv...') clf = GraphConv(input_size=input_size, output_size=output_size, hid_size_list=hidden_size, regul_coef=regul, drop_out=dropout, batchnorm=args.batchnorm, highway=model_args.highway) clf.build_model(A, use_text=args.notxt, use_labels=args.lp, seed=model_args.seed) for percentile in fractions: logging.info('***********percentile %f ******************' % percentile) model_file = './data/model-{}-{}.pkl'.format(A.shape[0], percentile) if stratified: all_chosen = [] for lbl in range(0, np.max(Y_train) + 1): lbl_indices = all_train_indices[Y_train == lbl] selection_size = int(percentile * len(lbl_indices)) + 1 lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint) all_chosen.append(lbl_chosen) train_indices = np.hstack(all_chosen) else: selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0]) train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint) num_training_samples = train_indices.shape[0] logging.info('{} training samples'.format(num_training_samples)) #train_indices = np.asarray(range(0, int(percentile * X_train.shape[0]))).astype(dtypeint) dev_indices = np.asarray( range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype(dtypeint) test_indices = np.asarray( range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype(dtypeint) # do not train, load if args.load: report_results = False clf.load(load_obj, model_file) else: #reset the network parameters if already fitted with another data if clf.fitted: clf.reset() clf.fit(X, A, Y, train_indices=train_indices, val_indices=dev_indices, n_epochs=10000, batch_size=batch_size, max_down=args.maxdown, verbose=verbose, seed=model_args.seed) if args.save: clf.save(dump_obj, model_file) logging.info('dev results:') y_pred, _ = clf.predict(X, A, dev_indices) mean, median, acc, distances, latlon_true, latlon_pred = geo_eval( Y_dev, y_pred, U_dev, classLatMedian, classLonMedian, userLocation) with open( 'gcn_{}_percent_pred_{}.pkl'.format( percentile, output_size), 'wb') as fout: pickle.dump((distances, latlon_true, latlon_pred), fout) logging.info('test results:') y_pred, _ = clf.predict(X, A, test_indices) geo_eval(Y_test, y_pred, U_test, classLatMedian, classLonMedian, userLocation) if args.feature_report: vocab_file = os.path.join(args.dir, 'vocab.pkl') if not os.path.exists(vocab_file): logging.error('vocab file {} not found'.format(vocab_file)) return else: vocab = load_obj(vocab_file) logging.info('{} vocab loaded from file'.format(len(vocab))) train_vocab = set([ term for term, count in Counter(np.nonzero(X[train_indices]) [1]).iteritems() if count >= 10 ]) dev_vocab = set(np.nonzero(X[dev_indices].sum(axis=0))[1]) X_onehot = sp.sparse.diags([1] * len(vocab), dtype=dtype) A_onehot = X_onehot feature_report(clf, vocab, X_onehot, A_onehot, classLatMedian, classLonMedian, train_vocab, dev_vocab, topk=200, dtypeint=dtypeint)
def main(data, args, **kwargs): batch_size = kwargs.get('batch', 500) hidden_size = kwargs.get('hidden', [100]) dropout = kwargs.get('dropout', 0.0) regul = kwargs.get('regularization', 1e-6) dtype = 'float32' dtypeint = 'int32' check_percentiles = kwargs.get('percent', False) H, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data Y_dev = Y_dev.astype(dtypeint) Y_test = Y_test.astype(dtypeint) logging.info( 'stacking training, dev and test features and creating indices...') X = sp.sparse.vstack([X_train, X_dev, X_test]) if len(Y_train.shape) == 1: Y = np.hstack((Y_train, Y_dev, Y_test)) else: Y = np.vstack((Y_train, Y_dev, Y_test)) Y = Y.astype('int32') X = X.astype(dtype) H = H.astype(dtype) input_size = X.shape[1] output_size = np.max(Y) + 1 train_indices = np.asarray(range(0, X_train.shape[0])).astype('int32') dev_indices = np.asarray( range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype('int32') test_indices = np.asarray( range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype('int32') batch_size = min(batch_size, train_indices.shape[0]) if args.dcca: logging.info('running deepcca...') deepcca = DeepCCA() deepcca.build(X.shape[1], H.shape[1], architecture=args.dccahid, regul_coef=args.dccareg, dropout=dropout, lr=args.dccalr, batchnorm=args.dccabatchnorm, seed=model_args.seed) if args.dccareload: #for the big dataset use pickle instead of cPickle if X.shape[0] > 1000000: loaded_args, params1, params2 = load_obj(args.dccareload, serializer=hickle) else: loaded_args, params1, params2 = load_obj(args.dccareload) logging.info(loaded_args) deepcca.set_params(params1, params2) else: deepcca.fit(V1=X, V2=H, train_indices=train_indices, val_indices=dev_indices, test_indices=test_indices, n_epochs=500, early_stopping_max_down=args.maxdown, batch_size=train_indices.shape[0]) V1_cca, V2_cca, l_cca = deepcca.f_predict(X, H) should_run_cca_on_outputs = True if should_run_cca_on_outputs: #run linear cca on the outputs of mlp A, B, mean1, mean2 = linear_cca(V1_cca, V2_cca, outdim_size=args.dccasize) V1_cca = V1_cca - mean1 V2_cca = V2_cca - mean2 V1_cca = np.dot(V1_cca, A) V2_cca = np.dot(V2_cca, B) X_cca = np.hstack((V1_cca, V2_cca)).astype(dtype) else: logging.info('No shared deepcca representation, just concatenation!') X_cca = sp.sparse.hstack([X, H]).astype(dtype).tocsr() stratified = False all_train_indices = train_indices fractions = args.lblfraction clf = MLPDense(input_sparse=sp.sparse.issparse(X_cca), in_size=X_cca.shape[1], out_size=output_size, architecture=hidden_size, regul=regul, dropout=dropout, lr=args.mlplr, batchnorm=args.mlpbatchnorm) clf.build(seed=model_args.seed) for percentile in fractions: logging.info('***********percentile %f ******************' % percentile) if stratified: all_chosen = [] for lbl in range(0, np.max(Y_train) + 1): lbl_indices = all_train_indices[Y_train == lbl] selection_size = int(percentile * len(lbl_indices)) + 1 lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint) all_chosen.append(lbl_chosen) train_indices = np.hstack(all_chosen) else: selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0]) train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint) num_training_samples = train_indices.shape[0] logging.info('{} training samples'.format(num_training_samples)) X_train = X_cca[train_indices, :] Y_train_chosen = Y_train[train_indices].astype('int32') X_dev = X_cca[dev_indices, :] X_test = X_cca[test_indices, :] if args.vis: draw_representations(X_train, Y_train_chosen, k=4, do_pca=True, filename=args.vis) if clf.fitted: clf.reset() clf.fit(X_train, Y_train_chosen, X_dev, Y_dev, n_epochs=1000, early_stopping_max_down=args.maxdown, verbose=not args.silent, batch_size=min(batch_size, train_indices.shape[0]), seed=model_args.seed) dev_pred = clf.predict(X_dev) test_pred = clf.predict(X_test) logging.info('Dev predictions') mean, median, acc, distances, latlon_true, latlon_pred = geo_eval( Y_dev, dev_pred, U_dev, classLatMedian, classLonMedian, userLocation) with open( 'dcca_{}_percent_pred_{}.pkl'.format(percentile, output_size) if args.dcca else 'concat_{}_percent_pred_{}.pkl'.format( percentile, output_size), 'wb') as fout: pickle.dump((distances, latlon_true, latlon_pred), fout) logging.info('Test predictions') geo_eval(Y_test, test_pred, U_test, classLatMedian, classLonMedian, userLocation)
def get_geo_data(raw_dir, name): filename = osp.join(raw_dir, name) #print(raw_dir, name) geo_data = load_obj(filename) #A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = geo_data return geo_data
# initiate the parser with a description parser = argparse.ArgumentParser(description=text) parser.add_argument( "-kmers", "--kmers_file", help="Path of the first k-mers file", required=True ) parser.add_argument( "-kmers2", "--kmers_file2", help="Path of the second k-mers file", required=True ) parser.add_argument("-k", help="K value", required=False, default=15) args = parser.parse_args() logging.debug("Getting the path of k-mer file") kmers_0 = data.load_obj(args.kmers_file) print("Loaded kmers_0") logging.debug("Getting the path of the second k-mer file") kmers_1 = data.load_obj(args.kmers_file2) print("Loaded kmers_1") logging.debug("Getting the k value") k = int(args.k) if kmers_1 and kmers_0: logging.debug("Creating the Snp objects") snp_0 = Snp(kmers_0=kmers_0, kmers_1=kmers_1) logging.debug("Get the SNPs from the first k-mers file") print("Extracting the SNPs please wait.") kmers_0_snp = snp_0.snp_with_cdf(snp_0.mean_coverage())
return heading + e def gyro_measurement_sim(yaw_rate, R): """ yaw rate in rad/s """ e = np.sqrt(R) * np.random.randn(1) return yaw_rate + e if __name__ == '__main__': dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) data_path = os.path.join(dir_path, 'data') car_path = os.path.join(data_path, 'autonomous-car') # Simulate open loop sequence sim_def = data.load_obj(car_path + '/sim/sim_definition') x0 = sim_def["x0"] u = sim_def["u"] T = sim_def["T"] N = sim_def["N"] t = sim_def["t"] params = {"tire_model_func": tire_model} x = simulate.open_loop_sim(t, u, x0, vehicle_dynamics, params) # Reference location in ECEF lat0 = 37.4276 lon0 = -122.1670 h0 = 0 p_ref_ECEF = utils.lla2ecef(np.array([lat0, lon0, h0])) # Load some data to get satellite positions