Example #1
0
def preprocess_data(data_home, **kwargs):
    bucket_size = kwargs.get('bucket', 300)
    encoding = kwargs.get('encoding', 'utf-8')
    celebrity_threshold = kwargs.get('celebrity', 10)
    mindf = kwargs.get('mindf', 10)
    dtype = kwargs.get('dtype', 'float32')
    one_hot_label = kwargs.get('onehot', False)
    vocab_file = os.path.join(data_home, 'vocab.pkl')
    dump_file = os.path.join(data_home, 'dump.pkl')
    if os.path.exists(dump_file) and not model_args.builddata:
        logging.info('loading data from dumped file...')
        data = load_obj(dump_file)
        logging.info('loading data finished!')
        return data

    dl = DataLoader(data_home=data_home,
                    bucket_size=bucket_size,
                    encoding=encoding,
                    celebrity_threshold=celebrity_threshold,
                    one_hot_labels=one_hot_label,
                    mindf=mindf,
                    token_pattern=r'(?u)(?<![@])#?\b\w\w+\b')
    dl.load_data()
    dl.assignClasses()
    dl.tfidf()
    vocab = dl.vectorizer.vocabulary_
    logging.info('saving vocab in {}'.format(vocab_file))
    dump_obj(vocab, vocab_file)
    logging.info('vocab dumped successfully!')
    U_test = dl.df_test.index.tolist()
    U_dev = dl.df_dev.index.tolist()
    U_train = dl.df_train.index.tolist()

    dl.get_graph()
    logging.info('creating adjacency matrix...')
    adj = nx.adjacency_matrix(dl.graph,
                              nodelist=xrange(len(U_train + U_dev + U_test)),
                              weight='w')

    adj.setdiag(0)
    #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,)
    selfloop_value = 1
    adj.setdiag(selfloop_value)
    n, m = adj.shape
    diags = adj.sum(axis=1).flatten()
    with sp.errstate(divide='ignore'):
        diags_sqrt = 1.0 / sp.sqrt(diags)
    diags_sqrt[sp.isinf(diags_sqrt)] = 0
    D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr')
    A = D_pow_neghalf * adj * D_pow_neghalf
    A = A.astype(dtype)
    logging.info('adjacency matrix created.')

    X_train = dl.X_train
    X_dev = dl.X_dev
    X_test = dl.X_test
    Y_test = dl.test_classes
    Y_train = dl.train_classes
    Y_dev = dl.dev_classes
    classLatMedian = {
        str(c): dl.cluster_median[c][0]
        for c in dl.cluster_median
    }
    classLonMedian = {
        str(c): dl.cluster_median[c][1]
        for c in dl.cluster_median
    }

    P_test = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_test[['lat', 'lon']].values.tolist()
    ]
    P_train = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_train[['lat', 'lon']].values.tolist()
    ]
    P_dev = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_dev[['lat', 'lon']].values.tolist()
    ]
    userLocation = {}
    for i, u in enumerate(U_train):
        userLocation[u] = P_train[i]
    for i, u in enumerate(U_test):
        userLocation[u] = P_test[i]
    for i, u in enumerate(U_dev):
        userLocation[u] = P_dev[i]

    data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev,
            U_test, classLatMedian, classLonMedian, userLocation)
    if not model_args.builddata:
        logging.info('dumping data in {} ...'.format(str(dump_file)))
        dump_obj(data, dump_file)
        logging.info('data dump finished!')

    return data
Example #2
0
def main(data, args, **kwargs):
    batch_size = kwargs.get('batch', 500)
    hidden_size = kwargs.get('hidden', [100])
    dropout = kwargs.get('dropout', 0.0)
    regul = kwargs.get('regularization', 1e-6)
    dtype = 'float32'
    dtypeint = 'int32'
    check_percentiles = kwargs.get('percent', False)
    A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data
    logging.info(
        'stacking training, dev and test features and creating indices...')
    X = sp.sparse.vstack([X_train, X_dev, X_test])
    if len(Y_train.shape) == 1:
        Y = np.hstack((Y_train, Y_dev, Y_test))
    else:
        Y = np.vstack((Y_train, Y_dev, Y_test))
    Y = Y.astype(dtypeint)
    X = X.astype(dtype)
    A = A.astype(dtype)
    if args.vis:
        from deepcca import draw_representations
        draw_representations(A.dot(X), Y, filename='gconv1.pdf')
        draw_representations(A.dot(A.dot(X)), Y, filename='gconv2.pdf')
    input_size = X.shape[1]
    output_size = np.max(Y) + 1
    verbose = not args.silent
    fractions = args.lblfraction
    stratified = False
    all_train_indices = np.asarray(range(0, X_train.shape[0])).astype(dtypeint)
    logging.info('running mlp with graph conv...')
    clf = GraphConv(input_size=input_size,
                    output_size=output_size,
                    hid_size_list=hidden_size,
                    regul_coef=regul,
                    drop_out=dropout,
                    batchnorm=args.batchnorm,
                    highway=model_args.highway)
    clf.build_model(A,
                    use_text=args.notxt,
                    use_labels=args.lp,
                    seed=model_args.seed)

    for percentile in fractions:
        logging.info('***********percentile %f ******************' %
                     percentile)
        model_file = './data/model-{}-{}.pkl'.format(A.shape[0], percentile)
        if stratified:
            all_chosen = []
            for lbl in range(0, np.max(Y_train) + 1):
                lbl_indices = all_train_indices[Y_train == lbl]
                selection_size = int(percentile * len(lbl_indices)) + 1
                lbl_chosen = np.random.choice(lbl_indices,
                                              size=selection_size,
                                              replace=False).astype(dtypeint)
                all_chosen.append(lbl_chosen)
            train_indices = np.hstack(all_chosen)
        else:
            selection_size = min(int(percentile * X.shape[0]),
                                 all_train_indices.shape[0])
            train_indices = np.random.choice(all_train_indices,
                                             size=selection_size,
                                             replace=False).astype(dtypeint)
        num_training_samples = train_indices.shape[0]
        logging.info('{} training samples'.format(num_training_samples))
        #train_indices = np.asarray(range(0, int(percentile * X_train.shape[0]))).astype(dtypeint)
        dev_indices = np.asarray(
            range(X_train.shape[0],
                  X_train.shape[0] + X_dev.shape[0])).astype(dtypeint)
        test_indices = np.asarray(
            range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] +
                  X_dev.shape[0] + X_test.shape[0])).astype(dtypeint)
        # do not train, load
        if args.load:
            report_results = False
            clf.load(load_obj, model_file)
        else:
            #reset the network parameters if already fitted with another data
            if clf.fitted:
                clf.reset()
            clf.fit(X,
                    A,
                    Y,
                    train_indices=train_indices,
                    val_indices=dev_indices,
                    n_epochs=10000,
                    batch_size=batch_size,
                    max_down=args.maxdown,
                    verbose=verbose,
                    seed=model_args.seed)
            if args.save:
                clf.save(dump_obj, model_file)

            logging.info('dev results:')
            y_pred, _ = clf.predict(X, A, dev_indices)
            mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(
                Y_dev, y_pred, U_dev, classLatMedian, classLonMedian,
                userLocation)
            with open(
                    'gcn_{}_percent_pred_{}.pkl'.format(
                        percentile, output_size), 'wb') as fout:
                pickle.dump((distances, latlon_true, latlon_pred), fout)
            logging.info('test results:')
            y_pred, _ = clf.predict(X, A, test_indices)
            geo_eval(Y_test, y_pred, U_test, classLatMedian, classLonMedian,
                     userLocation)

    if args.feature_report:
        vocab_file = os.path.join(args.dir, 'vocab.pkl')
        if not os.path.exists(vocab_file):
            logging.error('vocab file {} not found'.format(vocab_file))
            return
        else:
            vocab = load_obj(vocab_file)
        logging.info('{} vocab loaded from file'.format(len(vocab)))
        train_vocab = set([
            term for term, count in Counter(np.nonzero(X[train_indices])
                                            [1]).iteritems() if count >= 10
        ])
        dev_vocab = set(np.nonzero(X[dev_indices].sum(axis=0))[1])
        X_onehot = sp.sparse.diags([1] * len(vocab), dtype=dtype)
        A_onehot = X_onehot
        feature_report(clf,
                       vocab,
                       X_onehot,
                       A_onehot,
                       classLatMedian,
                       classLonMedian,
                       train_vocab,
                       dev_vocab,
                       topk=200,
                       dtypeint=dtypeint)
Example #3
0
def main(data, args, **kwargs):
    batch_size = kwargs.get('batch', 500)
    hidden_size = kwargs.get('hidden', [100])
    dropout = kwargs.get('dropout', 0.0)
    regul = kwargs.get('regularization', 1e-6)
    dtype = 'float32'
    dtypeint = 'int32'
    check_percentiles = kwargs.get('percent', False)
    H, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data
    Y_dev = Y_dev.astype(dtypeint)
    Y_test = Y_test.astype(dtypeint)
    logging.info(
        'stacking training, dev and test features and creating indices...')
    X = sp.sparse.vstack([X_train, X_dev, X_test])
    if len(Y_train.shape) == 1:
        Y = np.hstack((Y_train, Y_dev, Y_test))
    else:
        Y = np.vstack((Y_train, Y_dev, Y_test))
    Y = Y.astype('int32')
    X = X.astype(dtype)
    H = H.astype(dtype)
    input_size = X.shape[1]
    output_size = np.max(Y) + 1

    train_indices = np.asarray(range(0, X_train.shape[0])).astype('int32')
    dev_indices = np.asarray(
        range(X_train.shape[0],
              X_train.shape[0] + X_dev.shape[0])).astype('int32')
    test_indices = np.asarray(
        range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] +
              X_dev.shape[0] + X_test.shape[0])).astype('int32')
    batch_size = min(batch_size, train_indices.shape[0])
    if args.dcca:
        logging.info('running deepcca...')
        deepcca = DeepCCA()
        deepcca.build(X.shape[1],
                      H.shape[1],
                      architecture=args.dccahid,
                      regul_coef=args.dccareg,
                      dropout=dropout,
                      lr=args.dccalr,
                      batchnorm=args.dccabatchnorm,
                      seed=model_args.seed)
        if args.dccareload:
            #for the big dataset use pickle instead of cPickle
            if X.shape[0] > 1000000:
                loaded_args, params1, params2 = load_obj(args.dccareload,
                                                         serializer=hickle)
            else:
                loaded_args, params1, params2 = load_obj(args.dccareload)
            logging.info(loaded_args)
            deepcca.set_params(params1, params2)

        else:
            deepcca.fit(V1=X,
                        V2=H,
                        train_indices=train_indices,
                        val_indices=dev_indices,
                        test_indices=test_indices,
                        n_epochs=500,
                        early_stopping_max_down=args.maxdown,
                        batch_size=train_indices.shape[0])
        V1_cca, V2_cca, l_cca = deepcca.f_predict(X, H)
        should_run_cca_on_outputs = True
        if should_run_cca_on_outputs:
            #run linear cca on the outputs of mlp
            A, B, mean1, mean2 = linear_cca(V1_cca,
                                            V2_cca,
                                            outdim_size=args.dccasize)
            V1_cca = V1_cca - mean1
            V2_cca = V2_cca - mean2
            V1_cca = np.dot(V1_cca, A)
            V2_cca = np.dot(V2_cca, B)
        X_cca = np.hstack((V1_cca, V2_cca)).astype(dtype)
    else:
        logging.info('No shared deepcca representation, just concatenation!')
        X_cca = sp.sparse.hstack([X, H]).astype(dtype).tocsr()
    stratified = False
    all_train_indices = train_indices
    fractions = args.lblfraction
    clf = MLPDense(input_sparse=sp.sparse.issparse(X_cca),
                   in_size=X_cca.shape[1],
                   out_size=output_size,
                   architecture=hidden_size,
                   regul=regul,
                   dropout=dropout,
                   lr=args.mlplr,
                   batchnorm=args.mlpbatchnorm)
    clf.build(seed=model_args.seed)

    for percentile in fractions:
        logging.info('***********percentile %f ******************' %
                     percentile)
        if stratified:
            all_chosen = []
            for lbl in range(0, np.max(Y_train) + 1):
                lbl_indices = all_train_indices[Y_train == lbl]
                selection_size = int(percentile * len(lbl_indices)) + 1
                lbl_chosen = np.random.choice(lbl_indices,
                                              size=selection_size,
                                              replace=False).astype(dtypeint)
                all_chosen.append(lbl_chosen)
            train_indices = np.hstack(all_chosen)
        else:
            selection_size = min(int(percentile * X.shape[0]),
                                 all_train_indices.shape[0])
            train_indices = np.random.choice(all_train_indices,
                                             size=selection_size,
                                             replace=False).astype(dtypeint)
        num_training_samples = train_indices.shape[0]
        logging.info('{} training samples'.format(num_training_samples))
        X_train = X_cca[train_indices, :]
        Y_train_chosen = Y_train[train_indices].astype('int32')
        X_dev = X_cca[dev_indices, :]
        X_test = X_cca[test_indices, :]
        if args.vis:
            draw_representations(X_train,
                                 Y_train_chosen,
                                 k=4,
                                 do_pca=True,
                                 filename=args.vis)
        if clf.fitted:
            clf.reset()
        clf.fit(X_train,
                Y_train_chosen,
                X_dev,
                Y_dev,
                n_epochs=1000,
                early_stopping_max_down=args.maxdown,
                verbose=not args.silent,
                batch_size=min(batch_size, train_indices.shape[0]),
                seed=model_args.seed)
        dev_pred = clf.predict(X_dev)
        test_pred = clf.predict(X_test)
        logging.info('Dev predictions')
        mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(
            Y_dev, dev_pred, U_dev, classLatMedian, classLonMedian,
            userLocation)
        with open(
                'dcca_{}_percent_pred_{}.pkl'.format(percentile, output_size)
                if args.dcca else 'concat_{}_percent_pred_{}.pkl'.format(
                    percentile, output_size), 'wb') as fout:
            pickle.dump((distances, latlon_true, latlon_pred), fout)
        logging.info('Test predictions')
        geo_eval(Y_test, test_pred, U_test, classLatMedian, classLonMedian,
                 userLocation)
def get_geo_data(raw_dir, name):
    filename = osp.join(raw_dir, name)
    #print(raw_dir, name)
    geo_data = load_obj(filename)
    #A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = geo_data
    return geo_data
Example #5
0
    # initiate the parser with a description
    parser = argparse.ArgumentParser(description=text)
    parser.add_argument(
        "-kmers", "--kmers_file", help="Path of the first k-mers file", required=True
    )
    parser.add_argument(
        "-kmers2", "--kmers_file2", help="Path of the second k-mers file", required=True
    )

    parser.add_argument("-k", help="K value", required=False, default=15)

    args = parser.parse_args()

    logging.debug("Getting the path of k-mer file")
    kmers_0 = data.load_obj(args.kmers_file)
    print("Loaded kmers_0")

    logging.debug("Getting the path of the second  k-mer file")
    kmers_1 = data.load_obj(args.kmers_file2)
    print("Loaded kmers_1")

    logging.debug("Getting the k value")
    k = int(args.k)

    if kmers_1 and kmers_0:
        logging.debug("Creating the Snp objects")
        snp_0 = Snp(kmers_0=kmers_0, kmers_1=kmers_1)
        logging.debug("Get the SNPs from the first k-mers file")
        print("Extracting the SNPs please wait.")
        kmers_0_snp = snp_0.snp_with_cdf(snp_0.mean_coverage())
Example #6
0
    return heading + e


def gyro_measurement_sim(yaw_rate, R):
    """ yaw rate in rad/s """
    e = np.sqrt(R) * np.random.randn(1)
    return yaw_rate + e


if __name__ == '__main__':
    dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    data_path = os.path.join(dir_path, 'data')
    car_path = os.path.join(data_path, 'autonomous-car')

    # Simulate open loop sequence
    sim_def = data.load_obj(car_path + '/sim/sim_definition')
    x0 = sim_def["x0"]
    u = sim_def["u"]
    T = sim_def["T"]
    N = sim_def["N"]
    t = sim_def["t"]
    params = {"tire_model_func": tire_model}
    x = simulate.open_loop_sim(t, u, x0, vehicle_dynamics, params)

    # Reference location in ECEF
    lat0 = 37.4276
    lon0 = -122.1670
    h0 = 0
    p_ref_ECEF = utils.lla2ecef(np.array([lat0, lon0, h0]))

    # Load some data to get satellite positions