Ejemplo n.º 1
0
def preprocess_data(data_home,args, **kwargs):
    bucket_size = kwargs.get('bucket', 300)
    encoding = kwargs.get('encoding', 'iso-8859-1')
    celebrity_threshold = kwargs.get('celebrity', 10)
    mindf = kwargs.get('mindf', 10)
    d2v = kwargs.get('d2v', False)
    adj_d2v = args.adj_d2v

    one_hot_label = kwargs.get('onehot', False)
    vocab_file = os.path.join(data_home, 'vocab.pkl')
    if d2v:
        dump_name = 'doc2vec_win_' + str(args.d2vwindow) + '_dm_' + str(args.d2vdm) + 'adj_d2v_'+ str(adj_d2v*1) + '_dump.pkl'
    else:
        dump_name = 'tfidf_dump.pkl'
    dump_file = os.path.join(data_home, dump_name)
    if os.path.exists(dump_file) and not model_args.builddata:
        logging.info('loading data from dumped file ' + dump_name)
        data = load_obj(dump_file)
        logging.info('loading data finished!')
        return data

    dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding,
                    celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf,
                    token_pattern=r'(?u)(?<![@])#?\b\w\w+\b')

    dl.load_data()
    dl.assignClasses()
    if d2v:
        dl.doc2vec(args=args)
        X_train = dl.X_train_doc2vec
        X_test = dl.X_test_doc2vec
        X_dev = dl.X_dev_doc2vec
    else:
        dl.tfidf()
        X_train = dl.X_train
        X_dev = dl.X_dev
        X_test = dl.X_test
        vocab = dl.vectorizer.vocabulary_
        logging.info('saving vocab in {}'.format(vocab_file))
        dump_obj(vocab, vocab_file)
        logging.info('vocab dumped successfully!')

    U_test = dl.df_test.index.tolist()
    U_dev = dl.df_dev.index.tolist()
    U_train = dl.df_train.index.tolist()

    if adj_d2v and args.doc2vec:
        adj = dl.adj_doc2vec
        G = nx.from_numpy_matrix(adj, parallel_edges=False, create_using=None)
    else:
        dl.get_graph()
        logging.info('creating adjacency matrix...')
        adj = nx.adjacency_matrix(dl.graph, nodelist=range(len(U_train + U_dev + U_test)), weight='w')
    # converting the edges index to pytorch format
        G = dl.graph

    edges = G.edges()
    edges = np.array(edges)
    edges = edges[np.lexsort(np.fliplr(edges).T)]
    wadj = args.weighted_adjacency ## if we want to weight adjacency materix
    if wadj:
        logging.info('multiplying weights...')
        w_adj_s = dl.adj_weight_d2v * adj
    else:
        w_adj_s = 0

    logging.info('adjacency matrix created.')
    Y_test = dl.test_classes
    Y_train = dl.train_classes
    Y_dev = dl.dev_classes
    classLatMedian = {str(c): dl.cluster_median[c][0] for c in dl.cluster_median}
    classLonMedian = {str(c): dl.cluster_median[c][1] for c in dl.cluster_median}

    P_test = [str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist()]
    P_train = [str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist()]
    P_dev = [str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist()]
    userLocation = {}
    for i, u in enumerate(U_train):
        userLocation[u] = P_train[i]
    for i, u in enumerate(U_test):
        userLocation[u] = P_test[i]
    for i, u in enumerate(U_dev):
        userLocation[u] = P_dev[i]

    total_users = X_train.shape[0] + X_dev.shape[0] + X_test.shape[0]

    data = (adj, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian,userLocation, w_adj_s, edges, total_users)
    if not model_args.builddata:
        logging.info('dumping data in {} ...'.format(str(dump_file)))
        dump_obj(data, dump_file)
        logging.info('data dump finished!')

    return data
Ejemplo n.º 2
0
def preprocess_data(data_home, **kwargs):
    bucket_size = kwargs.get('bucket', 300)
    encoding = kwargs.get('encoding', 'utf-8')
    celebrity_threshold = kwargs.get('celebrity', 10)  
    mindf = kwargs.get('mindf', 10)
    dtype = kwargs.get('dtype', 'float32')
    one_hot_label = kwargs.get('onehot', False)
    vocab_file = os.path.join(data_home, 'vocab.pkl')
    dump_file = os.path.join(data_home, 'dump.pkl')
    if os.path.exists(dump_file) and not model_args.builddata:
        logging.info('loading data from dumped file...')
        data = load_obj(dump_file)
        logging.info('loading data finished!')
        return data

    dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, 
                    celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b')
    dl.load_data()
    dl.assignClasses()
    dl.tfidf()
    vocab = dl.vectorizer.vocabulary_
    logging.info('saving vocab in {}'.format(vocab_file))
    dump_obj(vocab, vocab_file)
    logging.info('vocab dumped successfully!')
    U_test = dl.df_test.index.tolist()
    U_dev = dl.df_dev.index.tolist()
    U_train = dl.df_train.index.tolist()    

    dl.get_graph()  
    logging.info('creating adjacency matrix...')
    adj = nx.adjacency_matrix(dl.graph, nodelist=xrange(len(U_train + U_dev + U_test)), weight='w')
    
    adj.setdiag(0)
    #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,)
    selfloop_value = 1
    adj.setdiag(selfloop_value)
    n,m = adj.shape
    diags = adj.sum(axis=1).flatten()
    with sp.errstate(divide='ignore'):
        diags_sqrt = 1.0/sp.sqrt(diags)
    diags_sqrt[sp.isinf(diags_sqrt)] = 0
    D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr')
    A = D_pow_neghalf * adj * D_pow_neghalf
    A = A.astype(dtype)
    logging.info('adjacency matrix created.')

    X_train = dl.X_train
    X_dev = dl.X_dev
    X_test = dl.X_test
    Y_test = dl.test_classes
    Y_train = dl.train_classes
    Y_dev = dl.dev_classes
    classLatMedian = {str(c):dl.cluster_median[c][0] for c in dl.cluster_median}
    classLonMedian = {str(c):dl.cluster_median[c][1] for c in dl.cluster_median}
    
    
    
    P_test = [str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist()]
    P_train = [str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist()]
    P_dev = [str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist()]
    userLocation = {}
    for i, u in enumerate(U_train):
        userLocation[u] = P_train[i]
    for i, u in enumerate(U_test):
        userLocation[u] = P_test[i]
    for i, u in enumerate(U_dev):
        userLocation[u] = P_dev[i]
    
    data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation)
    if not model_args.builddata:
        logging.info('dumping data in {} ...'.format(str(dump_file)))
        dump_obj(data, dump_file)
        logging.info('data dump finished!')

    return data
Ejemplo n.º 3
0
def main(data, args, **kwargs):
    batch_size = kwargs.get('batch', 500)
    hidden_size = kwargs.get('hidden', [100])
    dropout = kwargs.get('dropout', 0.0)
    regul = kwargs.get('regularization', 1e-6)
    dtype = 'float32'
    dtypeint = 'int32'
    check_percentiles = kwargs.get('percent', False)
    H, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data
    Y_dev = Y_dev.astype(dtypeint)
    Y_test = Y_test.astype(dtypeint)
    logging.info('stacking training, dev and test features and creating indices...')
    X = sp.sparse.vstack([X_train, X_dev, X_test])
    if len(Y_train.shape) == 1:
        Y = np.hstack((Y_train, Y_dev, Y_test))
    else:
        Y = np.vstack((Y_train, Y_dev, Y_test))
    Y = Y.astype('int32')
    X = X.astype(dtype)
    H = H.astype(dtype)
    input_size = X.shape[1]
    output_size = np.max(Y) + 1
    

    train_indices = np.asarray(range(0, X_train.shape[0])).astype('int32')
    dev_indices = np.asarray(range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype('int32')
    test_indices = np.asarray(range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype('int32')
    batch_size = min(batch_size, train_indices.shape[0])
    if args.dcca:
        logging.info('running deepcca...')
        deepcca = DeepCCA()
        deepcca.build(X.shape[1], H.shape[1], architecture=args.dccahid, regul_coef=args.dccareg, dropout=dropout, lr=args.dccalr, batchnorm=args.dccabatchnorm, seed=model_args.seed)
        if args.dccareload:
            #for the big dataset use pickle instead of cPickle
            if X.shape[0] > 1000000:
                loaded_args, params1, params2 = load_obj(args.dccareload, serializer=hickle)
            else:
                loaded_args, params1, params2 = load_obj(args.dccareload)
            logging.info(loaded_args)
            deepcca.set_params(params1, params2)
            
        else:
            deepcca.fit(V1=X, V2=H, train_indices=train_indices, val_indices=dev_indices, test_indices=test_indices, n_epochs=500, 
                        early_stopping_max_down=args.maxdown, batch_size=train_indices.shape[0])
        V1_cca, V2_cca, l_cca = deepcca.f_predict(X, H)
        should_run_cca_on_outputs = True
        if should_run_cca_on_outputs:
            #run linear cca on the outputs of mlp
            A, B, mean1, mean2 = linear_cca(V1_cca, V2_cca, outdim_size=args.dccasize)
            V1_cca = V1_cca - mean1
            V2_cca = V2_cca - mean2
            V1_cca = np.dot(V1_cca, A)
            V2_cca = np.dot(V2_cca, B)
        X_cca = np.hstack((V1_cca, V2_cca)).astype(dtype)
    else:
        logging.info('No shared deepcca representation, just concatenation!')
        X_cca = sp.sparse.hstack([X, H]).astype(dtype).tocsr()
    stratified = False
    all_train_indices = train_indices
    fractions = args.lblfraction
    clf = MLPDense(input_sparse=sp.sparse.issparse(X_cca), in_size=X_cca.shape[1], out_size=output_size,
                   architecture=hidden_size, regul=regul, dropout=dropout, lr=args.mlplr,
                   batchnorm=args.mlpbatchnorm)
    clf.build(seed=model_args.seed)

    for percentile in fractions:
        logging.info('***********percentile %f ******************' %percentile)
        if stratified:
            all_chosen = []
            for lbl in range(0, np.max(Y_train) + 1): 
                lbl_indices = all_train_indices[Y_train == lbl]
                selection_size =  int(percentile * len(lbl_indices)) + 1
                lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint)
                all_chosen.append(lbl_chosen)
            train_indices = np.hstack(all_chosen) 
        else:
            selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0])
            train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint)
        num_training_samples = train_indices.shape[0]
        logging.info('{} training samples'.format(num_training_samples))
        X_train = X_cca[train_indices, :]
        Y_train_chosen = Y_train[train_indices].astype('int32')
        X_dev = X_cca[dev_indices, :]
        X_test = X_cca[test_indices, :]
        if args.vis:
            draw_representations(X_train, Y_train_chosen, k=4, do_pca=True, filename=args.vis)
        if clf.fitted:
            clf.reset()
        clf.fit(X_train, Y_train_chosen, X_dev, Y_dev, n_epochs=1000, early_stopping_max_down=args.maxdown, verbose=not args.silent, batch_size=min(batch_size, train_indices.shape[0]), seed=model_args.seed)
        dev_pred = clf.predict(X_dev)
        test_pred = clf.predict(X_test)
        logging.info('Dev predictions')
        mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(Y_dev, dev_pred, U_dev, classLatMedian, classLonMedian, userLocation) 
        with open('dcca_{}_percent_pred_{}.pkl'.format(percentile, output_size) if args.dcca else 'concat_{}_percent_pred_{}.pkl'.format(percentile, output_size), 'wb') as fout:
            pickle.dump((distances, latlon_true, latlon_pred), fout)
        logging.info('Test predictions')
        geo_eval(Y_test, test_pred, U_test, classLatMedian, classLonMedian, userLocation)
Ejemplo n.º 4
0
def main(data, args, **kwargs):
    batch_size = kwargs.get('batch', 500)
    hidden_size = kwargs.get('hidden', [100])
    dropout = kwargs.get('dropout', 0.0)
    regul = kwargs.get('regularization', 1e-6)
    dtype = 'float32'
    dtypeint = 'int32'
    check_percentiles = kwargs.get('percent', False)
    A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data
    logging.info('stacking training, dev and test features and creating indices...')
    X = sp.sparse.vstack([X_train, X_dev, X_test])
    if len(Y_train.shape) == 1:
        Y = np.hstack((Y_train, Y_dev, Y_test))
    else:
        Y = np.vstack((Y_train, Y_dev, Y_test))
    Y = Y.astype(dtypeint)
    X = X.astype(dtype)
    A = A.astype(dtype)
    if args.vis:
        from deepcca import draw_representations
        draw_representations(A.dot(X), Y, filename='gconv1.pdf')
        draw_representations(A.dot(A.dot(X)), Y, filename='gconv2.pdf')
    input_size = X.shape[1]
    output_size = np.max(Y) + 1
    verbose = not args.silent
    fractions = args.lblfraction
    stratified = False
    all_train_indices = np.asarray(range(0, X_train.shape[0])).astype(dtypeint)
    logging.info('running mlp with graph conv...')
    clf = GraphConv(input_size=input_size, output_size=output_size, hid_size_list=hidden_size, regul_coef=regul, drop_out=dropout, batchnorm=args.batchnorm, highway=model_args.highway)
    clf.build_model(A, use_text=args.notxt, use_labels=args.lp, seed=model_args.seed)

    for percentile in fractions:
        logging.info('***********percentile %f ******************' %percentile)
        model_file = './data/model-{}-{}.pkl'.format(A.shape[0], percentile)
        if stratified:
            all_chosen = []
            for lbl in range(0, np.max(Y_train) + 1):
                lbl_indices = all_train_indices[Y_train == lbl]
                selection_size =  int(percentile * len(lbl_indices)) + 1
                lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint)
                all_chosen.append(lbl_chosen)
            train_indices = np.hstack(all_chosen) 
        else:
            selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0])
            train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint)
        num_training_samples = train_indices.shape[0]
        logging.info('{} training samples'.format(num_training_samples))
        #train_indices = np.asarray(range(0, int(percentile * X_train.shape[0]))).astype(dtypeint)
        dev_indices = np.asarray(range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype(dtypeint)
        test_indices = np.asarray(range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype(dtypeint)
        # do not train, load
        if args.load:
            report_results = False
            clf.load(load_obj, model_file)
        else:
            #reset the network parameters if already fitted with another data
            if clf.fitted:
                clf.reset()
            clf.fit(X, A, Y, train_indices=train_indices, val_indices=dev_indices, n_epochs=10000, batch_size=batch_size, max_down=args.maxdown, verbose=verbose, seed=model_args.seed)
            if args.save:
                clf.save(dump_obj, model_file)

            logging.info('dev results:')
            y_pred, _ = clf.predict(X, A, dev_indices)
            mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(Y_dev, y_pred, U_dev, classLatMedian, classLonMedian, userLocation)
            with open('gcn_{}_percent_pred_{}.pkl'.format(percentile, output_size), 'wb') as fout:
                pickle.dump((distances, latlon_true, latlon_pred), fout)
            logging.info('test results:')
            y_pred, _ = clf.predict(X, A, test_indices)
            geo_eval(Y_test, y_pred, U_test, classLatMedian, classLonMedian, userLocation)

    if args.feature_report:
        vocab_file = os.path.join(args.dir, 'vocab.pkl')
        if not os.path.exists(vocab_file):
            logging.error('vocab file {} not found'.format(vocab_file))
            return
        else:
            vocab = load_obj(vocab_file)
        logging.info('{} vocab loaded from file'.format(len(vocab)))
        train_vocab = set([term for term, count in Counter(np.nonzero(X[train_indices])[1]).iteritems() if count >= 10])
        dev_vocab = set(np.nonzero(X[dev_indices].sum(axis=0))[1])
        X_onehot = sp.sparse.diags([1] * len(vocab), dtype=dtype)
        A_onehot = X_onehot
        feature_report(clf, vocab, X_onehot, A_onehot, classLatMedian, classLonMedian, train_vocab, dev_vocab, topk=200, dtypeint=dtypeint)
Ejemplo n.º 5
0
def preprocess_data(data_home, **kwargs):
    bucket_size = kwargs.get('bucket', 300)
    encoding = kwargs.get('encoding', 'iso-8859-1')
    celebrity_threshold = kwargs.get('celebrity', 10)
    mindf = kwargs.get('mindf', 10)
    d2v = kwargs.get('d2v', False)

    one_hot_label = kwargs.get('onehot', False)
    vocab_file = os.path.join(data_home, 'vocab.pkl')
    if d2v:
        dump_name = 'doc2vec_dump.pkl'
    else:
        dump_name = 'dump.pkl'
    dump_file = os.path.join(data_home, dump_name)
    if os.path.exists(dump_file) and not model_args.builddata:
        logging.info('loading data from dumped file ' + dump_name)
        data = load_obj(dump_file)
        logging.info('loading data finished!')
        return data

    dl = DataLoader(data_home=data_home,
                    bucket_size=bucket_size,
                    encoding=encoding,
                    celebrity_threshold=celebrity_threshold,
                    one_hot_labels=one_hot_label,
                    mindf=mindf,
                    token_pattern=r'(?u)(?<![@])#?\b\w\w+\b')

    dl.load_data()
    dl.assignClasses()
    if d2v:
        dl.doc2vec()
        X_train = dl.X_train_doc2vec
        X_test = dl.X_test_doc2vec
        X_dev = dl.X_dev_doc2vec
    else:
        dl.tfidf()
        X_train = dl.X_train
        X_dev = dl.X_dev
        X_test = dl.X_test
        vocab = dl.vectorizer.vocabulary_
        logging.info('saving vocab in {}'.format(vocab_file))
        dump_obj(vocab, vocab_file)
        logging.info('vocab dumped successfully!')

    U_test = dl.df_test.index.tolist()
    U_dev = dl.df_dev.index.tolist()
    U_train = dl.df_train.index.tolist()

    dl.get_graph()
    logging.info('creating adjacency matrix...')
    adj = nx.adjacency_matrix(dl.graph,
                              nodelist=range(len(U_train + U_dev + U_test)),
                              weight='w')
    G = dl.graph
    # converting the edges index to pytorch format

    edges = list(G.edges)
    edges_test_hash = set(edges)  # used to make search faster

    for index, item in enumerate(edges):
        swapped = (item[1], item[0])
        if swapped not in edges_test_hash:
            edges.append(swapped)

    edges = sorted(edges)
    edges = np.array(edges)

    logging.info('adjacency matrix created.')

    Y_test = dl.test_classes
    Y_train = dl.train_classes
    Y_dev = dl.dev_classes
    classLatMedian = {
        str(c): dl.cluster_median[c][0]
        for c in dl.cluster_median
    }
    classLonMedian = {
        str(c): dl.cluster_median[c][1]
        for c in dl.cluster_median
    }

    P_test = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_test[['lat', 'lon']].values.tolist()
    ]
    P_train = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_train[['lat', 'lon']].values.tolist()
    ]
    P_dev = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_dev[['lat', 'lon']].values.tolist()
    ]
    userLocation = {}
    for i, u in enumerate(U_train):
        userLocation[u] = P_train[i]
    for i, u in enumerate(U_test):
        userLocation[u] = P_test[i]
    for i, u in enumerate(U_dev):
        userLocation[u] = P_dev[i]

    data = (adj, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train,
            U_dev, U_test, classLatMedian, classLonMedian, userLocation, edges)
    if not model_args.builddata:
        logging.info('dumping data in {} ...'.format(str(dump_file)))
        dump_obj(data, dump_file)
        logging.info('data dump finished!')

    return data