def preprocess_data(data_home,args, **kwargs): bucket_size = kwargs.get('bucket', 300) encoding = kwargs.get('encoding', 'iso-8859-1') celebrity_threshold = kwargs.get('celebrity', 10) mindf = kwargs.get('mindf', 10) d2v = kwargs.get('d2v', False) adj_d2v = args.adj_d2v one_hot_label = kwargs.get('onehot', False) vocab_file = os.path.join(data_home, 'vocab.pkl') if d2v: dump_name = 'doc2vec_win_' + str(args.d2vwindow) + '_dm_' + str(args.d2vdm) + 'adj_d2v_'+ str(adj_d2v*1) + '_dump.pkl' else: dump_name = 'tfidf_dump.pkl' dump_file = os.path.join(data_home, dump_name) if os.path.exists(dump_file) and not model_args.builddata: logging.info('loading data from dumped file ' + dump_name) data = load_obj(dump_file) logging.info('loading data finished!') return data dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b') dl.load_data() dl.assignClasses() if d2v: dl.doc2vec(args=args) X_train = dl.X_train_doc2vec X_test = dl.X_test_doc2vec X_dev = dl.X_dev_doc2vec else: dl.tfidf() X_train = dl.X_train X_dev = dl.X_dev X_test = dl.X_test vocab = dl.vectorizer.vocabulary_ logging.info('saving vocab in {}'.format(vocab_file)) dump_obj(vocab, vocab_file) logging.info('vocab dumped successfully!') U_test = dl.df_test.index.tolist() U_dev = dl.df_dev.index.tolist() U_train = dl.df_train.index.tolist() if adj_d2v and args.doc2vec: adj = dl.adj_doc2vec G = nx.from_numpy_matrix(adj, parallel_edges=False, create_using=None) else: dl.get_graph() logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(dl.graph, nodelist=range(len(U_train + U_dev + U_test)), weight='w') # converting the edges index to pytorch format G = dl.graph edges = G.edges() edges = np.array(edges) edges = edges[np.lexsort(np.fliplr(edges).T)] wadj = args.weighted_adjacency ## if we want to weight adjacency materix if wadj: logging.info('multiplying weights...') w_adj_s = dl.adj_weight_d2v * adj else: w_adj_s = 0 logging.info('adjacency matrix created.') Y_test = dl.test_classes Y_train = dl.train_classes Y_dev = dl.dev_classes classLatMedian = {str(c): dl.cluster_median[c][0] for c in dl.cluster_median} classLonMedian = {str(c): dl.cluster_median[c][1] for c in dl.cluster_median} P_test = [str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist()] P_train = [str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist()] P_dev = [str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist()] userLocation = {} for i, u in enumerate(U_train): userLocation[u] = P_train[i] for i, u in enumerate(U_test): userLocation[u] = P_test[i] for i, u in enumerate(U_dev): userLocation[u] = P_dev[i] total_users = X_train.shape[0] + X_dev.shape[0] + X_test.shape[0] data = (adj, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian,userLocation, w_adj_s, edges, total_users) if not model_args.builddata: logging.info('dumping data in {} ...'.format(str(dump_file))) dump_obj(data, dump_file) logging.info('data dump finished!') return data
def preprocess_data(data_home, **kwargs): bucket_size = kwargs.get('bucket', 300) encoding = kwargs.get('encoding', 'utf-8') celebrity_threshold = kwargs.get('celebrity', 10) mindf = kwargs.get('mindf', 10) dtype = kwargs.get('dtype', 'float32') one_hot_label = kwargs.get('onehot', False) vocab_file = os.path.join(data_home, 'vocab.pkl') dump_file = os.path.join(data_home, 'dump.pkl') if os.path.exists(dump_file) and not model_args.builddata: logging.info('loading data from dumped file...') data = load_obj(dump_file) logging.info('loading data finished!') return data dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b') dl.load_data() dl.assignClasses() dl.tfidf() vocab = dl.vectorizer.vocabulary_ logging.info('saving vocab in {}'.format(vocab_file)) dump_obj(vocab, vocab_file) logging.info('vocab dumped successfully!') U_test = dl.df_test.index.tolist() U_dev = dl.df_dev.index.tolist() U_train = dl.df_train.index.tolist() dl.get_graph() logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(dl.graph, nodelist=xrange(len(U_train + U_dev + U_test)), weight='w') adj.setdiag(0) #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,) selfloop_value = 1 adj.setdiag(selfloop_value) n,m = adj.shape diags = adj.sum(axis=1).flatten() with sp.errstate(divide='ignore'): diags_sqrt = 1.0/sp.sqrt(diags) diags_sqrt[sp.isinf(diags_sqrt)] = 0 D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr') A = D_pow_neghalf * adj * D_pow_neghalf A = A.astype(dtype) logging.info('adjacency matrix created.') X_train = dl.X_train X_dev = dl.X_dev X_test = dl.X_test Y_test = dl.test_classes Y_train = dl.train_classes Y_dev = dl.dev_classes classLatMedian = {str(c):dl.cluster_median[c][0] for c in dl.cluster_median} classLonMedian = {str(c):dl.cluster_median[c][1] for c in dl.cluster_median} P_test = [str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist()] P_train = [str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist()] P_dev = [str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist()] userLocation = {} for i, u in enumerate(U_train): userLocation[u] = P_train[i] for i, u in enumerate(U_test): userLocation[u] = P_test[i] for i, u in enumerate(U_dev): userLocation[u] = P_dev[i] data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation) if not model_args.builddata: logging.info('dumping data in {} ...'.format(str(dump_file))) dump_obj(data, dump_file) logging.info('data dump finished!') return data
def main(data, args, **kwargs): batch_size = kwargs.get('batch', 500) hidden_size = kwargs.get('hidden', [100]) dropout = kwargs.get('dropout', 0.0) regul = kwargs.get('regularization', 1e-6) dtype = 'float32' dtypeint = 'int32' check_percentiles = kwargs.get('percent', False) H, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data Y_dev = Y_dev.astype(dtypeint) Y_test = Y_test.astype(dtypeint) logging.info('stacking training, dev and test features and creating indices...') X = sp.sparse.vstack([X_train, X_dev, X_test]) if len(Y_train.shape) == 1: Y = np.hstack((Y_train, Y_dev, Y_test)) else: Y = np.vstack((Y_train, Y_dev, Y_test)) Y = Y.astype('int32') X = X.astype(dtype) H = H.astype(dtype) input_size = X.shape[1] output_size = np.max(Y) + 1 train_indices = np.asarray(range(0, X_train.shape[0])).astype('int32') dev_indices = np.asarray(range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype('int32') test_indices = np.asarray(range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype('int32') batch_size = min(batch_size, train_indices.shape[0]) if args.dcca: logging.info('running deepcca...') deepcca = DeepCCA() deepcca.build(X.shape[1], H.shape[1], architecture=args.dccahid, regul_coef=args.dccareg, dropout=dropout, lr=args.dccalr, batchnorm=args.dccabatchnorm, seed=model_args.seed) if args.dccareload: #for the big dataset use pickle instead of cPickle if X.shape[0] > 1000000: loaded_args, params1, params2 = load_obj(args.dccareload, serializer=hickle) else: loaded_args, params1, params2 = load_obj(args.dccareload) logging.info(loaded_args) deepcca.set_params(params1, params2) else: deepcca.fit(V1=X, V2=H, train_indices=train_indices, val_indices=dev_indices, test_indices=test_indices, n_epochs=500, early_stopping_max_down=args.maxdown, batch_size=train_indices.shape[0]) V1_cca, V2_cca, l_cca = deepcca.f_predict(X, H) should_run_cca_on_outputs = True if should_run_cca_on_outputs: #run linear cca on the outputs of mlp A, B, mean1, mean2 = linear_cca(V1_cca, V2_cca, outdim_size=args.dccasize) V1_cca = V1_cca - mean1 V2_cca = V2_cca - mean2 V1_cca = np.dot(V1_cca, A) V2_cca = np.dot(V2_cca, B) X_cca = np.hstack((V1_cca, V2_cca)).astype(dtype) else: logging.info('No shared deepcca representation, just concatenation!') X_cca = sp.sparse.hstack([X, H]).astype(dtype).tocsr() stratified = False all_train_indices = train_indices fractions = args.lblfraction clf = MLPDense(input_sparse=sp.sparse.issparse(X_cca), in_size=X_cca.shape[1], out_size=output_size, architecture=hidden_size, regul=regul, dropout=dropout, lr=args.mlplr, batchnorm=args.mlpbatchnorm) clf.build(seed=model_args.seed) for percentile in fractions: logging.info('***********percentile %f ******************' %percentile) if stratified: all_chosen = [] for lbl in range(0, np.max(Y_train) + 1): lbl_indices = all_train_indices[Y_train == lbl] selection_size = int(percentile * len(lbl_indices)) + 1 lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint) all_chosen.append(lbl_chosen) train_indices = np.hstack(all_chosen) else: selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0]) train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint) num_training_samples = train_indices.shape[0] logging.info('{} training samples'.format(num_training_samples)) X_train = X_cca[train_indices, :] Y_train_chosen = Y_train[train_indices].astype('int32') X_dev = X_cca[dev_indices, :] X_test = X_cca[test_indices, :] if args.vis: draw_representations(X_train, Y_train_chosen, k=4, do_pca=True, filename=args.vis) if clf.fitted: clf.reset() clf.fit(X_train, Y_train_chosen, X_dev, Y_dev, n_epochs=1000, early_stopping_max_down=args.maxdown, verbose=not args.silent, batch_size=min(batch_size, train_indices.shape[0]), seed=model_args.seed) dev_pred = clf.predict(X_dev) test_pred = clf.predict(X_test) logging.info('Dev predictions') mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(Y_dev, dev_pred, U_dev, classLatMedian, classLonMedian, userLocation) with open('dcca_{}_percent_pred_{}.pkl'.format(percentile, output_size) if args.dcca else 'concat_{}_percent_pred_{}.pkl'.format(percentile, output_size), 'wb') as fout: pickle.dump((distances, latlon_true, latlon_pred), fout) logging.info('Test predictions') geo_eval(Y_test, test_pred, U_test, classLatMedian, classLonMedian, userLocation)
def main(data, args, **kwargs): batch_size = kwargs.get('batch', 500) hidden_size = kwargs.get('hidden', [100]) dropout = kwargs.get('dropout', 0.0) regul = kwargs.get('regularization', 1e-6) dtype = 'float32' dtypeint = 'int32' check_percentiles = kwargs.get('percent', False) A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation = data logging.info('stacking training, dev and test features and creating indices...') X = sp.sparse.vstack([X_train, X_dev, X_test]) if len(Y_train.shape) == 1: Y = np.hstack((Y_train, Y_dev, Y_test)) else: Y = np.vstack((Y_train, Y_dev, Y_test)) Y = Y.astype(dtypeint) X = X.astype(dtype) A = A.astype(dtype) if args.vis: from deepcca import draw_representations draw_representations(A.dot(X), Y, filename='gconv1.pdf') draw_representations(A.dot(A.dot(X)), Y, filename='gconv2.pdf') input_size = X.shape[1] output_size = np.max(Y) + 1 verbose = not args.silent fractions = args.lblfraction stratified = False all_train_indices = np.asarray(range(0, X_train.shape[0])).astype(dtypeint) logging.info('running mlp with graph conv...') clf = GraphConv(input_size=input_size, output_size=output_size, hid_size_list=hidden_size, regul_coef=regul, drop_out=dropout, batchnorm=args.batchnorm, highway=model_args.highway) clf.build_model(A, use_text=args.notxt, use_labels=args.lp, seed=model_args.seed) for percentile in fractions: logging.info('***********percentile %f ******************' %percentile) model_file = './data/model-{}-{}.pkl'.format(A.shape[0], percentile) if stratified: all_chosen = [] for lbl in range(0, np.max(Y_train) + 1): lbl_indices = all_train_indices[Y_train == lbl] selection_size = int(percentile * len(lbl_indices)) + 1 lbl_chosen = np.random.choice(lbl_indices, size=selection_size, replace=False).astype(dtypeint) all_chosen.append(lbl_chosen) train_indices = np.hstack(all_chosen) else: selection_size = min(int(percentile * X.shape[0]), all_train_indices.shape[0]) train_indices = np.random.choice(all_train_indices, size=selection_size, replace=False).astype(dtypeint) num_training_samples = train_indices.shape[0] logging.info('{} training samples'.format(num_training_samples)) #train_indices = np.asarray(range(0, int(percentile * X_train.shape[0]))).astype(dtypeint) dev_indices = np.asarray(range(X_train.shape[0], X_train.shape[0] + X_dev.shape[0])).astype(dtypeint) test_indices = np.asarray(range(X_train.shape[0] + X_dev.shape[0], X_train.shape[0] + X_dev.shape[0] + X_test.shape[0])).astype(dtypeint) # do not train, load if args.load: report_results = False clf.load(load_obj, model_file) else: #reset the network parameters if already fitted with another data if clf.fitted: clf.reset() clf.fit(X, A, Y, train_indices=train_indices, val_indices=dev_indices, n_epochs=10000, batch_size=batch_size, max_down=args.maxdown, verbose=verbose, seed=model_args.seed) if args.save: clf.save(dump_obj, model_file) logging.info('dev results:') y_pred, _ = clf.predict(X, A, dev_indices) mean, median, acc, distances, latlon_true, latlon_pred = geo_eval(Y_dev, y_pred, U_dev, classLatMedian, classLonMedian, userLocation) with open('gcn_{}_percent_pred_{}.pkl'.format(percentile, output_size), 'wb') as fout: pickle.dump((distances, latlon_true, latlon_pred), fout) logging.info('test results:') y_pred, _ = clf.predict(X, A, test_indices) geo_eval(Y_test, y_pred, U_test, classLatMedian, classLonMedian, userLocation) if args.feature_report: vocab_file = os.path.join(args.dir, 'vocab.pkl') if not os.path.exists(vocab_file): logging.error('vocab file {} not found'.format(vocab_file)) return else: vocab = load_obj(vocab_file) logging.info('{} vocab loaded from file'.format(len(vocab))) train_vocab = set([term for term, count in Counter(np.nonzero(X[train_indices])[1]).iteritems() if count >= 10]) dev_vocab = set(np.nonzero(X[dev_indices].sum(axis=0))[1]) X_onehot = sp.sparse.diags([1] * len(vocab), dtype=dtype) A_onehot = X_onehot feature_report(clf, vocab, X_onehot, A_onehot, classLatMedian, classLonMedian, train_vocab, dev_vocab, topk=200, dtypeint=dtypeint)
def preprocess_data(data_home, **kwargs): bucket_size = kwargs.get('bucket', 300) encoding = kwargs.get('encoding', 'iso-8859-1') celebrity_threshold = kwargs.get('celebrity', 10) mindf = kwargs.get('mindf', 10) d2v = kwargs.get('d2v', False) one_hot_label = kwargs.get('onehot', False) vocab_file = os.path.join(data_home, 'vocab.pkl') if d2v: dump_name = 'doc2vec_dump.pkl' else: dump_name = 'dump.pkl' dump_file = os.path.join(data_home, dump_name) if os.path.exists(dump_file) and not model_args.builddata: logging.info('loading data from dumped file ' + dump_name) data = load_obj(dump_file) logging.info('loading data finished!') return data dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b') dl.load_data() dl.assignClasses() if d2v: dl.doc2vec() X_train = dl.X_train_doc2vec X_test = dl.X_test_doc2vec X_dev = dl.X_dev_doc2vec else: dl.tfidf() X_train = dl.X_train X_dev = dl.X_dev X_test = dl.X_test vocab = dl.vectorizer.vocabulary_ logging.info('saving vocab in {}'.format(vocab_file)) dump_obj(vocab, vocab_file) logging.info('vocab dumped successfully!') U_test = dl.df_test.index.tolist() U_dev = dl.df_dev.index.tolist() U_train = dl.df_train.index.tolist() dl.get_graph() logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(dl.graph, nodelist=range(len(U_train + U_dev + U_test)), weight='w') G = dl.graph # converting the edges index to pytorch format edges = list(G.edges) edges_test_hash = set(edges) # used to make search faster for index, item in enumerate(edges): swapped = (item[1], item[0]) if swapped not in edges_test_hash: edges.append(swapped) edges = sorted(edges) edges = np.array(edges) logging.info('adjacency matrix created.') Y_test = dl.test_classes Y_train = dl.train_classes Y_dev = dl.dev_classes classLatMedian = { str(c): dl.cluster_median[c][0] for c in dl.cluster_median } classLonMedian = { str(c): dl.cluster_median[c][1] for c in dl.cluster_median } P_test = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist() ] P_train = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist() ] P_dev = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist() ] userLocation = {} for i, u in enumerate(U_train): userLocation[u] = P_train[i] for i, u in enumerate(U_test): userLocation[u] = P_test[i] for i, u in enumerate(U_dev): userLocation[u] = P_dev[i] data = (adj, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation, edges) if not model_args.builddata: logging.info('dumping data in {} ...'.format(str(dump_file))) dump_obj(data, dump_file) logging.info('data dump finished!') return data