def preprocess_data(data_home, **kwargs): bucket_size = kwargs.get('bucket', 300) encoding = kwargs.get('encoding', 'utf-8') celebrity_threshold = kwargs.get('celebrity', 10) mindf = kwargs.get('mindf', 10) dtype = kwargs.get('dtype', 'float32') one_hot_label = kwargs.get('onehot', False) vocab_file = os.path.join(data_home, 'vocab.pkl') dump_file = os.path.join(data_home, 'dump.pkl') if os.path.exists(dump_file) and not model_args.builddata: logging.info('loading data from dumped file...') data = load_obj(dump_file) logging.info('loading data finished!') return data dl = DataLoader(data_home=data_home, bucket_size=bucket_size, encoding=encoding, celebrity_threshold=celebrity_threshold, one_hot_labels=one_hot_label, mindf=mindf, token_pattern=r'(?u)(?<![@])#?\b\w\w+\b') dl.load_data() dl.assignClasses() dl.tfidf() vocab = dl.vectorizer.vocabulary_ logging.info('saving vocab in {}'.format(vocab_file)) dump_obj(vocab, vocab_file) logging.info('vocab dumped successfully!') U_test = dl.df_test.index.tolist() U_dev = dl.df_dev.index.tolist() U_train = dl.df_train.index.tolist() dl.get_graph() logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(dl.graph, nodelist=xrange(len(U_train + U_dev + U_test)), weight='w') adj.setdiag(0) #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,) selfloop_value = 1 adj.setdiag(selfloop_value) n, m = adj.shape diags = adj.sum(axis=1).flatten() with sp.errstate(divide='ignore'): diags_sqrt = 1.0 / sp.sqrt(diags) diags_sqrt[sp.isinf(diags_sqrt)] = 0 D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr') A = D_pow_neghalf * adj * D_pow_neghalf A = A.astype(dtype) logging.info('adjacency matrix created.') X_train = dl.X_train X_dev = dl.X_dev X_test = dl.X_test Y_test = dl.test_classes Y_train = dl.train_classes Y_dev = dl.dev_classes classLatMedian = { str(c): dl.cluster_median[c][0] for c in dl.cluster_median } classLonMedian = { str(c): dl.cluster_median[c][1] for c in dl.cluster_median } P_test = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_test[['lat', 'lon']].values.tolist() ] P_train = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_train[['lat', 'lon']].values.tolist() ] P_dev = [ str(a[0]) + ',' + str(a[1]) for a in dl.df_dev[['lat', 'lon']].values.tolist() ] userLocation = {} for i, u in enumerate(U_train): userLocation[u] = P_train[i] for i, u in enumerate(U_test): userLocation[u] = P_test[i] for i, u in enumerate(U_dev): userLocation[u] = P_dev[i] data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev, U_test, classLatMedian, classLonMedian, userLocation) if not model_args.builddata: logging.info('dumping data in {} ...'.format(str(dump_file))) dump_obj(data, dump_file) logging.info('data dump finished!') return data
def fit(self, V1, V2, train_indices, val_indices, test_indices, n_epochs=10, early_stopping_max_down=3, batch_size=1000): best_params1 = None best_params2 = None best_val_loss = sys.maxint n_validation_down = 0 V1_train = V1[train_indices, :] V2_train = V2[train_indices, :] V1_dev = V1[val_indices, :] V2_dev = V2[val_indices, :] logging.info('training with batch size {}'.format(batch_size)) for n in xrange(n_epochs): l_train = [] for batch in iterate_minibatches(V1_train, V2_train, batch_size, shuffle=False): l_train.append(self.f_train(batch[0], batch[1])) l_train = np.mean(l_train) l_val = self.f_val(V1_dev, V2_dev).item() #after k iterations improvement should be higher than 0.1 k = 100 improvement = 1.0 if (l_val < best_val_loss and n < k) or (l_val < best_val_loss - improvement): best_val_loss = l_val best_params1 = lasagne.layers.get_all_param_values( self.l_out_view1) best_params2 = lasagne.layers.get_all_param_values( self.l_out_view2) n_validation_down = 0 else: #early stopping n_validation_down += 1 logging.info( 'epoch {} train loss {:.2f} val loss {:.2f} numdown {}'.format( n, l_train, l_val, n_validation_down)) if n_validation_down > early_stopping_max_down: logging.info( 'validation results went down. early stopping ...') break lasagne.layers.set_all_param_values(self.l_out_view1, best_params1) lasagne.layers.set_all_param_values(self.l_out_view2, best_params2) logging.info( '***************** final results based on best validation **************' ) V1_test, V2_test, l_test = self.f_predict(V1[test_indices], V2[test_indices]) logging.info('test loss:{}'.format(l_test)) filename = 'deepcca-{}-{}'.format(train_indices.shape[0], str(datetime.now())) logging.info('dumping deepcca params in {} '.format(filename)) if V1.shape[0] > 1000000: dump_obj((str(model_args), best_params1, best_params2), filename, serializer=hickle) else: dump_obj((model_args, best_params1, best_params2), filename)