Ejemplo n.º 1
0
def preprocess_data(data_home, **kwargs):
    bucket_size = kwargs.get('bucket', 300)
    encoding = kwargs.get('encoding', 'utf-8')
    celebrity_threshold = kwargs.get('celebrity', 10)
    mindf = kwargs.get('mindf', 10)
    dtype = kwargs.get('dtype', 'float32')
    one_hot_label = kwargs.get('onehot', False)
    vocab_file = os.path.join(data_home, 'vocab.pkl')
    dump_file = os.path.join(data_home, 'dump.pkl')
    if os.path.exists(dump_file) and not model_args.builddata:
        logging.info('loading data from dumped file...')
        data = load_obj(dump_file)
        logging.info('loading data finished!')
        return data

    dl = DataLoader(data_home=data_home,
                    bucket_size=bucket_size,
                    encoding=encoding,
                    celebrity_threshold=celebrity_threshold,
                    one_hot_labels=one_hot_label,
                    mindf=mindf,
                    token_pattern=r'(?u)(?<![@])#?\b\w\w+\b')
    dl.load_data()
    dl.assignClasses()
    dl.tfidf()
    vocab = dl.vectorizer.vocabulary_
    logging.info('saving vocab in {}'.format(vocab_file))
    dump_obj(vocab, vocab_file)
    logging.info('vocab dumped successfully!')
    U_test = dl.df_test.index.tolist()
    U_dev = dl.df_dev.index.tolist()
    U_train = dl.df_train.index.tolist()

    dl.get_graph()
    logging.info('creating adjacency matrix...')
    adj = nx.adjacency_matrix(dl.graph,
                              nodelist=xrange(len(U_train + U_dev + U_test)),
                              weight='w')

    adj.setdiag(0)
    #selfloop_value = np.asarray(adj.sum(axis=1)).reshape(-1,)
    selfloop_value = 1
    adj.setdiag(selfloop_value)
    n, m = adj.shape
    diags = adj.sum(axis=1).flatten()
    with sp.errstate(divide='ignore'):
        diags_sqrt = 1.0 / sp.sqrt(diags)
    diags_sqrt[sp.isinf(diags_sqrt)] = 0
    D_pow_neghalf = sp.sparse.spdiags(diags_sqrt, [0], m, n, format='csr')
    A = D_pow_neghalf * adj * D_pow_neghalf
    A = A.astype(dtype)
    logging.info('adjacency matrix created.')

    X_train = dl.X_train
    X_dev = dl.X_dev
    X_test = dl.X_test
    Y_test = dl.test_classes
    Y_train = dl.train_classes
    Y_dev = dl.dev_classes
    classLatMedian = {
        str(c): dl.cluster_median[c][0]
        for c in dl.cluster_median
    }
    classLonMedian = {
        str(c): dl.cluster_median[c][1]
        for c in dl.cluster_median
    }

    P_test = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_test[['lat', 'lon']].values.tolist()
    ]
    P_train = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_train[['lat', 'lon']].values.tolist()
    ]
    P_dev = [
        str(a[0]) + ',' + str(a[1])
        for a in dl.df_dev[['lat', 'lon']].values.tolist()
    ]
    userLocation = {}
    for i, u in enumerate(U_train):
        userLocation[u] = P_train[i]
    for i, u in enumerate(U_test):
        userLocation[u] = P_test[i]
    for i, u in enumerate(U_dev):
        userLocation[u] = P_dev[i]

    data = (A, X_train, Y_train, X_dev, Y_dev, X_test, Y_test, U_train, U_dev,
            U_test, classLatMedian, classLonMedian, userLocation)
    if not model_args.builddata:
        logging.info('dumping data in {} ...'.format(str(dump_file)))
        dump_obj(data, dump_file)
        logging.info('data dump finished!')

    return data
Ejemplo n.º 2
0
    def fit(self,
            V1,
            V2,
            train_indices,
            val_indices,
            test_indices,
            n_epochs=10,
            early_stopping_max_down=3,
            batch_size=1000):
        best_params1 = None
        best_params2 = None
        best_val_loss = sys.maxint
        n_validation_down = 0
        V1_train = V1[train_indices, :]
        V2_train = V2[train_indices, :]
        V1_dev = V1[val_indices, :]
        V2_dev = V2[val_indices, :]
        logging.info('training with batch size {}'.format(batch_size))
        for n in xrange(n_epochs):
            l_train = []
            for batch in iterate_minibatches(V1_train,
                                             V2_train,
                                             batch_size,
                                             shuffle=False):
                l_train.append(self.f_train(batch[0], batch[1]))
            l_train = np.mean(l_train)
            l_val = self.f_val(V1_dev, V2_dev).item()
            #after k iterations improvement should be higher than 0.1
            k = 100
            improvement = 1.0
            if (l_val < best_val_loss
                    and n < k) or (l_val < best_val_loss - improvement):
                best_val_loss = l_val
                best_params1 = lasagne.layers.get_all_param_values(
                    self.l_out_view1)
                best_params2 = lasagne.layers.get_all_param_values(
                    self.l_out_view2)
                n_validation_down = 0
            else:
                #early stopping
                n_validation_down += 1
            logging.info(
                'epoch {} train loss {:.2f} val loss {:.2f} numdown {}'.format(
                    n, l_train, l_val, n_validation_down))
            if n_validation_down > early_stopping_max_down:
                logging.info(
                    'validation results went down. early stopping ...')
                break

        lasagne.layers.set_all_param_values(self.l_out_view1, best_params1)
        lasagne.layers.set_all_param_values(self.l_out_view2, best_params2)

        logging.info(
            '***************** final results based on best validation **************'
        )
        V1_test, V2_test, l_test = self.f_predict(V1[test_indices],
                                                  V2[test_indices])
        logging.info('test loss:{}'.format(l_test))
        filename = 'deepcca-{}-{}'.format(train_indices.shape[0],
                                          str(datetime.now()))
        logging.info('dumping deepcca params in {} '.format(filename))
        if V1.shape[0] > 1000000:
            dump_obj((str(model_args), best_params1, best_params2),
                     filename,
                     serializer=hickle)
        else:
            dump_obj((model_args, best_params1, best_params2), filename)