def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        metric=args.metric,
        use_label_persistence=True,
    )

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)

    # This ignores *all* other feature generation methods and falls back
    # to the original Weisfeiler--Lehman subtree kernel.
    if args.use_subtree_features:

        logger.info('Using original subtree features')

        wl_subtree = WeisfeilerLehmanSubtree()
        X, num_columns_per_iteration = \
            wl_subtree.transform(graphs, args.num_iterations)
    else:
        X, num_columns_per_iteration = \
            pwl.transform(graphs, args.num_iterations)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)

    mean_accuracies = []

    params = [
        'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence',
        'use_original_features', 'use_subtree_features', 'metric'
    ]
    cv_results = []
    entry = {}
    for param in params:
        entry[param] = args.__dict__[param]
    entry['dataset'] = dirname(args.FILES[0]).split('/')[1]
    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(X, y)):
            entry_fold = copy.copy(entry)
            train_index = indices[0]
            test_index = indices[1]

            pipeline = Pipeline(
                [('fs', FeatureSelector(num_columns_per_iteration)),
                 ('clf',
                  RandomForestClassifier(
                      class_weight='balanced' if args.balanced else None,
                      random_state=42))], )

            grid_params = {
                'fs__num_iterations': np.arange(0, args.num_iterations + 1),
                'clf__n_estimators': [25, 50, 100],
            }

            clf = GridSearchCV(pipeline,
                               grid_params,
                               cv=StratifiedKFold(n_splits=5, shuffle=True),
                               iid=False,
                               scoring='accuracy',
                               n_jobs=4)

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # TODO: need to discuss whether this is 'allowed' or smart
            # to do; this assumes normality of the attributes.
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            for param, param_val in clf.best_params_.items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            logger.info('Best classifier for this fold:{}'.format(
                clf.best_params_))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
Exemple #2
0
def main(args, logger):

    # Read all graphs and labels; there is no direct way of checking
    # that the labels are 'correct' for the graphs, but at least the
    # code will check that they have the same cardinality.
    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Simple pre-processing to ensure that all graphs are set up
    # equally.
    #
    # TODO: make this into a shared function?
    for graph in graphs:
        # Set the label to be uniform over all graphs in case no labels are
        # available. This essentially changes our iteration to degree-based
        # checks.
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

        # Reset edge weights if they already exist
        if 'weight' in graph.es.attributes():
            graph.es['weight'] = [0] * len(graph.es)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    # Replace selected metric if necessary; this only applies to the
    # uniform metric shortcut.
    if args.use_uniform_metric:
        args.metric = 'uniform'

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        use_label_persistence=True,
        metric=args.metric,
        p=args.power,
        smooth=args.smooth)

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    # Ensures that labels are encoded correctly, regardless of whether
    # they are numerical or not.
    y = LabelEncoder().fit_transform(labels)

    # This ignores *all* other feature generation methods and falls back
    # to the original Weisfeiler--Lehman subtree kernel.
    if args.use_subtree_features:

        logger.info('Using original subtree features')

        wl_subtree = WeisfeilerLehmanSubtree()
        X, num_columns_per_iteration = \
            wl_subtree.transform(graphs, args.num_iterations)
    else:
        X, num_columns_per_iteration = \
            pwl.transform(graphs, args.num_iterations)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    mean_accuracies = []

    for i in range(10):

        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []

        for train_index, test_index in cv.split(X, y):
            rf_clf = RandomForestClassifier(
                n_estimators=50,
                class_weight='balanced' if args.balanced else None,
                random_state=42)

            if args.grid_search:
                pipeline = Pipeline(
                    [('fs', FeatureSelector(num_columns_per_iteration)),
                     ('clf', rf_clf)], )

                grid_params = {
                    'fs__num_iterations': np.arange(0,
                                                    args.num_iterations + 1),
                    'clf__n_estimators': [10, 20, 50, 100, 150, 200],
                }

                clf = GridSearchCV(pipeline,
                                   grid_params,
                                   cv=StratifiedKFold(n_splits=10,
                                                      shuffle=True),
                                   iid=False,
                                   scoring='accuracy',
                                   n_jobs=4)
            else:
                clf = rf_clf

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            accuracy_scores.append(accuracy_score(y_test, y_pred))

            logger.debug('Best classifier for this fold: {}'.format(clf))

            if args.grid_search:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.best_params_))
            else:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.get_params()))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '''  - Mean 10-fold accuracy: {:2.2f} [running mean over all
            folds: {:2.2f}]'''.format(mean_accuracies[-1] * 100,
                                      np.mean(mean_accuracies) * 100))

    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))
Exemple #3
0
from features import FeatureSelector
import pickle
import matplotlib.pyplot as plt

states = pickle.load(open('featureset.pt', 'rb'))

for state in states:
    selector = FeatureSelector(state)
    selector.agent_at_border()
    print(selector.features)

    plt.figure()
    vis_field = state['field'].copy()
    x, y = state['self'][3]
    vis_field[x, y] = 4

    #for bomb in state['bombs']:
    #    bx, by = bomb[0]
    #    if bomb[1] != 0:
    #        vis_field[bx, by] = 2

    for bomb in state['coins']:
        bx, by = bomb
        vis_field[bx, by] = 2

    for agent in state['others']:
        bx, by = agent[3]
        vis_field[bx, by] = 3

    plt.imshow(vis_field)
    plt.show()
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Stores *all* vertex labels of the given graph in order to
    # determine the conversion factor for persistence diagrams.
    vertex_labels = set()

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

        vertex_labels.update(graph.vs['label'])

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        use_label_persistence=True,
        store_persistence_diagrams=True,
    )

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)
    X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations)

    persistence_diagrams = pwl._persistence_diagrams

    fig, ax = plt.subplots(args.num_iterations + 1)

    for iteration in persistence_diagrams.keys():
        M = collections.defaultdict(list)

        for index, pd in enumerate(persistence_diagrams[iteration]):
            label = y[index]
            for _, d, _ in pd:
                M[label].append(d)

        d_min = sys.float_info.max
        d_max = -d_min

        for hist in M.values():
            d_min = min(d_min, min(hist))
            d_max = max(d_max, max(hist))

        bins = np.linspace(d_min, d_max, 10)

        for label, hist in M.items():
            sns.distplot(hist,
                         bins=bins,
                         rug=True,
                         kde=True,
                         hist=False,
                         ax=ax[iteration])

    plt.show()

    L = len(vertex_labels)
    assert L > 0

    original_labels = pwl._original_labels

    # Will store *all* persistence diagrams in the form of a probability
    # distribution.
    M = np.zeros((len(graphs), (args.num_iterations + 1) * L))

    # Will store *all* pairwise distances according to the
    # Jensen--Shannon divergence (JS),  or, alternatively,
    # the Kullback--Leibler divergence (KL).
    D_KL = np.zeros((len(graphs), len(graphs)))
    D_JS = np.zeros((len(graphs), len(graphs)))

    D = np.zeros((len(graphs), len(graphs)))

    for iteration in persistence_diagrams.keys():

        M, D_KL, D_JS = make_kernel_matrices(
            persistence_diagrams[iteration],
            original_labels,  # notice that they do *not* change
            L)

        D += D_JS

    D = -D

    fig, ax = plt.subplots(len(set(y)))

    for label in sorted(set(y)):
        ax[label].matshow(M[y == label], aspect='auto', vmin=0, vmax=1)

    plt.show()

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    mean_accuracies = []

    for i in range(10):

        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []

        for train_index, test_index in cv.split(X, y):
            rf_clf = RandomForestClassifier(
                n_estimators=50,
                class_weight='balanced' if args.balanced else None)

            if args.grid_search:
                pipeline = Pipeline(
                    [('fs', FeatureSelector(num_columns_per_iteration)),
                     ('clf', rf_clf)], )

                grid_params = {
                    'fs__num_iterations': np.arange(0,
                                                    args.num_iterations + 1),
                    'clf__n_estimators': [10, 20, 50, 100, 150, 200],
                }

                clf = GridSearchCV(pipeline,
                                   grid_params,
                                   cv=StratifiedKFold(n_splits=10,
                                                      shuffle=True),
                                   iid=False,
                                   scoring='accuracy',
                                   n_jobs=4)
            else:
                clf = rf_clf

            clf = SVC(kernel='precomputed')
            clf.fit(D, y)
            y_test = y
            y_pred = clf.predict(D)

            #X_train, X_test = X[train_index], X[test_index]
            #y_train, y_test = y[train_index], y[test_index]

            ## TODO: need to discuss whether this is 'allowed' or smart
            ## to do; this assumes normality of the attributes.
            #scaler = StandardScaler()
            #X_train = scaler.fit_transform(X_train)
            #X_test = scaler.transform(X_test)

            #scaler = MinMaxScaler()
            #X_train = scaler.fit_transform(X_train)
            #X_test = scaler.transform(X_test)

            #clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_test)

            accuracy_scores.append(accuracy_score(y_test, y_pred))

            logger.debug('Best classifier for this fold: {}'.format(clf))

            if args.grid_search:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.best_params_))
            else:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.get_params()))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))

    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))
Exemple #5
0
]]

test = pd.read_csv('data/validation_data.csv', parse_dates=['date'])
X_test = test.merge(employee, on='employee id').loc[:, [
    'date',
    'category',
    'pre-tax amount',
    'role',
]]

model = Pipeline([
    ('features', FeatureUnion([

        # weekend?
        ('weekend', Pipeline([
            ('selector', FeatureSelector('date')),
            ('transform', IsWeekendTransformer()),
        ])),

        # category
        ('category', Pipeline([
            ('selector', FeatureSelector('category')),
            ('encode', PipelineLabelBinarizer()),
        ])),

        # role
        ('role', Pipeline([
            ('selector', FeatureSelector('role')),
            ('encode', PipelineLabelBinarizer()),
        ])),
Exemple #6
0
y_train = training.loc[:, ['category']].values.ravel()

X_val = validation.loc[:, ['expense description', 'pre-tax amount']]
y_val = validation.loc[:, ['category']].values.ravel()

# build data pipeline!
pipeline = Pipeline([
    (
        'features',
        FeatureUnion(
            [

                # expense description feature
                ('description',
                 Pipeline([
                     ('selector', FeatureSelector('expense description')),
                     ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                 ])),

                # pretax amount feature
                ('pretax',
                 Pipeline([
                     ('selector', FeatureSelector('pre-tax amount',
                                                  wrap=True)),
                     ('scaler', StandardScaler()),
                 ]))
            ],
            transformer_weights={
                'description': 1.0,
                'pretax': 1.0,
#!/usr/bin/env python
#-*- encoding:utf-8 -*-

import sys, os

from preprocess import Preprocessor
from features import FeatureSelector
from bayes import BayesClassifier

if __name__ == '__main__':
    train_file = sys.argv[1]
    test_file = sys.argv[2]

    pr = Preprocessor()
    pr.build_vocabulary_and_categories(train_file)

    fs = FeatureSelector(train_file, ck = 500)
    fs.select_features()

    bc = BayesClassifier(train_file, test_file, model = 'bernoulli')
    bc.train()
    bc.test()
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl_list = []
    for p in [1, 2]:
        pwl = PersistentWeisfeilerLehman(
            use_cycle_persistence=args.use_cycle_persistence,
            use_original_features=args.use_original_features,
            metric=args.metric,
            use_label_persistence=True,
            p=p)

        X, num_columns_per_iteration = pwl.transform(graphs,
                                                     args.num_iterations)
        pwl_list.append({'p': p, 'X': X})

        logger.info(
            f'Finished persistent Weisfeiler-Lehman transformation for \
                    p={p}')
        logger.info('Obtained ({} x {}) feature matrix'.format(
            X.shape[0], X.shape[1]))

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)

    np.random.seed(42)
    mean_accuracies = []

    params = [
        'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence',
        'use_original_features', 'metric'
    ]
    cv_results = []
    entry = {}
    for param in params:
        entry[param] = args.__dict__[param]
    entry['dataset'] = dirname(args.FILES[0]).split('/')[1]
    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(X, y)):
            entry_fold = copy.copy(entry)
            train_index = indices[0]
            test_index = indices[1]
            y_train = y[train_index]
            y_test = y[test_index]

            # Override current full matrices
            for pwl_dict in pwl_list:

                scaler = StandardScaler()
                X_train = scaler.fit_transform(pwl_dict['X'][train_index])
                X_test = scaler.transform(pwl_dict['X'][test_index])

                scaler = MinMaxScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                pwl_dict['X_train'] = X_train
                pwl_dict['X_test'] = X_test

            pipeline = Pipeline(
                [('fs', FeatureSelector(num_columns_per_iteration)),
                 ('clf',
                  RandomForestClassifier(
                      class_weight='balanced' if args.balanced else None,
                      random_state=42,
                      n_jobs=4))], )

            grid_params = {
                'fs__num_iterations': np.arange(0, args.num_iterations + 1),
                'clf__n_estimators': [25, 50, 100],
            }

            clf, best_params = custom_grid_search_cv(pipeline, grid_params,
                                                     pwl_list, y_train)

            X_test = pwl_list[best_params['pwl_idx']]['X_test']
            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            best_params['params']['p'] = best_params['pwl_idx'] + 1
            for param, param_val in best_params['params'].items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            logger.info('Best classifier for this fold:{}'.format(
                best_params['params']))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)