Esempio n. 1
0
def main():
    psi = 256
    t = 100
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset', type=str, default='ENZYMES')
    parser.add_argument('--crossvalidation',
                        default=True,
                        action='store_true',
                        help='Enable a 10-fold crossvalidation')
    parser.add_argument('--h',
                        type=int,
                        required=False,
                        default=5,
                        help="(Max) number of WL iterations")

    args = parser.parse_args()
    dataset = args.dataset
    h = args.h
    data_path = os.path.join('./data', dataset)
    output_path = os.path.join('output', dataset)
    results_path = os.path.join('results', dataset)

    for path in [output_path, results_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    label_file = os.path.join(data_path, 'Labels.txt')
    y = np.array(read_labels(label_file))

    cv = StratifiedKFold(n_splits=10, shuffle=True)

    accuracy_scores = []
    label_sequences = compute_wl_embeddings_continuous_by_IK(
        data_path, h, t, psi)
    label_sequences = np.array(label_sequences)
    for train_index, test_index in cv.split(label_sequences, y):
        X_train = label_sequences[train_index]
        X_test = label_sequences[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # use liblinear
        X_train = average_graph(X_train, t)
        X_test = average_graph(X_test, t)
        gs = LinearSVC().fit(X_train, y_train)
        y_pred = gs.predict(X_test)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        print(accuracy_scores)
        if not args.crossvalidation:
            break

    if args.crossvalidation:
        print('Mean 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format(
            np.mean(accuracy_scores) * 100,
            np.std(accuracy_scores) * 100))
    else:
        print('Final accuracy: {:2.3f} %'.format(
            np.mean(accuracy_scores) * 100))
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        use_label_persistence=args.use_persistence_features,
    )

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)
    X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations)

    X = StandardScaler().fit_transform(X)
    X = MinMaxScaler().fit_transform(X)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    num_classes = len(np.bincount(y))

    fig, ax = plt.subplots(nrows=num_classes,
                           ncols=2,
                           sharex=True,
                           sharey=False,
                           squeeze=False)

    for index in range(num_classes):
        ax[index][0].matshow(X[y == index], aspect='auto')
        ax[index][0].set_title(f'Class {index} (features)')

        ax[index][1].matshow(np.mean(X[y == index], axis=0).reshape(1, -1),
                             aspect='auto')
        ax[index][1].set_title(f'Class {index} (mean)')

    plt.show()
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        metric=args.metric,
        use_label_persistence=True,
    )

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)

    # This ignores *all* other feature generation methods and falls back
    # to the original Weisfeiler--Lehman subtree kernel.
    if args.use_subtree_features:

        logger.info('Using original subtree features')

        wl_subtree = WeisfeilerLehmanSubtree()
        X, num_columns_per_iteration = \
            wl_subtree.transform(graphs, args.num_iterations)
    else:
        X, num_columns_per_iteration = \
            pwl.transform(graphs, args.num_iterations)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)

    mean_accuracies = []

    params = [
        'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence',
        'use_original_features', 'use_subtree_features', 'metric'
    ]
    cv_results = []
    entry = {}
    for param in params:
        entry[param] = args.__dict__[param]
    entry['dataset'] = dirname(args.FILES[0]).split('/')[1]
    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(X, y)):
            entry_fold = copy.copy(entry)
            train_index = indices[0]
            test_index = indices[1]

            pipeline = Pipeline(
                [('fs', FeatureSelector(num_columns_per_iteration)),
                 ('clf',
                  RandomForestClassifier(
                      class_weight='balanced' if args.balanced else None,
                      random_state=42))], )

            grid_params = {
                'fs__num_iterations': np.arange(0, args.num_iterations + 1),
                'clf__n_estimators': [25, 50, 100],
            }

            clf = GridSearchCV(pipeline,
                               grid_params,
                               cv=StratifiedKFold(n_splits=5, shuffle=True),
                               iid=False,
                               scoring='accuracy',
                               n_jobs=4)

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # TODO: need to discuss whether this is 'allowed' or smart
            # to do; this assumes normality of the attributes.
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            for param, param_val in clf.best_params_.items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            logger.info('Best classifier for this fold:{}'.format(
                clf.best_params_))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
                        type=str,
                        help='Labels file',
                        required=True)
    parser.add_argument('-n',
                        '--num-graphs',
                        type=int,
                        required=True,
                        help='Sample size')
    parser.add_argument('-o',
                        '--out-dir',
                        type=str,
                        required=True,
                        help='Output directory')

    args = parser.parse_args()
    labels = read_labels(args.labels)
    y = LabelEncoder().fit_transform(labels)
    n = len(y)

    sss = StratifiedShuffleSplit(n_splits=1,
                                 random_state=23,
                                 train_size=args.num_graphs)

    for train_index, _ in sss.split(range(n), y):
        train_index = sorted(train_index)

        files = np.array(args.FILES)
        files = files[train_index]

        try:
            os.makedirs(args.out_dir)
Esempio n. 5
0
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    # Calculate graph kernel
    gram_matrix = gk.CalculateEdgeHistKernel(graphs)

    y = LabelEncoder().fit_transform(labels)

    #np.random.seed(42)
    mean_accuracies = []

    params = ['balanced']
    cv_results = []
    entry = {}
    for param in params:
        entry[param] = args.__dict__[param]
    entry['dataset'] = dirname(args.FILES[0]).split('/')[1]
    entry['baseline'] = 'edge hist kernel'
    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(graphs, y)):

            entry_fold = copy.copy(entry)
            train_index = indices[0]
            test_index = indices[1]

            pipeline = Pipeline(
                [('clf',
                  SVC(class_weight='balanced' if args.balanced else None,
                      random_state=42,
                      kernel='precomputed'))], )

            grid_params = {'clf__C': [1e-1, 1e0, 1e1]}

            X_train, X_test = gram_matrix[
                train_index][:,
                             train_index], gram_matrix[test_index][:,
                                                                   train_index]
            y_train, y_test = y[train_index], y[test_index]

            kgscv = KernelGridSearchCV(
                pipeline,
                param_grid=grid_params,
                cv=cv,
            )
            kgscv.fit(X_train, y_train)
            p = kgscv._best_params
            sc = kgscv._best_score
            clf = kgscv._best_estimator
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            for param, param_val in kgscv._best_params.items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            logger.info('Best classifier for this fold:{}'.format(
                kgscv._best_params))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
Esempio n. 6
0
File: main.py Progetto: wibruce/P-WL
def main(args, logger):

    # Read all graphs and labels; there is no direct way of checking
    # that the labels are 'correct' for the graphs, but at least the
    # code will check that they have the same cardinality.
    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Simple pre-processing to ensure that all graphs are set up
    # equally.
    #
    # TODO: make this into a shared function?
    for graph in graphs:
        # Set the label to be uniform over all graphs in case no labels are
        # available. This essentially changes our iteration to degree-based
        # checks.
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

        # Reset edge weights if they already exist
        if 'weight' in graph.es.attributes():
            graph.es['weight'] = [0] * len(graph.es)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    # Replace selected metric if necessary; this only applies to the
    # uniform metric shortcut.
    if args.use_uniform_metric:
        args.metric = 'uniform'

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        use_label_persistence=True,
        metric=args.metric,
        p=args.power,
        smooth=args.smooth)

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    # Ensures that labels are encoded correctly, regardless of whether
    # they are numerical or not.
    y = LabelEncoder().fit_transform(labels)

    # This ignores *all* other feature generation methods and falls back
    # to the original Weisfeiler--Lehman subtree kernel.
    if args.use_subtree_features:

        logger.info('Using original subtree features')

        wl_subtree = WeisfeilerLehmanSubtree()
        X, num_columns_per_iteration = \
            wl_subtree.transform(graphs, args.num_iterations)
    else:
        X, num_columns_per_iteration = \
            pwl.transform(graphs, args.num_iterations)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    mean_accuracies = []

    for i in range(10):

        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []

        for train_index, test_index in cv.split(X, y):
            rf_clf = RandomForestClassifier(
                n_estimators=50,
                class_weight='balanced' if args.balanced else None,
                random_state=42)

            if args.grid_search:
                pipeline = Pipeline(
                    [('fs', FeatureSelector(num_columns_per_iteration)),
                     ('clf', rf_clf)], )

                grid_params = {
                    'fs__num_iterations': np.arange(0,
                                                    args.num_iterations + 1),
                    'clf__n_estimators': [10, 20, 50, 100, 150, 200],
                }

                clf = GridSearchCV(pipeline,
                                   grid_params,
                                   cv=StratifiedKFold(n_splits=10,
                                                      shuffle=True),
                                   iid=False,
                                   scoring='accuracy',
                                   n_jobs=4)
            else:
                clf = rf_clf

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            accuracy_scores.append(accuracy_score(y_test, y_pred))

            logger.debug('Best classifier for this fold: {}'.format(clf))

            if args.grid_search:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.best_params_))
            else:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.get_params()))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '''  - Mean 10-fold accuracy: {:2.2f} [running mean over all
            folds: {:2.2f}]'''.format(mean_accuracies[-1] * 100,
                                      np.mean(mean_accuracies) * 100))

    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))
                        action='store_true',
                        help='Make random forest classifier balanced')
    parser.add_argument('-l',
                        '--labels',
                        type=str,
                        help='Labels file',
                        required=True)
    parser.add_argument('-n',
                        '--num-iterations',
                        default=3,
                        type=int,
                        help='Number of Weisfeiler-Lehman iterations')

    args = parser.parse_args()
    graphs = [ig.read(filename) for filename in args.FILES]
    y = np.array(read_labels(args.labels))

    wl = WeisfeilerLehman()
    label_dicts = wl.fit_transform(graphs, args.num_iterations)

    # Each entry in the list represents the label sequence of a single
    # graph. The label sequence contains the vertices in its rows, and
    # the individual iterations in its columns.
    #
    # Hence, (i, j) will contain the label of vertex i at iteration j.
    label_sequences = [
        np.full((len(graph.vs), args.num_iterations + 1), np.nan)
        for graph in graphs
    ]

    for iteration in sorted(label_dicts.keys()):
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Stores *all* vertex labels of the given graph in order to
    # determine the conversion factor for persistence diagrams.
    vertex_labels = set()

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

        vertex_labels.update(graph.vs['label'])

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_cycle_persistence=args.use_cycle_persistence,
        use_original_features=args.use_original_features,
        use_label_persistence=True,
        store_persistence_diagrams=True,
    )

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)
    X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations)

    persistence_diagrams = pwl._persistence_diagrams

    fig, ax = plt.subplots(args.num_iterations + 1)

    for iteration in persistence_diagrams.keys():
        M = collections.defaultdict(list)

        for index, pd in enumerate(persistence_diagrams[iteration]):
            label = y[index]
            for _, d, _ in pd:
                M[label].append(d)

        d_min = sys.float_info.max
        d_max = -d_min

        for hist in M.values():
            d_min = min(d_min, min(hist))
            d_max = max(d_max, max(hist))

        bins = np.linspace(d_min, d_max, 10)

        for label, hist in M.items():
            sns.distplot(hist,
                         bins=bins,
                         rug=True,
                         kde=True,
                         hist=False,
                         ax=ax[iteration])

    plt.show()

    L = len(vertex_labels)
    assert L > 0

    original_labels = pwl._original_labels

    # Will store *all* persistence diagrams in the form of a probability
    # distribution.
    M = np.zeros((len(graphs), (args.num_iterations + 1) * L))

    # Will store *all* pairwise distances according to the
    # Jensen--Shannon divergence (JS),  or, alternatively,
    # the Kullback--Leibler divergence (KL).
    D_KL = np.zeros((len(graphs), len(graphs)))
    D_JS = np.zeros((len(graphs), len(graphs)))

    D = np.zeros((len(graphs), len(graphs)))

    for iteration in persistence_diagrams.keys():

        M, D_KL, D_JS = make_kernel_matrices(
            persistence_diagrams[iteration],
            original_labels,  # notice that they do *not* change
            L)

        D += D_JS

    D = -D

    fig, ax = plt.subplots(len(set(y)))

    for label in sorted(set(y)):
        ax[label].matshow(M[y == label], aspect='auto', vmin=0, vmax=1)

    plt.show()

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(
        X.shape[0], X.shape[1]))

    np.random.seed(42)
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    mean_accuracies = []

    for i in range(10):

        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []

        for train_index, test_index in cv.split(X, y):
            rf_clf = RandomForestClassifier(
                n_estimators=50,
                class_weight='balanced' if args.balanced else None)

            if args.grid_search:
                pipeline = Pipeline(
                    [('fs', FeatureSelector(num_columns_per_iteration)),
                     ('clf', rf_clf)], )

                grid_params = {
                    'fs__num_iterations': np.arange(0,
                                                    args.num_iterations + 1),
                    'clf__n_estimators': [10, 20, 50, 100, 150, 200],
                }

                clf = GridSearchCV(pipeline,
                                   grid_params,
                                   cv=StratifiedKFold(n_splits=10,
                                                      shuffle=True),
                                   iid=False,
                                   scoring='accuracy',
                                   n_jobs=4)
            else:
                clf = rf_clf

            clf = SVC(kernel='precomputed')
            clf.fit(D, y)
            y_test = y
            y_pred = clf.predict(D)

            #X_train, X_test = X[train_index], X[test_index]
            #y_train, y_test = y[train_index], y[test_index]

            ## TODO: need to discuss whether this is 'allowed' or smart
            ## to do; this assumes normality of the attributes.
            #scaler = StandardScaler()
            #X_train = scaler.fit_transform(X_train)
            #X_test = scaler.transform(X_test)

            #scaler = MinMaxScaler()
            #X_train = scaler.fit_transform(X_train)
            #X_test = scaler.transform(X_test)

            #clf.fit(X_train, y_train)
            #y_pred = clf.predict(X_test)

            accuracy_scores.append(accuracy_score(y_test, y_pred))

            logger.debug('Best classifier for this fold: {}'.format(clf))

            if args.grid_search:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.best_params_))
            else:
                logger.debug('Best parameters for this fold: {}'.format(
                    clf.get_params()))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))

    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))
import utilities

# Decide read/write mode based on python version
read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt', 'wt')

# Set the path to your consolidated files
path = '/Users/chrysovalantis/Documents/UCY/EPL451/Project'
os.chdir(path)

# File names
ftrain = 'train_consolidation.txt'
ftest = 'test_consolidation.txt'
flabel = 'trainLabels.csv'
fsubmission = 'submission.csv'

labels = utilities.read_labels(flabel)

# Dimensions for train set
ntrain = 10868
nfeature = 16 ** 2 + 1 + 1  # For two_byte_codes, no_que_marks, label
train = utilities.read_train(ntrain, nfeature, labels, ftrain)

X = train[:, :-1]
y = train[:,  -1]

del labels
del train

# Parameters for trees
random_state = 5342
n_jobs = 8
Esempio n. 10
0
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels)))

    assert len(graphs) == len(labels)

    pwl = PersistentWeisfeilerLehman(
        use_label_persistence=True,
        store_persistence_diagrams=False,  # TODO: might need this later on?
    )

    y = LabelEncoder().fit_transform(labels)
    X, num_columns_per_iteration = pwl.transform(graphs, args.num_iterations)

    logger.info('Finished persistent Weisfeiler-Lehman transformation')
    logger.info('Obtained ({} x {}) feature matrix'.format(X.shape[0], X.shape[1]))

    X = to_probability_distribution(X, num_columns_per_iteration)

    np.random.seed(42)
    cv = StratifiedKFold(n_splits=3, shuffle=True)
    mean_accuracies = []

    def product_kernel(X, Y, k):

        # Index for the current iteration; indicates which column is
        # used as the *first* one.
        start_index = 0
        K = np.zeros((X.shape[0], Y.shape[0]))

        for iteration in sorted(num_columns_per_iteration.keys()):
            end_index = num_columns_per_iteration[iteration]

            P = X[:, start_index:end_index]
            Q = Y[:, start_index:end_index]

            # TODO: can this be made more efficient?
            K_iteration = np.array(
                [k(p, q) for p, q in itertools.product(P, Q)]).reshape(
                X.shape[0], Y.shape[0]
            )

            K += K_iteration

            start_index += end_index

        return K

    def jensen_shannon_kernel(X, Y):
        return product_kernel(X, Y, jensen_shannon)

    def kullback_leibler_kernel(X, Y):
        return -product_kernel(X, Y, kullback_leibler)

    for i in range(3):

        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []

        for train_index, test_index in cv.split(X, y):
            clf = SVC(
                kernel=jensen_shannon_kernel,
            )

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # TODO: need to discuss whether this is 'allowed' or smart
            # to do; this assumes normality of the attributes.
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            accuracy_scores.append(accuracy_score(y_test, y_pred))

            logger.debug('Best classifier for this fold: {}'.format(clf))
            logger.debug('Best parameters for this fold: {}'.format(clf.get_params()))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info('  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'.format(mean_accuracies[-1] * 100, np.mean(mean_accuracies) * 100))

    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(np.mean(mean_accuracies) * 100, np.std(mean_accuracies) * 100))
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--dataset', type=str, help='Provide the dataset name (MUTAG or Enzymes)',
                            choices=['MUTAG', 'ENZYMES'])
    parser.add_argument('--crossvalidation', default=False, action='store_true', help='Enable a 10-fold crossvalidation')
    parser.add_argument('--gridsearch', default=False, action='store_true', help='Enable grid search')
    parser.add_argument('--sinkhorn', default=False, action='store_true', help='Use sinkhorn approximation')
    parser.add_argument('--h', type = int, required=False, default=2, help = "(Max) number of WL iterations")

    args = parser.parse_args()
    dataset = args.dataset
    h = args.h
    sinkhorn = args.sinkhorn
    print(f'Generating results for {dataset}...')
    #---------------------------------
    # Setup
    #---------------------------------
    # Start by making directories for intermediate and final files
    data_path = os.path.join('../data', dataset)
    output_path = os.path.join('output', dataset)
    results_path = os.path.join('results', dataset)
    
    for path in [output_path, results_path]:
        if not os.path.exists(path):
            os.makedirs(path)

    #---------------------------------
    # Embeddings
    #---------------------------------
    # Load the data and generate the embeddings 
    embedding_type = 'continuous' if dataset == 'ENZYMES' else 'discrete'
    print(f'Generating {embedding_type} embeddings for {dataset}.')
    if dataset == 'ENZYMES':
        label_sequences = compute_wl_embeddings_continuous(data_path, h)
    else:
        label_sequences = compute_wl_embeddings_discrete(data_path, h)

    # Save embeddings to output folder
    out_name = f'{dataset}_wl_{embedding_type}_embeddings_h{h}.npy'
    np.save(os.path.join(output_path, out_name), label_sequences)
    print(f'Embeddings for {dataset} computed, saved to {os.path.join(output_path, out_name)}.')
    print()

    #---------------------------------
    # Wasserstein & Kernel computations
    #---------------------------------
    # Run Wasserstein distance computation
    print('Computing the Wasserstein distances...')
    wasserstein_distances = compute_wasserstein_distance(label_sequences, h, sinkhorn=sinkhorn,
                                                            discrete=dataset=='MUTAG')

    # Save Wasserstein distance matrices
    for i, D_w in enumerate(wasserstein_distances):
        filext = 'wasserstein_distance_matrix'
        if sinkhorn:
            filext += '_sinkhorn'
        filext += f'_it{i}.npy'
        np.save(os.path.join(output_path,filext), D_w)
    print('Wasserstein distances computation done. Saved to file.')
    print()

    # Transform to Kernel
    # Here the flags come into play
    if args.gridsearch:
        # Gammas in eps(-gamma*M):
        gammas = np.logspace(-4,1,num=6)  
        # iterate over the iterations too
        hs = range(h)
        param_grid = [
            {'C': np.logspace(-3,3,num=7)}
        ]
    else:
        gammas = [0.001]
        hs = [h]

    kernel_matrices = []
    kernel_params = []
    for i, current_h in enumerate(hs):
        # Generate the full list of kernel matrices from which to select
        M = wasserstein_distances[current_h]
        for g in gammas:
            K = np.exp(-g*M)
            kernel_matrices.append(K)
            kernel_params.append((current_h, g))

    # Check for no hyperparam:
    if not args.gridsearch:
        assert len(kernel_matrices) == 1
    print('Kernel matrices computed.')
    print()

    #---------------------------------
    # Classification
    #---------------------------------
    # Run hyperparameter search if needed
    print(f'Running SVMs, crossvalidation: {args.crossvalidation}, gridsearch: {args.gridsearch}.')
    # Load labels
    label_file = os.path.join(data_path, 'Labels.txt')
    y = np.array(read_labels(label_file))

    # Contains accuracy scores for each cross validation step; the
    # means of this list will be used later on.
    accuracy_scores = []
    np.random.seed(42)
    
    cv = StratifiedKFold(n_splits=10, shuffle=True)
    # Hyperparam logging
    best_C = []
    best_h = []
    best_gamma = []

    for train_index, test_index in cv.split(kernel_matrices[0], y):
        K_train = [K[train_index][:, train_index] for K in kernel_matrices]
        K_test  = [K[test_index][:, train_index] for K in kernel_matrices]
        y_train, y_test = y[train_index], y[test_index]

        # Gridsearch
        if args.gridsearch:
            gs, best_params = custom_grid_search_cv(SVC(kernel='precomputed'), 
                    param_grid, K_train, y_train, cv=5)
            # Store best params
            C_ = best_params['params']['C']
            h_, gamma_ = kernel_params[best_params['K_idx']]
            y_pred = gs.predict(K_test[best_params['K_idx']])
        else:
            gs = SVC(C=100, kernel='precomputed').fit(K_train[0], y_train)
            y_pred = gs.predict(K_test[0])
            h_, gamma_, C_ = h, gammas[0], 100 
        best_C.append(C_)
        best_h.append(h_)
        best_gamma.append(gamma_)

        accuracy_scores.append(accuracy_score(y_test, y_pred))
        if not args.crossvalidation:
            break
    
    #---------------------------------
    # Printing and logging
    #---------------------------------
    if args.crossvalidation:
        print('Mean 10-fold accuracy: {:2.2f} +- {:2.2f} %'.format(
                    np.mean(accuracy_scores) * 100,  
                    np.std(accuracy_scores) * 100))
    else:
        print('Final accuracy: {:2.3f} %'.format(np.mean(accuracy_scores)*100))

    # Save to file
    if args.crossvalidation or args.gridsearch:
        extension = ''
        if args.crossvalidation:
            extension += '_crossvalidation'
        if args.gridsearch:
            extension += '_gridsearch'
        results_filename = os.path.join(results_path, f'results_{dataset}'+extension+'.csv')
        n_splits = 10 if args.crossvalidation else 1
        pd.DataFrame(np.array([best_h, best_C, best_gamma, accuracy_scores]).T, 
                columns=[['h', 'C', 'gamma', 'accuracy']], 
                index=['fold_id{}'.format(i) for i in range(n_splits)]).to_csv(results_filename)
        print(f'Results saved in {results_filename}.')
    else:
        print('No results saved to file as --crossvalidation or --gridsearch were not selected.')
def main(args, logger):

    labels = read_labels(args.labels)
    # Load matrices
    matrices = np.load(args.MATRICES)

    print(
        f"Loaded {len(list(matrices.keys()))} matrices, with shape {matrices['0'].shape}"
    )

    matrix_dict = {}
    for h in matrices.keys():
        matrix_dict[int(h)] = {'gram': matrices[h]}

    y = LabelEncoder().fit_transform(labels)

    np.random.seed(42)
    mean_accuracies = []

    kernel_params = np.array(basename(args.MATRICES)[:-4].split('_'))[1:]

    params = ['dataset', 'max_h', 'sigma']
    cv_results = []
    entry = {}
    for i, param in enumerate(params):
        entry[param] = kernel_params[i]

    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(matrix_dict[0]['gram'], y)):
            entry_fold = copy.copy(entry)

            train_index = indices[0]
            test_index = indices[1]
            y_train = y[train_index]
            y_test = y[test_index]

            # Override current full matrices
            for h, m_dict in matrix_dict.items():

                X_train = m_dict['gram'][train_index][:, train_index]
                X_test = m_dict['gram'][test_index][:, train_index]

                m_dict['X_train'] = X_train
                m_dict['X_test'] = X_test

            pipeline = Pipeline(
                [('clf',
                  SVC(class_weight='balanced' if args.balanced else None,
                      random_state=42,
                      kernel='precomputed'))], )

            grid_params = {
                'clf__C': [1e-1, 1e0, 1e1],
            }

            clf, best_params = custom_grid_search_cv(pipeline, grid_params,
                                                     matrix_dict, y_train)

            X_test = np.zeros(shape=matrix_dict[0]['X_test'].shape)
            counter = 0
            for h in range(best_params['h'] + 1):
                X_test += matrix_dict[h]['X_test']
                counter += 1

            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            best_params['params']['h'] = best_params['h']
            for param, param_val in best_params['params'].items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            print('Best classifier for this fold:{}'.format(
                best_params['params']))

        mean_accuracies.append(np.mean(accuracy_scores))
        print(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)
Esempio n. 13
0
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    for graph in graphs:

        # Make sure that no label information exists as a graph
        # attribute already.
        assert 'label' not in graph.vs.attributes()

        graph.vs['degree'] = graph.vs.degree()

    logger.info('Read {} graphs and {} labels'.format(len(graphs), len(labels)))

    assert len(graphs) == len(labels)

    prop = WeisfeilerLehmanAttributePropagation()
    attributes_per_iteration = prop.transform(
        graphs,
        'degree',
        args.num_iterations
    )

    use_vertex_weights = args.vertex_weights

    # Stores *all* persistence diagrams because they will be used to
    # represent the data set later on.
    persistence_diagrams_per_iteration = collections.defaultdict(list)

    for iteration in sorted(attributes_per_iteration.keys()):

        # Determine maximum attribute value over *all* graphs and their
        # respective filtrations.
        max_attribute = max([np.max(attributes_per_iteration[iteration][index]) for index, _ in enumerate(graphs)])

        unpaired_value = 2 * max_attribute

        if use_vertex_weights:
            pdc = PersistenceDiagramCalculator(
                unpaired_value=unpaired_value,
                vertex_attribute='degree',
            )
        else:
            pdc = PersistenceDiagramCalculator(
                unpaired_value=unpaired_value
            )

        for index, graph in enumerate(graphs):
            attributes = attributes_per_iteration[iteration][index]

            graph.vs['degree'] = attributes

            weighted_graph = assign_filtration_values(
                graph,
                attributes,
                normalize=args.normalize
            )

            pd, edge_indices_cycles = pdc.fit_transform(graph)

            # Store the persistence diagram as a 2D array in order to
            # facilitate the subsequent kernel calculations.
            persistence_diagrams_per_iteration[iteration].append(
                np.array([(c, d) for c, d, _ in pd])
            )

            np.savetxt(
                '/tmp/{:04d}_d0_h{:d}.txt'.format(index, iteration),
                np.array([(c, d) for c, d, _ in pd]),
                fmt='%.f'
            )
def main(args, logger):

    graphs = [ig.read(filename) for filename in args.FILES]
    labels = read_labels(args.labels)

    # Set the label to be uniform over all graphs in case no labels are
    # available. This essentially changes our iteration to degree-based
    # checks.
    for graph in graphs:
        if 'label' not in graph.vs.attributes():
            graph.vs['label'] = [0] * len(graph.vs)

    logger.info('Read {} graphs and {} labels'.format(len(graphs),
                                                      len(labels)))

    assert len(graphs) == len(labels)

    pwl_list = []
    for p in [1, 2]:
        pwl = PersistentWeisfeilerLehman(
            use_cycle_persistence=args.use_cycle_persistence,
            use_original_features=args.use_original_features,
            metric=args.metric,
            use_label_persistence=True,
            p=p)

        X, num_columns_per_iteration = pwl.transform(graphs,
                                                     args.num_iterations)
        pwl_list.append({'p': p, 'X': X})

        logger.info(
            f'Finished persistent Weisfeiler-Lehman transformation for \
                    p={p}')
        logger.info('Obtained ({} x {}) feature matrix'.format(
            X.shape[0], X.shape[1]))

    if args.use_cycle_persistence:
        logger.info('Using cycle persistence')

    y = LabelEncoder().fit_transform(labels)

    np.random.seed(42)
    mean_accuracies = []

    params = [
        'balanced', 'num_iterations', 'filtration', 'use_cycle_persistence',
        'use_original_features', 'metric'
    ]
    cv_results = []
    entry = {}
    for param in params:
        entry[param] = args.__dict__[param]
    entry['dataset'] = dirname(args.FILES[0]).split('/')[1]
    for i in range(10):
        # Contains accuracy scores for each cross validation step; the
        # means of this list will be used later on.
        accuracy_scores = []
        cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=i)
        for n, indices in enumerate(cv.split(X, y)):
            entry_fold = copy.copy(entry)
            train_index = indices[0]
            test_index = indices[1]
            y_train = y[train_index]
            y_test = y[test_index]

            # Override current full matrices
            for pwl_dict in pwl_list:

                scaler = StandardScaler()
                X_train = scaler.fit_transform(pwl_dict['X'][train_index])
                X_test = scaler.transform(pwl_dict['X'][test_index])

                scaler = MinMaxScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

                pwl_dict['X_train'] = X_train
                pwl_dict['X_test'] = X_test

            pipeline = Pipeline(
                [('fs', FeatureSelector(num_columns_per_iteration)),
                 ('clf',
                  RandomForestClassifier(
                      class_weight='balanced' if args.balanced else None,
                      random_state=42,
                      n_jobs=4))], )

            grid_params = {
                'fs__num_iterations': np.arange(0, args.num_iterations + 1),
                'clf__n_estimators': [25, 50, 100],
            }

            clf, best_params = custom_grid_search_cv(pipeline, grid_params,
                                                     pwl_list, y_train)

            X_test = pwl_list[best_params['pwl_idx']]['X_test']
            y_pred = clf.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            accuracy_scores.append(acc)

            best_params['params']['p'] = best_params['pwl_idx'] + 1
            for param, param_val in best_params['params'].items():
                entry_fold[param] = param_val
                entry[param] = ''
            entry_fold['fold'] = n + 1
            entry_fold['it'] = i
            entry_fold['acc'] = acc * 100
            entry_fold['std'] = 0.0
            cv_results.append(entry_fold)

            logger.info('Best classifier for this fold:{}'.format(
                best_params['params']))

        mean_accuracies.append(np.mean(accuracy_scores))
        logger.info(
            '  - Mean 10-fold accuracy: {:2.2f} [running mean over all folds: {:2.2f}]'
            .format(mean_accuracies[-1] * 100,
                    np.mean(mean_accuracies) * 100))
    entry['fold'] = 'all'
    entry['it'] = 'all'
    entry['acc'] = np.mean(mean_accuracies) * 100
    entry['std'] = np.std(mean_accuracies) * 100
    cv_results.append(entry)
    logger.info('Accuracy: {:2.2f} +- {:2.2f}'.format(
        np.mean(mean_accuracies) * 100,
        np.std(mean_accuracies) * 100))

    if exists(args.result_file):
        with open(args.result_file, 'a') as f:
            pd.DataFrame(cv_results).to_csv(f, index=False, header=None)
    else:
        pd.DataFrame(cv_results).to_csv(args.result_file, index=False)