Example #1
0
def benchmark(posf, negf, minsup, topk):
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    prefix = "../statement/data/"

    database_file_name_pos = prefix + posf
    database_file_name_neg = prefix + negf
    top_K = topk
    total_min_freq = minsup

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan
Example #2
0
def task2(database_file_name_pos, database_file_name_neg, k, minsup, nfolds):
    """
    Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
    the positive class with a minimum support of minsup.
    Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
    the test set.
    Performs a k-fold cross-validation.
    """

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
    if nfolds < 2:
        subsets = [
            pos_ids,  # Positive training set
            pos_ids,  # Positive test set
            neg_ids,  # Negative training set
            neg_ids  # Negative test set
        ]
        # Printing fold number:
        print('fold {}'.format(1))
        train_and_evaluate(minsup, graph_database, subsets, k)

    # Otherwise: performs k-fold cross-validation:
    else:
        pos_fold_size = len(pos_ids) // nfolds
        neg_fold_size = len(neg_ids) // nfolds
        for i in range(nfolds):
            # Use fold as test set, the others as training set for each class;
            # identify all the subsets to be maintained by the graph mining algorithm.
            subsets = [
                numpy.concatenate(
                    (pos_ids[:i * pos_fold_size],
                     pos_ids[(i + 1) *
                             pos_fold_size:])),  # Positive training set
                pos_ids[i * pos_fold_size:(i + 1) *
                        pos_fold_size],  # Positive test set
                numpy.concatenate(
                    (neg_ids[:i * neg_fold_size],
                     neg_ids[(i + 1) *
                             neg_fold_size:])),  # Negative training set
                neg_ids[i * neg_fold_size:(i + 1) *
                        neg_fold_size],  # Negative test set
            ]
            # Printing fold number:
            print('fold {}'.format(i + 1))
            train_and_evaluate(minsup, graph_database, subsets, k)
def task2():
    """
    Runs gSpan with the specified positive and negative graphs; finds all frequent sub-graphs in the training subset of
    the positive class with a minimum support of minSup.
    Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
    the test set.
    Performs a k-fold cross-validation.
    """

    args = sys.argv
    database_file_name_pos = args[1]  # First parameter: path to positive class file
    database_file_name_neg = args[2]  # Second parameter: path to negative class file
    k = int(args[3])  # Third parameter: minimum support (note: this parameter will be k in case of top-k mining) 0
    minFrequency = int(args[4])
    nfolds = int(args[5])  # Fourth parameter: number of folds to use in the k-fold cross-validation.
    with open('./results/task2.txt', 'w') as dataset:
        if not os.path.exists(database_file_name_pos):
            print('{} does not exist.'.format(database_file_name_pos))
            sys.exit()
        if not os.path.exists(database_file_name_neg):
            print('{} does not exist.'.format(database_file_name_neg))
            sys.exit()

        graph_database = GraphDatabase()  # Graph database object
        pos_ids = graph_database.read_graphs(
            database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
        neg_ids = graph_database.read_graphs(
            database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids

        # If less than two folds: using the same set as training and test set
        # (note this is not an accurate way to evaluate the performances!)
        if nfolds < 2:
            subsets = [
                pos_ids,  # Positive training set
                pos_ids,  # Positive test set
                neg_ids,  # Negative training set
                neg_ids  # Negative test set
            ]
            # Printing fold number:
            print('fold {}'.format(1))
            train_and_evaluate(minFrequency, graph_database, subsets, k, dataset)

        # Otherwise: performs k-fold cross-validation:
        else:
            pos_fold_size = len(pos_ids) // nfolds
            neg_fold_size = len(neg_ids) // nfolds
            for i in range(nfolds):
                # Use fold as test set, the others as training set for each class;
                # identify all the subsets to be maintained by the graph mining algorithm.
                subsets = [
                    numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])),
                    # Positive training set
                    pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size],  # Positive test set
                    numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])),
                    # Negative training set
                    neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size],  # Negative test set
                ]
                # Printing fold number:
                print('fold {}'.format(i + 1), file= dataset)
                train_and_evaluate(minFrequency, graph_database, subsets, k, dataset)
Example #4
0
def task1(database_file_name_pos, database_file_name_neg, k, minsup):
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids

    subsets = [pos_ids, neg_ids]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(graph_database, subsets, minsup, k)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their confidence and total support:
    for pattern in task.patterns:
        total_support = pattern[1]
        confidence = pattern[0]
        print('{} {} {}'.format(pattern[2], confidence, total_support))
Example #5
0
def benchmark(posf, negf, nf, minsup, top_K, classifiers):
    """
    Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
    the positive class with a minimum support of minsup.
    Uses the patterns found to train a classifier using Scikit-learn and evaluates its performances on
    the test set.
    Performs a k-fold cross-validation.
    """
    prefix = "../statement/data/"

    database_file_name_pos = prefix + posf
    database_file_name_neg = prefix + negf
    nfolds = nf

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    pos_fold_size = len(pos_ids) // nfolds
    neg_fold_size = len(neg_ids) // nfolds

    accuracy = {'test': {}, 'train': {}}

    for i in range(nfolds):
        # Use fold as test set, the others as training set for each class;
        # identify all the subsets to be maintained by the graph mining algorithm.
        subsets = [
            numpy.concatenate(
                (pos_ids[:i * pos_fold_size],
                 pos_ids[(i + 1) * pos_fold_size:])),  # Positive training set
            pos_ids[i * pos_fold_size:(i + 1) *
                    pos_fold_size],  # Positive test set
            numpy.concatenate(
                (neg_ids[:i * neg_fold_size],
                 neg_ids[(i + 1) * neg_fold_size:])),  # Negative training set
            neg_ids[i * neg_fold_size:(i + 1) *
                    neg_fold_size],  # Negative test set
        ]
        testacc, trainacc = tae(minsup,
                                graph_database,
                                subsets,
                                top_K,
                                cl=classifiers)
        accuracy['test'][i] = testacc
        accuracy['train'][i] = trainacc

    return accuracy
Example #6
0
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    a = 11
    if a == 1:
        args = sys.argv
        database_file_name_pos = args[
            1]  # First parameter: path to positive class file
        database_file_name_neg = args[
            2]  # Second parameter: path to negative class file
        k = int(args[3])
        minsup = int(args[4])  # Third parameter: minimum support
    else:
        database_file_name_pos = 'data/molecules-small.pos'
        database_file_name_neg = 'data/molecules-small.neg'
        k = 5
        minsup = 5

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids
    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    print(subsets)
    task = ConfidencePositiveGraphs(k, minsup, graph_database,
                                    subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    keys = task.patterns.keys()
    for key in keys:
        for pattern, a in task.patterns[key]:
            confidence = key[0]
            support = key[
                1]  # This will have to be replaced by the confidence and support on both classes
            print('{} {} {}'.format(pattern, confidence, support))
            print(a)
def top_k():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
    with a minimum positive support of minsup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    k = int(args[3])  # Third parameter: minimum support
    minsup = int(args[4])  # Third parameter: minimum support

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, k, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan
    sort = sorted(task.patterns,
                  key=attrgetter('confidence', 'support'),
                  reverse=True)
    bestConf = -1
    bestSupp = -1
    for patt in sort:
        confidence = patt.confidence
        support = patt.support
        dfs_code = patt.code
        if (confidence != bestConf or support != bestSupp):
            bestConf = confidence
            bestSupp = support
            k -= 1
            if k == -1:
                print(" ")
                break
        print('{} {} {}'.format(dfs_code, confidence, support))
def task4():
    args = sys.argv
    database_file_name_pos = args[1]  # First parameter: path to positive class file
    database_file_name_neg = args[2]  # Second parameter: path to negative class file
    nfolds = int(args[5]) # Third parameter: number of folds to use in the k-fold cross-validation.

    with open('./results/task4.txt', 'w') as dataset:
        if not os.path.exists(database_file_name_pos):
            print('{} does not exist.'.format(database_file_name_pos))
            sys.exit()
        if not os.path.exists(database_file_name_neg):
            print('{} does not exist.'.format(database_file_name_neg))
            sys.exit()

        graph_database = GraphDatabase()  # Graph database object
        pos_ids = graph_database.read_graphs(
            database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
        neg_ids = graph_database.read_graphs(
            database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids
        minFrequency = (len(pos_ids) + len(neg_ids)) // 8
        k = minFrequency
        # If less than two folds: using the same set as training and test set
        # (note this is not an accurate way to evaluate the performances!)
        if nfolds < 2:
            subsets = [
                pos_ids,  # Positive training set
                pos_ids,  # Positive test set
                neg_ids,  # Negative training set
                neg_ids  # Negative test set
            ]
            # Printing fold number:
            print('fold {}'.format(1), file= dataset)
            train_and_evaluate(minFrequency, graph_database, subsets, k)

        # Otherwise: performs k-fold cross-validation:
        else:
            pos_fold_size = len(pos_ids) // nfolds
            neg_fold_size = len(neg_ids) // nfolds
            for i in range(0, nfolds):
                # Use fold as test set, the others as training set for each class;
                # identify all the subsets to be maintained by the graph mining algorithm.
                subsets = [
                    numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])),
                    # Positive training set
                    pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size],  # Positive test set
                    numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])),
                    # Negative training set
                    neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size],  # Negative test set
                ]
                # Printing fold number:
                print('fold {}'.format(i + 1), file= dataset)
                train_and_evaluate_task4(minFrequency, graph_database, subsets, k, dataset)
def task1():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subGraphs in the positive class
    with a minimum positive support of minSup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[1]  # First parameter: path to positive class file
    database_file_name_neg = args[2]  # Second parameter: path to negative class file
    k = int(args[4])  # Third parameter: k
    minFrequency = int(args[4])  # Fourth parameter: minimum frequency

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    # Reading positive graphs, adding them to database and getting ids
    pos_ids = graph_database.read_graphs(database_file_name_pos)
    # Reading negative graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)

    task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, graph_database, [pos_ids, neg_ids], k, False)

    gSpan(task).run()  # Running gSpan

    # with open('./solution1', 'w') as file:
    firstLine = True
    result = ""
    # Printing frequent patterns along with their positive support:
    with open('./results/task1.txt', 'w') as dataset:
        for confidenceLevel in reversed(task.orderedListOfConfidenceValues):
            for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns:
                if confidence == confidenceLevel:
                    toPrint = False
                    if confidence > task.minConfidence:
                        toPrint = True
                    elif confidence == task.minConfidence:
                        if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]:
                            toPrint = True

                    if toPrint:
                        if not firstLine:
                            result += '\n'
                        else:
                            firstLine = False
                        result += '{}_{}_{}'.format(pattern, confidence, frequency)
            # print(result, file=file, end='')
        print(result, end='', file= dataset)
Example #10
0
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    result = []
    frequents = []
    for pattern, gid_subsets in task.patterns:
        pos_support = len(gid_subsets[0])
        neg_support = len(gid_subsets[1])
        confidence = pos_support / (pos_support + neg_support)
        frequents.append((confidence, pos_support + neg_support))
        result.append((pattern, confidence, pos_support + neg_support))

        uniq = list(set(freq for freq in frequents))
        s = sorted(uniq, key=lambda x: x[0], reverse=True)
        r = [s.index(freq) for freq in frequents]

        ranked = []
        for idx, i in enumerate(r):
            if i < k:
                ranked.append(result[idx])
                ranked.sort(key=lambda x: x[1], reverse=True)

        for a, b, c in ranked:
            print('{} {} {}'.format(a, b, c))
def topK(database_file_name_pos, database_file_name_neg ,k, minsup, nfolds):
    accuracy = numpy.zeros(nfolds)
    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(database_file_name_pos)  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(database_file_name_neg)  # Reading negative graphs, adding them to database and getting ids
    #print(graph_database._graphs[0].plot())



    # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
    if nfolds < 2:
        subsets = [
            pos_ids,  # Positive training set
            pos_ids,  # Positive test set
            neg_ids,  # Negative training set
            neg_ids  # Negative test set
        ]
        # Printing fold number:
        print('fold {}'.format(1))
        acc = train_and_evaluate(minsup, graph_database, subsets)

    # Otherwise: performs k-fold cross-validation:
    else:
        pos_fold_size = len(pos_ids) // nfolds
        neg_fold_size = len(neg_ids) // nfolds
        for i in range(nfolds):
            # Use fold as test set, the others as training set for each class;
            # identify all the subsets to be maintained by the graph mining algorithm.
            subsets = [
                numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])),  # Positive training set
                pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size],  # Positive test set
                numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])),  # Negative training set
                neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size],  # Negative test set
            ]
            # Printing fold number:
            #print('fold {}'.format(i+1))
            acc = train_and_evaluate(minsup, graph_database, subsets, k)
            accuracy[i] = acc
    #print(accuracy)
    return numpy.mean(accuracy)
def example1():
    """
	Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
	with a minimum positive support of minsup and prints them.
	"""

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    minsup = int(args[3])  # Third parameter: minimum support

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(minsup, graph_database,
                                  subsets)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    for pattern, gid_subsets in task.patterns:
        pos_support = len(
            gid_subsets[0]
        )  # This will have to be replaced by the confidence and support on both classes
        print('{} {}'.format(pattern, pos_support))
def find_subgraphs():
    """
    Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class
    with a minimum positive support of minsup and prints them.
    """

    from argparse import ArgumentParser

    parser = ArgumentParser("Find subgraphs")
    parser.add_argument("positive_file", type=str)
    parser.add_argument("negative_file", type=str)
    parser.add_argument("top_k", type=int)
    parser.add_argument("min_supp", type=int)
    args = parser.parse_args()

    if not os.path.exists(args.positive_file):
        print("{} does not exist.".format(args.positive_file))
        sys.exit()
    if not os.path.exists(args.negative_file):
        print("{} does not exist.".format(args.negative_file))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        args.positive_file
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        args.negative_file
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids,
        neg_ids,
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(args.min_supp, graph_database, subsets,
                                  args.top_k)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their positive support:
    for (confidence, frequency), dfs_code in task.patterns:
        print("{} {} {}".format(dfs_code, confidence, frequency))
Example #14
0
def example1():
    """
    Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence
    with a minimum positive support of minsup and prints them.
    """

    args = sys.argv
    database_file_name_pos = args[
        1]  # First parameter: path to positive class file
    database_file_name_neg = args[
        2]  # Second parameter: path to negative class file
    top_K = int(args[3])  # Third parameter: minimum support
    total_min_freq = int(args[4])

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    subsets = [
        pos_ids, neg_ids
    ]  # The ids for the positive and negative labelled graphs in the database
    task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets,
                                  top_K)  # Creating task

    gSpan(task).run()  # Running gSpan

    # Printing frequent patterns along with their confidence and total support:
    for pattern in task.patterns:
        total_support = pattern[1]
        confidence = pattern[0]
        print('{} {} {}'.format(pattern[2], confidence, total_support))
Example #15
0
def example2(posf=None, negf=None, nf=5):
    """
    Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
    the positive class with a minimum support of minsup.
    Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
    the test set.
    Performs a k-fold cross-validation.
    """

    if posf is None or negf is None:
        args = sys.argv
        database_file_name_pos = args[
            1]  # First parameter: path to positive class file
        database_file_name_neg = args[
            2]  # Second parameter: path to negative class file
        nfolds = int(
            args[3]
        )  # Fifth parameter: number of folds to use in the k-fold cross-validation.
    else:
        database_file_name_pos = posf
        database_file_name_neg = negf
        nfolds = nf

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    minsup = max(5, len(pos_ids) * (nfolds - 1) / 2 / nfolds)
    top_K = 50

    # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
    if nfolds < 2:
        subsets = [
            pos_ids,  # Positive training set
            pos_ids,  # Positive test set
            neg_ids,  # Negative training set
            neg_ids  # Negative test set
        ]
        # Printing fold number:
        print('fold {}'.format(1))
        train_and_evaluate(minsup, graph_database, subsets, top_K)

    # Otherwise: performs k-fold cross-validation:
    else:
        pos_fold_size = len(pos_ids) // nfolds
        neg_fold_size = len(neg_ids) // nfolds
        for i in range(nfolds):
            # Use fold as test set, the others as training set for each class;
            # identify all the subsets to be maintained by the graph mining algorithm.
            subsets = [
                numpy.concatenate(
                    (pos_ids[:i * pos_fold_size],
                     pos_ids[(i + 1) *
                             pos_fold_size:])),  # Positive training set
                pos_ids[i * pos_fold_size:(i + 1) *
                        pos_fold_size],  # Positive test set
                numpy.concatenate(
                    (neg_ids[:i * neg_fold_size],
                     neg_ids[(i + 1) *
                             neg_fold_size:])),  # Negative training set
                neg_ids[i * neg_fold_size:(i + 1) *
                        neg_fold_size],  # Negative test set
            ]
            # Printing fold number:
            print('fold {}'.format(i + 1))
            train_and_evaluate(minsup, graph_database, subsets, top_K)
Example #16
0
def example3():
    a = 1

    if a == 1:
        args = sys.argv
        database_file_name_pos = args[
            1]  # First parameter: path to positive class file
        database_file_name_neg = args[
            2]  # Second parameter: path to negative class file
        k = int(args[3])
        minsup = int(
            args[4]
        )  # Third parameter: minimum support (note: this parameter will be k in case of top-k mining)
        nfolds = int(
            args[5]
        )  # Fourth parameter: number of folds to use in the k-fold cross-validation.
    else:
        database_file_name_pos = 'data/molecules-small.pos'
        database_file_name_neg = 'data/molecules-small.neg'
        k = 5
        minsup = 5
        nfolds = 4

    if not os.path.exists(database_file_name_pos):
        print('{} does not exist.'.format(database_file_name_pos))
        sys.exit()
    if not os.path.exists(database_file_name_neg):
        print('{} does not exist.'.format(database_file_name_neg))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        database_file_name_pos
    )  # Reading positive graphs, adding them to database and getting ids
    # print(graph_database._graphs[0].display())
    neg_ids = graph_database.read_graphs(
        database_file_name_neg
    )  # Reading negative graphs, adding them to database and getting ids

    # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
    if nfolds < 2:
        subsets = [
            pos_ids,  # Positive training set
            pos_ids,  # Positive test set
            neg_ids,  # Negative training set
            neg_ids  # Negative test set
        ]
        # Printing fold number:
        print('fold {}'.format(1))
        train_and_evaluate(minsup, graph_database, subsets)

    # Otherwise: performs k-fold cross-validation:
    else:
        pos_fold_size = len(pos_ids) // nfolds
        neg_fold_size = len(neg_ids) // nfolds
        for i in range(nfolds):
            # Use fold as test set, the others as training set for each class;
            # identify all the subsets to be maintained by the graph mining algorithm.
            subsets = [
                numpy.concatenate(
                    (pos_ids[:i * pos_fold_size],
                     pos_ids[(i + 1) *
                             pos_fold_size:])),  # Positive training set
                pos_ids[i * pos_fold_size:(i + 1) *
                        pos_fold_size],  # Positive test set
                numpy.concatenate(
                    (neg_ids[:i * neg_fold_size],
                     neg_ids[(i + 1) *
                             neg_fold_size:])),  # Negative training set
                neg_ids[i * neg_fold_size:(i + 1) *
                        neg_fold_size],  # Negative test set
            ]
            # Printing fold number:
            print('fold {}'.format(i + 1))
            Sequential_Covering(k, minsup, graph_database, subsets)
def train_evaluate_decision_tree():
    """
    Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of
    the positive class with a minimum support of minsup.
    Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on
    the test set.
    Performs a k-fold cross-validation.
    """
    from argparse import ArgumentParser

    parser = ArgumentParser("Find subgraphs")
    parser.add_argument("positive_file", type=str)
    parser.add_argument("negative_file", type=str)
    parser.add_argument("top_k", type=int)
    parser.add_argument("min_supp", type=int)
    parser.add_argument("n_folds", type=int)
    parser.add_argument("-b", "--benchmark", action="store_true")
    args = parser.parse_args()

    if not os.path.exists(args.positive_file):
        print("{} does not exist.".format(args.positive_file))
        sys.exit()
    if not os.path.exists(args.negative_file):
        print("{} does not exist.".format(args.negative_file))
        sys.exit()

    graph_database = GraphDatabase()  # Graph database object
    pos_ids = graph_database.read_graphs(
        args.positive_file
    )  # Reading positive graphs, adding them to database and getting ids
    neg_ids = graph_database.read_graphs(
        args.negative_file
    )  # Reading negative graphs, adding them to database and getting ids

    # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!)
    if args.n_folds < 2:
        subsets = [
            pos_ids,  # Positive training set
            pos_ids,  # Positive test set
            neg_ids,  # Negative training set
            neg_ids,  # Negative test set
        ]
        # Printing fold number:
        print("fold {}".format(1))
        train_and_evaluate(args.min_supp, graph_database, subsets, args.top_k, args)

    # Otherwise: performs k-fold cross-validation:
    else:
        pos_fold_size = len(pos_ids) // args.n_folds
        neg_fold_size = len(neg_ids) // args.n_folds
        for i in range(args.n_folds):
            # Use fold as test set, the others as training set for each class;
            # identify all the subsets to be maintained by the graph mining algorithm.
            subsets = [
                numpy.concatenate(
                    (pos_ids[: i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size :])
                ),  # Positive training set
                pos_ids[
                    i * pos_fold_size : (i + 1) * pos_fold_size
                ],  # Positive test set
                numpy.concatenate(
                    (neg_ids[: i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size :])
                ),  # Negative training set
                neg_ids[
                    i * neg_fold_size : (i + 1) * neg_fold_size
                ],  # Negative test set
            ]
            # Printing fold number:
            print("fold {}".format(i + 1))
            train_and_evaluate(args.min_supp, graph_database, subsets, args.top_k, args)