def benchmark(posf, negf, minsup, topk): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ prefix = "../statement/data/" database_file_name_pos = prefix + posf database_file_name_neg = prefix + negf top_K = topk total_min_freq = minsup if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan
def task2(database_file_name_pos, database_file_name_neg, k, minsup, nfolds): """ Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of the positive class with a minimum support of minsup. Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on the test set. Performs a k-fold cross-validation. """ if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1)) train_and_evaluate(minsup, graph_database, subsets, k) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate( (pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate( (neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: print('fold {}'.format(i + 1)) train_and_evaluate(minsup, graph_database, subsets, k)
def task2(): """ Runs gSpan with the specified positive and negative graphs; finds all frequent sub-graphs in the training subset of the positive class with a minimum support of minSup. Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on the test set. Performs a k-fold cross-validation. """ args = sys.argv database_file_name_pos = args[1] # First parameter: path to positive class file database_file_name_neg = args[2] # Second parameter: path to negative class file k = int(args[3]) # Third parameter: minimum support (note: this parameter will be k in case of top-k mining) 0 minFrequency = int(args[4]) nfolds = int(args[5]) # Fourth parameter: number of folds to use in the k-fold cross-validation. with open('./results/task2.txt', 'w') as dataset: if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg) # Reading negative graphs, adding them to database and getting ids # If less than two folds: using the same set as training and test set # (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1)) train_and_evaluate(minFrequency, graph_database, subsets, k, dataset) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: print('fold {}'.format(i + 1), file= dataset) train_and_evaluate(minFrequency, graph_database, subsets, k, dataset)
def task1(database_file_name_pos, database_file_name_neg, k, minsup): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids subsets = [pos_ids, neg_ids] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(graph_database, subsets, minsup, k) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their confidence and total support: for pattern in task.patterns: total_support = pattern[1] confidence = pattern[0] print('{} {} {}'.format(pattern[2], confidence, total_support))
def benchmark(posf, negf, nf, minsup, top_K, classifiers): """ Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of the positive class with a minimum support of minsup. Uses the patterns found to train a classifier using Scikit-learn and evaluates its performances on the test set. Performs a k-fold cross-validation. """ prefix = "../statement/data/" database_file_name_pos = prefix + posf database_file_name_neg = prefix + negf nfolds = nf if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds accuracy = {'test': {}, 'train': {}} for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate( (pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate( (neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] testacc, trainacc = tae(minsup, graph_database, subsets, top_K, cl=classifiers) accuracy['test'][i] = testacc accuracy['train'][i] = trainacc return accuracy
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ a = 11 if a == 1: args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file k = int(args[3]) minsup = int(args[4]) # Third parameter: minimum support else: database_file_name_pos = 'data/molecules-small.pos' database_file_name_neg = 'data/molecules-small.neg' k = 5 minsup = 5 if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database print(subsets) task = ConfidencePositiveGraphs(k, minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: keys = task.patterns.keys() for key in keys: for pattern, a in task.patterns[key]: confidence = key[0] support = key[ 1] # This will have to be replaced by the confidence and support on both classes print('{} {} {}'.format(pattern, confidence, support)) print(a)
def top_k(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file k = int(args[3]) # Third parameter: minimum support minsup = int(args[4]) # Third parameter: minimum support if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, k, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan sort = sorted(task.patterns, key=attrgetter('confidence', 'support'), reverse=True) bestConf = -1 bestSupp = -1 for patt in sort: confidence = patt.confidence support = patt.support dfs_code = patt.code if (confidence != bestConf or support != bestSupp): bestConf = confidence bestSupp = support k -= 1 if k == -1: print(" ") break print('{} {} {}'.format(dfs_code, confidence, support))
def task4(): args = sys.argv database_file_name_pos = args[1] # First parameter: path to positive class file database_file_name_neg = args[2] # Second parameter: path to negative class file nfolds = int(args[5]) # Third parameter: number of folds to use in the k-fold cross-validation. with open('./results/task4.txt', 'w') as dataset: if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg) # Reading negative graphs, adding them to database and getting ids minFrequency = (len(pos_ids) + len(neg_ids)) // 8 k = minFrequency # If less than two folds: using the same set as training and test set # (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1), file= dataset) train_and_evaluate(minFrequency, graph_database, subsets, k) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(0, nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: print('fold {}'.format(i + 1), file= dataset) train_and_evaluate_task4(minFrequency, graph_database, subsets, k, dataset)
def task1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subGraphs in the positive class with a minimum positive support of minSup and prints them. """ args = sys.argv database_file_name_pos = args[1] # First parameter: path to positive class file database_file_name_neg = args[2] # Second parameter: path to negative class file k = int(args[4]) # Third parameter: k minFrequency = int(args[4]) # Fourth parameter: minimum frequency if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object # Reading positive graphs, adding them to database and getting ids pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading negative graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs(database_file_name_neg) task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, graph_database, [pos_ids, neg_ids], k, False) gSpan(task).run() # Running gSpan # with open('./solution1', 'w') as file: firstLine = True result = "" # Printing frequent patterns along with their positive support: with open('./results/task1.txt', 'w') as dataset: for confidenceLevel in reversed(task.orderedListOfConfidenceValues): for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns: if confidence == confidenceLevel: toPrint = False if confidence > task.minConfidence: toPrint = True elif confidence == task.minConfidence: if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]: toPrint = True if toPrint: if not firstLine: result += '\n' else: firstLine = False result += '{}_{}_{}'.format(pattern, confidence, frequency) # print(result, file=file, end='') print(result, end='', file= dataset)
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: result = [] frequents = [] for pattern, gid_subsets in task.patterns: pos_support = len(gid_subsets[0]) neg_support = len(gid_subsets[1]) confidence = pos_support / (pos_support + neg_support) frequents.append((confidence, pos_support + neg_support)) result.append((pattern, confidence, pos_support + neg_support)) uniq = list(set(freq for freq in frequents)) s = sorted(uniq, key=lambda x: x[0], reverse=True) r = [s.index(freq) for freq in frequents] ranked = [] for idx, i in enumerate(r): if i < k: ranked.append(result[idx]) ranked.sort(key=lambda x: x[1], reverse=True) for a, b, c in ranked: print('{} {} {}'.format(a, b, c))
def topK(database_file_name_pos, database_file_name_neg ,k, minsup, nfolds): accuracy = numpy.zeros(nfolds) if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids #print(graph_database._graphs[0].plot()) # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1)) acc = train_and_evaluate(minsup, graph_database, subsets) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate((pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate((neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: #print('fold {}'.format(i+1)) acc = train_and_evaluate(minsup, graph_database, subsets, k) accuracy[i] = acc #print(accuracy) return numpy.mean(accuracy)
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file minsup = int(args[3]) # Third parameter: minimum support if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: for pattern, gid_subsets in task.patterns: pos_support = len( gid_subsets[0] ) # This will have to be replaced by the confidence and support on both classes print('{} {}'.format(pattern, pos_support))
def find_subgraphs(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ from argparse import ArgumentParser parser = ArgumentParser("Find subgraphs") parser.add_argument("positive_file", type=str) parser.add_argument("negative_file", type=str) parser.add_argument("top_k", type=int) parser.add_argument("min_supp", type=int) args = parser.parse_args() if not os.path.exists(args.positive_file): print("{} does not exist.".format(args.positive_file)) sys.exit() if not os.path.exists(args.negative_file): print("{} does not exist.".format(args.negative_file)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( args.positive_file ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( args.negative_file ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids, ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(args.min_supp, graph_database, subsets, args.top_k) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: for (confidence, frequency), dfs_code in task.patterns: print("{} {} {}".format(dfs_code, confidence, frequency))
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file top_K = int(args[3]) # Third parameter: minimum support total_min_freq = int(args[4]) if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their confidence and total support: for pattern in task.patterns: total_support = pattern[1] confidence = pattern[0] print('{} {} {}'.format(pattern[2], confidence, total_support))
def example2(posf=None, negf=None, nf=5): """ Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of the positive class with a minimum support of minsup. Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on the test set. Performs a k-fold cross-validation. """ if posf is None or negf is None: args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file nfolds = int( args[3] ) # Fifth parameter: number of folds to use in the k-fold cross-validation. else: database_file_name_pos = posf database_file_name_neg = negf nfolds = nf if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids minsup = max(5, len(pos_ids) * (nfolds - 1) / 2 / nfolds) top_K = 50 # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1)) train_and_evaluate(minsup, graph_database, subsets, top_K) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate( (pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate( (neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: print('fold {}'.format(i + 1)) train_and_evaluate(minsup, graph_database, subsets, top_K)
def example3(): a = 1 if a == 1: args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file k = int(args[3]) minsup = int( args[4] ) # Third parameter: minimum support (note: this parameter will be k in case of top-k mining) nfolds = int( args[5] ) # Fourth parameter: number of folds to use in the k-fold cross-validation. else: database_file_name_pos = 'data/molecules-small.pos' database_file_name_neg = 'data/molecules-small.neg' k = 5 minsup = 5 nfolds = 4 if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids # print(graph_database._graphs[0].display()) neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) if nfolds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids # Negative test set ] # Printing fold number: print('fold {}'.format(1)) train_and_evaluate(minsup, graph_database, subsets) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // nfolds neg_fold_size = len(neg_ids) // nfolds for i in range(nfolds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate( (pos_ids[:i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size:])), # Positive training set pos_ids[i * pos_fold_size:(i + 1) * pos_fold_size], # Positive test set numpy.concatenate( (neg_ids[:i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size:])), # Negative training set neg_ids[i * neg_fold_size:(i + 1) * neg_fold_size], # Negative test set ] # Printing fold number: print('fold {}'.format(i + 1)) Sequential_Covering(k, minsup, graph_database, subsets)
def train_evaluate_decision_tree(): """ Runs gSpan with the specified positive and negative graphs; finds all frequent subgraphs in the training subset of the positive class with a minimum support of minsup. Uses the patterns found to train a naive bayesian classifier using Scikit-learn and evaluates its performances on the test set. Performs a k-fold cross-validation. """ from argparse import ArgumentParser parser = ArgumentParser("Find subgraphs") parser.add_argument("positive_file", type=str) parser.add_argument("negative_file", type=str) parser.add_argument("top_k", type=int) parser.add_argument("min_supp", type=int) parser.add_argument("n_folds", type=int) parser.add_argument("-b", "--benchmark", action="store_true") args = parser.parse_args() if not os.path.exists(args.positive_file): print("{} does not exist.".format(args.positive_file)) sys.exit() if not os.path.exists(args.negative_file): print("{} does not exist.".format(args.negative_file)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( args.positive_file ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( args.negative_file ) # Reading negative graphs, adding them to database and getting ids # If less than two folds: using the same set as training and test set (note this is not an accurate way to evaluate the performances!) if args.n_folds < 2: subsets = [ pos_ids, # Positive training set pos_ids, # Positive test set neg_ids, # Negative training set neg_ids, # Negative test set ] # Printing fold number: print("fold {}".format(1)) train_and_evaluate(args.min_supp, graph_database, subsets, args.top_k, args) # Otherwise: performs k-fold cross-validation: else: pos_fold_size = len(pos_ids) // args.n_folds neg_fold_size = len(neg_ids) // args.n_folds for i in range(args.n_folds): # Use fold as test set, the others as training set for each class; # identify all the subsets to be maintained by the graph mining algorithm. subsets = [ numpy.concatenate( (pos_ids[: i * pos_fold_size], pos_ids[(i + 1) * pos_fold_size :]) ), # Positive training set pos_ids[ i * pos_fold_size : (i + 1) * pos_fold_size ], # Positive test set numpy.concatenate( (neg_ids[: i * neg_fold_size], neg_ids[(i + 1) * neg_fold_size :]) ), # Negative training set neg_ids[ i * neg_fold_size : (i + 1) * neg_fold_size ], # Negative test set ] # Printing fold number: print("fold {}".format(i + 1)) train_and_evaluate(args.min_supp, graph_database, subsets, args.top_k, args)