def benchmark(posf, negf, minsup, topk): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ prefix = "../statement/data/" database_file_name_pos = prefix + posf database_file_name_neg = prefix + negf top_K = topk total_min_freq = minsup if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan
def task1(database_file_name_pos, database_file_name_neg, k, minsup): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs(database_file_name_neg) # Reading negative graphs, adding them to database and getting ids subsets = [pos_ids, neg_ids] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(graph_database, subsets, minsup, k) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their confidence and total support: for pattern in task.patterns: total_support = pattern[1] confidence = pattern[0] print('{} {} {}'.format(pattern[2], confidence, total_support))
def top_k(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file k = int(args[3]) # Third parameter: minimum support minsup = int(args[4]) # Third parameter: minimum support if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, k, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan sort = sorted(task.patterns, key=attrgetter('confidence', 'support'), reverse=True) bestConf = -1 bestSupp = -1 for patt in sort: confidence = patt.confidence support = patt.support dfs_code = patt.code if (confidence != bestConf or support != bestSupp): bestConf = confidence bestSupp = support k -= 1 if k == -1: print(" ") break print('{} {} {}'.format(dfs_code, confidence, support))
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ a = 11 if a == 1: args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file k = int(args[3]) minsup = int(args[4]) # Third parameter: minimum support else: database_file_name_pos = 'data/molecules-small.pos' database_file_name_neg = 'data/molecules-small.neg' k = 5 minsup = 5 if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database print(subsets) task = ConfidencePositiveGraphs(k, minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: keys = task.patterns.keys() for key in keys: for pattern, a in task.patterns[key]: confidence = key[0] support = key[ 1] # This will have to be replaced by the confidence and support on both classes print('{} {} {}'.format(pattern, confidence, support)) print(a)
def task1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subGraphs in the positive class with a minimum positive support of minSup and prints them. """ args = sys.argv database_file_name_pos = args[1] # First parameter: path to positive class file database_file_name_neg = args[2] # Second parameter: path to negative class file k = int(args[4]) # Third parameter: k minFrequency = int(args[4]) # Fourth parameter: minimum frequency if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object # Reading positive graphs, adding them to database and getting ids pos_ids = graph_database.read_graphs(database_file_name_pos) # Reading negative graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs(database_file_name_neg) task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, graph_database, [pos_ids, neg_ids], k, False) gSpan(task).run() # Running gSpan # with open('./solution1', 'w') as file: firstLine = True result = "" # Printing frequent patterns along with their positive support: with open('./results/task1.txt', 'w') as dataset: for confidenceLevel in reversed(task.orderedListOfConfidenceValues): for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns: if confidence == confidenceLevel: toPrint = False if confidence > task.minConfidence: toPrint = True elif confidence == task.minConfidence: if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]: toPrint = True if toPrint: if not firstLine: result += '\n' else: firstLine = False result += '{}_{}_{}'.format(pattern, confidence, frequency) # print(result, file=file, end='') print(result, end='', file= dataset)
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: result = [] frequents = [] for pattern, gid_subsets in task.patterns: pos_support = len(gid_subsets[0]) neg_support = len(gid_subsets[1]) confidence = pos_support / (pos_support + neg_support) frequents.append((confidence, pos_support + neg_support)) result.append((pattern, confidence, pos_support + neg_support)) uniq = list(set(freq for freq in frequents)) s = sorted(uniq, key=lambda x: x[0], reverse=True) r = [s.index(freq) for freq in frequents] ranked = [] for idx, i in enumerate(r): if i < k: ranked.append(result[idx]) ranked.sort(key=lambda x: x[1], reverse=True) for a, b, c in ranked: print('{} {} {}'.format(a, b, c))
def train_and_evaluate(minsup, database, subsets, top_k, args=None): task = FrequentPositiveGraphs(minsup, database, subsets, top_k) # Creating task gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate((features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( ( numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int), ) ) # Training labels test_fm = numpy.concatenate((features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( ( numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int), ) ) # Testing labels classifier = DecisionTreeClassifier(random_state=1) classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict( test_fm ) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy # Printing frequent patterns along with their positive support: for (confidence, frequency), dfs_code, _ in task.patterns: print("{} {} {}".format(dfs_code, confidence, frequency)) # printing classification results: print(predicted.tolist()) if args and args.benchmark: train_predicted = classifier.predict( train_fm ) # Using model to predict labels of testing data train_accuracy = metrics.accuracy_score( train_labels, train_predicted ) # Computing accuracy: print("train accuracy: {}".format(train_accuracy)) print("accuracy: {}".format(accuracy)) print() # Blank line to indicate end of fold.
def train_and_evaluate(minsup, database, subsets, top_K, ret=False): task = FrequentPositiveGraphs(minsup, database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate( (features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate( (features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels classifier = tree.DecisionTreeClassifier( random_state=1) # Creating model object classifier.fit(train_fm, train_labels) # Training model predictedtest = classifier.predict( test_fm) # Using model to predict labels of testing data testaccuracy = metrics.accuracy_score(test_labels, predictedtest) # Computing accuracy: if ret: predictedtrain = classifier.predict( train_fm) # Using model to predict labels of training data trainaccuracy = metrics.accuracy_score(train_labels, predictedtrain) return testaccuracy, trainaccuracy else: # Printing frequent patterns along with their positive support: for pattern in task.patterns: total_support = pattern[1] confidence = pattern[0] print('{} {} {}'.format(pattern[2], confidence, total_support)) # printing classification results: print(predictedtest.tolist()) print('accuracy: {}'.format(testaccuracy)) print() # Blank line to indicate end of fold.
def subgraph_is_isomorphic(graph, subgraph): """ determines whether main graph contains a subgraph which is isomorphic to input subgraph :param graph: main graph :param subgraph: a subgraph to be searched in main graph :return: boolean """ graph_gspan = networkx_to_gspan(graph, 0) subgraph_gspan = networkx_to_gspan(subgraph, 1) # create temporary files during gspan processing input_fd, input_filename = tempfile.mkstemp() output_fd, output_filename = tempfile.mkstemp() with os.fdopen(input_fd, 'w', encoding='utf-8') as input_handler: input_handler.write(graph_gspan + subgraph_gspan) orig_stdout = sys.stdout sys.stdout = os.fdopen(output_fd, 'w', encoding='utf-8') subgraph_miner = gSpan(input_filename, 2, where=True) subgraph_miner.run() sys.stdout = orig_stdout mined_subgraphs = parse_mined_gspan_file(output_filename) # remove temporary files os.remove(input_filename) os.remove(output_filename) em = iso.numerical_edge_match('weight', 0) nm = iso.categorical_node_match('name', None) for mined_subgraph in mined_subgraphs: graph_matcher = iso.GraphMatcher(mined_subgraph, subgraph, node_match=nm, edge_match=em) if graph_matcher.is_isomorphic(): return True return False
def train_and_evaluate(minsup, k, database, subsets): task = FrequentPositiveGraphs(minsup, k, database, subsets) # Creating task gSpan(task).run() # Running gSpan patterns = get_output(task, k) # Creating feature matrices for training and testing: features = get_feature_matrices(task, patterns) train_fm = np.concatenate( (features[0], features[2])) # Training feature matrix train_labels = np.concatenate( (np.full(len(features[0]), 1, dtype=int), np.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = np.concatenate( (features[1], features[3])) # Testing feature matrix test_labels = np.concatenate( (np.full(len(features[1]), 1, dtype=int), np.full(len(features[3]), -1, dtype=int))) # Testing labels classifier = DecisionTreeClassifier( random_state=1) # Creating model object classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict( test_fm) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: # Printing frequent patterns along with their positive support: #print("number of patterns:", len(patterns)) for pattern, gid_subsets in patterns: p = len(gid_subsets[0]) n = len(gid_subsets[2]) total = p + n if total == 0: confidence = 0 else: confidence = p / total print('{} {} {}'.format(pattern, confidence, total)) # printing classification results: print(predicted.tolist()) print('accuracy: {}'.format(accuracy)) print() # Blank line to indicate end of fold.
def train_and_evaluate(minFrequency, database, subsets, k, dataset): task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, subsets, k, False) gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate((features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate((features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels classifier = tree.DecisionTreeClassifier(random_state=1) # Creating model object classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict(test_fm) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: # Printing frequent patterns along with their positive support: firstLine = True result = "" # Printing frequent patterns along with their positive support: for confidenceLevel in reversed(task.orderedListOfConfidenceValues): for pattern, gid_subsets, confidence, frequency, _, _, _ in task.patterns: if confidence == confidenceLevel: toPrint = False if confidence > task.minConfidence: toPrint = True elif confidence == task.minConfidence: if frequency >= task.orderedListOfFrequencyValuesForMinConfidence[0]: toPrint = True if toPrint: if not firstLine: result += '\n' else: firstLine = False result += '{}_{}_{}'.format(pattern, confidence, frequency) print(result, file= dataset) # printing classification results: print(predicted, file= dataset) print('accuracy: {}'.format(accuracy), file= dataset) print("",file= dataset) # Blank line to indicate end of fold.
def train_and_evaluate(k, minsup, database, subsets): task = ConfidencePositiveGraphs2(k, minsup, database, subsets) # Creating task gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate( (features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate( (features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels classifier = tree.DecisionTreeClassifier(random_state=1) # classifier = naive_bayes.GaussianNB(random_state=1) # Creating model object classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict( test_fm) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: # Printing frequent patterns along with their positive support: keys = task.patterns.keys() # print(len(keys)) for key in keys: for pattern, a in task.patterns[key]: confidence = key[0] support = key[ 1] # This will have to be replaced by the confidence and support on both classes print('{} {} {}'.format(pattern, confidence, support)) # print(a) # printing classification results: print(predicted) print('accuracy: {}'.format(accuracy)) print() # Blank line to indicate end of fold.
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all topK frequent subgraphs based on their confidence with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file top_K = int(args[3]) # Third parameter: minimum support total_min_freq = int(args[4]) if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(total_min_freq, graph_database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their confidence and total support: for pattern in task.patterns: total_support = pattern[1] confidence = pattern[0] print('{} {} {}'.format(pattern[2], confidence, total_support))
def find_subgraphs(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ from argparse import ArgumentParser parser = ArgumentParser("Find subgraphs") parser.add_argument("positive_file", type=str) parser.add_argument("negative_file", type=str) parser.add_argument("top_k", type=int) parser.add_argument("min_supp", type=int) args = parser.parse_args() if not os.path.exists(args.positive_file): print("{} does not exist.".format(args.positive_file)) sys.exit() if not os.path.exists(args.negative_file): print("{} does not exist.".format(args.negative_file)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( args.positive_file ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( args.negative_file ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids, ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(args.min_supp, graph_database, subsets, args.top_k) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: for (confidence, frequency), dfs_code in task.patterns: print("{} {} {}".format(dfs_code, confidence, frequency))
def example1(): """ Runs gSpan with the specified positive and negative graphs, finds all frequent subgraphs in the positive class with a minimum positive support of minsup and prints them. """ args = sys.argv database_file_name_pos = args[ 1] # First parameter: path to positive class file database_file_name_neg = args[ 2] # Second parameter: path to negative class file minsup = int(args[3]) # Third parameter: minimum support if not os.path.exists(database_file_name_pos): print('{} does not exist.'.format(database_file_name_pos)) sys.exit() if not os.path.exists(database_file_name_neg): print('{} does not exist.'.format(database_file_name_neg)) sys.exit() graph_database = GraphDatabase() # Graph database object pos_ids = graph_database.read_graphs( database_file_name_pos ) # Reading positive graphs, adding them to database and getting ids neg_ids = graph_database.read_graphs( database_file_name_neg ) # Reading negative graphs, adding them to database and getting ids subsets = [ pos_ids, neg_ids ] # The ids for the positive and negative labelled graphs in the database task = FrequentPositiveGraphs(minsup, graph_database, subsets) # Creating task gSpan(task).run() # Running gSpan # Printing frequent patterns along with their positive support: for pattern, gid_subsets in task.patterns: pos_support = len( gid_subsets[0] ) # This will have to be replaced by the confidence and support on both classes print('{} {}'.format(pattern, pos_support))
def tae(minsup, database, subsets, top_K, cl): task = FrequentPositiveGraphs(minsup, database, subsets, top_K) # Creating task gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate( (features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate( (features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels testaccuracy = [] trainaccuracy = [] for classifier in cl: classifier.fit(train_fm, train_labels) # Training model predictedtest = classifier.predict( test_fm) # Using model to predict labels of testing data testaccuracy.append(metrics.accuracy_score( test_labels, predictedtest)) # Computing accuracy: predictedtrain = classifier.predict( train_fm) # Using model to predict labels of training data trainaccuracy.append( metrics.accuracy_score(train_labels, predictedtrain)) return testaccuracy, trainaccuracy
def train_and_evaluate(minsup, database, subsets): task = FrequentPositiveGraphs(minsup, database, subsets) # Creating task gSpan(task).run() # Running gSpan # Creating feature matrices for training and testing: features = task.get_feature_matrices() train_fm = numpy.concatenate( (features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate( (numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate( (features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate( (numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels classifier = naive_bayes.GaussianNB() # Creating model object classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict( test_fm) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: # Printing frequent patterns along with their positive support: for pattern, gid_subsets in task.patterns: pos_support = len(gid_subsets[0]) print('{} {}'.format(pattern, pos_support)) # printing classification results: print(predicted) print('accuracy: {}'.format(accuracy)) print() # Blank line to indicate end of fold.
def train_and_evaluate(minsup, database, subsets, k): task = FrequentPositiveGraphs(minsup, database, subsets, k) # Creating task gSpan(task).run() # Running gSpan task.sortPatters() features = task.get_feature_matrices() train_fm = numpy.concatenate((features[0], features[2])) # Training feature matrix train_labels = numpy.concatenate((numpy.full(len(features[0]), 1, dtype=int), numpy.full(len(features[2]), -1, dtype=int))) # Training labels test_fm = numpy.concatenate((features[1], features[3])) # Testing feature matrix test_labels = numpy.concatenate((numpy.full(len(features[1]), 1, dtype=int), numpy.full(len(features[3]), -1, dtype=int))) # Testing labels #classifier = naive_bayes.GaussianNB() # classifier = svm.SVC() classifier = KNeighborsClassifier() classifier.fit(train_fm, train_labels) # Training model predicted = classifier.predict(test_fm) # Using model to predict labels of testing data accuracy = metrics.accuracy_score(test_labels, predicted) # Computing accuracy: # for pattern, gid_subsets in task.patterns: # print(' {} {} {}'.format(pattern,(len(gid_subsets[0]) / (len(gid_subsets[0])+len(gid_subsets[2]))),(len(gid_subsets[0])+len(gid_subsets[2])))) #print(predicted.tolist()) #print('accuracy: {}'.format(accuracy)) # print() # Blank line to indicate end of fold return accuracy
def run_gspan(graph_file, output_file, min_support=10, min_num_vertices=1, where=True, **kwargs): """Run gSpan algorithm from https://github.com/betterenvi/gSpan Args: graph_file formatted as follows ``` t # 0 v 0 Oct4-Sox2/match=medium/imp=high v 1 Oct4-Sox2/match=high/imp=high v 2 Oct4-Sox2/match=high/imp=high v 3 Oct4-Sox2-deg/match=medium/imp=high e 0 1 10-50 e 1 0 10-50 t # 1 v 0 Oct4-Sox2/match=medium/imp=high v 1 Nanog/match=medium/imp=high ... t # -1 ``` output_file: output file path min_support: minimal required support in order to display the output count min_num_vertices: minimal number of vertices in the graph """ from gspan_mining import gSpan import io from contextlib import redirect_stdout f = io.StringIO() with redirect_stdout(f): g = gSpan(graph_file, min_support=min_support, min_num_vertices=min_num_vertices, where=where, **kwargs) g.run() out = f.getvalue() with open(output_file, 'w') as f: f.write(out)
def tae(minsup, database, subsets, k): pos_ids = copy.deepcopy(subsets[1]) neg_ids = copy.deepcopy(subsets[3]) pos_ids2 = copy.deepcopy(subsets[0]) neg_ids2 = copy.deepcopy(subsets[2]) list_subsets = [] for subset in subsets: if type(subset) != type([]): new_subset = subset.tolist() list_subsets.append(new_subset) else: list_subsets.append(subset) result = [] temp_conf = [] train_pos_conf = [] for i in range(k): task = FrequentPositiveGraphs(minsup, database, list_subsets, 1) gSpan(task).run() sorted_list = [] for pattern in task.patterns: sorted_list.append( [pattern[2], pattern[0], pattern[1], pattern[3]]) sorted_list.sort() if len(sorted_list) > 0: result.append(sorted_list[0]) subsets_list = sorted_list[0][3] test_list = subsets_list[1] + subsets_list[3] train_list = subsets_list[0] + subsets_list[2] for item in test_list: insort(temp_conf, [item, pattern[4]]) for item in train_list: insort(train_pos_conf, [item, pattern[4]]) list_subsets = [[x for x in b if x not in a] for a, b in zip(subsets_list, list_subsets)] test_list = list_subsets[1] + list_subsets[3] train_list = list_subsets[0] + list_subsets[2] pos_conf = True if len(list_subsets[0]) < len(list_subsets[2]): pos_conf = False # building test and training lists with conf, item & boolean for item in test_list: insort(temp_conf, [item, pos_conf]) for item in train_list: insort(train_pos_conf, [item, pos_conf]) # test accuracy counter = 0 for pos_conf in temp_conf: if pos_conf[0] in pos_ids: if pos_conf[1]: counter += 1 if pos_conf[0] in neg_ids: if not pos_conf[1]: counter += 1 testaccuracy = counter / len(temp_conf) # training accuracy counter = 0 for pos_conf in train_pos_conf: if pos_conf[0] in pos_ids2: if pos_conf[1]: counter += 1 if pos_conf[0] in neg_ids2: if not pos_conf[1]: counter += 1 trainaccuracy = counter / len(train_pos_conf) return testaccuracy, trainaccuracy
def Sequential_Covering(k, minsup, database, subsets): origin_label = copy.deepcopy([subsets[1], subsets[3]]) new_subsets = [] for subset in subsets: if type(subset) != type([]): new_subset = subset.tolist() new_subsets.append(new_subset) else: new_subsets.append(subset) pattern_dic = {} test_pred = {} for _ in range(k): task = ConfidencePositiveGraphs3(1, minsup, database, new_subsets) # Creating task gSpan(task).run() # Running gSpan new_pattern = task.patterns keys = new_pattern.keys() for key in keys: # print(key) pattern_list = new_pattern[key] if len(pattern_list) == 1: pattern = pattern_list[0] else: DFS_list = [pattern[0] for pattern in pattern_list] min_DFS = min(DFS_list) DFS_index = DFS_list.index(min_DFS) # get lowest pattern = pattern_list[DFS_index] # print(pattern[0], key) pattern_dic[pattern[0]] = (pattern[2], key) example_list = pattern[1] test_list = example_list[1] + example_list[3] # print(example_list) for item in test_list: test_pred[item] = pattern[2] new_subsets = RemoveX1FromX2(example_list, new_subsets) test_list = new_subsets[1] + new_subsets[3] # print(new_subsets) length_pos, length_neg = len(new_subsets[0]), len(new_subsets[2]) if length_pos >= length_neg: default = 'pos' else: default = 'neg' for item in test_list: test_pred[item] = default # print('dic', pattern_dic) # print(test_pred) keys = test_pred.keys() key_list = [key for key in keys] key_list.sort() # print(key_list) test_prediction = [test_pred[key] for key in key_list] # print patterns keys = pattern_dic.keys() for key in keys: print('{} {} {}'.format(key, pattern_dic[key][1][0], pattern_dic[key][1][1])) # print prediction out_pred = [] for pred in test_prediction: if pred == 'pos': out_pred.append(1) else: out_pred.append(-1) print(out_pred) # print accuracy keys = test_pred.keys() counter = 0 # print(origin_label) for key in keys: if key in origin_label[0]: if test_pred[key] == 'pos': counter += 1 if key in origin_label[1]: if test_pred[key] == 'neg': counter += 1 accuracy = counter / len(keys) print('accuracy: {}'.format(accuracy)) print() # Blank line to indicate end of fold.
def train_and_evaluate(minsup, database, subsets, top_k, args=None): y_test = [(item, 1) for item in subsets[1]] + [(item, -1) for item in subsets[3]] y_test.sort() y_train = [(item, 1) for item in subsets[0]] + [(item, -1) for item in subsets[2]] y_train.sort() sc_subsets = [ subset.tolist() if type(subset) != list else subset.copy() for subset in subsets ] rules = list() y_test_predicted = list() y_train_predicted = list() for k in range(top_k): task = FrequentPositiveGraphs(minsup, database, sc_subsets, 1) gSpan(task).run() if task.patterns: task.patterns.sort(key=lambda x: (x[1], *x[0], x[2])) best_pattern = task.patterns[0] (confidence, frequency), dfs_code, gid_subsets, label = best_pattern rules.append(best_pattern) for item in gid_subsets[1] + gid_subsets[3]: insort(y_test_predicted, (item, label)) for item in gid_subsets[0] + gid_subsets[2]: insort(y_train_predicted, (item, label)) sc_subsets = remove(gid_subsets, sc_subsets) default_label = 1 if len(sc_subsets[0]) >= len(sc_subsets[2]) else -1 for item in sc_subsets[1] + sc_subsets[3]: insort(y_test_predicted, (item, default_label)) for item in sc_subsets[0] + sc_subsets[2]: insort(y_train_predicted, (item, default_label)) for (confidence, frequency), dfs_code, _, _ in rules: print(f"{dfs_code} {confidence} {frequency}") predicted_labels = [label for _, label in y_test_predicted] print(predicted_labels) accuracy = sum( t == p for t, p in zip(y_test, y_test_predicted)) / len(y_test_predicted) if args and args.benchmark: train_accuracy = sum(t == p for t, p in zip( y_train, y_train_predicted)) / len(y_train_predicted) print(f"train accuracy: {train_accuracy}") print(f"accuracy: {accuracy}") print()
def train_and_evaluate_task4(minFrequency, database, subsets, k, dataset): rules = [] task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, subsets, 5, True, True) pos_ids = subsets[0] pos_idsTest = subsets[1] neg_ids = subsets[2] neg_idsTest = subsets[3] for _ in range(0, k): gSpan(task).run() # Running gSpan numberOfPatternsFound = len(task.patterns) if numberOfPatternsFound == 0: break patterns = sortList(task.patterns) numberOfPatternsFound = len(patterns) pattern = patterns[0] if numberOfPatternsFound == 1: # N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern) rules.append(pattern) elif numberOfPatternsFound > 1: for i in range(1, numberOfPatternsFound): if patterns[i] < pattern: pattern = patterns[i] rules.append(pattern) nextPosIds = [] for transaction in pos_ids: if int(transaction) not in pattern[1][0]: # == gid_subsets nextPosIds.append(int(transaction)) nextPosIdsTest = [] for transaction in pos_idsTest: if int(transaction) not in pattern[1][1]: # == gid_subsets nextPosIdsTest.append(int(transaction)) nextNegIds = [] for transaction in neg_ids: if int(transaction) not in pattern[1][2]: # == gid_subsets nextNegIds.append(int(transaction)) nextNegIdsTest = [] for transaction in neg_idsTest: if int(transaction) not in pattern[1][3]: # == gid_subsets nextNegIdsTest.append(int(transaction)) projectedSubset = [nextPosIds, nextPosIdsTest, nextNegIds, nextNegIdsTest] pos_ids = nextPosIds pos_idsTest = nextPosIdsTest neg_ids = nextNegIds neg_idsTest = nextNegIdsTest if len(pos_ids) == 0 and len(neg_ids) == 0: break task = K_MostConfidentAndFrequentPositiveSubGraphs(minFrequency, database, projectedSubset, 5, True, True) # default class is positive if there are more remaining positive examples, or if there are no remaining patterns isDefaultPositive = len(nextPosIds) >= len(nextNegIds) # classification predicted = [] correctPredictions = 0 testPositive = subsets[1] testNegative = subsets[3] for transaction in testPositive: isTransactionPositive = isDefaultPositive for rule in rules: # N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern) if transaction in rule[1][1]: if rule[6]: isTransactionPositive = True else: isTransactionPositive = False break if isTransactionPositive: predicted.append(1) correctPredictions += 1 else: predicted.append(-1) for transaction in testNegative: isTransactionPositive = isDefaultPositive for rule in rules: # N.B. rule format: (dfs_code, gid_subsets, confidence, frequency, p_test, n_test, isPositivePattern) if transaction in rule[1][3]: if rule[6]: isTransactionPositive = True else: isTransactionPositive = False break if isTransactionPositive: predicted.append(1) else: predicted.append(-1) correctPredictions += 1 accuracy = correctPredictions / (len(testPositive) + len(testNegative)) # Printing frequent patterns along with their positive support: firstLine = True result = "" # Printing frequent patterns along with their positive support: for pattern, gid_subsets, confidence, frequency, _, _, _ in rules: if not firstLine: result += '\n' else: firstLine = False result += '{}_{}_{}'.format(pattern, confidence, frequency) print(result, file= dataset) # printing classification results: print(predicted, file= dataset) print('accuracy: {}'.format(accuracy), file= dataset) print("", file= dataset) # Blank line to indicate end of fold.
def train_and_evaluate(minsup, database, subsets, k): pos_ids = copy.deepcopy(subsets[1]) neg_ids = copy.deepcopy(subsets[3]) list_subsets = [] for subset in subsets: if isinstance(subset, list): list_subsets.append(subset) else: ready_to_go = subset.tolist() list_subsets.append(ready_to_go) result = [] temp_conf = [] for i in range(k): task = FrequentPositiveGraphs(minsup, database, list_subsets, 1) gSpan(task).run() sorted_list = [] for pattern in task.patterns: sorted_list.append( [pattern[2], pattern[0], pattern[1], pattern[3]]) sorted_list.sort() if len(sorted_list) > 0: result.append(sorted_list[0]) subsets_list = sorted_list[0][3] test_list = subsets_list[1] + subsets_list[3] list_subsets = [[x for x in b if x not in a] for a, b in zip(subsets_list, list_subsets)] for item in test_list: insort(temp_conf, [item, pattern[4]]) test_list = list_subsets[1] + list_subsets[3] pos_conf = True if len(list_subsets[0]) < len(list_subsets[2]): pos_conf = False for item in test_list: insort(temp_conf, [item, pos_conf]) for pattern in result: print('{} {} {}'.format(pattern[0], pattern[1], pattern[2])) pred_result = [] for pred in temp_conf: if pred[1]: pred_result.append(1) else: pred_result.append(-1) print(pred_result) counter = 0 for pos_conf in temp_conf: if pos_conf[0] in pos_ids: if pos_conf[1]: counter += 1 if pos_conf[0] in neg_ids: if not pos_conf[1]: counter += 1 accuracy = counter / len(temp_conf) print('accuracy: {}'.format(accuracy)) print()
def train_and_evaluate(minsup, database, subsets, top_K): pos_ids = copy.deepcopy(subsets[1]) neg_ids = copy.deepcopy(subsets[3]) new_subsets = [] for subset in subsets: if type(subset) != type([]): new_subset = subset.tolist() new_subsets.append(new_subset) else: new_subsets.append(subset) result = [] test_is_pos = [] for i in range(top_K): task = FrequentPositiveGraphs(minsup, database, new_subsets, 1) gSpan(task).run() sort_list = [] for pattern in task.patterns: sort_list.append([pattern[2], pattern[0], pattern[1], pattern[3]]) sort_list.sort() if len(sort_list) > 0: result.append(sort_list[0]) subsets_list = sort_list[0][3] test_list = subsets_list[1] + subsets_list[3] for item in test_list: insort(test_is_pos, [item, pattern[4]]) new_subsets = remove(subsets_list, new_subsets) test_list = new_subsets[1] + new_subsets[3] length_pos = len(new_subsets[0]) length_neg = len(new_subsets[2]) if length_pos >= length_neg: is_pos = True else: is_pos = False for item in test_list: insort(test_is_pos, [item, is_pos]) for pattern in result: print('{} {} {}'.format(pattern[0], pattern[1], pattern[2])) pred_result = [] for pred in test_is_pos: if pred[1]: pred_result.append(1) else: pred_result.append(-1) print(pred_result) counter = 0 for is_pos in test_is_pos: if is_pos[0] in pos_ids: if is_pos[1]: counter += 1 if is_pos[0] in neg_ids: if not is_pos[1]: counter += 1 accuracy = counter / len(test_is_pos) print('accuracy: {}'.format(accuracy)) print()
def tae(minsup, database, subsets, top_K): pos_ids = copy.deepcopy(subsets[1]) neg_ids = copy.deepcopy(subsets[3]) pos_ids2 = copy.deepcopy(subsets[0]) neg_ids2 = copy.deepcopy(subsets[2]) new_subsets = [] for subset in subsets: if type(subset) != type([]): new_subset = subset.tolist() new_subsets.append(new_subset) else: new_subsets.append(subset) result = [] test_is_pos = [] train_is_pos = [] for i in range(top_K): task = FrequentPositiveGraphs(minsup, database, new_subsets, 1) gSpan(task).run() sort_list = [] for pattern in task.patterns: sort_list.append([pattern[2], pattern[0], pattern[1], pattern[3]]) sort_list.sort() if len(sort_list) > 0: result.append(sort_list[0]) subsets_list = sort_list[0][3] test_list = subsets_list[1] + subsets_list[3] train_list = subsets_list[0] + subsets_list[2] for item in test_list: insort(test_is_pos, [item, pattern[4]]) # pattern[4]: pos or not for item in train_list: insort(train_is_pos, [item, pattern[4]]) new_subsets = remove(subsets_list, new_subsets) test_list = new_subsets[1] + new_subsets[3] train_list = new_subsets[0] + new_subsets[2] length_pos = len(new_subsets[0]) length_neg = len(new_subsets[2]) if length_pos >= length_neg: is_pos = True else: is_pos = False for item in test_list: insort(test_is_pos, [item, is_pos]) for item in train_list: insort(train_is_pos, [item, is_pos]) counter = 0 for is_pos in test_is_pos: if is_pos[0] in pos_ids: if is_pos[1]: counter += 1 if is_pos[0] in neg_ids: if not is_pos[1]: counter += 1 testaccuracy = counter / len(test_is_pos) counter = 0 for is_pos in train_is_pos: if is_pos[0] in pos_ids2: if is_pos[1]: counter += 1 if is_pos[0] in neg_ids2: if not is_pos[1]: counter += 1 trainaccuracy = counter / len(train_is_pos) return testaccuracy, trainaccuracy