def sequence_idp(params, patterns): for p in patterns: print p indices = set([seq.id for seq in patterns]) nonclosed_indices = set() # closed pattern mining by generated IDP code idp_gen = IDPGenerator(params) path, filename = os.path.split(params["data"]) idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0]) if params["dominance"] == "closed": support_mapping = make_grouping_by_support(patterns) else: support_mapping = None attribute_mapping = make_attribute_mapping(patterns) for support, group in support_mapping.items(): if len(group) == 1: print group continue check_mapping = defaultdict(set) for seq in group: patterns_to_check = get_attribute_intersection(seq, attribute_mapping, support_mapping) if patterns_to_check: check_mapping[seq] = patterns_to_check if len(check_mapping.values()) != 0: idp_gen.gen_IDP_code_group(check_mapping, idp_program_name) idp_output = idp_gen.run_IDP(idp_program_name) lines = idp_output.split("\n") for line in lines: if "selected_seq" in line: nonclosed_indices.add(int(line[19])) indices = indices - nonclosed_indices print indices """ for seq in tqdm(patterns): #if we make it a function, is_closed(seq) #then we need just need async_map(is_closed,patterns) patterns_to_check = get_attribute_intersection(seq,mapping,support_mapping) if len(patterns_to_check) > 1: #the pattern itself and other patterns # generate idp code for finding pattern with constraints for this seq idp_gen.gen_IDP_code(patterns_to_check, idp_program_name, seq.id) idp_output = idp_gen.run_IDP(idp_program_name) if 'Unsatisfiable' in idp_output: print(seq.id) os.system("cp IDP/closed_sequence_test.idp tmp/seq_test_{id}".format(id=seq.id)) return # break here look at the INDEX, it should be 1 but it is 2 for some reason; # the same for the case of id = 5, it is selected as 2 for some reason indices.append(seq.id) else: indices.append(seq.id) """ return indices
def subsumption_lattice_check_sequence(self, patterns,params): print('\n Starting dominance check for sequences...') if params['dominance'] == "maximal": pattern_to_parent, pattern_to_set_of_children = self.extract_parental_tree_itemset(patterns) skip_set = self.prune_initial_tree(patterns, pattern_to_parent, pattern_to_set_of_children, params) else: skip_set = set() set_of_patterns = set(patterns) - skip_set is_free = params['dominance'] == "free" if params['dominance'] == "closed" or is_free: support_mapping = make_grouping_by_support(set_of_patterns) print("initial skip set len", len(skip_set)) skip_set = set() #init again ordered_sequences = sorted(set_of_patterns, key=lambda x: x.get_pattern_len(),reverse=True) all_candidate_sizes = [] for seq in tqdm(ordered_sequences): # maximal are not subsumed by anything if seq in skip_set: continue if params['dominance'] == "closed" or is_free: candidates = support_mapping[seq.get_support()] - skip_set elif params['dominance'] == "maximal": candidates = set_of_patterns - skip_set candidates = get_smaller_patterns(seq.get_pattern_len(), candidates) candidates = get_attribute_subset(seq, candidates) all_candidate_sizes.append(len(candidates)) for candidate in candidates: if candidate.is_subsequence_of(seq): if (params['dominance'] == "maximal" or params['dominance'] == 'closed'): skip_set.add(candidate) if params['dominance'] == 'free': skip_set.add(seq) break print('dominance check done') if len(all_candidate_sizes) != 0: print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes)) return set_of_patterns - set(skip_set)
def itemset_idp_new(params, patterns): indices = [] # closed pattern mining by generated IDP code idp_gen = IDPGenerator(params) path, filename = os.path.split(params["data"]) idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0]) if params["dominance"] == "closed": support_mapping = make_grouping_by_support(patterns) else: support_mapping = None mapping = make_attribute_mapping(patterns) return indices
def subsumption_lattice_check_graph(self, patterns, params): print '\n Starting dominance check for graphs...\n' #self.mapping_by_len = group_by_len(patterns) initial_subsumption_tree, initial_subsumed_by_tree = self.create_initial_parent_tree(patterns) skip_set = self.initialize_skip_set_with_parent_info(patterns, initial_subsumption_tree, initial_subsumed_by_tree, params) print("initial skip set", len(skip_set)) is_free = params['dominance'] == "free" set_of_patterns = set(patterns) - skip_set sorted_graphs = sorted(set_of_patterns, cmp=lambda x,y: self.pareto_front_pair(x.get_pattern_len(),y.get_pattern_len()),reverse=True) if params['dominance'] == "closed" or is_free: support_mapping = make_grouping_by_support(set_of_patterns) skip_set = set() all_candidate_sizes = [] for graph in tqdm(sorted_graphs): # maximal are not subsumed by anything if graph in skip_set: continue if params['dominance'] == "closed" or is_free: candidates = support_mapping[graph.get_support()] - skip_set if params['dominance'] == "maximal": candidates = set(patterns) - skip_set candidates = filter(lambda x: self.pareto_front_pair(x.get_pattern_len(),graph.get_pattern_len()) < 0, candidates) candidates = get_attribute_subset(graph, candidates) candidates = get_combined_subset(graph, candidates) number_of_candidates = len(candidates) all_candidate_sizes.append(number_of_candidates) for candidate in candidates: if candidate.is_subgraph_of(graph): if params['dominance'] == "maximal" or params['dominance'] == 'closed': skip_set.add(candidate) if params['dominance'] == 'free': skip_set.add(graph) break print 'done dominance check' print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes)) return set_of_patterns - skip_set
def sequence_idp_multiple(params, patterns): indices = set([seq.id for seq in patterns]) nonclosed_indices = set() # closed pattern mining by generated IDP code idp_gen = IDPGenerator(params) path, filename = os.path.split(params["data"]) idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0]) if params["dominance"] == "closed": support_mapping = make_grouping_by_support(patterns) else: support_mapping = None attribute_mapping = make_attribute_mapping(patterns) """ group testing """ mapping_groups = [] for group in support_mapping.values(): if len(group) == 1: print group continue check_mapping = defaultdict(set) for seq in group: patterns_to_check = get_attribute_intersection(seq, attribute_mapping, support_mapping) if len(patterns_to_check) > 1: check_mapping[seq] = patterns_to_check if check_mapping: mapping_groups.append(check_mapping) nonclosed_indices = async_mapping(mapping_groups, idp_gen, idp_program_name) # nonclosed_indices = async_mapping_withoutLock(mapping_groups, idp_gen, idp_program_name) """ lines = idp_output.split('\n') for line in lines: if 'selected_seq' in line: nonclosed_indices.add(int(line[19])) """ indices = indices - nonclosed_indices print indices return indices
def subsumption_lattice_check_itemset(self, patterns,params): print('\nStarting dominance check for itemsets...') is_3a_enabled = False is_3b_enabled = True all_candidate_sizes = [] is_free = params['dominance'] == "free" if is_3a_enabled: pattern_to_parent, pattern_to_set_of_children = self.extract_parental_tree_itemset(patterns) skip_set = self.prune_initial_tree(patterns, pattern_to_parent, pattern_to_set_of_children, params) set_of_patterns = set(patterns) - skip_set print('initial skip set size', len(skip_set)) else: set_of_patterns = set(patterns) if is_3b_enabled: if params['dominance'] == "closed" or is_free: support_mapping = make_grouping_by_support(set_of_patterns) sorted_itemsets = sorted(set_of_patterns, key=lambda x: x.get_pattern_len(),reverse=(not is_free)) skip_set = set() for itemset in tqdm(sorted_itemsets): # maximal are not subsumed by anything if itemset in skip_set: continue if is_3b_enabled: if params['dominance'] == "closed" or is_free: candidates = support_mapping[itemset.get_support()] - skip_set if params['dominance'] == "maximal": candidates = set_of_patterns - skip_set l = itemset.get_pattern_len() if params['dominance'] == "closed" or params['dominance'] == "maximal": candidates = check_bounds_and_size(l, itemset.min_val, itemset.max_val, candidates) if is_free: candidates = check_larger_and_out_bounds(l, itemset.min_val, itemset.max_val, candidates) else: candidates = (set(patterns) - skip_set) - set([pattern]) all_candidate_sizes.append(len(candidates)) for candidate in candidates: if params['dominance'] == "closed" : if not is_3b_enabled and itemset.get_support() != candidate.get_support(): continue if (candidate.itemset).issubset(itemset.itemset): skip_set.add(candidate) if params['dominance'] == "maximal": if (candidate.itemset).issubset(itemset.itemset): skip_set.add(candidate) if is_free: if (itemset.itemset).issubset(candidate.itemset): skip_set.add(candidate) # print("candidates len", len(candidates), "skipset", len(skip_set)) print('Dominance check done...') if len(all_candidate_sizes) != 0: print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes)) return set_of_patterns - skip_set