Example #1
0
def sequence_idp(params, patterns):
    for p in patterns:
        print p
    indices = set([seq.id for seq in patterns])
    nonclosed_indices = set()

    # closed pattern mining by generated IDP code
    idp_gen = IDPGenerator(params)
    path, filename = os.path.split(params["data"])
    idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0])

    if params["dominance"] == "closed":
        support_mapping = make_grouping_by_support(patterns)
    else:
        support_mapping = None

    attribute_mapping = make_attribute_mapping(patterns)

    for support, group in support_mapping.items():
        if len(group) == 1:
            print group
            continue
        check_mapping = defaultdict(set)
        for seq in group:
            patterns_to_check = get_attribute_intersection(seq, attribute_mapping, support_mapping)
            if patterns_to_check:
                check_mapping[seq] = patterns_to_check

        if len(check_mapping.values()) != 0:
            idp_gen.gen_IDP_code_group(check_mapping, idp_program_name)
            idp_output = idp_gen.run_IDP(idp_program_name)

        lines = idp_output.split("\n")
        for line in lines:
            if "selected_seq" in line:
                nonclosed_indices.add(int(line[19]))
    indices = indices - nonclosed_indices
    print indices

    """
    for seq in tqdm(patterns):
        #if we make it a function, is_closed(seq)
        #then we need just need async_map(is_closed,patterns)
        patterns_to_check = get_attribute_intersection(seq,mapping,support_mapping)
        if len(patterns_to_check) > 1: #the pattern itself and other patterns
          # generate idp code for finding pattern with constraints for this seq
          idp_gen.gen_IDP_code(patterns_to_check, idp_program_name, seq.id)
          idp_output = idp_gen.run_IDP(idp_program_name)
          if 'Unsatisfiable' in idp_output:
              print(seq.id)
              os.system("cp IDP/closed_sequence_test.idp tmp/seq_test_{id}".format(id=seq.id))
              return # break here look at the INDEX, it should be 1 but it is 2 for some reason;
                     # the same for the case of id = 5, it is selected as 2 for some reason
              indices.append(seq.id)
        else:
          indices.append(seq.id)
    """

    return indices
  def subsumption_lattice_check_sequence(self, patterns,params):
    print('\n Starting dominance check for sequences...')
    
    
    if params['dominance'] == "maximal":
        pattern_to_parent, pattern_to_set_of_children = self.extract_parental_tree_itemset(patterns)
        skip_set = self.prune_initial_tree(patterns, pattern_to_parent, pattern_to_set_of_children, params)
    else:
        skip_set = set()

    set_of_patterns = set(patterns) - skip_set

    is_free = params['dominance'] == "free"
    if params['dominance'] == "closed" or is_free:
      support_mapping = make_grouping_by_support(set_of_patterns)

    print("initial skip set len", len(skip_set))

    skip_set = set() #init again
    ordered_sequences = sorted(set_of_patterns, key=lambda x: x.get_pattern_len(),reverse=True)

    all_candidate_sizes = []
    for seq in tqdm(ordered_sequences): # maximal are not subsumed by anything
      if seq in skip_set:
        continue
       
      if params['dominance'] == "closed" or is_free:
        candidates = support_mapping[seq.get_support()] - skip_set
      elif params['dominance'] == "maximal":
        candidates = set_of_patterns - skip_set

      candidates = get_smaller_patterns(seq.get_pattern_len(), candidates)
      candidates = get_attribute_subset(seq, candidates)

      all_candidate_sizes.append(len(candidates))

      for candidate in candidates:
        if candidate.is_subsequence_of(seq):
          if (params['dominance'] == "maximal" or params['dominance'] == 'closed'):
                skip_set.add(candidate)

          if params['dominance'] == 'free':
            skip_set.add(seq)
            break

    print('dominance check done')
    if len(all_candidate_sizes) != 0:
        print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes))
    return set_of_patterns - set(skip_set)                                               
Example #3
0
def itemset_idp_new(params, patterns):
    indices = []

    # closed pattern mining by generated IDP code
    idp_gen = IDPGenerator(params)
    path, filename = os.path.split(params["data"])
    idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0])

    if params["dominance"] == "closed":
        support_mapping = make_grouping_by_support(patterns)
    else:
        support_mapping = None

    mapping = make_attribute_mapping(patterns)

    return indices
  def subsumption_lattice_check_graph(self, patterns, params):

    print '\n Starting dominance check for graphs...\n'
   #self.mapping_by_len = group_by_len(patterns)
    initial_subsumption_tree, initial_subsumed_by_tree = self.create_initial_parent_tree(patterns)
    skip_set = self.initialize_skip_set_with_parent_info(patterns, initial_subsumption_tree, initial_subsumed_by_tree, params)
    print("initial skip set", len(skip_set))
    is_free = params['dominance'] == "free"

    set_of_patterns = set(patterns) - skip_set
    sorted_graphs = sorted(set_of_patterns, cmp=lambda x,y: self.pareto_front_pair(x.get_pattern_len(),y.get_pattern_len()),reverse=True)

    if params['dominance'] == "closed" or is_free:
      support_mapping = make_grouping_by_support(set_of_patterns)

    skip_set = set()
    all_candidate_sizes = []
    for graph in tqdm(sorted_graphs): # maximal are not subsumed by anything
      if graph in skip_set:
          continue
      if params['dominance'] == "closed" or is_free:
        candidates = support_mapping[graph.get_support()] - skip_set
      if params['dominance'] == "maximal":
        candidates = set(patterns) - skip_set

      candidates = filter(lambda x: self.pareto_front_pair(x.get_pattern_len(),graph.get_pattern_len()) < 0, candidates)
      candidates = get_attribute_subset(graph, candidates)
      candidates = get_combined_subset(graph, candidates)

      number_of_candidates = len(candidates)
      all_candidate_sizes.append(number_of_candidates)

      for candidate in candidates:
          if candidate.is_subgraph_of(graph):
            if params['dominance'] == "maximal" or params['dominance'] == 'closed':
              skip_set.add(candidate)

            if params['dominance'] == 'free':
              skip_set.add(graph)
              break
             



    print 'done dominance check'
    print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes))
    return set_of_patterns - skip_set                                               
Example #5
0
def sequence_idp_multiple(params, patterns):
    indices = set([seq.id for seq in patterns])
    nonclosed_indices = set()

    # closed pattern mining by generated IDP code
    idp_gen = IDPGenerator(params)
    path, filename = os.path.split(params["data"])
    idp_program_name = "{0}_{1}_{2}".format(params["dominance"], params["type"], filename.split(".")[0])

    if params["dominance"] == "closed":
        support_mapping = make_grouping_by_support(patterns)
    else:
        support_mapping = None

    attribute_mapping = make_attribute_mapping(patterns)

    """ group testing """
    mapping_groups = []
    for group in support_mapping.values():
        if len(group) == 1:
            print group
            continue
        check_mapping = defaultdict(set)
        for seq in group:
            patterns_to_check = get_attribute_intersection(seq, attribute_mapping, support_mapping)
            if len(patterns_to_check) > 1:
                check_mapping[seq] = patterns_to_check
        if check_mapping:
            mapping_groups.append(check_mapping)
    nonclosed_indices = async_mapping(mapping_groups, idp_gen, idp_program_name)
    # nonclosed_indices = async_mapping_withoutLock(mapping_groups, idp_gen, idp_program_name)

    """
    lines = idp_output.split('\n')
    for line in lines:
        if 'selected_seq' in line:
            nonclosed_indices.add(int(line[19]))
    """
    indices = indices - nonclosed_indices
    print indices

    return indices
  def subsumption_lattice_check_itemset(self, patterns,params):
    print('\nStarting dominance check for itemsets...')
    
    is_3a_enabled = False
    is_3b_enabled = True 

    all_candidate_sizes = []
    
    is_free = params['dominance'] == "free"
    if is_3a_enabled:
        pattern_to_parent, pattern_to_set_of_children = self.extract_parental_tree_itemset(patterns)
        skip_set = self.prune_initial_tree(patterns, pattern_to_parent, pattern_to_set_of_children, params)

        set_of_patterns = set(patterns) - skip_set


        print('initial skip set size', len(skip_set))
    else:
        set_of_patterns = set(patterns)

    if is_3b_enabled:
        if params['dominance'] == "closed" or is_free:
            support_mapping = make_grouping_by_support(set_of_patterns)

    sorted_itemsets = sorted(set_of_patterns, key=lambda x: x.get_pattern_len(),reverse=(not is_free))
    skip_set = set()
    for itemset in tqdm(sorted_itemsets): # maximal are not subsumed by anything
      if itemset in skip_set:
        continue
      if is_3b_enabled:
          if params['dominance'] == "closed" or is_free:
            candidates = support_mapping[itemset.get_support()] - skip_set
          if params['dominance'] == "maximal":
            candidates = set_of_patterns - skip_set
          

          l = itemset.get_pattern_len()
          if params['dominance'] == "closed" or params['dominance'] == "maximal":
            candidates = check_bounds_and_size(l, itemset.min_val, itemset.max_val, candidates)
          if is_free:
            candidates = check_larger_and_out_bounds(l, itemset.min_val, itemset.max_val, candidates)
      else:
          candidates = (set(patterns) - skip_set) - set([pattern])
      
      all_candidate_sizes.append(len(candidates))
      for candidate in candidates:
        if params['dominance'] == "closed" :
          if not is_3b_enabled and itemset.get_support() != candidate.get_support():
              continue
          if (candidate.itemset).issubset(itemset.itemset): 
                skip_set.add(candidate)

        if params['dominance'] == "maximal": 
          if (candidate.itemset).issubset(itemset.itemset): 
                skip_set.add(candidate)
              
        if is_free:
          if (itemset.itemset).issubset(candidate.itemset):
              skip_set.add(candidate)

#     print("candidates len", len(candidates), "skipset", len(skip_set))
    
    print('Dominance check done...')
    if len(all_candidate_sizes) != 0:
        print 'AVG candidate size:', float(sum(all_candidate_sizes))/float(len(all_candidate_sizes))

    return set_of_patterns - skip_set