def get_trueFIs(ds_stats, res_filename, min_freq, delta, gap=0.0, use_additional_knowledge=False): """ Compute the True Frequent Itemsets using the VC method we present in the paper. The parameter 'use_additional_knowledge' can be used to incorporate additional knowledge about the data generation process. 'gap' controls how close to the optimal solution we ask the CPLEX solver to go. The right way to implement this would be to use a user-defined function in CPLEX. Returns a pair (trueFIs, stats). 'trueFIs' is a dict whose keys are itemsets (frozensets) and values are frequencies. This collection of itemsets contains only TFIs with probability at least 1 - delta. 'stats' is a dict containing various statistics used in computing the collection of itemsets.""" stats = dict() # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. lower_delta = 1.0 - math.sqrt(1 - delta) # Compute the maximum frequency of an itemset in the dataset with open(res_filename, 'rt') as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: error_exit( "Cannot compute size of the dataset: '{}' is not in the recognized format\n" .format(size_line)) try: size = int(size_str) except ValueError: error_exit( "Cannot compute size of the dataset: '{}' is not a number\n". format(size_line.split("(")[1].split(")")[0])) max_freq_line = FILE.readline() if max_freq_line.find("(") > -1: tokens = max_freq_line.split("(") itemset = frozenset(map(int, tokens[0].split())) try: support = int(tokens[1][:-2]) except ValueError: error_exit( "Cannot compute the maximum frequency: '{}' is not a number\n" .format(tokens[1][:-2])) max_freq = support / size else: error_exit( "Cannot compute the maximum frequency: '{}' is not in the recognized format\n" .format(max_freq_line)) # Compute the first epsilon using results from the paper (Riondato and # Upfal 2014) # Incorporate or not 'previous knowledge' about generative process in # computation of the VC-dimension, depending on the option passed on the # command line (eps_vc_dim, eps_shatter, returned) = epsilon.epsilon_dataset(lower_delta, ds_stats, use_additional_knowledge, max_freq) stats['epsilon_1'] = min(eps_vc_dim, eps_shatter) items = ds_stats['items'] items_num = len(items) lengths_dict = ds_stats['lengths'] lengths = sorted(lengths_dict.keys(), reverse=True) # Extract the first (and largest) set of itemsets with frequency at least # min_freq - stats['epsilon_1'] lower_bound_freq = min_freq - stats['epsilon_1'] - (1 / ds_stats['size']) freq_itemsets_1_dict = utils.create_results(res_filename, lower_bound_freq) freq_itemsets_1_set = frozenset(freq_itemsets_1_dict.keys()) freq_itemsets_1_sorted = sorted(freq_itemsets_1_set, key=lambda x: freq_itemsets_1_dict[x]) freq_items_1 = set() for itemset in freq_itemsets_1_set: if len(itemset) == 1: freq_items_1 |= itemset freq_items_1_num = len(freq_items_1) sys.stderr.write("First set of FI's: {} itemsets\n".format( len(freq_itemsets_1_set))) sys.stderr.flush() constr_start_str = "cplex.SparsePair(ind = [" constr_end_str = "], val = vals)" # Compute the "base set" (terrible name), that is the set of # itemsets with frequency < min_freq + epsilon_1 (but greater than min_freq # - stats['epsilon_1']. In the paper we call it \mathcal{G}. sys.stderr.write("Creating base set...") sys.stderr.flush() base_set = dict() # We use the maximum frequency in the base set to compute the epislon max_freq_base_set = 0 for itemset in freq_itemsets_1_sorted: if freq_itemsets_1_dict[itemset] < min_freq + stats['epsilon_1']: base_set[itemset] = freq_itemsets_1_dict[itemset] if freq_itemsets_1_dict[itemset] > max_freq_base_set: max_freq_base_set = freq_itemsets_1_dict[itemset] else: break stats['base_set'] = len(base_set) sys.stderr.write("done: {} itemsets\n".format(stats['base_set'])) sys.stderr.flush() # Compute Closed Itemsets. We need them to compute the maximal. sys.stderr.write("Computing closed itemsets...") sys.stderr.flush() closed_itemsets = utils.get_closed_itemsets(base_set) closed_itemsets_len = len(closed_itemsets) sys.stderr.write( "done. Found {} closed itemsets\n".format(closed_itemsets_len)) sys.stderr.flush() # Compute maximal itemsets. We will use them to compute the negative # border. An itemset is maximal frequent if none of its immediate supersets # is frequent. sys.stderr.write("Computing maximal itemsets...") sys.stderr.flush() maximal_itemsets_dict = utils.get_maximal_itemsets(closed_itemsets) maximal_itemsets = list(maximal_itemsets_dict.keys()) stats['maximal_itemsets'] = len(maximal_itemsets) sys.stderr.write("done. Found {} maximal itemsets\n".format( stats['maximal_itemsets'])) sys.stderr.flush() # Compute the negative border sys.stderr.write("Computing negative border...") sys.stderr.flush() negative_border = set() negative_border_items = set() # The idea is to look for "children" of maximal itemsets, and for # "siblings" of maximal itemsets for maximal in maximal_itemsets: for item_to_remove_from_maximal in maximal: reduced_maximal = maximal - frozenset([ item_to_remove_from_maximal, ]) for item in freq_items_1: if item in maximal: continue # Create sibling candidate = reduced_maximal | frozenset([item]) if candidate in freq_itemsets_1_set: continue if candidate in negative_border: continue to_add = True for item_to_remove in candidate: subset = candidate - frozenset([item_to_remove]) if subset not in freq_itemsets_1_set: to_add = False break if to_add: negative_border.add(candidate) negative_border_items |= candidate if not to_add: # if we added the sibling, there's no way we can add the # child candidate2 = maximal | frozenset([item]) # create child if candidate2 in negative_border: continue to_add = True for item_to_remove in candidate2: subset = candidate2 - frozenset([item_to_remove]) if subset not in freq_itemsets_1_set: to_add = False break if to_add: negative_border.add(candidate2) negative_border_items |= candidate # We don't need to add the non-frequent-items because none of them (or # their supersets) will ever be included in the output, so at most we lose # some statistical power, but it's not a problem of avoiding false # positives. # for item in non_freq_items_1: # negative_border.add(frozenset([item])) # negative_border_items.add(item) original_negative_border_len = len(negative_border) sys.stderr.write( "done. Length now: {}\n".format(original_negative_border_len)) sys.stderr.flush() # Add the "base set" to negative_border, so that it becomes a superset of # the "true" negative border (with some caveats about non-frequent single # items and their supersets, see comment above) sys.stderr.write("Adding base set...") sys.stderr.flush() for itemset in base_set: negative_border.add(itemset) negative_border_items |= itemset sys.stderr.write("done. Length now: {}\n".format(len(negative_border))) sys.stderr.flush() negative_border = sorted(negative_border, key=len, reverse=True) stats['negative_border'] = len(negative_border) negative_border_items_sorted = sorted(negative_border_items) # Create the graph that we will use to compute the chain constraints. # The nodes are the itemsets in negative_border. There is an edge between # two nodes if one is contained in the other or vice-versa. # Cliques on this graph are chains. sys.stderr.write("Creating graph...") sys.stderr.flush() graph = nx.Graph() graph.add_nodes_from(negative_border) sys.stderr.write("added nodes...adding edges...") sys.stderr.flush() negative_border_items_in_sets_dict = dict() negative_border_itemset_index = 0 itemset_indexes_dict = dict() for first_itemset_index in range(stats['negative_border']): first_itemset = negative_border[first_itemset_index] for second_itemset_index in range(first_itemset_index + 1, stats['negative_border']): second_itemset = negative_border[second_itemset_index] if first_itemset < second_itemset or \ second_itemset < first_itemset: graph.add_edge(first_itemset, second_itemset) for item in first_itemset: if item in negative_border_items_in_sets_dict: negative_border_items_in_sets_dict[item].append( negative_border_itemset_index) else: negative_border_items_in_sets_dict[item] = \ [negative_border_itemset_index, ] itemset_indexes_dict[first_itemset] = negative_border_itemset_index negative_border_itemset_index += 1 sys.stderr.write("done\n") sys.stderr.flush() capacity = freq_items_1_num - 1 if use_additional_knowledge and 2 * ds_stats['maxlen'] < capacity: sys.stderr.write("Lowering capacity={} to {}\n".format( capacity, 2 * ds_stats['maxlen'])) sys.stderr.flush() capacity = 2 * ds_stats['maxlen'] vars_num = stats['negative_border'] + len(negative_border_items) constr_names = [] (tmpfile_handle, tmpfile_name) = tempfile.mkstemp(prefix="cplx", dir=os.environ['PWD'], text=True) os.close(tmpfile_handle) with open(tmpfile_name, 'wt') as cplex_script: cplex_script.write("capacity = {}\n".format(capacity)) cplex_script.write("import cplex, os, sys\n") cplex_script.write("from cplex.exceptions import CplexError\n") cplex_script.write("\n") cplex_script.write("\n") cplex_script.write(" ".join( ("os.environ[\"ILOG_LICENSE_FILE\"] =", "\"/local/projects/cplex/ilm/site.access.ilm\"\n"))) cplex_script.write("vals = [-1.0, 1.0]\n") cplex_script.write("sets_num = {}\n".format(stats['negative_border'])) cplex_script.write("items_num = {}\n".format( len(negative_border_items))) cplex_script.write("vars_num = {}\n".format(vars_num)) cplex_script.write("my_ub = [1.0] * vars_num\n") cplex_script.write( "my_types = \"\".join(\"I\" for i in range(vars_num))\n") cplex_script.write( "my_obj = ([1.0] * sets_num) + ([0.0] * items_num)\n") cplex_script.write(" ".join( ("my_colnames =", "[\"set{0}\".format(i) for i in range(sets_num)] +", "[\"item{0}\".format(j) for j in range(items_num)]\n"))) cplex_script.write("rows = [ ") sys.stderr.write("Writing knapsack constraints...") sys.stderr.flush() constr_num = 0 for item_index in range(len(negative_border_items)): try: for itemset_index in negative_border_items_in_sets_dict[ negative_border_items_sorted[item_index]]: constr_str = "".join( (constr_start_str, "\"set{}\",\"item{}\"".format( itemset_index, item_index), constr_end_str)) cplex_script.write("{},".format(constr_str)) constr_num += 1 name = "s{}i{}".format(item_index, itemset_index) constr_names.append(name) except KeyError: sys.stderr.write(" ".join( ("item_index={}".format(item_index), "neg_border_items_sorted[item_index]={}\n".format( negative_border_items_sorted[item_index])))) sys.stderr.write("{} in items: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in items)) sys.stderr.write("{} in freq_items_1: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in freq_items_1)) non_freq_items_1 = items - freq_items_1 sys.stderr.write("{} in non_freq_items_1: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in non_freq_items_1)) in_pos_border = False pos_border_itemset = frozenset() for itemset in maximal_itemsets: if negative_border_items_sorted[item_index] in itemset: in_pos_border = True pos_border_itemset = itemset break sys.stderr.write( "{} in maximal_itemsets: {}. Itemset: {}\n".format( negative_border_items_sorted[item_index], in_pos_border, pos_border_itemset)) in_neg_border = False neg_border_itemset = frozenset() for itemset in negative_border: if negative_border_items_sorted[item_index] in itemset: in_neg_border = True neg_border_itemset = itemset break sys.stderr.write( "{} in negative_border: {}. Itemset: {}\n".format( negative_border_items_sorted[item_index], in_neg_border, neg_border_itemset)) sys.exit(1) # Create capacity constraints and write it to script constr_str = "".join( (constr_start_str, ",".join("\"item{}\"".format(j) for j in range(len(negative_border_items))), "], val=[", ",".join("1.0" for j in range(len(negative_border_items))), "])")) cplex_script.write(constr_str) last_tell = cplex_script.tell() cplex_script.write(",") cap_constr_name = "capacity" constr_names.append(cap_constr_name) sys.stderr.write("done\n") sys.stderr.flush() # Create chain constraints and write them to script sys.stderr.write("Writing chain constraints...") sys.stderr.flush() chains_index = 0 for clique in nx.find_cliques(graph): if len(clique) == 1: continue constr_str = "".join( (constr_start_str, ",".join( "\"set{}\"".format(j) for j in map(lambda x: itemset_indexes_dict[x], clique)), "], val=[1.0] * {}".format(len(clique)), ")")) cplex_script.write(constr_str) last_tell = cplex_script.tell() cplex_script.write(",") name = "chain{}".format(chains_index) constr_names.append(name) chains_index += 1 sys.stderr.write("done\n") sys.stderr.flush() sys.stderr.write(" ".join( ("Optimization problem: capacity={}".format(capacity), "vars_num={}".format(vars_num), "negative_border_size={}".format(stats['negative_border']), "negative_border_items_num={}".format(len(negative_border_items)), "constr_num={}".format(constr_num), "chains_index={}\n".format(chains_index)))) sys.stderr.flush() # Go back one character to remove last comma "," cplex_script.seek(last_tell) cplex_script.write("]\n") cplex_script.write("my_rownames = {}\n".format(constr_names)) cplex_script.write("constr_num = {}\n".format(constr_num)) cplex_script.write("chain_constr_num = {}\n".format(chains_index)) cplex_script.write(" ".join( ("my_senses = [\"G\"] * constr_num +", "[\"L\"] + [\"L\"] * chain_constr_num\n"))) cplex_script.write(" ".join( ("my_rhs = [0.0] * constr_num + [capacity] +", "[1.0] * chain_constr_num\n"))) cplex_script.write("\n") cplex_script.write("try:\n") cplex_script.write(" prob = cplex.Cplex()\n") cplex_script.write(" prob.set_error_stream(sys.stderr)\n") cplex_script.write(" prob.set_log_stream(sys.stderr)\n") cplex_script.write(" prob.set_results_stream(sys.stderr)\n") cplex_script.write(" prob.set_warning_stream(sys.stderr)\n") # cplex_script.write(" prob.parameters.mip.strategy.file.set(2)\n") cplex_script.write( " prob.parameters.mip.tolerances.mipgap.set({})\n".format(gap)) cplex_script.write( " prob.parameters.timelimit.set({})\n".format(600)) # cplex_script.write(" # prob.parameters.mip.strategy.variableselect.set(3) # strong # branching\n") cplex_script.write( " prob.objective.set_sense(prob.objective.sense.maximize)\n") cplex_script.write(" ".join( (" prob.variables.add(obj = my_obj, ub = my_ub,", "types = my_types, names = my_colnames)\n"))) cplex_script.write(" ".join( (" prob.linear_constraints.add(lin_expr = rows,", "senses = my_senses, rhs = my_rhs, names = my_rownames)\n"))) cplex_script.write(" ".join( (" prob.MIP_starts.add(cplex.SparsePair(", "ind = [i for i in range(vars_num)],", "val = [1.0] * vars_num),", "prob.MIP_starts.effort_level.auto)\n"))) cplex_script.write(" prob.solve()\n") cplex_script.write("".join( (" print (prob.solution.get_status(),", "prob.solution.status[prob.solution.get_status()],", "prob.solution.MIP.get_best_objective(),", "prob.solution.MIP.get_mip_relative_gap())\n"))) cplex_script.write("except CplexError, exc:\n") cplex_script.write(" print exc\n") # Run script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[-1 if len(cplex_output_lines[-1] ) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("cplex_solution={}\n".format(cplex_solution)) sys.stderr.flush() # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], cplex_solution[2])) # This is also an upper bound to the size of the true negative border optimal_sol_upp_bound = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # Compute non-empirical VC-dimension and first candidate to epsilon_2 stats['not_emp_vc_dim'] = int(math.floor( math.log2(optimal_sol_upp_bound))) + 1 if stats['not_emp_vc_dim'] > math.log2(len(negative_border)): sys.stderr.write( "Lowering non_empirical VC-dimension to maximum value\n") stats['not_emp_vc_dim'] = int( math.floor(math.log2(len(negative_border)))) not_emp_epsilon_2 = epsilon.get_eps_vc_dim(lower_delta, ds_stats['size'], stats['not_emp_vc_dim']) sys.stderr.write(" ".join( ("items_num-1={}".format(items_num - 1), "optimal_sol_upp_bound={}".format(optimal_sol_upp_bound), "not_emp_vc_dim={}".format(stats['not_emp_vc_dim']), "not_emp_e2={}\n".format(not_emp_epsilon_2)))) sys.stderr.flush() # Loop to compute empirical VC-dimension using lengths distribution items_num_str_len = len(str(len(negative_border_items) - 1)) longer_equal = 0 for i in range(len(lengths)): cand_len = lengths[i] if cand_len == items_num: continue longer_equal += lengths_dict[cand_len] # No need to include tests to check whether cand_len is lower than # 2*ds_stats['maxlen'] if use_additional_knowledge is True: it is # always true given that cand_len <= ds_stats['maxlen'] if cand_len >= len(negative_border_items): cand_len = len(negative_border_items) - 1 # Modify the script to use the new capacity. with open(tmpfile_name, 'r+t') as cplex_script: cplex_script.seek(0) cplex_script.write("capacity = {}\n".format( str(cand_len).ljust(items_num_str_len))) # Run the script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[ -1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("{}\n".format(cplex_solution)) # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) # if cplex_solution[0] == 102: optimal_sol_upp_bound_emp = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # else: # optimal_sol_upp_bound_emp = cplex_solution[0] stats['emp_vc_dim'] = int( math.floor(math.log2(optimal_sol_upp_bound_emp))) + 1 if stats['emp_vc_dim'] > math.log2(len(negative_border)): sys.stderr.write("Lowering VC-dimension to maximum value\n") stats['emp_vc_dim'] = int( math.floor(math.log2(len(negative_border)))) sys.stderr.write(" ".join( ("cand_len={}".format(cand_len), "longer_equal={}".format(longer_equal), "emp_vc_dim={}".format(stats['emp_vc_dim']), "optimal_sol_upp_bound_emp={}\n".format(optimal_sol_upp_bound_emp) ))) sys.stderr.flush() # If stopping condition is satisfied, exit. if stats['emp_vc_dim'] <= longer_equal: break # sys.stderr.write("{} {} {}\n".format(vc_dim_cand, vc_dim_cand2, # vc_dim_cand3)) os.remove(tmpfile_name) # Compute the bound to the shatter coefficient, which we use to compute # epsilon bound = min((math.log(optimal_sol_upp_bound), stats['emp_vc_dim'] * math.log(math.e * ds_stats['size'] / stats['emp_vc_dim']))) sys.stderr.write( "bound to shatter coeff: log_of_range_size={}, log_using_vc_dim={}\n". format( math.log(optimal_sol_upp_bound), stats['emp_vc_dim'] * math.log(math.e * ds_stats['size'] / stats['emp_vc_dim']))) sys.stderr.flush() # The following assert is to check that we are better than another bound to # the shatter coefficient which used the number of closed itemsets in the # base set and the size of the negative border of the base set. # Intuitively, the assert should not fail. =) assert (optimal_sol_upp_bound <= original_negative_border_len + closed_itemsets_len) # Compute second candidate to epsilon_2 emp_epsilon_2 = epsilon.get_eps_shattercoeff_bound(lower_delta, ds_stats['size'], bound, max_freq_base_set) sys.stderr.write( "cand_len={} opt_sol_upp_bound_emp={} emp_vc_dim={} bound={} max_freq_base_set={} emp_e2={}\n" .format(cand_len, optimal_sol_upp_bound_emp, stats['emp_vc_dim'], bound, max_freq_base_set, emp_epsilon_2)) sys.stderr.flush() sys.stderr.write("not_emp_e2={}, emp_e2={}\n".format( not_emp_epsilon_2, emp_epsilon_2)) sys.stderr.flush() stats['epsilon_2'] = min(emp_epsilon_2, not_emp_epsilon_2) # Extract TFIs using epsilon_2 sys.stderr.write("Extracting TFIs using epsilon_2...") sys.stderr.flush() trueFIs = dict() for itemset in reversed(freq_itemsets_1_sorted): if freq_itemsets_1_dict[itemset] >= min_freq + stats['epsilon_2']: trueFIs[itemset] = freq_itemsets_1_dict[itemset] else: break sys.stderr.write("done ({} TFIS)\n".format(len(trueFIs))) sys.stderr.flush() return (trueFIs, stats)
def get_trueFIs(ds_stats, res_filename, min_freq, delta, gap=0.0, use_additional_knowledge=False): """ Compute the True Frequent Itemsets using the VC method we present in the paper. The parameter 'use_additional_knowledge' can be used to incorporate additional knowledge about the data generation process. 'gap' controls how close to the optimal solution we ask the CPLEX solver to go. The right way to implement this would be to use a user-defined function in CPLEX. Returns a pair (trueFIs, stats). 'trueFIs' is a dict whose keys are itemsets (frozensets) and values are frequencies. This collection of itemsets contains only TFIs with probability at least 1 - delta. 'stats' is a dict containing various statistics used in computing the collection of itemsets.""" stats = dict() # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. lower_delta = 1.0 - math.sqrt(1 - delta) # Compute the maximum frequency of an itemset in the dataset with open(res_filename, 'rt') as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: error_exit("Cannot compute size of the dataset: '{}' is not in the recognized format\n".format(size_line)) try: size = int(size_str) except ValueError: error_exit("Cannot compute size of the dataset: '{}' is not a number\n".format(size_line.split("(")[1].split(")")[0])) max_freq_line = FILE.readline() if max_freq_line.find("(") > -1: tokens = max_freq_line.split("(") itemset = frozenset(map(int, tokens[0].split())) try: support = int(tokens[1][:-2]) except ValueError: error_exit("Cannot compute the maximum frequency: '{}' is not a number\n".format(tokens[1][:-2])) max_freq = support / size else: error_exit("Cannot compute the maximum frequency: '{}' is not in the recognized format\n".format(max_freq_line)) # Compute the first epsilon using results from the paper (Riondato and # Upfal 2014) # Incorporate or not 'previous knowledge' about generative process in # computation of the VC-dimension, depending on the option passed on the # command line (eps_vc_dim, eps_shatter, returned) = epsilon.epsilon_dataset( lower_delta, ds_stats, use_additional_knowledge, max_freq) stats['epsilon_1'] = min(eps_vc_dim, eps_shatter) items = ds_stats['items'] items_num = len(items) lengths_dict = ds_stats['lengths'] lengths = sorted(lengths_dict.keys(), reverse=True) # Extract the first (and largest) set of itemsets with frequency at least # min_freq - stats['epsilon_1'] lower_bound_freq = min_freq - stats['epsilon_1'] - (1 / ds_stats['size']) freq_itemsets_1_dict = utils.create_results(res_filename, lower_bound_freq) freq_itemsets_1_set = frozenset(freq_itemsets_1_dict.keys()) freq_itemsets_1_sorted = sorted(freq_itemsets_1_set, key=lambda x: freq_itemsets_1_dict[x]) freq_items_1 = set() for itemset in freq_itemsets_1_set: if len(itemset) == 1: freq_items_1 |= itemset freq_items_1_num = len(freq_items_1) sys.stderr.write("First set of FI's: {} itemsets\n".format( len(freq_itemsets_1_set))) sys.stderr.flush() constr_start_str = "cplex.SparsePair(ind = [" constr_end_str = "], val = vals)" # Compute the "base set" (terrible name), that is the set of # itemsets with frequency < min_freq + epsilon_1 (but greater than min_freq # - stats['epsilon_1']. In the paper we call it \mathcal{G}. sys.stderr.write("Creating base set...") sys.stderr.flush() base_set = dict() # We use the maximum frequency in the base set to compute the epislon max_freq_base_set = 0 for itemset in freq_itemsets_1_sorted: if freq_itemsets_1_dict[itemset] < min_freq + stats['epsilon_1']: base_set[itemset] = freq_itemsets_1_dict[itemset] if freq_itemsets_1_dict[itemset] > max_freq_base_set: max_freq_base_set = freq_itemsets_1_dict[itemset] else: break stats['base_set'] = len(base_set) sys.stderr.write("done: {} itemsets\n".format(stats['base_set'])) sys.stderr.flush() # Compute Closed Itemsets. We need them to compute the maximal. sys.stderr.write("Computing closed itemsets...") sys.stderr.flush() closed_itemsets = utils.get_closed_itemsets(base_set) closed_itemsets_len = len(closed_itemsets) sys.stderr.write("done. Found {} closed itemsets\n".format( closed_itemsets_len)) sys.stderr.flush() # Compute maximal itemsets. We will use them to compute the negative # border. An itemset is maximal frequent if none of its immediate supersets # is frequent. sys.stderr.write("Computing maximal itemsets...") sys.stderr.flush() maximal_itemsets_dict = utils.get_maximal_itemsets(closed_itemsets) maximal_itemsets = list(maximal_itemsets_dict.keys()) stats['maximal_itemsets'] = len(maximal_itemsets) sys.stderr.write("done. Found {} maximal itemsets\n".format( stats['maximal_itemsets'])) sys.stderr.flush() # Compute the negative border sys.stderr.write("Computing negative border...") sys.stderr.flush() negative_border = set() negative_border_items = set() # The idea is to look for "children" of maximal itemsets, and for # "siblings" of maximal itemsets for maximal in maximal_itemsets: for item_to_remove_from_maximal in maximal: reduced_maximal = maximal - frozenset( [item_to_remove_from_maximal, ]) for item in freq_items_1: if item in maximal: continue # Create sibling candidate = reduced_maximal | frozenset([item]) if candidate in freq_itemsets_1_set: continue if candidate in negative_border: continue to_add = True for item_to_remove in candidate: subset = candidate - frozenset([item_to_remove]) if subset not in freq_itemsets_1_set: to_add = False break if to_add: negative_border.add(candidate) negative_border_items |= candidate if not to_add: # if we added the sibling, there's no way we can add the # child candidate2 = maximal | frozenset([item]) # create child if candidate2 in negative_border: continue to_add = True for item_to_remove in candidate2: subset = candidate2 - frozenset([item_to_remove]) if subset not in freq_itemsets_1_set: to_add = False break if to_add: negative_border.add(candidate2) negative_border_items |= candidate # We don't need to add the non-frequent-items because none of them (or # their supersets) will ever be included in the output, so at most we lose # some statistical power, but it's not a problem of avoiding false # positives. # for item in non_freq_items_1: # negative_border.add(frozenset([item])) # negative_border_items.add(item) original_negative_border_len = len(negative_border) sys.stderr.write("done. Length now: {}\n".format( original_negative_border_len)) sys.stderr.flush() # Add the "base set" to negative_border, so that it becomes a superset of # the "true" negative border (with some caveats about non-frequent single # items and their supersets, see comment above) sys.stderr.write("Adding base set...") sys.stderr.flush() for itemset in base_set: negative_border.add(itemset) negative_border_items |= itemset sys.stderr.write("done. Length now: {}\n".format(len(negative_border))) sys.stderr.flush() negative_border = sorted(negative_border, key=len, reverse=True) stats['negative_border'] = len(negative_border) negative_border_items_sorted = sorted(negative_border_items) # Create the graph that we will use to compute the chain constraints. # The nodes are the itemsets in negative_border. There is an edge between # two nodes if one is contained in the other or vice-versa. # Cliques on this graph are chains. sys.stderr.write("Creating graph...") sys.stderr.flush() graph = nx.Graph() graph.add_nodes_from(negative_border) sys.stderr.write("added nodes...adding edges...") sys.stderr.flush() negative_border_items_in_sets_dict = dict() negative_border_itemset_index = 0 itemset_indexes_dict = dict() for first_itemset_index in range(stats['negative_border']): first_itemset = negative_border[first_itemset_index] for second_itemset_index in range(first_itemset_index + 1, stats['negative_border']): second_itemset = negative_border[second_itemset_index] if first_itemset < second_itemset or \ second_itemset < first_itemset: graph.add_edge(first_itemset, second_itemset) for item in first_itemset: if item in negative_border_items_in_sets_dict: negative_border_items_in_sets_dict[item].append( negative_border_itemset_index) else: negative_border_items_in_sets_dict[item] = \ [negative_border_itemset_index, ] itemset_indexes_dict[first_itemset] = negative_border_itemset_index negative_border_itemset_index += 1 sys.stderr.write("done\n") sys.stderr.flush() capacity = freq_items_1_num - 1 if use_additional_knowledge and 2 * ds_stats['maxlen'] < capacity: sys.stderr.write("Lowering capacity={} to {}\n".format( capacity, 2 * ds_stats['maxlen'])) sys.stderr.flush() capacity = 2 * ds_stats['maxlen'] vars_num = stats['negative_border'] + len(negative_border_items) constr_names = [] (tmpfile_handle, tmpfile_name) = tempfile.mkstemp( prefix="cplx", dir=os.environ['PWD'], text=True) os.close(tmpfile_handle) with open(tmpfile_name, 'wt') as cplex_script: cplex_script.write("capacity = {}\n".format(capacity)) cplex_script.write("import cplex, os, sys\n") cplex_script.write("from cplex.exceptions import CplexError\n") cplex_script.write("\n") cplex_script.write("\n") cplex_script.write( " ".join( ("os.environ[\"ILOG_LICENSE_FILE\"] =", "\"/local/projects/cplex/ilm/site.access.ilm\"\n"))) cplex_script.write("vals = [-1.0, 1.0]\n") cplex_script.write("sets_num = {}\n".format(stats['negative_border'])) cplex_script.write("items_num = {}\n".format( len(negative_border_items))) cplex_script.write("vars_num = {}\n".format(vars_num)) cplex_script.write("my_ub = [1.0] * vars_num\n") cplex_script.write( "my_types = \"\".join(\"I\" for i in range(vars_num))\n") cplex_script.write( "my_obj = ([1.0] * sets_num) + ([0.0] * items_num)\n") cplex_script.write( " ".join( ("my_colnames =", "[\"set{0}\".format(i) for i in range(sets_num)] +", "[\"item{0}\".format(j) for j in range(items_num)]\n"))) cplex_script.write("rows = [ ") sys.stderr.write("Writing knapsack constraints...") sys.stderr.flush() constr_num = 0 for item_index in range(len(negative_border_items)): try: for itemset_index in negative_border_items_in_sets_dict[ negative_border_items_sorted[item_index]]: constr_str = "".join( (constr_start_str, "\"set{}\",\"item{}\"".format( itemset_index, item_index), constr_end_str)) cplex_script.write("{},".format(constr_str)) constr_num += 1 name = "s{}i{}".format(item_index, itemset_index) constr_names.append(name) except KeyError: sys.stderr.write( " ".join( ("item_index={}".format(item_index), "neg_border_items_sorted[item_index]={}\n".format( negative_border_items_sorted[item_index])))) sys.stderr.write("{} in items: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in items)) sys.stderr.write("{} in freq_items_1: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in freq_items_1)) non_freq_items_1 = items - freq_items_1 sys.stderr.write("{} in non_freq_items_1: {}\n".format( negative_border_items_sorted[item_index], negative_border_items_sorted[item_index] in non_freq_items_1)) in_pos_border = False pos_border_itemset = frozenset() for itemset in maximal_itemsets: if negative_border_items_sorted[item_index] in itemset: in_pos_border = True pos_border_itemset = itemset break sys.stderr.write( "{} in maximal_itemsets: {}. Itemset: {}\n".format( negative_border_items_sorted[item_index], in_pos_border, pos_border_itemset)) in_neg_border = False neg_border_itemset = frozenset() for itemset in negative_border: if negative_border_items_sorted[item_index] in itemset: in_neg_border = True neg_border_itemset = itemset break sys.stderr.write( "{} in negative_border: {}. Itemset: {}\n".format( negative_border_items_sorted[item_index], in_neg_border, neg_border_itemset)) sys.exit(1) # Create capacity constraints and write it to script constr_str = "".join( (constr_start_str, ",".join( "\"item{}\"".format(j) for j in range( len(negative_border_items))), "], val=[", ",".join( "1.0" for j in range(len(negative_border_items))), "])")) cplex_script.write(constr_str) last_tell = cplex_script.tell() cplex_script.write(",") cap_constr_name = "capacity" constr_names.append(cap_constr_name) sys.stderr.write("done\n") sys.stderr.flush() # Create chain constraints and write them to script sys.stderr.write("Writing chain constraints...") sys.stderr.flush() chains_index = 0 for clique in nx.find_cliques(graph): if len(clique) == 1: continue constr_str = "".join( (constr_start_str, ",".join("\"set{}\"".format(j) for j in map( lambda x: itemset_indexes_dict[x], clique)), "], val=[1.0] * {}".format(len(clique)), ")")) cplex_script.write(constr_str) last_tell = cplex_script.tell() cplex_script.write(",") name = "chain{}".format(chains_index) constr_names.append(name) chains_index += 1 sys.stderr.write("done\n") sys.stderr.flush() sys.stderr.write( " ".join( ("Optimization problem: capacity={}".format(capacity), "vars_num={}".format(vars_num), "negative_border_size={}".format(stats['negative_border']), "negative_border_items_num={}".format( len(negative_border_items)), "constr_num={}".format(constr_num), "chains_index={}\n".format(chains_index)))) sys.stderr.flush() # Go back one character to remove last comma "," cplex_script.seek(last_tell) cplex_script.write("]\n") cplex_script.write("my_rownames = {}\n".format(constr_names)) cplex_script.write("constr_num = {}\n".format(constr_num)) cplex_script.write("chain_constr_num = {}\n".format(chains_index)) cplex_script.write( " ".join( ("my_senses = [\"G\"] * constr_num +", "[\"L\"] + [\"L\"] * chain_constr_num\n"))) cplex_script.write( " ".join( ("my_rhs = [0.0] * constr_num + [capacity] +", "[1.0] * chain_constr_num\n"))) cplex_script.write("\n") cplex_script.write("try:\n") cplex_script.write(" prob = cplex.Cplex()\n") cplex_script.write(" prob.set_error_stream(sys.stderr)\n") cplex_script.write(" prob.set_log_stream(sys.stderr)\n") cplex_script.write(" prob.set_results_stream(sys.stderr)\n") cplex_script.write(" prob.set_warning_stream(sys.stderr)\n") # cplex_script.write(" prob.parameters.mip.strategy.file.set(2)\n") cplex_script.write( " prob.parameters.mip.tolerances.mipgap.set({})\n".format(gap)) cplex_script.write( " prob.parameters.timelimit.set({})\n".format(600)) # cplex_script.write(" # prob.parameters.mip.strategy.variableselect.set(3) # strong # branching\n") cplex_script.write( " prob.objective.set_sense(prob.objective.sense.maximize)\n") cplex_script.write( " ".join( (" prob.variables.add(obj = my_obj, ub = my_ub,", "types = my_types, names = my_colnames)\n"))) cplex_script.write( " ".join( (" prob.linear_constraints.add(lin_expr = rows,", "senses = my_senses, rhs = my_rhs, names = my_rownames)\n"))) cplex_script.write( " ".join( (" prob.MIP_starts.add(cplex.SparsePair(", "ind = [i for i in range(vars_num)],", "val = [1.0] * vars_num),", "prob.MIP_starts.effort_level.auto)\n"))) cplex_script.write(" prob.solve()\n") cplex_script.write( "".join( (" print (prob.solution.get_status(),", "prob.solution.status[prob.solution.get_status()],", "prob.solution.MIP.get_best_objective(),", "prob.solution.MIP.get_mip_relative_gap())\n"))) cplex_script.write("except CplexError, exc:\n") cplex_script.write(" print exc\n") # Run script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[ -1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("cplex_solution={}\n".format(cplex_solution)) sys.stderr.flush() # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], cplex_solution[2])) # This is also an upper bound to the size of the true negative border optimal_sol_upp_bound = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # Compute non-empirical VC-dimension and first candidate to epsilon_2 stats['not_emp_vc_dim'] = int( math.floor(math.log2(optimal_sol_upp_bound))) + 1 if stats['not_emp_vc_dim'] > math.log2(len(negative_border)): sys.stderr.write( "Lowering non_empirical VC-dimension to maximum value\n") stats['not_emp_vc_dim'] = int( math.floor(math.log2(len(negative_border)))) not_emp_epsilon_2 = epsilon.get_eps_vc_dim( lower_delta, ds_stats['size'], stats['not_emp_vc_dim']) sys.stderr.write( " ".join( ("items_num-1={}".format(items_num - 1), "optimal_sol_upp_bound={}".format(optimal_sol_upp_bound), "not_emp_vc_dim={}".format(stats['not_emp_vc_dim']), "not_emp_e2={}\n".format(not_emp_epsilon_2)))) sys.stderr.flush() # Loop to compute empirical VC-dimension using lengths distribution items_num_str_len = len(str(len(negative_border_items) - 1)) longer_equal = 0 for i in range(len(lengths)): cand_len = lengths[i] if cand_len == items_num: continue longer_equal += lengths_dict[cand_len] # No need to include tests to check whether cand_len is lower than # 2*ds_stats['maxlen'] if use_additional_knowledge is True: it is # always true given that cand_len <= ds_stats['maxlen'] if cand_len >= len(negative_border_items): cand_len = len(negative_border_items) - 1 # Modify the script to use the new capacity. with open(tmpfile_name, 'r+t') as cplex_script: cplex_script.seek(0) cplex_script.write("capacity = {}\n".format( str(cand_len).ljust(items_num_str_len))) # Run the script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[ -1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("{}\n".format(cplex_solution)) # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) # if cplex_solution[0] == 102: optimal_sol_upp_bound_emp = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # else: # optimal_sol_upp_bound_emp = cplex_solution[0] stats['emp_vc_dim'] = int( math.floor(math.log2(optimal_sol_upp_bound_emp))) + 1 if stats['emp_vc_dim'] > math.log2(len(negative_border)): sys.stderr.write("Lowering VC-dimension to maximum value\n") stats['emp_vc_dim'] = int( math.floor(math.log2(len(negative_border)))) sys.stderr.write( " ".join( ("cand_len={}".format(cand_len), "longer_equal={}".format(longer_equal), "emp_vc_dim={}".format(stats['emp_vc_dim']), "optimal_sol_upp_bound_emp={}\n".format(optimal_sol_upp_bound_emp)))) sys.stderr.flush() # If stopping condition is satisfied, exit. if stats['emp_vc_dim'] <= longer_equal: break # sys.stderr.write("{} {} {}\n".format(vc_dim_cand, vc_dim_cand2, # vc_dim_cand3)) os.remove(tmpfile_name) # Compute the bound to the shatter coefficient, which we use to compute # epsilon bound = min((math.log(optimal_sol_upp_bound), stats['emp_vc_dim'] * math.log(math.e * ds_stats['size'] / stats['emp_vc_dim']))) sys.stderr.write("bound to shatter coeff: log_of_range_size={}, log_using_vc_dim={}\n".format( math.log(optimal_sol_upp_bound), stats['emp_vc_dim'] * math.log(math.e * ds_stats['size'] / stats['emp_vc_dim']))) sys.stderr.flush() # The following assert is to check that we are better than another bound to # the shatter coefficient which used the number of closed itemsets in the # base set and the size of the negative border of the base set. # Intuitively, the assert should not fail. =) assert(optimal_sol_upp_bound <= original_negative_border_len + closed_itemsets_len) # Compute second candidate to epsilon_2 emp_epsilon_2 = epsilon.get_eps_shattercoeff_bound(lower_delta, ds_stats['size'], bound, max_freq_base_set) sys.stderr.write( "cand_len={} opt_sol_upp_bound_emp={} emp_vc_dim={} bound={} max_freq_base_set={} emp_e2={}\n".format( cand_len, optimal_sol_upp_bound_emp, stats['emp_vc_dim'], bound, max_freq_base_set, emp_epsilon_2)) sys.stderr.flush() sys.stderr.write("not_emp_e2={}, emp_e2={}\n".format( not_emp_epsilon_2, emp_epsilon_2)) sys.stderr.flush() stats['epsilon_2'] = min(emp_epsilon_2, not_emp_epsilon_2) # Extract TFIs using epsilon_2 sys.stderr.write("Extracting TFIs using epsilon_2...") sys.stderr.flush() trueFIs = dict() for itemset in reversed(freq_itemsets_1_sorted): if freq_itemsets_1_dict[itemset] >= min_freq + stats['epsilon_2']: trueFIs[itemset] = freq_itemsets_1_dict[itemset] else: break sys.stderr.write("done ({} TFIS)\n".format(len(trueFIs))) sys.stderr.flush() return (trueFIs, stats)
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, gap=0.0, first_epsilon=1.0, vcdim=-1): """ Compute the True Frequent Itemsets using the 'holdout-VC' method. TODO Add more details.""" stats = dict() with open(exp_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit(" ".join( ("Cannot compute size of the explore dataset:", "'{}' is not in the recognized format\n".format(size_line)))) try: stats['exp_size'] = int(size_str) except ValueError: utils.error_exit(" ".join( ("Cannot compute size of the explore dataset:", "{} is not a number\n".format(size_str)))) with open(eval_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit(" ".join( ("Cannot compute size of the eval dataset:", "'{}' is not in the recognized format\n".format(size_line)))) try: stats['eval_size'] = int(size_str) except ValueError: utils.error_exit(" ".join( ("Cannot compute size of the eval dataset:", "'{}' is not a number\n".format(size_str)))) stats['orig_size'] = stats['exp_size'] + stats['eval_size'] exp_res = utils.create_results(exp_res_filename, min_freq) stats['exp_res'] = len(exp_res) exp_res_set = set(exp_res.keys()) eval_res = utils.create_results(eval_res_filename, min_freq) stats['eval_res'] = len(eval_res) eval_res_set = set(eval_res.keys()) intersection = exp_res_set & eval_res_set stats['holdout_intersection'] = len(intersection) stats['holdout_false_negatives'] = len(exp_res_set - eval_res_set) stats['holdout_false_positives'] = len(eval_res_set - exp_res_set) stats['holdout_jaccard'] = len(intersection) / \ len(exp_res_set | eval_res_set) # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. lower_delta = 1.0 - math.sqrt(1 - delta) stats['epsilon_1'] = first_epsilon sys.stderr.write("Computing candidates...") sys.stderr.flush() freq_bound = min_freq + stats['epsilon_1'] candidates = [] candidates_items = set() trueFIs = dict() for itemset in exp_res: if exp_res[itemset] < freq_bound: candidates.append(itemset) candidates_items |= itemset else: # Add itemsets with frequency at last freq_bound to the TFIs trueFIs[itemset] = exp_res[itemset] sys.stderr.write("done: {} candidates ({} items)\n".format( len(candidates), len(candidates_items))) sys.stderr.flush() if len(candidates ) > 0 and vcdim > -1 and len(candidates_items) - 1 > vcdim: sys.stderr.write("Using additional knowledge\n") candidates_items_sorted = sorted(candidates_items) candidates_items_in_sets_dict = dict() candidates_itemset_index = 0 itemset_indexes_dict = dict() for first_itemset_index in range(len(candidates)): first_itemset = candidates[first_itemset_index] for item in first_itemset: if item in candidates_items_in_sets_dict: candidates_items_in_sets_dict[item].append( candidates_itemset_index) else: candidates_items_in_sets_dict[item] = \ [candidates_itemset_index, ] itemset_indexes_dict[first_itemset] = candidates_itemset_index candidates_itemset_index += 1 # Compute an upper-bound to the VC-dimension of the set of candidates. constr_start_str = "cplex.SparsePair(ind = [" constr_end_str = "], val = vals)" vars_num = len(candidates) + len(candidates_items) constr_names = [] capacity = vcdim (tmpfile_handle, tmpfile_name) = tempfile.mkstemp(prefix="cplx", dir=os.environ['PWD'], text=True) os.close(tmpfile_handle) with open(tmpfile_name, 'wt') as cplex_script: cplex_script.write("capacity = {}\n".format(capacity)) cplex_script.write("import cplex, os, sys\n") cplex_script.write("from cplex.exceptions import CplexError\n") cplex_script.write("\n") cplex_script.write("\n") cplex_script.write(" ".join( ("os.environ[\"ILOG_LICENSE_FILE\"] =" "\"/local/projects/cplex/ilm/site.access.ilm\"\n"))) cplex_script.write("vals = [-1.0, 1.0]\n") cplex_script.write("sets_num = {}\n".format(len(candidates))) cplex_script.write("items_num = {}\n".format( len(candidates_items))) cplex_script.write("vars_num = {}\n".format(vars_num)) cplex_script.write("my_ub = [1.0] * vars_num\n") cplex_script.write( "my_types = \"\".join(\"I\" for i in range(vars_num))\n") cplex_script.write( "my_obj = ([1.0] * sets_num) + ([0.0] * items_num)\n") cplex_script.write(" ".join( ("my_colnames =" "[\"set{0}\".format(i) for i in range(sets_num)]", "+ [\"item{0}\".format(j) for j in range(items_num)]\n"))) cplex_script.write("rows = [ ") sys.stderr.write("Writing knapsack constraints...") sys.stderr.flush() constr_num = 0 for item_index in range(len(candidates_items)): try: for itemset_index in \ candidates_items_in_sets_dict[ candidates_items_sorted[item_index]]: constr_str = "".join( (constr_start_str, "\"set{}\",\"item{}\"".format( itemset_index, item_index), constr_end_str)) cplex_script.write("{},".format(constr_str)) constr_num += 1 name = "s{}i{}".format(item_index, itemset_index) constr_names.append(name) except KeyError: sys.stderr.write(" ".join( ("item_index={}".format(item_index), "candidates_items_sorted[item_index]={}\n".format( candidates_items_sorted[item_index])))) in_candidates = False candidates_itemset = frozenset() for itemset in candidates: if candidates_items_sorted[item_index] in itemset: in_candidates = True candidates_itemset = itemset break sys.stderr.write( "{} in negative_border: {}. Itemset: {}\n".format( candidates_items_sorted[item_index], in_candidates, candidates_itemset)) sys.exit(1) # Create capacity constraints and write it to script constr_str = "".join( (constr_start_str, ",".join("\"item{}\"".format(j) for j in range(len(candidates_items))), "], val=[", ",".join("1.0" for j in range(len(candidates_items))), "])")) cplex_script.write(constr_str) cplex_script.write("]\n") cap_constr_name = "capacity" constr_names.append(cap_constr_name) sys.stderr.write("done\n") sys.stderr.flush() sys.stderr.write(" ".join( ("Optimization problem: capacity={}".format(capacity), "vars_num={}".format(vars_num), "candidates={}".format(len(candidates)), "candidates_items_num={}".format(len(candidates_items)), "constr_num={}\n".format(constr_num)))) sys.stderr.flush() cplex_script.write("my_rownames = {}\n".format(constr_names)) cplex_script.write("constr_num = {}\n".format(constr_num)) cplex_script.write("my_senses = [\"G\"] * constr_num + [\"L\"]\n") cplex_script.write("my_rhs = [0.0] * constr_num + [capacity]\n") cplex_script.write("\n") cplex_script.write("try:\n") cplex_script.write(" prob = cplex.Cplex()\n") cplex_script.write(" prob.set_error_stream(sys.stderr)\n") cplex_script.write(" prob.set_log_stream(sys.stderr)\n") cplex_script.write(" prob.set_results_stream(sys.stderr)\n") cplex_script.write(" prob.set_warning_stream(sys.stderr)\n") # cplex_script.write(" # prob.parameters.mip.strategy.file.set(2)\n") cplex_script.write( " prob.parameters.mip.tolerances.mipgap.set({})\n".format( gap)) cplex_script.write( " prob.parameters.timelimit.set({})\n".format(600)) # cplex_script.write(" # prob.parameters.mip.strategy.variableselect.set(3) # strong # branching\n") cplex_script.write("".join((" prob.objective.set_sense(", "prob.objective.sense.maximize)\n"))) cplex_script.write(" ".join( (" prob.variables.add(obj = my_obj, ub = my_ub,", "types = my_types, names = my_colnames)\n"))) cplex_script.write(" ".join( (" prob.linear_constraints.add(lin_expr = rows,", "senses = my_senses, rhs = my_rhs,", "names = my_rownames)\n"))) cplex_script.write(" ".join( (" prob.MIP_starts.add(cplex.SparsePair(ind =", "[i for i in range(vars_num)], val = [1.0] * vars_num),", "prob.MIP_starts.effort_level.auto)\n"))) cplex_script.write(" prob.solve()\n") cplex_script.write(",".join( (" print (prob.solution.get_status()", "prob.solution.status[prob.solution.get_status()]", "prob.solution.MIP.get_best_objective()" "prob.solution.MIP.get_mip_relative_gap())\n"))) cplex_script.write("except CplexError, exc:\n") cplex_script.write(" print exc\n") # Run script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[ -1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("cplex_solution={}\n".format(cplex_solution)) sys.stderr.flush() # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) optimal_sol_upp_bound = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) stats['vcdim'] = int(math.floor(math.log2(optimal_sol_upp_bound))) + 1 if stats['vcdim'] > math.log2(len(candidates)): sys.stderr.write("Lowering VC-dimension to maximum value\n") sys.stderr.flush() stats['vcdim'] = int(math.floor(math.log2(len(candidates)))) stats['epsilon_2_vc'] = epsilon.get_eps_vc_dim(lower_delta, stats['orig_size'], stats['vcdim']) elif len(candidates ) > 0 and vcdim > -1 and len(candidates_items) - 1 <= vcdim: sys.stderr.write("Additional knowledge is useless\n") sys.stderr.flush() stats['vcdim'] = int(math.floor(math.log2(len(candidates)))) stats['epsilon_2_vc'] = epsilon.get_eps_vc_dim(lower_delta, stats['orig_size'], stats['vcdim']) elif len(candidates) > 0 and vcdim == -1: sys.stderr.write("Not using additional knowledge\n") sys.stderr.flush() stats['vcdim'] = int(math.floor(math.log2(len(candidates)))) stats['epsilon_2_vc'] = epsilon.get_eps_vc_dim(lower_delta, stats['orig_size'], stats['vcdim']) else: sys.stderr.write("There are no candidates\n") sys.stderr.flush() stats['vcdim'] = 0 stats['epsilon_2_vc'] = 0 # Loop to compute empirical VC-dimension using lengths distribution capacity_str_len = len(str(capacity)) longer_equal = 0 lengths_dict = ds_stats['lengths'] lengths = sorted(lengths_dict.keys(), reverse=True) start_len_idx = 0 while start_len_idx < len(lengths): if lengths[start_len_idx] > len(candidates_items) - 1: longer_equal += lengths_dict[start_len_idx] start_len_idx += 1 else: break for i in range(start_len_idx, len(lengths)): cand_len = lengths[i] longer_equal += lengths_dict[cand_len] # Modify the script to use the new capacity. with open(tmpfile_name, 'r+t') as cplex_script: cplex_script.seek(0) cplex_script.write("capacity = {}\n".format( str(cand_len).ljust(capacity_str_len))) # Run the script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = \ "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"]) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format( err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode( locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[ -1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit( "Error evaluating the CPLEX solution line: {}\n".format( cplex_solution_line)) sys.stderr.write("{}\n".format(cplex_solution)) # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) # if cplex_solution[0] == 102: optimal_sol_upp_bound_emp = int( math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # else: # optimal_sol_upp_bound_emp = cplex_solution[0] stats['emp_vc_dim'] = int( math.floor(math.log2(optimal_sol_upp_bound_emp))) + 1 if stats['emp_vc_dim'] > math.log2(len(negative_border)): sys.stderr.write("Lowering VC-dimension to maximum value\n") stats['emp_vc_dim'] = int( math.floor(math.log2(len(negative_border)))) sys.stderr.write(" ".join( ("cand_len={}".format(cand_len), "longer_equal={}".format(longer_equal), "emp_vc_dim={}".format(stats['emp_vc_dim']), "optimal_sol_upp_bound_emp={}\n".format(optimal_sol_upp_bound_emp) ))) sys.stderr.flush() # If stopping condition is satisfied, exit. if stats['emp_vc_dim'] <= longer_equal: break os.remove(tmpfile_name) # Compute the bound to the shatter coefficient, which we use to compute # epsilon bound = min((math.log(len(candidates)), stats['emp_vc_dim'] * math.log(math.e * stats['eval_size'] / stats['emp_vc_dim']))) # Compute second candidate to epsilon_2 emp_epsilon_2 = epsilon.get_eps_shattercoeff_bound(lower_delta, stats['eval_size'], bound, max_freq_base_set) sys.stderr.write( "cand_len={} opt_sol_upp_bound_emp={} emp_vc_dim={} bound={} max_freq_base_set={} emp_e2={}\n" .format(cand_len, optimal_sol_upp_bound_emp, stats['emp_vc_dim'], bound, max_freq_base_set, emp_epsilon_2)) sys.stderr.flush() sys.stderr.write("not_emp_e2={}, emp_e2={}\n".format( stats['epsilon_2_vc'], emp_epsilon_2)) sys.stderr.flush() stats['epsilon_2'] = min(emp_epsilon_2, stats['epsilon_2_vc']) if len(candidates) > 0: sys.stderr.write("Computing the candidates that are TFIs...") sys.stderr.flush() freq_bound = min_freq + stats['epsilon_2'] eval_res_itemsets = frozenset(eval_res.keys()) for itemset in sorted(frozenset(candidates) & eval_res_itemsets, key=lambda x: eval_res[x], reverse=True): if eval_res[itemset] >= freq_bound: trueFIs[itemset] = eval_res[itemset] sys.stderr.write("done\n") sys.stderr.flush() return (trueFIs, stats)
def get_trueFIs(exp_res_filename, eval_res_filename, min_freq, delta, gap=0.0, first_epsilon=1.0, vcdim=-1): """ Compute the True Frequent Itemsets using the 'holdout-VC' method. TODO Add more details.""" stats = dict() with open(exp_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( " ".join( ( "Cannot compute size of the explore dataset:", "'{}' is not in the recognized format\n".format(size_line), ) ) ) try: stats["exp_size"] = int(size_str) except ValueError: utils.error_exit( " ".join(("Cannot compute size of the explore dataset:", "{} is not a number\n".format(size_str))) ) with open(eval_res_filename) as FILE: size_line = FILE.readline() try: size_str = size_line.split("(")[1].split(")")[0] except IndexError: utils.error_exit( " ".join( ( "Cannot compute size of the eval dataset:", "'{}' is not in the recognized format\n".format(size_line), ) ) ) try: stats["eval_size"] = int(size_str) except ValueError: utils.error_exit( " ".join(("Cannot compute size of the eval dataset:", "'{}' is not a number\n".format(size_str))) ) stats["orig_size"] = stats["exp_size"] + stats["eval_size"] exp_res = utils.create_results(exp_res_filename, min_freq) stats["exp_res"] = len(exp_res) exp_res_set = set(exp_res.keys()) eval_res = utils.create_results(eval_res_filename, min_freq) stats["eval_res"] = len(eval_res) eval_res_set = set(eval_res.keys()) intersection = exp_res_set & eval_res_set stats["holdout_intersection"] = len(intersection) stats["holdout_false_negatives"] = len(exp_res_set - eval_res_set) stats["holdout_false_positives"] = len(eval_res_set - exp_res_set) stats["holdout_jaccard"] = len(intersection) / len(exp_res_set | eval_res_set) # One may want to play with giving different values for the different error # probabilities, but there isn't really much point in it. lower_delta = 1.0 - math.sqrt(1 - delta) stats["epsilon_1"] = first_epsilon sys.stderr.write("Computing candidates...") sys.stderr.flush() freq_bound = min_freq + stats["epsilon_1"] candidates = [] candidates_items = set() trueFIs = dict() for itemset in exp_res: if exp_res[itemset] < freq_bound: candidates.append(itemset) candidates_items |= itemset else: # Add itemsets with frequency at last freq_bound to the TFIs trueFIs[itemset] = exp_res[itemset] sys.stderr.write("done: {} candidates ({} items)\n".format(len(candidates), len(candidates_items))) sys.stderr.flush() if len(candidates) > 0 and vcdim > -1 and len(candidates_items) - 1 > vcdim: sys.stderr.write("Using additional knowledge\n") candidates_items_sorted = sorted(candidates_items) candidates_items_in_sets_dict = dict() candidates_itemset_index = 0 itemset_indexes_dict = dict() for first_itemset_index in range(len(candidates)): first_itemset = candidates[first_itemset_index] for item in first_itemset: if item in candidates_items_in_sets_dict: candidates_items_in_sets_dict[item].append(candidates_itemset_index) else: candidates_items_in_sets_dict[item] = [candidates_itemset_index] itemset_indexes_dict[first_itemset] = candidates_itemset_index candidates_itemset_index += 1 # Compute an upper-bound to the VC-dimension of the set of candidates. constr_start_str = "cplex.SparsePair(ind = [" constr_end_str = "], val = vals)" vars_num = len(candidates) + len(candidates_items) constr_names = [] capacity = vcdim (tmpfile_handle, tmpfile_name) = tempfile.mkstemp(prefix="cplx", dir=os.environ["PWD"], text=True) os.close(tmpfile_handle) with open(tmpfile_name, "wt") as cplex_script: cplex_script.write("capacity = {}\n".format(capacity)) cplex_script.write("import cplex, os, sys\n") cplex_script.write("from cplex.exceptions import CplexError\n") cplex_script.write("\n") cplex_script.write("\n") cplex_script.write( " ".join(('os.environ["ILOG_LICENSE_FILE"] =' '"/local/projects/cplex/ilm/site.access.ilm"\n')) ) cplex_script.write("vals = [-1.0, 1.0]\n") cplex_script.write("sets_num = {}\n".format(len(candidates))) cplex_script.write("items_num = {}\n".format(len(candidates_items))) cplex_script.write("vars_num = {}\n".format(vars_num)) cplex_script.write("my_ub = [1.0] * vars_num\n") cplex_script.write('my_types = "".join("I" for i in range(vars_num))\n') cplex_script.write("my_obj = ([1.0] * sets_num) + ([0.0] * items_num)\n") cplex_script.write( " ".join( ( "my_colnames =" '["set{0}".format(i) for i in range(sets_num)]', '+ ["item{0}".format(j) for j in range(items_num)]\n', ) ) ) cplex_script.write("rows = [ ") sys.stderr.write("Writing knapsack constraints...") sys.stderr.flush() constr_num = 0 for item_index in range(len(candidates_items)): try: for itemset_index in candidates_items_in_sets_dict[candidates_items_sorted[item_index]]: constr_str = "".join( (constr_start_str, '"set{}","item{}"'.format(itemset_index, item_index), constr_end_str) ) cplex_script.write("{},".format(constr_str)) constr_num += 1 name = "s{}i{}".format(item_index, itemset_index) constr_names.append(name) except KeyError: sys.stderr.write( " ".join( ( "item_index={}".format(item_index), "candidates_items_sorted[item_index]={}\n".format(candidates_items_sorted[item_index]), ) ) ) in_candidates = False candidates_itemset = frozenset() for itemset in candidates: if candidates_items_sorted[item_index] in itemset: in_candidates = True candidates_itemset = itemset break sys.stderr.write( "{} in negative_border: {}. Itemset: {}\n".format( candidates_items_sorted[item_index], in_candidates, candidates_itemset ) ) sys.exit(1) # Create capacity constraints and write it to script constr_str = "".join( ( constr_start_str, ",".join('"item{}"'.format(j) for j in range(len(candidates_items))), "], val=[", ",".join("1.0" for j in range(len(candidates_items))), "])", ) ) cplex_script.write(constr_str) cplex_script.write("]\n") cap_constr_name = "capacity" constr_names.append(cap_constr_name) sys.stderr.write("done\n") sys.stderr.flush() sys.stderr.write( " ".join( ( "Optimization problem: capacity={}".format(capacity), "vars_num={}".format(vars_num), "candidates={}".format(len(candidates)), "candidates_items_num={}".format(len(candidates_items)), "constr_num={}\n".format(constr_num), ) ) ) sys.stderr.flush() cplex_script.write("my_rownames = {}\n".format(constr_names)) cplex_script.write("constr_num = {}\n".format(constr_num)) cplex_script.write('my_senses = ["G"] * constr_num + ["L"]\n') cplex_script.write("my_rhs = [0.0] * constr_num + [capacity]\n") cplex_script.write("\n") cplex_script.write("try:\n") cplex_script.write(" prob = cplex.Cplex()\n") cplex_script.write(" prob.set_error_stream(sys.stderr)\n") cplex_script.write(" prob.set_log_stream(sys.stderr)\n") cplex_script.write(" prob.set_results_stream(sys.stderr)\n") cplex_script.write(" prob.set_warning_stream(sys.stderr)\n") # cplex_script.write(" # prob.parameters.mip.strategy.file.set(2)\n") cplex_script.write(" prob.parameters.mip.tolerances.mipgap.set({})\n".format(gap)) cplex_script.write(" prob.parameters.timelimit.set({})\n".format(600)) # cplex_script.write(" # prob.parameters.mip.strategy.variableselect.set(3) # strong # branching\n") cplex_script.write("".join((" prob.objective.set_sense(", "prob.objective.sense.maximize)\n"))) cplex_script.write( " ".join( (" prob.variables.add(obj = my_obj, ub = my_ub,", "types = my_types, names = my_colnames)\n") ) ) cplex_script.write( " ".join( ( " prob.linear_constraints.add(lin_expr = rows,", "senses = my_senses, rhs = my_rhs,", "names = my_rownames)\n", ) ) ) cplex_script.write( " ".join( ( " prob.MIP_starts.add(cplex.SparsePair(ind =", "[i for i in range(vars_num)], val = [1.0] * vars_num),", "prob.MIP_starts.effort_level.auto)\n", ) ) ) cplex_script.write(" prob.solve()\n") cplex_script.write( ",".join( ( " print (prob.solution.get_status()", "prob.solution.status[prob.solution.get_status()]", "prob.solution.MIP.get_best_objective()" "prob.solution.MIP.get_mip_relative_gap())\n", ) ) ) cplex_script.write("except CplexError, exc:\n") cplex_script.write(" print exc\n") # Run script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"] ) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format(err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode(locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[-1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit("Error evaluating the CPLEX solution line: {}\n".format(cplex_solution_line)) sys.stderr.write("cplex_solution={}\n".format(cplex_solution)) sys.stderr.flush() # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) optimal_sol_upp_bound = int(math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) stats["vcdim"] = int(math.floor(math.log2(optimal_sol_upp_bound))) + 1 if stats["vcdim"] > math.log2(len(candidates)): sys.stderr.write("Lowering VC-dimension to maximum value\n") sys.stderr.flush() stats["vcdim"] = int(math.floor(math.log2(len(candidates)))) stats["epsilon_2_vc"] = epsilon.get_eps_vc_dim(lower_delta, stats["orig_size"], stats["vcdim"]) elif len(candidates) > 0 and vcdim > -1 and len(candidates_items) - 1 <= vcdim: sys.stderr.write("Additional knowledge is useless\n") sys.stderr.flush() stats["vcdim"] = int(math.floor(math.log2(len(candidates)))) stats["epsilon_2_vc"] = epsilon.get_eps_vc_dim(lower_delta, stats["orig_size"], stats["vcdim"]) elif len(candidates) > 0 and vcdim == -1: sys.stderr.write("Not using additional knowledge\n") sys.stderr.flush() stats["vcdim"] = int(math.floor(math.log2(len(candidates)))) stats["epsilon_2_vc"] = epsilon.get_eps_vc_dim(lower_delta, stats["orig_size"], stats["vcdim"]) else: sys.stderr.write("There are no candidates\n") sys.stderr.flush() stats["vcdim"] = 0 stats["epsilon_2_vc"] = 0 # Loop to compute empirical VC-dimension using lengths distribution capacity_str_len = len(str(capacity)) longer_equal = 0 lengths_dict = ds_stats["lengths"] lengths = sorted(lengths_dict.keys(), reverse=True) start_len_idx = 0 while start_len_idx < len(lengths): if lengths[start_len_idx] > len(candidates_items) - 1: longer_equal += lengths_dict[start_len_idx] start_len_idx += 1 else: break for i in range(start_len_idx, len(lengths)): cand_len = lengths[i] longer_equal += lengths_dict[cand_len] # Modify the script to use the new capacity. with open(tmpfile_name, "r+t") as cplex_script: cplex_script.seek(0) cplex_script.write("capacity = {}\n".format(str(cand_len).ljust(capacity_str_len))) # Run the script, solve optimization problem, extract solution my_environ = os.environ if "ILOG_LICENSE_FILE" not in my_environ: my_environ["ILOG_LICENSE_FILE"] = "/local/projects/cplex/ilm/site.access.ilm" try: cplex_output_binary_str = subprocess.check_output( ["python2.6", tmpfile_name], env=my_environ, cwd=os.environ["PWD"] ) except subprocess.CalledProcessError as err: os.remove(tmpfile_name) utils.error_exit("CPLEX exited with error code {}: {}\n".format(err.returncode, err.output)) # finally: # os.remove(tmpfile_name) cplex_output = cplex_output_binary_str.decode(locale.getpreferredencoding()) cplex_output_lines = cplex_output.split("\n") cplex_solution_line = cplex_output_lines[-1 if len(cplex_output_lines[-1]) > 0 else -2] try: cplex_solution = eval(cplex_solution_line) except Exception: utils.error_exit("Error evaluating the CPLEX solution line: {}\n".format(cplex_solution_line)) sys.stderr.write("{}\n".format(cplex_solution)) # if cplex_solution[0] not in (1, 101, 102): # utils.error_exit("CPLEX didn't find the optimal solution: {} {} # {}\n".format(cplex_solution[0], cplex_solution[1], # cplex_solution[2])) # if cplex_solution[0] == 102: optimal_sol_upp_bound_emp = int(math.floor(cplex_solution[2] * (1 + cplex_solution[3]))) # else: # optimal_sol_upp_bound_emp = cplex_solution[0] stats["emp_vc_dim"] = int(math.floor(math.log2(optimal_sol_upp_bound_emp))) + 1 if stats["emp_vc_dim"] > math.log2(len(negative_border)): sys.stderr.write("Lowering VC-dimension to maximum value\n") stats["emp_vc_dim"] = int(math.floor(math.log2(len(negative_border)))) sys.stderr.write( " ".join( ( "cand_len={}".format(cand_len), "longer_equal={}".format(longer_equal), "emp_vc_dim={}".format(stats["emp_vc_dim"]), "optimal_sol_upp_bound_emp={}\n".format(optimal_sol_upp_bound_emp), ) ) ) sys.stderr.flush() # If stopping condition is satisfied, exit. if stats["emp_vc_dim"] <= longer_equal: break os.remove(tmpfile_name) # Compute the bound to the shatter coefficient, which we use to compute # epsilon bound = min( (math.log(len(candidates)), stats["emp_vc_dim"] * math.log(math.e * stats["eval_size"] / stats["emp_vc_dim"])) ) # Compute second candidate to epsilon_2 emp_epsilon_2 = epsilon.get_eps_shattercoeff_bound(lower_delta, stats["eval_size"], bound, max_freq_base_set) sys.stderr.write( "cand_len={} opt_sol_upp_bound_emp={} emp_vc_dim={} bound={} max_freq_base_set={} emp_e2={}\n".format( cand_len, optimal_sol_upp_bound_emp, stats["emp_vc_dim"], bound, max_freq_base_set, emp_epsilon_2 ) ) sys.stderr.flush() sys.stderr.write("not_emp_e2={}, emp_e2={}\n".format(stats["epsilon_2_vc"], emp_epsilon_2)) sys.stderr.flush() stats["epsilon_2"] = min(emp_epsilon_2, stats["epsilon_2_vc"]) if len(candidates) > 0: sys.stderr.write("Computing the candidates that are TFIs...") sys.stderr.flush() freq_bound = min_freq + stats["epsilon_2"] eval_res_itemsets = frozenset(eval_res.keys()) for itemset in sorted(frozenset(candidates) & eval_res_itemsets, key=lambda x: eval_res[x], reverse=True): if eval_res[itemset] >= freq_bound: trueFIs[itemset] = eval_res[itemset] sys.stderr.write("done\n") sys.stderr.flush() return (trueFIs, stats)