def returnRuleWithMinSupport(binary_data_path, label, ruleList, minSupport, minConfidence, maxLength, file_path, length): binary_data = load_daily_data_simple(binary_data_path) if length == 5: minSupport_return = minSupport + 0.005 else: minSupport_return = minSupport try: f = open(file_path, 'a+') returnList = [] for rule in ruleList: df = (binary_data[rule[0]] == 1) for idx in range(1, len(rule)): df = df & (binary_data[rule[idx]] == 1) sup = np.sum(np.sum(df)) if (sup / maxLength) >= minSupport_return: returnList.append(rule) label_tem = label.copy() label_tem[~(df == True)] = np.nan ret = np.sum(np.sum(label_tem)) if ret / sup >= minConfidence and (sup / maxLength) >= minSupport: f.write('%s with sup %.6lf and conf %.6lf\n' % (str(rule), sup / maxLength, ret / sup)) f.flush() f.close() except Exception as e: print(traceback.format_exc()) del binary_data return returnList
def adjust_key(args, result_data, key, key_rule, label, accuracy=0.002, epochs=10): conf_baseline = get_conf(result_data, key_rule, label) with open(args.splitPointPath, 'r') as f: split_point_dict = json.load(fp=f) key_name = key[:-3] key_num = int(key[-3:]) data = load_daily_data_simple(args.dataPath, ['%s' % key_name]) data = data[key_name] array = data.values.reshape(len(data) * len(data.columns)) array = array[np.isfinite(array)] array = list(sorted(array)) accuracy_length = int(len(array) * accuracy) if key_num == 0: left_point = -np.inf right_point = split_point_dict[key_name][key_num] if len(key_rule) == 0: return [[-np.inf, right_point], [-np.inf, right_point]] for epoch in range(epochs): try: right_idx = array.index(right_point) except: right_idx = int(len(array) * (key_num + 1) * 0.1) if int(array[right_idx]) == int(right_point): pass else: return [[-np.inf, right_point], [-np.inf, right_point]] # 右→右 right_point = array[right_idx + accuracy_length] conf_right, data_tem_right_1 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) if conf_right > conf_baseline: result_data[key] = data_tem_right_1 right_point = array[right_idx + accuracy_length] continue # 右→左 right_point = array[right_idx - accuracy_length] conf_right, data_tem_right_2 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) if conf_right > conf_baseline: result_data[key] = data_tem_right_2 right_point = array[right_idx - accuracy_length] else: right_point = array[right_idx] break return [[-np.inf, split_point_dict[key_name][key_num]], [-np.inf, right_point]] elif key_num == len(split_point_dict[key_name]): left_point = split_point_dict[key_name][key_num - 1] right_point = np.inf if len(key_rule) == 0: return [[left_point, np.inf], [left_point, np.inf]] for epoch in range(epochs): try: left_idx = array.index(left_point) except: left_idx = int(len(array) * (key_num) * 0.1) if int(array[left_idx]) == int(left_point): pass else: return [[left_point, np.inf], [left_point, np.inf]] # 左→左 left_point = array[left_idx - accuracy_length] conf_left, data_tem_left_1 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) if conf_left > conf_baseline: result_data[key] = data_tem_left_1 left_point = array[left_idx - accuracy_length] continue # 左→右 left_point = array[left_idx + accuracy_length] conf_left, data_tem_left_2 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) if conf_left > conf_baseline: result_data[key] = data_tem_left_2 left_point = array[left_idx + accuracy_length] else: left_point = array[left_idx] break return [[split_point_dict[key_name][key_num - 1], np.inf], [left_point, np.inf]] else: left_point = split_point_dict[key_name][key_num - 1] right_point = split_point_dict[key_name][key_num] if len(key_rule) == 0: return [[left_point, right_point], [left_point, right_point]] for epoch in range(epochs): try: left_idx = array.index(left_point) right_idx = array.index(right_point) except: left_idx = int(len(array) * (key_num) * 0.1) right_idx = int(len(array) * (key_num + 1) * 0.1) if int(array[left_idx]) == int(left_point) and int( array[right_idx]) == int(right_point): pass else: return [[left_point, right_point], [left_point, right_point]] # 左→左 left_point = array[left_idx - accuracy_length] right_point = array[right_idx] conf_left, data_tem_left_1 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) # 右→右 left_point = array[left_idx] right_point = array[right_idx + accuracy_length] conf_right, data_tem_right_1 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) flag = compare_conf(conf_left, conf_right, conf_baseline) if flag == 1: result_data[key] = data_tem_left_1 left_point = array[left_idx - accuracy_length] right_point = array[right_idx] continue elif flag == 2: result_data[key] = data_tem_right_1 left_point = array[left_idx] right_point = array[right_idx + accuracy_length] continue # 左→右 left_point = array[left_idx + accuracy_length] right_point = array[right_idx] conf_left, data_tem_left_2 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) # 右→左 left_point = array[left_idx] right_point = array[right_idx - accuracy_length] conf_right, data_tem_right_2 = get_interval_conf( result_data, data, key, key_rule, label, left_point, right_point) flag = compare_conf(conf_left, conf_right, conf_baseline) if flag == 0: left_point = array[left_idx] right_point = array[right_idx] break elif flag == 1: result_data[key] = data_tem_left_2 left_point = array[left_idx + accuracy_length] right_point = array[right_idx] elif flag == 2: result_data[key] = data_tem_right_2 left_point = array[left_idx] right_point = array[right_idx - accuracy_length] return [[ split_point_dict[key_name][key_num - 1], split_point_dict[key_name][key_num] ], [left_point, right_point]]
adjust_interval(args, binary_data, label, ruleList, elogger) for key in binary_data.keys(): binary_data[key].to_csv('./binary_data_adjusted/%s.csv' % str(key)) binary_data_path = './binary_data_adjusted' length += 1 elogger.log('Time spent in filtering ruleList: %d seconds' % (end - start).seconds) candidateList = get_candidate_List(ruleList, elogger) elogger.log("The number of candidateList filtered is %d" % len(candidateList)) if __name__ == '__main__': data = load_daily_data_simple(args.dataPath) label = load_daily_data_simple(args.labelPath, ['label']) label = label['label'] label = label.loc[label.index <= '20171231'] minSup = args.sup minConf = args.conf elogger = Logger(args.logger) elogger.log(str(os.getpid())) elogger.log(str(args._get_kwargs())) with open('./EquiDepth_Label000_pct5.json', 'r') as f: split_point_dict = json.load(fp=f) binary_data = GetBinaryBySplitPoint(data, split_point_dict) attr_set = get_attr()
result = {} split_point_list = {} for key in data.keys(): print("Processing feature %s." % key) if key == 'label': continue key_type = get_type(data[key]) print("Processing feature %s. It's type %d" % (key, key_type)) if key_type == 0: result[key] = data[key] elif key_type == 1: result_tem, split_point_tem = categorical_to_binary(data[key], key) result.update(result_tem) split_point_list[key] = split_point_tem elif key_type == 2: result_tem, split_point_tem = quantitative_to_binary(data[key], data['label'], key) result.update(result_tem) split_point_list[key] = split_point_tem with open(config['split_point_save_path'], 'w') as f: json.dump(split_point_list, f) return result if __name__ == '__main__': data = load_daily_data_simple(config['path_data']) label = data['label'].loc[data['label'].index <= '20171231'] for key in data.keys(): data[key] = data[key].reindex(label.index) data['label'] = label result = get_binary(data) print(len(result.keys()))