def make_pattern(self, item_list, selected_item): """ make criticize pattern :param item_list: :param selected_item: :return: """ # attributes = self.get_attributes() attributes = self.__rules.keys() pt_keys = [] length = 0 pt_judged = {} pt_counter = Counter() def judge_pattern(t_value, b_value): _ptn = 0 if t_value and b_value: if t_value > b_value: _ptn = 1 elif t_value < b_value: _ptn = -1 return _ptn for a in attributes: name = a selected_value = vector_utils.to_vector(name, [selected_item])[0] # don't use None or empty value to create criticize pattern if selected_value is None: continue attribute_values = vector_utils.to_vector(name, item_list) pt_keys.append(name) if length == 0: length = len(item_list) pt_judged.update({name: [judge_pattern(a_v, selected_value) for a_v in attribute_values]}) # make pattern for p_index in range(0, length): # single pattern & multiple(combination of two attribute) pattern for cnt in [1, 2]: for combi in itertools.combinations(pt_keys, cnt): ptn = ",".join(combi) p_ptn = "".join(["X" if pt_judged[a_k][p_index] == 1 else "" for a_k in combi]) n_ptn = "".join(["X" if pt_judged[a_k][p_index] == -1 else "" for a_k in combi]) if len(p_ptn) == cnt: pt_counter["+:" + ptn] += 1 if len(n_ptn) == cnt: pt_counter["-:" + ptn] += 1 # order by support rate, and define pattern by at least two count patterns = filter(lambda p: p[1] > 1, pt_counter.items()) patterns = [self.pattern_type(item[0], item[1] / length) for item in patterns] patterns = sorted(patterns, key=lambda p: p.score) return patterns
def __assert_vector_integrity(self, obj_list, target_attribute, is_print=True): attributes = vector_manager.to_vector(target_attribute, obj_list) self.assertEquals(len(obj_list), len(attributes)) for index, value in enumerate(attributes): if is_print: print(value) self.assertEquals(vector_manager.to_value(getattr(obj_list[index], target_attribute)), value)
def calc_near_is_better(cls, attr_name, item_list, selected_item): if selected_item and getattr(selected_item, attr_name): values = vector_utils.to_vector(attr_name, item_list) attr = getattr(selected_item, attr_name) normalized = cls.__normalize(values, prop=attr) normalized = [v if v is None else 1 - abs(v) for v in normalized] return normalized else: raise NotCalculatable("selected item's " + attr_name + " is None")
def test_calc_more_is_better(self): data = self.__create_test_data() values = vector_manager.to_vector("reviews", data) max_value = max(filter(None, values)) min_value = min(filter(None, values)) calc_result = [(v - min_value) / (max_value - min_value) for v in values if v] calc_returned = ItemEvaluator.calc_more_is_better("reviews", data) for index, value in enumerate(calc_result): self.assertLess(abs(calc_result[index] - calc_returned[index]), 1 / pow(10, 5)) print(calc_result[index])
def test_calc_near_is_better(self): data = self.__create_test_data() values = vector_manager.to_vector("bpm", data) max_value = max(filter(None, values)) min_value = min(filter(None, values)) selected = EvaluateItem().set_params(100, 10, datetime(2010, 4, 1, 0, 0)) calc_result = [1 - abs(v - selected.bpm) / (max_value - min_value) for v in values if v] calc_returned = ItemEvaluator.calc_near_is_better("bpm", data, selected) for index, value in enumerate(calc_result): self.assertLess(abs(calc_result[index] - calc_returned[index]), 1 / pow(10, 5)) print(calc_result[index])
def calc_text_token_distance(cls, attr_name, item_list, selected_item): if selected_item and getattr(selected_item, attr_name): item_tokens = vector_utils.to_vector(attr_name, item_list) tokens = vector_utils.to_value(getattr(selected_item, attr_name)) clusters, vectors = vector_utils.make_text_clusters(item_tokens) target_vector = vector_utils.classify_text_tokens(tokens, clusters) distances = [vector_utils.calc_vector_distance(target_vector, v) for v in vectors] inv_distance = [4 if d == 0 else 1 - math.log(d) for d in distances] # 4 is large enough in f(x) = 1-log(x) return cls.normalize(inv_distance) else: raise NotCalculatable("selected item's " + attr_name + " is None")
def calc_less_is_better(cls, attr_name, item_list, selected_item=None): values = vector_utils.to_vector(attr_name, item_list) return cls.__normalize(values, normalize_value_type="max")