def transform_one(self, obs, target, id): """obs is a list of attributes""" cnt = 0 for lst in obs: if not lst[0].startswith("bullet"): cnt += 1 return np_utils._try_divide(cnt, len(obs))
def _get_importance(self, text1, text2): len_prev_1 = len(text1.split(" ")) len_prev_2 = len(text2.split(" ")) len1 = len(self._get_valid_word_list(text1)) len2 = len(self._get_valid_word_list(text2)) imp = np_utils._try_divide(len1+len2, len_prev_1+len_prev_2) return imp
def _get_importance(self, text1, text2): len_prev_1 = len(text1.split(" ")) len_prev_2 = len(text2.split(" ")) len1 = len(self._get_valid_word_list(text1)) len2 = len(self._get_valid_word_list(text2)) imp = np_utils._try_divide(len1 + len2, len_prev_1 + len_prev_2) return imp
def _dice_dist(A, B): if not isinstance(A, set): A = set(A) if not isinstance(B, set): B = set(B) return np_utils._try_divide(2. * float(len(A.intersection(B))), (len(A) + len(B)))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) K = self.k1 * (1 - self.b + self.b * np_utils._try_divide(len(target_ngrams), self.avg_ngram_doc_len)) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. bm25 = s * self._get_idf(w1) * np_utils._try_divide(1 + self.k1, s + K) val_list.append(bm25) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def _longest_match_ratio(str1, str2): """ find the longest matching block between string1 and string2, and then calculate the string length divide by min(string1, string2) """ sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return np_utils._try_divide(match.size, min(len(str1), len(str2)))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide( self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
def _inter_norm_pos_list(obs, target): """ ex: _inter_norm_pos_list([1,2,3,4,5,6,9,1,1,1,1,1,1], [1]) = [0.07692307692307693, 0.6153846153846154, 0.6923076923076923, 0.7692307692307693, 0.8461538461538461, 0.9230769230769231, 1.0] """ pos_list = _inter_pos_list(obs, target) N = len(obs) return [np_utils._try_divide(i, N) for i in pos_list]
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) s = 0. for w1 in obs_ngrams: for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_nterms = ngram_utils._nterms(obs_tokens, self.nterm) target_nterms = ngram_utils._nterms(target_tokens, self.nterm) s = 0. for w1 in obs_nterms: for w2 in target_nterms: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. return np_utils._try_divide(s, len(obs_nterms) * len(target_nterms))
def _compression_dist(x, y, l_x=None, l_y=None): if x == y: return 0 x_b = x.encode("utf-8") y_b = y.encode("utf-8") if l_x is None: l_x = len(lzma.compress(x_b)) l_y = len(lzma.compress(y_b)) l_xy = len(lzma.compress(x_b + y_b)) l_yx = len(lzma.compress(y_b + x_b)) dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y)) return dist
def _compression_dist(x, y, l_x=None, l_y=None): if x == y: return 0 x_b = x.encode('utf-8') y_b = y.encode('utf-8') if l_x is None: l_x = len(lzma.compress(x_b)) l_y = len(lzma.compress(y_b)) l_xy = len(lzma.compress(x_b + y_b)) l_yx = len(lzma.compress(y_b + x_b)) dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y)) return dist
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) val_list = [] for w1 in obs_ngrams: s = 0. for w2 in target_ngrams: if dist_utils._is_str_match(w1, w2, self.str_match_threshold): s += 1. val_list.append(np_utils._try_divide(s, len(target_ngrams))) if len(val_list) == 0: val_list = [config.MISSING_VALUE_NUMERIC] return val_list
def _count_stats(s1, s2): # length l1 = len(s1) l2 = len(s2) len_diff = np_utils._try_divide(np.abs(l1 - l2), (l1 + l2) / 2.) # set s1_set = set(s1) s2_set = set(s2) # unique length l1_unique = len(s1_set) l2_unique = len(s2_set) len_diff_unique = np_utils._try_divide(np.abs(l1_unique - l2_unique), (l1_unique + l2_unique) / 2.) # unique ratio r1_unique = np_utils._try_divide(l1_unique, l1) r2_unique = np_utils._try_divide(l2_unique, l2) # jaccard coef li = len(s1_set.intersection(s2_set)) lu = len(s1_set.union(s2_set)) jaccard_coef = np_utils._try_divide(li, lu) # dice coef dice_coef = np_utils._try_divide(li, l1_unique + l2_unique) # common number common_ = _common_num(s1, s2) common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.) common_ratio_max = np_utils._try_divide(common_, min(l1, l2)) common_ratio_min = np_utils._try_divide(common_, max(l1, l2)) # over all features f = [ l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique, r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg, common_ratio_max, common_ratio_min ] return np.array(f, dtype=np.float32)
def _compression_dist(x, y, l_x=None, l_y=None): """ compress data (a bytes object) and decide two string distance by calculating result length """ if x == y: return 0 x_b = x.encode('utf-8') y_b = y.encode('utf-8') if l_x is None: l_x = len(lzma.compress(x_b)) l_y = len(lzma.compress(y_b)) l_xy = len(lzma.compress(x_b + y_b)) l_yx = len(lzma.compress(y_b + x_b)) dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y)) return dist
def _longest_match_ratio(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return np_utils._try_divide(match.size, min(len(str1), len(str2)))
def _jaccard_coef(A, B): if not isinstance(A, set): A = set(A) if not isinstance(B, set): B = set(B) return np_utils._try_divide(float(len(A.intersection(B))), len(A.union(B)))
def transform_one(self, obs, target, id): lo = len(obs.split(" ")) lt = len([t[0] for t in target if not t[0].startswith("bullet")]) return np_utils._try_divide(super().transform_one(obs, target, id), lo * lt)
def _inter_norm_pos_list(obs, target): pos_list = _inter_pos_list(obs, target) N = len(obs) return [np_utils._try_divide(i, N) for i in pos_list]
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram) return np_utils._try_divide(self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
def transform_one(self, obs, target, id): lo = len(obs.split(" ")) lt = len([t[0] for t in target if not t[0].startswith("bullet")]) return np_utils._try_divide(super().transform_one(obs, target, id), lo*lt)
def transform_one(self, obs, target, id): return np_utils._try_divide(super().transform_one(obs, target, id), len(target.split(" ")))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram) return np_utils._try_divide(len(set(obs_ngrams)), len(obs_ngrams))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) return np_utils._try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) digits = re.findall(r"\d", obs) return np_utils._try_divide(len(digits), len(obs_tokens))
def _dice_dist(A, B): if not isinstance(A, set): A = set(A) if not isinstance(B, set): B = set(B) return np_utils._try_divide(2.0 * float(len(A.intersection(B))), (len(A) + len(B)))
def transform_one(self, obs, target, id): obs_tokens = nlp_utils._tokenize(obs, token_pattern) target_tokens = nlp_utils._tokenize(target, token_pattern) return abs(np_utils._try_divide(len(obs_tokens), len(target_tokens)) - 1)