コード例 #1
0
 def transform_one(self, obs, target, id):
     """obs is a list of attributes"""
     cnt = 0
     for lst in obs:
         if not lst[0].startswith("bullet"):
             cnt += 1
     return np_utils._try_divide(cnt, len(obs))
コード例 #2
0
 def _get_importance(self, text1, text2):
     len_prev_1 = len(text1.split(" "))
     len_prev_2 = len(text2.split(" "))
     len1 = len(self._get_valid_word_list(text1))
     len2 = len(self._get_valid_word_list(text2))
     imp = np_utils._try_divide(len1+len2, len_prev_1+len_prev_2)
     return imp
コード例 #3
0
 def transform_one(self, obs, target, id):
     """obs is a list of attributes"""
     cnt = 0
     for lst in obs:
         if not lst[0].startswith("bullet"):
             cnt += 1
     return np_utils._try_divide(cnt, len(obs))
コード例 #4
0
ファイル: feature_word2vec.py プロジェクト: qianteng/JQXXXX
 def _get_importance(self, text1, text2):
     len_prev_1 = len(text1.split(" "))
     len_prev_2 = len(text2.split(" "))
     len1 = len(self._get_valid_word_list(text1))
     len2 = len(self._get_valid_word_list(text2))
     imp = np_utils._try_divide(len1 + len2, len_prev_1 + len_prev_2)
     return imp
コード例 #5
0
ファイル: dist_utils.py プロジェクト: qianteng/Quora_HD
def _dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return np_utils._try_divide(2. * float(len(A.intersection(B))),
                                (len(A) + len(B)))
コード例 #6
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     K = self.k1 * (1 - self.b + self.b * np_utils._try_divide(len(target_ngrams), self.avg_ngram_doc_len))
     val_list = []
     for w1 in obs_ngrams:
         s = 0.
         for w2 in target_ngrams:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
         bm25 = s * self._get_idf(w1) * np_utils._try_divide(1 + self.k1, s + K)
         val_list.append(bm25)
     if len(val_list) == 0:
         val_list = [config.MISSING_VALUE_NUMERIC]
     return val_list
コード例 #7
0
def _longest_match_ratio(str1, str2):
    """
    find the longest matching block between string1 and string2, 
    and then calculate the string length divide by min(string1, string2)
    """
    sq = SequenceMatcher(lambda x: x == " ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return np_utils._try_divide(match.size, min(len(str1), len(str2)))
コード例 #8
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     return np_utils._try_divide(
         self._get_match_count(obs_ngrams, target_ngrams, self.idx),
         len(target_ngrams))
コード例 #9
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     K = self.k1 * (1 - self.b + self.b * np_utils._try_divide(len(target_ngrams), self.avg_ngram_doc_len))
     val_list = []
     for w1 in obs_ngrams:
         s = 0.
         for w2 in target_ngrams:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
         bm25 = s * self._get_idf(w1) * np_utils._try_divide(1 + self.k1, s + K)
         val_list.append(bm25)
     if len(val_list) == 0:
         val_list = [config.MISSING_VALUE_NUMERIC]
     return val_list
コード例 #10
0
def _inter_norm_pos_list(obs, target):
    """
    ex:
        _inter_norm_pos_list([1,2,3,4,5,6,9,1,1,1,1,1,1], [1])
        = [0.07692307692307693, 0.6153846153846154, 0.6923076923076923, 0.7692307692307693, 0.8461538461538461, 0.9230769230769231, 1.0]
    """
    pos_list = _inter_pos_list(obs, target)
    N = len(obs)
    return [np_utils._try_divide(i, N) for i in pos_list]
コード例 #11
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     s = 0.
     for w1 in obs_ngrams:
         for w2 in target_ngrams:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
     return np_utils._try_divide(s, len(obs_ngrams)*len(target_ngrams))
コード例 #12
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_nterms = ngram_utils._nterms(obs_tokens, self.nterm)
     target_nterms = ngram_utils._nterms(target_tokens, self.nterm)
     s = 0.
     for w1 in obs_nterms:
         for w2 in target_nterms:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
     return np_utils._try_divide(s, len(obs_nterms) * len(target_nterms))
コード例 #13
0
ファイル: dist_utils.py プロジェクト: pjpan/Practice
def _compression_dist(x, y, l_x=None, l_y=None):
    if x == y:
        return 0
    x_b = x.encode("utf-8")
    y_b = y.encode("utf-8")
    if l_x is None:
        l_x = len(lzma.compress(x_b))
        l_y = len(lzma.compress(y_b))
    l_xy = len(lzma.compress(x_b + y_b))
    l_yx = len(lzma.compress(y_b + x_b))
    dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y))
    return dist
コード例 #14
0
ファイル: dist_utils.py プロジェクト: qianteng/Quora_HD
def _compression_dist(x, y, l_x=None, l_y=None):
    if x == y:
        return 0
    x_b = x.encode('utf-8')
    y_b = y.encode('utf-8')
    if l_x is None:
        l_x = len(lzma.compress(x_b))
        l_y = len(lzma.compress(y_b))
    l_xy = len(lzma.compress(x_b + y_b))
    l_yx = len(lzma.compress(y_b + x_b))
    dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y))
    return dist
コード例 #15
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     val_list = []
     for w1 in obs_ngrams:
         s = 0.
         for w2 in target_ngrams:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
         val_list.append(np_utils._try_divide(s, len(target_ngrams)))
     if len(val_list) == 0:
         val_list = [config.MISSING_VALUE_NUMERIC]
     return val_list
コード例 #16
0
ファイル: dist_utils.py プロジェクト: y12uc231/BERT-1
def _count_stats(s1, s2):
    # length
    l1 = len(s1)
    l2 = len(s2)
    len_diff = np_utils._try_divide(np.abs(l1 - l2), (l1 + l2) / 2.)

    # set
    s1_set = set(s1)
    s2_set = set(s2)

    # unique length
    l1_unique = len(s1_set)
    l2_unique = len(s2_set)
    len_diff_unique = np_utils._try_divide(np.abs(l1_unique - l2_unique),
                                           (l1_unique + l2_unique) / 2.)

    # unique ratio
    r1_unique = np_utils._try_divide(l1_unique, l1)
    r2_unique = np_utils._try_divide(l2_unique, l2)

    # jaccard coef
    li = len(s1_set.intersection(s2_set))
    lu = len(s1_set.union(s2_set))
    jaccard_coef = np_utils._try_divide(li, lu)

    # dice coef
    dice_coef = np_utils._try_divide(li, l1_unique + l2_unique)

    # common number
    common_ = _common_num(s1, s2)
    common_ratio_avg = np_utils._try_divide(common_, (l1 + l2) / 2.)
    common_ratio_max = np_utils._try_divide(common_, min(l1, l2))
    common_ratio_min = np_utils._try_divide(common_, max(l1, l2))

    # over all features
    f = [
        l1, l2, len_diff, l1_unique, l2_unique, len_diff_unique, r1_unique,
        r2_unique, li, lu, jaccard_coef, dice_coef, common_, common_ratio_avg,
        common_ratio_max, common_ratio_min
    ]
    return np.array(f, dtype=np.float32)
コード例 #17
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     val_list = []
     for w1 in obs_ngrams:
         s = 0.
         for w2 in target_ngrams:
             if dist_utils._is_str_match(w1, w2, self.str_match_threshold):
                 s += 1.
         val_list.append(np_utils._try_divide(s, len(target_ngrams)))
     if len(val_list) == 0:
         val_list = [config.MISSING_VALUE_NUMERIC]
     return val_list
コード例 #18
0
def _compression_dist(x, y, l_x=None, l_y=None):
    """
    compress data (a bytes object) and decide two string distance by calculating result length
    """
    if x == y:
        return 0
    x_b = x.encode('utf-8')
    y_b = y.encode('utf-8')
    if l_x is None:
        l_x = len(lzma.compress(x_b))
        l_y = len(lzma.compress(y_b))
    l_xy = len(lzma.compress(x_b + y_b))
    l_yx = len(lzma.compress(y_b + x_b))
    dist = np_utils._try_divide(min(l_xy, l_yx) - min(l_x, l_y), max(l_x, l_y))
    return dist
コード例 #19
0
ファイル: dist_utils.py プロジェクト: qianteng/Quora_HD
def _longest_match_ratio(str1, str2):
    sq = SequenceMatcher(lambda x: x == " ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return np_utils._try_divide(match.size, min(len(str1), len(str2)))
コード例 #20
0
ファイル: dist_utils.py プロジェクト: pjpan/Practice
def _longest_match_ratio(str1, str2):
    sq = SequenceMatcher(lambda x: x == " ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return np_utils._try_divide(match.size, min(len(str1), len(str2)))
コード例 #21
0
ファイル: dist_utils.py プロジェクト: pjpan/Practice
def _jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return np_utils._try_divide(float(len(A.intersection(B))), len(A.union(B)))
コード例 #22
0
 def transform_one(self, obs, target, id):
     lo = len(obs.split(" "))
     lt = len([t[0] for t in target if not t[0].startswith("bullet")])
     return np_utils._try_divide(super().transform_one(obs, target, id),
                                 lo * lt)
コード例 #23
0
def _inter_norm_pos_list(obs, target):
    pos_list = _inter_pos_list(obs, target)
    N = len(obs)
    return [np_utils._try_divide(i, N) for i in pos_list]
コード例 #24
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     target_ngrams = ngram_utils._ngrams(target_tokens, self.ngram)
     return np_utils._try_divide(self._get_match_count(obs_ngrams, target_ngrams, self.idx), len(target_ngrams))
コード例 #25
0
 def transform_one(self, obs, target, id):
     lo = len(obs.split(" "))
     lt = len([t[0] for t in target if not t[0].startswith("bullet")])
     return np_utils._try_divide(super().transform_one(obs, target, id), lo*lt)
コード例 #26
0
 def transform_one(self, obs, target, id):
     return np_utils._try_divide(super().transform_one(obs, target, id), len(target.split(" ")))
コード例 #27
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     return np_utils._try_divide(len(set(obs_ngrams)), len(obs_ngrams))
コード例 #28
0
def _inter_norm_pos_list(obs, target):
    pos_list = _inter_pos_list(obs, target)
    N = len(obs)
    return [np_utils._try_divide(i, N) for i in pos_list]
コード例 #29
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     return np_utils._try_divide(len(re.findall(r"\d", obs)), len(obs_tokens))
コード例 #30
0
ファイル: feature_basic.py プロジェクト: ebernhardson/l2r
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     digits = re.findall(r"\d", obs)
     return np_utils._try_divide(len(digits), len(obs_tokens))
コード例 #31
0
ファイル: feature_basic.py プロジェクト: ebernhardson/l2r
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     obs_ngrams = ngram_utils._ngrams(obs_tokens, self.ngram)
     return np_utils._try_divide(len(set(obs_ngrams)), len(obs_ngrams))
コード例 #32
0
ファイル: dist_utils.py プロジェクト: pjpan/Practice
def _dice_dist(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return np_utils._try_divide(2.0 * float(len(A.intersection(B))), (len(A) + len(B)))
コード例 #33
0
 def transform_one(self, obs, target, id):
     return np_utils._try_divide(super().transform_one(obs, target, id),
                                 len(target.split(" ")))
コード例 #34
0
ファイル: dist_utils.py プロジェクト: qianteng/Quora_HD
def _jaccard_coef(A, B):
    if not isinstance(A, set):
        A = set(A)
    if not isinstance(B, set):
        B = set(B)
    return np_utils._try_divide(float(len(A.intersection(B))), len(A.union(B)))
コード例 #35
0
 def transform_one(self, obs, target, id):
     obs_tokens = nlp_utils._tokenize(obs, token_pattern)
     target_tokens = nlp_utils._tokenize(target, token_pattern)
     return abs(np_utils._try_divide(len(obs_tokens), len(target_tokens)) - 1)