Exemple #1
0
def TFIDFWordMatchShare(data_list, tf_idf):
    # 词共现的基础上乘以tf-idf 特征数量1
    tfidf_WordMatch = []
    for data in data_list:
        q1words = {}
        q2words = {}
        for word in data[0].split():
            q1words[word] = q1words.get(word, 0) + 1
        for word in data[1].split():
            q2words[word] = q2words.get(word, 0) + 1
        sum_shared_word_in_q1 = sum(
            [q1words[w] * tf_idf.get(w, 0) for w in q1words if w in q2words])
        sum_shared_word_in_q2 = sum(
            [q2words[w] * tf_idf.get(w, 0) for w in q2words if w in q1words])
        sum_tol = sum(q1words[w] * tf_idf.get(w, 0)
                      for w in q1words) + sum(q2words[w] * tf_idf.get(w, 0)
                                              for w in q2words)
        if 1e-6 > sum_tol:
            tfidf_WordMatch.append([0.])
        else:
            tfidf_WordMatch.append([
                1.0 * (sum_shared_word_in_q1 + sum_shared_word_in_q2) / sum_tol
            ])
    LogUtil.log(
        "INFO",
        "词共现的基础上乘以tf-idf 特征数量1 TFIDFWordMatchShare features, len(tfidf_WordMatch)=%d"
        % len(tfidf_WordMatch))
    return tfidf_WordMatch
Exemple #2
0
def getLDA_features(stop_word, all_file_with_label, features_num, features_file):
    # 获取LAD特征 LDA 受到分词效果的影响巨大
    all_data_frame = pd.read_csv(open(all_file_with_label, mode='r', encoding="UTF-8"), sep="\t", header=None)
    label = all_data_frame[2]
    data_list = []
    for index, line in tqdm(all_data_frame.iterrows()):
        data_list.append(line[0])
        data_list.append(line[1])
    LogUtil.log("INFO", "加载数据总量 = %d, file done " % (len(data_list) / 2))

    cntVector = CountVectorizer(stop_words=stop_word)
    cntTf = cntVector.fit_transform(data_list)
    # print(cntVector.vocabulary_)
    LogUtil.log("INFO", "开始训练LDA模型 ")
    lda = LatentDirichletAllocation(n_components=features_num, learning_method='online', batch_size=128, learning_offset=20., random_state=0, max_iter=10)
    docres = lda.fit_transform(cntTf).tolist()
    data_features_list = []
    for index in tqdm(range(int(len(docres)/2))):
        line_features = docres[index*2] + docres[index*2+1]
        data_features_list.append(line_features)
    LogUtil.log("INFO", "特征长度=%d, file done " % (len(data_features_list[0])))
    LogUtil.log("INFO", "数据总理=%d, file done " % (len(data_features_list)))

    log_reg = LogisticRegression(class_weight='balanced')
    log_reg.fit(data_features_list[0:238766], label[0:238766])
    predict_y = log_reg.predict(data_features_list[238766: 238766 + 8802])
    print(classification_report(label[238766: 238766 + 8802], predict_y))
    LDA_features_file_object = open(features_file, mode='wb')
    pk.dump(data_features_list, LDA_features_file_object)
    LDA_features_file_object.close()
    LogUtil.log("INFO", "Writer LDA features into file done ")
Exemple #3
0
def WordMatchShare(data_list, stop_word_list):
    # 统计句子1和句子2的词相同词的数量 作为特征 特征数量1
    static_WordMatchShare = []
    for line in data_list:
        q1words = {}
        q2words = {}
        for word in line[0].split():
            if word not in stop_word_list:
                q1words[word] = q1words.get(word, 0) + 1
        for word in line[1].split():
            if word not in stop_word_list:
                q2words[word] = q2words.get(word, 0) + 1
        n_shared_word_in_q1 = sum(
            [q1words[w] for w in q1words if w in q2words])
        n_shared_word_in_q2 = sum(
            [q2words[w] for w in q2words if w in q1words])
        n_tol = sum(q1words.values()) + sum(q2words.values())
        if 1e-6 > n_tol:
            static_WordMatchShare.append([0.])
        else:
            static_WordMatchShare.append(
                [1.0 * (n_shared_word_in_q1 + n_shared_word_in_q2) / n_tol])
    LogUtil.log(
        "INFO",
        "句子1和句子2的词相同词的数量 特征数量1 WordMatchShare features, len(static_WordMatchShare)=%d"
        % len(static_WordMatchShare))
    return static_WordMatchShare
Exemple #4
0
def ProcessPOSTag(stanford_core_nlp, data_list_no_token, stanford_token_file, stanford_postag_file):
    # 这里改成你stanford-corenlp所在的目录
    nlp = StanfordCoreNLP(stanford_core_nlp, lang='zh')
    stanford_token = []
    stanford_postag = []
    for data in tqdm(data_list_no_token):
        sentence_1_token = nlp.word_tokenize(data[0])
        sentence_1_POS = nlp.pos_tag(data[0])
        simply_POS_1 = []
        for POS in sentence_1_POS:
            simply_POS_1.append(POS[1])
        sentence_2_token = nlp.word_tokenize(data[1])
        sentence_2_POS = nlp.pos_tag(data[1])
        simply_POS_2 = []
        for POS in sentence_2_POS:
            simply_POS_2.append(POS[1])
        stanford_token.append([" ".join(sentence_1_token), " ".join(sentence_2_token)])
        stanford_postag.append([" ".join(simply_POS_1), " ".join(simply_POS_2)])
    nlp.close()
    assert len(data_list_no_token) == len(stanford_token), "token数量出现问题"
    assert len(data_list_no_token) == len(stanford_postag), "POStag数量出现问题"
    stanford_token_file_object = open(stanford_token_file, mode='w', encoding="UTF-8")
    for line in stanford_token:
        stanford_token_file_object.write(line[0] + "\t" + line[1] + "\n")
    stanford_token_file_object.close()
    LogUtil.log("INFO",
                "Stanford Token Writer to stanford_token_file done, len(stanford_token)=%d" % len(stanford_token))
    stanford_postag_file_object = open(stanford_postag_file, mode='w', encoding="UTF-8")
    for line in stanford_postag:
        stanford_postag_file_object.write(line[0] + "\t" + line[1] + "\n")
    stanford_postag_file_object.close()
    LogUtil.log("INFO",
                "Stanford POStag Writer to stanford_postag_file done, len(stanford_postag)=%d" % len(stanford_postag))
Exemple #5
0
 def init_powerful_word_dside(pword, thresh_num, thresh_rate):
     pword_dside = []
     pword = filter(lambda x: x[1][0] * x[1][5] >= thresh_num, pword)
     pword_sort = sorted(pword, key=lambda d: d[1][6], reverse=True)
     pword_dside.extend(
         map(lambda x: x[0],
             filter(lambda x: x[1][6] >= thresh_rate, pword_sort)))
     LogUtil.log(
         'INFO', 'Double side power words(%d): %s' %
         (len(pword_dside), str(pword_dside)))
     return pword_dside
Exemple #6
0
 def init_tfidf(token_data_file, stops_list):
     all_data_frame = pd.read_csv(open(token_data_file, encoding="UTF-8"),
                                  error_bad_lines=False,
                                  sep="\t",
                                  header=None)
     tfidf = TfidfVectorizer(stop_words=stops_list, ngram_range=(1, 1))
     tfidf_txt = pd.Series(all_data_frame[0].tolist() +
                           all_data_frame[1].tolist()).astype(str)
     tfidf.fit_transform(tfidf_txt)
     LogUtil.log("INFO", "init tfidf done ")
     return tfidf
Exemple #7
0
def LengthDiff(data_list):
    # 句子长度差 get_feature_num = 1
    length_diff = []
    for data in data_list:
        length_diff.append([abs(len(data[0]) - len(data[1]))])
    LogUtil.log(
        "INFO",
        "句子长度差 get_feature_num = 1, LengthDiff features, len(length_diff)=%d" %
        len(length_diff))
    min_max_scaler = preprocessing.MinMaxScaler((0, 1))
    length_diff = min_max_scaler.fit_transform(length_diff)
    return length_diff.tolist()
Exemple #8
0
def LengthDiffRate(data_list):
    # 两个句子长度比 get_feature_num = 1
    length_diff_rate_list = []
    for data in data_list:
        len_q1 = len(data[0])
        len_q2 = len(data[1])
        if max(len_q1, len_q2) < 1e-6:
            length_diff_rate_list.append([0.0])
        else:
            length_diff_rate_list.append(
                [1.0 * min(len_q1, len_q2) / max(len_q1, len_q2)])
    LogUtil.log(
        "INFO",
        "两个句子长度比 get_feature_num = 1, LengthDiffRate features, len(length_diff_rate_list)=%d"
        % len(length_diff_rate_list))
    return length_diff_rate_list
Exemple #9
0
def NgramDiceDistance(data_list):
    # DiceDistance距离 get_feature_num = 4
    all_DiceDistance = []
    for data in data_list:
        q1_words = data[0].split()
        q2_words = data[1].split()
        fs = list()
        for n in range(1, 4):
            q1_ngrams = NgramUtil.ngrams(q1_words, n)
            q2_ngrams = NgramUtil.ngrams(q2_words, n)
            fs.append(DistanceUtil.dice_dist(q1_ngrams, q2_ngrams))
        all_DiceDistance.append(fs)
    LogUtil.log(
        "INFO",
        "DiceDistance距离 NgramDiceDistance get_feature_num = 4, len(all_DiceDistance)=%d"
        % len(all_DiceDistance))
    return all_DiceDistance
Exemple #10
0
def NgramJaccardCoef(data_list):
    # n-gram jaccard系数特征 get_feature_num = 4
    all_jaccard = []
    for data in data_list:
        q1_words = data[0].split()
        q2_words = data[1].split()
        fs = list()
        for n in range(1, 4):
            q1_ngrams = NgramUtil.ngrams(q1_words, n)
            q2_ngrams = NgramUtil.ngrams(q2_words, n)
            fs.append(DistanceUtil.jaccard_coef(q1_ngrams, q2_ngrams))
        all_jaccard.append(fs)
    LogUtil.log(
        "INFO",
        "n-gram jaccard系数特征 NgramJaccardCoef get_feature_num = 4, len(all_jaccard)=%d"
        % len(all_jaccard))
    return all_jaccard
Exemple #11
0
def Length(data_list):
    # 句子长度 句子字符长度和句子分词长度 get_feature_num = 4
    sentence_length_list = []
    for data in data_list:
        fs = list()
        fs.append(len(data[0]))
        fs.append(len(data[1]))
        fs.append(len(data[0].split()))
        fs.append(len(data[1].split()))
        sentence_length_list.append(fs)
    LogUtil.log(
        "INFO",
        "句子长度 句子字符长度和句子分词长度 get_feature_num = 4, Length features, len(sentence_length_list)=%d"
        % len(sentence_length_list))
    min_max_scaler = preprocessing.MinMaxScaler((0, 1))
    sentence_length_list = min_max_scaler.fit_transform(sentence_length_list)
    return sentence_length_list.tolist()
Exemple #12
0
 def extract_all_features(self, data_list):
     all_data_tags = []
     for data in data_list:
         tags = []
         q1_words = data[0].split()
         q2_words = data[1].split()
         for word in self.pword_dside:
             if (word in q1_words) and (word in q2_words):
                 tags.append(1.0)
             else:
                 tags.append(0.0)
         all_data_tags.append(tags)
     LogUtil.log(
         "INFO",
         "PowerfulWordDoubleSide get_feature_num = %d, len(all_data_tags)=%d"
         % (len(self.pword_dside), len(all_data_tags)))
     return all_data_tags
Exemple #13
0
 def extract_features(self, data_list):
     tf_idf_statics_features = []
     for data in tqdm(data_list):
         q1_tf_idf = self.tfidf.transform([data[0]])
         q2_tf_idf = self.tfidf.transform([data[1]])
         fs = list()
         fs.append(np.sum(q1_tf_idf.data))
         fs.append(np.sum(q2_tf_idf.data))
         fs.append(len(q1_tf_idf.data))
         fs.append(len(q2_tf_idf.data))
         cosine_similarities = linear_kernel(q1_tf_idf, q2_tf_idf).flatten()
         fs.append(cosine_similarities[0])
         tf_idf_statics_features.append(fs)
     LogUtil.log(
         "INFO",
         "tf-idf值的求和特征,并不是纯tf-idf特征 TFIDF get_feature_num = 6, len(tf_idf_statics_features)=%d"
         % len(tf_idf_statics_features))
     return tf_idf_statics_features
Exemple #14
0
def generate_idf(data_list):
    idf = {}
    q_set = set()
    for data in data_list:
        if data[0] not in q_set:
            q_set.add(data[0])
            words = data[0].split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
        if data[1] not in q_set:
            q_set.add(data[1])
            words = data[1].split()
            for word in words:
                idf[word] = idf.get(word, 0) + 1
    num_docs = len(data_list)
    for word in idf:
        idf[word] = math.log(num_docs / (idf[word] + 1.)) / math.log(2.)
    LogUtil.log("INFO", "idf calculation done, len(idf)=%d" % len(idf))
    return idf
Exemple #15
0
def LevenshteinDistance(data_list):
    # 编辑距离 4个特征
    all_levenshtein_feature = []
    for data in tqdm(data_list):
        every_levenshtein = []
        levenshtein_ratio = fuzz.ratio(data[0], data[1]) / 100
        levenshtein_partial_ratio = fuzz.partial_ratio(data[0], data[1]) / 100
        levenshtein_token_sort_ratio = fuzz.token_sort_ratio(data[0],
                                                             data[1]) / 100
        levenshtein_set_ratio = fuzz.token_set_ratio(data[0], data[1]) / 100
        every_levenshtein.append(levenshtein_ratio)
        every_levenshtein.append(levenshtein_partial_ratio)
        every_levenshtein.append(levenshtein_token_sort_ratio)
        every_levenshtein.append(levenshtein_set_ratio)
        all_levenshtein_feature.append(every_levenshtein)
    LogUtil.log(
        "INFO",
        "LevenshteinDistance距离 get_feature_num = 4, len(all_levenshtein_feature)=%d"
        % len(all_levenshtein_feature))
    return all_levenshtein_feature
Exemple #16
0
 def extract_all_features(self, data_list):
     all_data_rate = []
     num_least = 300
     rate = [1.0]
     for data in data_list:
         q1_words = set(data[0].split())
         q2_words = set(data[1].split())
         share_words = list(q1_words.intersection(q2_words))
         for word in share_words:
             if word not in self.pword_dict:
                 continue
             if self.pword_dict[word][0] * self.pword_dict[word][
                     5] < num_least:
                 continue
             rate[0] *= (1.0 - self.pword_dict[word][6])
         rate = [1 - num for num in rate]
         all_data_rate.append(rate)
     LogUtil.log(
         "INFO",
         "PowerfulWordDoubleSideRate get_feature_num = 1, len(all_data_rate)=%d"
         % len(all_data_rate))
     return all_data_rate
Exemple #17
0
    def extract_features(self, POStag_features_file):
        all_POStag_features = []
        for pos_data in self.all_POStag:
            POSTag_Sentence = pos_data.strip().split("\t")
            q1_vec = len(self.postag) * [0]
            q1_postag = POSTag_Sentence[0].split(" ")
            for s in q1_postag:
                postag_id = self.postag[s]
                q1_vec[postag_id] += 1
            q2_vec = len(self.postag) * [0]
            q2_postag = POSTag_Sentence[1].split(" ")
            for s in q2_postag:
                postag_id = self.postag[s]
                q2_vec[postag_id] += 1

            q1_vec = np.array(q1_vec)
            q2_vec = np.array(q2_vec)
            sum_vec = q1_vec + q2_vec
            sub_vec = abs(q1_vec - q2_vec)
            dot_vec = q1_vec.dot(q2_vec)
            q1_len = np.sqrt(q1_vec.dot(q1_vec))
            q2_len = np.sqrt(q2_vec.dot(q2_vec))
            cos_sim = 0.
            if q1_len * q2_len > 1e-6:
                cos_sim = dot_vec / q1_len / q2_len
            all_POStag_features.append(
                list(q1_vec) + list(q2_vec) + list(sum_vec) + list(sub_vec) + [np.sqrt(dot_vec), q1_len, q2_len,
                                                                               cos_sim])
        LogUtil.log("INFO",
                    "all_data_file POStag features Writer to stanford_postag_features_file done, len(all_POStag_features)=%d" % len(
                        all_POStag_features))
        assert len(all_POStag_features) == len(self.all_POStag), "POStag 特征数量与原数据不一致"
        # for line in all_POStag_features[300:405]:
        #     print(line)
        POStag_features_file_object = open(POStag_features_file, mode='wb')
        pk.dump(all_POStag_features, POStag_features_file_object)
        POStag_features_file_object.close()
        LogUtil.log("INFO",
                    "POStag features num=%d, POStag feature Writer file done " % (len(self.postag) * 4 + 4))  # 140个特征
Exemple #18
0
 def extract_all_features(self, data_list):
     all_data_rate = []
     num_least = 300
     rate = [1.0]
     for data in data_list:
         q1_words = data[0].split()
         q2_words = data[0].split()
         q1_diff = list(set(q1_words).difference(set(q2_words)))
         q2_diff = list(set(q2_words).difference(set(q1_words)))
         all_diff = set(q1_diff + q2_diff)
         for word in all_diff:
             if word not in self.pword_dict:
                 continue
             if self.pword_dict[word][0] * self.pword_dict[word][
                     3] < num_least:
                 continue
             rate[0] *= (1.0 - self.pword_dict[word][4])
         rate = [1 - num for num in rate]
         all_data_rate.append(rate)
     LogUtil.log(
         "INFO",
         "PowerfulWordOneSideRate get_feature_num = 1, len(all_data_rate)=%d"
         % len(all_data_rate))
     return all_data_rate
Exemple #19
0
def NgramDistance(data_list):
    # get_feature_num = 4*5 = 20
    distance_func = getattr(DistanceUtil, 'edit_dist')
    all_NgramDistance = []
    for data in tqdm(data_list):
        q1_words = data[0].split()
        q2_words = data[1].split()
        fs = list()
        aggregation_modes_outer = ["mean", "max", "min", "median"]
        aggregation_modes_inner = ["mean", "std", "max", "min", "median"]
        for n_ngram in range(1, 4):
            q1_ngrams = NgramUtil.ngrams(q1_words, n_ngram)
            q2_ngrams = NgramUtil.ngrams(q2_words, n_ngram)
            val_list = list()
            for w1 in q1_ngrams:
                _val_list = list()
                for w2 in q2_ngrams:
                    s = distance_func(w1, w2)
                    _val_list.append(s)
                if len(_val_list) == 0:
                    _val_list = [MISSING_VALUE_NUMERIC]
                val_list.append(_val_list)
            if len(val_list) == 0:
                val_list = [[MISSING_VALUE_NUMERIC]]

            for mode_inner in aggregation_modes_inner:
                tmp = list()
                for l in val_list:
                    tmp.append(MathUtil.aggregate(l, mode_inner))
                fs.extend(MathUtil.aggregate(tmp, aggregation_modes_outer))
        all_NgramDistance.append(fs)
    LogUtil.log(
        "INFO",
        "NgramDistance距离 NgramDistance get_feature_num = 4*5, len(all_NgramDistance)=%d"
        % len(all_NgramDistance))
    return all_NgramDistance
Exemple #20
0
    def generate_powerful_word(train_data_file):
        """
        计算数据中词语的影响力,格式如下:
            词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例,5. 双侧语句对比例,6. 双侧语句对正确比例]
        """
        train_data = open(train_data_file, encoding="UTF-8",
                          mode='r').readlines()
        # 使用分词过的带标签数据
        words_power = {}
        for data in train_data:
            # print(data)
            sens = data.strip().split("\t")
            label = int(sens[2])
            q1_words = sens[0].split()
            q2_words = sens[1].split()
            all_words = set(q1_words + q2_words)
            q1_words = set(q1_words)
            q2_words = set(q2_words)
            for word in all_words:
                if word not in words_power:
                    words_power[word] = [0. for i in range(7)]
                # 计算出现语句对数量
                words_power[word][0] += 1.
                words_power[word][1] += 1.

                if ((word in q1_words) and
                    (word not in q2_words)) or ((word not in q1_words) and
                                                (word in q2_words)):
                    # 计算单侧语句数量
                    words_power[word][3] += 1.
                    if 0 == label:
                        # 计算正确语句对数量
                        words_power[word][2] += 1.
                        # 计算单侧语句正确比例
                        words_power[word][4] += 1.
                if (word in q1_words) and (word in q2_words):
                    # 计算双侧语句数量
                    words_power[word][5] += 1.
                    if 1 == label:
                        # 计算正确语句对数量
                        words_power[word][2] += 1.
                        # 计算双侧语句正确比例
                        words_power[word][6] += 1.
        for word in words_power:
            # 计算出现语句对比例
            words_power[word][1] /= len(train_data)
            # 计算正确语句对比例
            words_power[word][2] /= words_power[word][0]
            # 计算单侧语句对正确比例
            if words_power[word][3] > 1e-6:
                words_power[word][4] /= words_power[word][3]
            # 计算单侧语句对比例
            words_power[word][3] /= words_power[word][0]
            # 计算双侧语句对正确比例
            if words_power[word][5] > 1e-6:
                words_power[word][6] /= words_power[word][5]
            # 计算双侧语句对比例
            words_power[word][5] /= words_power[word][0]
        sorted_words_power = sorted(words_power.items(),
                                    key=lambda d: d[1][0],
                                    reverse=True)
        LogUtil.log(
            "INFO", "power words calculation done, len(words_power)=%d" %
            len(sorted_words_power))
        return sorted_words_power