Exemple #1
0
 def __init__(self,
              rule,
              alone,
              max_df,
              min_df,
              max_features,
              window=None,
              use_idf=False):
     self.rule = rule
     self.alone = alone
     self.path = os.path.join(PATH, rule)
     if not self.alone:
         self.data = load_data(
             os.path.join(PATH, rule, rule + "_agent_tokens.csv"))
     else:
         self.data = load_data(
             os.path.join(PATH, rule, rule + "_tokens.csv"))
     self.tokens = "transData.sentenceList"
     self.seed = 2018
     self.Counter = None
     self.max_df = max_df
     self.min_df = min_df
     self.max_features = max_features
     self.use_idf = use_idf
     self.window = window
Exemple #2
0
 def load_all_data(self):
     data1 = load_data(PATH1)
     print(data1.shape)
     data2 = load_data(PATH2)
     print(data2.shape)
     # 合并数据
     data = pd.concat([data1, data2])
     print(data.shape)
     del (data2, data1)
     data.drop_duplicates(['UUID'], inplace=True)
     data.reset_index(inplace=True)
     print(data.shape)
     data[['UUID', '']]
Exemple #3
0
    def makeToken(self):
        # thu1 = thulac.thulac(seg_only=True)  # 只进行分词,不进行词性标注
        # thu1.cut_f("input.txt", "output.txt")  # 对input.txt文件内容进行分词,输出到output.txt
        # jieba.load_userdict('setting/userdict1.txt')

        _content_prepath = os.path.join(self.path,
                                        self.content)  # ../../data/Content
        _files = os.listdir(_content_prepath)
        _files = [_ for _ in _files]  # 所有的文件
        _labels = [os.path.splitext(_)[0] for _ in _files]  # 所有违规标签

        for i, _file in enumerate(_files):
            print(i + 1, _labels[i])
            # if not os.path.exists(_token_prepath):
            #     os.makedirs(_token_prepath)
            file_name = _labels[i] + "_{}_sentences.csv".format(self.alone)
            token_name = _labels[i] + "_{}_tokens.csv".format(self.alone)
            data = load_data(
                os.path.join(_content_prepath, _labels[i], file_name))
            data['transData.sentenceList'] = data['transData.sentenceList'].\
                apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in [' ']]))
            data.to_csv(os.path.join(_content_prepath, _labels[i], token_name),
                        sep=',',
                        encoding="utf-8",
                        index=False)
        print('Make Tokens of all files completed')
Exemple #4
0
 def makeContents(self):
     _files = os.listdir(os.path.join(self.path, self.content))
     _files = [_ + '.csv' for _ in _files]  # 所有的文件
     _labels = [os.path.splitext(_)[0] for _ in _files]  # 所有违规标签
     print(_files)
     print(_labels)
     for i, _file in enumerate(_files):
         print(i + 1, _labels[i])
         prepath = os.path.join(self.path, self.content,
                                _labels[i])  # ../../data/Content/XXX
         _file_df = load_data(os.path.join(os.path.join(
             prepath, _file)))  # ../../data/Content/XXX/XXX.csv
         if 'transData.sentenceList' not in _file_df.columns:
             continue
         _file_df['transData.sentenceList'] = _file_df['transData.sentenceList'].apply(eval)\
             .apply(lambda x: get_sentences(x, self.alone))
         if not self.alone:
             file_name = _labels[
                 i] + "_agent_sentences.csv"  # XXX_sentences.csv
         else:
             file_name = _labels[
                 i] + "_sentences.csv"  # XXX_agent_sentences.csv
         _file_df.to_csv(os.path.join(prepath, file_name),
                         sep=',',
                         encoding="utf-8",
                         index=False)
         del _file_df
     print('save all contents completed!')
Exemple #5
0
    def load_test(self, test_file):
        test_uuid = pd.read_csv(os.path.join('../../data/Sample',
                                             test_file + ".txt"),
                                header=None)
        rules = os.listdir(PATH)
        rules = [os.path.splitext(_)[0] for _ in rules]  # 所有违规标签
        if not self.alone:
            suffix = "_agent_tokens.csv"
        else:
            suffix = "_tokens.csv"
        test_data = pd.DataFrame()
        for rule in rules:
            _ = load_data(os.path.join(PATH, rule, rule + suffix))
            test_data = pd.concat([test_data, _], axis=0)

        # 测试集样本空间
        test_data.drop_duplicates(['UUID'], inplace=True)
        test_data.reset_index(inplace=True)
        print(len(test_data))
        self.data = test_data[test_data['UUID'].isin(test_uuid.values[:, 0])]
        self.data.reset_index(drop=True, inplace=True)
        if self.window:
            print("window:", self.window)
            key_words = []
            with open(os.path.join('../setting', self.rule + ".txt"),
                      'r',
                      encoding='utf-8') as f:
                for line in f.readlines():
                    key_words.append(line.strip())
            self.data[self.tokens] = self.data[self.tokens].apply(
                lambda x: get_window_words(x, key_words, windows=self.window))
Exemple #6
0
    def load_train(self, test_file, only):
        """
        采样训练集数据,首先将出现在测试集中的数据去除
        然后使用剩余数据的所有正样本,采样相同数量的负样本
        将训练集 UUID 保存在 self.path 路径中
        :param test_file: 测试集UUID文件
        :param only: only 为 True,负样本仅从不出现任何违规的数据中提取
        """
        # 已经提取过训练集,直接加载返回
        if only:
            file_name = self.rule + "_train_only_" + test_file + ".csv"
        else:
            file_name = self.rule + "_train_" + test_file + ".csv"
        if os.path.exists(os.path.join(self.path, test_file[:-1], file_name)):
            self.data = load_data(
                os.path.join(self.path, test_file[:-1], file_name))
            return

        print("sample train data...")
        test_uuid = pd.read_csv(os.path.join('../../data/Sample',
                                             test_file + ".txt"),
                                header=None)
        self.data = self.sample(test_uuid, only=only)
        self.data.reset_index(drop=True, inplace=True)
        if self.window:
            print("window:", self.window)
            key_words = []
            with open(os.path.join('../setting', self.rule + ".txt"),
                      'r',
                      encoding='utf-8') as f:
                for line in f.readlines():
                    key_words.append(line.strip())
            self.data[self.tokens] = self.data[self.tokens].apply(
                lambda x: get_window_words(x, key_words, windows=self.window))
        print(len(self.data))
Exemple #7
0
    def sample(self, test_uuid, only=True):
        """
        采样训练集数据,首先将出现在测试集中的数据去除
        然后使用剩余数据的所有正样本,采样相同数量的负样本
        将训练集 UUID 保存在 self.path 路径中
        :param test_uuid: 测试集数据 UUID
        :param only: only 为 True,负样本仅从不出现任何违规的数据中提取
        :return: 训练集
        """
        self.data = self.data[~self.data['UUID'].isin(test_uuid.values[:, 0])]
        print("pos data:", len(self.data))
        print("是否只从不出现任何违规的数据中采集负样本: " + str(only))
        # 负样本从不出现任何违规的数据中提取
        if only:
            if not self.alone:
                file_name = os.path.join(PATH, "不违规", "不违规_agent_tokens.csv")
            else:
                file_name = os.path.join(PATH, "不违规", "不违规_tokens.csv")
            neg_data = load_data(file_name)
        else:
            rules = os.listdir(PATH)
            rules = [os.path.splitext(_)[0] for _ in rules]  # 所有违规标签
            rules.remove(self.rule)
            if not self.alone:
                suffix = "_agent_tokens.csv"
            else:
                suffix = "_tokens.csv"
            neg_data = pd.DataFrame()
            for rule in rules:
                _ = load_data(os.path.join(PATH, rule, rule + suffix))
                neg_data = pd.concat([neg_data, _], axis=0)

        # 负样本空间
        neg_data.drop_duplicates(['UUID'], inplace=True)
        neg_data = neg_data[~neg_data['UUID'].isin(test_uuid.values[:, 0])]
        neg_data = neg_data[~neg_data['UUID'].isin(self.data['UUID'])]
        print("neg data:", len(neg_data))

        train_data = pd.concat([self.data, neg_data], axis=0)
        # train_data = pd.concat([self.data, neg_data.sample(n=len(self.data) * 2, random_state=self.seed)], axis=0)
        train_data = train_data.sample(frac=1, random_state=self.seed)
        return train_data
Exemple #8
0
def load_all_data(alone=False):
    rules = os.listdir(PATH)
    rules = [os.path.splitext(_)[0] for _ in rules]  # 所有违规标签
    if not alone:
        suffix = "_agent_tokens.csv"
    else:
        suffix = "_tokens.csv"
    all_data = pd.DataFrame()
    for rule in rules:
        _ = load_data(os.path.join(PATH, rule, rule + suffix))
        all_data = pd.concat([all_data, _], axis=0)

    # 测试集样本空间
    all_data.drop_duplicates(['UUID'], inplace=True)
    all_data.reset_index(inplace=True)
    return all_data
Exemple #9
0
    def load_corpus(self):
        rules = os.listdir(PATH)
        rules = [os.path.splitext(_)[0] for _ in rules]  # 所有违规标签
        if not self.alone:
            suffix = "_agent_tokens.csv"
        else:
            suffix = "_tokens.csv"
        data = pd.DataFrame()
        for rule in rules:
            _ = load_data(os.path.join(PATH, rule, rule + suffix))
            data = pd.concat([data, _], axis=0)

        # 样本空间
        data.drop_duplicates(['UUID'], inplace=True)
        data.reset_index(inplace=True)
        print("All corpus size:", len(data))
        self.data = data
Exemple #10
0
 def makeContents(self):
     _files = os.listdir(os.path.join(self.path, self.content))
     _files = [_ + '.csv' for _ in _files]  # 所有的文件
     _labels = [os.path.splitext(_)[0] for _ in _files]  # 所有违规标签
     print(_files)
     print(_labels)
     for i, _file in enumerate(_files):
         print(i + 1, _labels[i])
         prepath = os.path.join(self.path, self.content,
                                _labels[i])  # ../../data/Content/XXX
         _file_df = load_data(os.path.join(os.path.join(
             prepath, _file)))  # ../../data/Content/XXX/XXX.csv
         if 'transData.sentenceList' not in _file_df.columns:
             continue
         _file_df['transData.sentenceList'] = _file_df['transData.sentenceList'].apply(eval)\
             .apply(lambda x: get_sentences(x, self.alone))
         if not self.alone:
             file_name = _labels[
                 i] + "_agent_sentences.csv"  # XXX_sentences.csv
         else:
             file_name = _labels[
                 i] + "_sentences.csv"  # XXX_agent_sentences.csv
         _file_df.to_csv(os.path.join(prepath, file_name),
                         sep=',',
                         encoding="utf-8",
                         index=False)
         del _file_df
         # for _id in range(len(_file_df)):
         #     uuid = _file_df['UUID'][_id]
         #     sentenceList = _file_df['transData.sentenceList'][_id]
         #     if not self.alone:
         #         _contents = ['{}:{}'.format(_['role'], _['content']) for _ in sentenceList]
         #     else:
         #         _contents = ['{}'.format(_['content']) for _ in sentenceList if _['role'] == 'AGENT']
         #     contens = '\n'.join(_contents)
         #     save_file(contens, os.path.join(prepath, '{}-{}.txt'.format(uuid, _labels[i])))
     print('save all contents completed!')
Exemple #11
0
                all_rules[l].append(i)
    for key, value in all_rules.items():
        indices.extend(
            random.sample(value,
                          int(float(len(value) * random_rate)) + 1))
    indices = sorted(indices)
    print(len(indices))
    uuid = data['UUID'][indices]
    uuid.to_csv(os.path.join(PATH, file_name),
                sep=',',
                encoding="utf-8",
                index=False)


if __name__ == "__main__":
    data1 = load_data(PATH1)
    print(data1.shape)
    data2 = load_data(PATH2)
    print(data2.shape)
    data3 = load_data(PATH3)
    print(data3.shape)
    data4 = load_data(PATH4)
    print(data4.shape)
    data = pd.concat([data1, data2, data3, data4])
    print(data.shape)
    del (data1, data2, data3, data4)
    data.drop_duplicates(['UUID'], inplace=True)
    data.reset_index(inplace=True)
    print(data.shape)

    for i in range(5):
Exemple #12
0
    def get_weight(self, test_file, only, total=False, train=True):
        # if total and not os.path.exists(os.path.join(self.path, test_file[:-1], "Vectorizer_total_ngram_1_2.pkl")):
        if total and not os.path.exists(
                os.path.join("../../data", "Vectorizer_total_ngram_1_2.pkl")):
            print("generate vocabulary...")
            rules = os.listdir(PATH)
            rules = [os.path.splitext(_)[0] for _ in rules]  # 所有违规标签
            if not self.alone:
                suffix = "_agent_tokens.csv"
            else:
                suffix = "_tokens.csv"
            total_data = pd.DataFrame()
            for rule in rules:
                _ = load_data(os.path.join(PATH, rule, rule + suffix))
                total_data = pd.concat([total_data, _], axis=0)

            # 样本空间
            total_data.drop_duplicates(['UUID'], inplace=True)
            total_data.reset_index(inplace=True)

            if self.window:
                print("window:", self.window)
                key_words = []
                with open(os.path.join('../setting', self.rule + ".txt"),
                          'r',
                          encoding='utf-8') as f:
                    for line in f.readlines():
                        key_words.append(line.strip())
                    total_data[self.tokens] = total_data[self.tokens].apply(
                        lambda x: get_window_words(
                            x, key_words, windows=self.window))
            print("fitting in data: ", total_data.shape)
            self.Counter = TfidfVectorizer(max_df=self.max_df,
                                           min_df=self.min_df,
                                           use_idf=True,
                                           max_features=self.max_features,
                                           ngram_range=(1, 2))
            # self.Counter = CountVectorizer(max_df=self.max_df, min_df=self.min_df,
            #                                max_features=self.max_features)
            self.Counter.fit(total_data[self.tokens])
            if not os.path.exists(os.path.join(self.path, test_file[:-1])):
                os.mkdir(os.path.join(self.path, test_file[:-1]))
            if not os.path.exists(os.path.join(self.path,
                                               'sample_proportion')):
                os.mkdir(os.path.join(self.path, test_file[:-1]))
            pickle.dump(
                self.Counter,
                open(
                    os.path.join(self.path, test_file[:-1],
                                 "Vectorizer_total_ngram_1_2.pkl"), 'wb'))
            pickle.dump(
                self.Counter,
                open(
                    os.path.join(self.path, 'sample_proportion',
                                 "Vectorizer_total_ngram_1_2.pkl"), 'wb'))

        if train:
            if not os.path.exists(os.path.join(self.path, test_file[:-1])):
                os.makedirs(os.path.join(self.path, test_file[:-1]))
            print("load train data...")
            # 生成保存文件名
            if only:
                file_name = self.rule + "_train_weight_only_" + test_file + ".pkl"
                label_name = self.rule + "_train_label_only_" + test_file + ".npy"
                if not total:
                    pickle_file = "CountVectorizer_" + test_file + "_only" + ".pkl"
                else:
                    pickle_file = "Vectorizer_total_ngram_1_2.pkl"
            else:
                file_name = self.rule + "_train_weight_" + test_file + ".pkl"
                label_name = self.rule + "_train_label_" + test_file + ".npy"
                if not total:
                    pickle_file = "CountVectorizer_" + test_file + ".pkl"
                else:
                    pickle_file = "Vectorizer_total_ngram_1_2.pkl"

            # 特征文件不存在,生成
            # ../../data/Sample/rule/sample/label_name
            if not os.path.exists(
                    os.path.join(self.path, test_file[:-1], label_name)):
                self.load_train(test_file, only)

                # if os.path.exists(os.path.join(self.path, test_file[:-1], pickle_file)):
                if os.path.exists(os.path.join("../../data", pickle_file)):
                    print("load counter_vectorizer...")
                    self.Counter = pickle.load(
                        open(os.path.join("../../data", pickle_file), 'rb'))
                    # self.Counter = pickle.load(open(os.path.join(self.path, test_file[:-1], pickle_file), 'rb'))
                else:
                    print("fitting in data: ", self.data.shape)
                    self.Counter = TfidfVectorizer(
                        max_df=self.max_df,
                        min_df=self.min_df,
                        use_idf=True,
                        max_features=self.max_features,
                        ngram_range=(1, 1))
                    # self.Counter = CountVectorizer(max_df=self.max_df, min_df=self.min_df,
                    #                                max_features=self.max_features)
                    self.Counter.fit(self.data[self.tokens])
                    pickle.dump(
                        self.Counter,
                        open(
                            os.path.join(self.path, test_file[:-1],
                                         pickle_file), 'wb'))

                print("get label...")
                self.get_label(
                    os.path.join(self.path, test_file[:-1], label_name))
                print("get weight...")
                token_counter = self.Counter.transform(
                    self.data['transData.sentenceList'].values)
                print(len(self.Counter.vocabulary_.items()))
                weight = token_counter.toarray()
                print(weight.shape)
                pickle.dump(
                    token_counter,
                    open(os.path.join(self.path, test_file[:-1], file_name),
                         'wb'))

        # 测试集特征
        else:
            print("load test data...")
            # 生成保存文件名
            if only:
                file_name = "test_weight_only_" + test_file + ".pkl"
                label_name = "test_label_only_" + test_file + ".npy"
                if not total:
                    pickle_file = "CountVectorizer_" + test_file + "_only" + ".pkl"
                else:
                    pickle_file = "Vectorizer_total_ngram_1_2.pkl"
                    file_name = "test_weight_" + test_file + ".pkl"
                    label_name = "test_label_" + test_file + ".npy"
            else:
                file_name = "test_weight_" + test_file + ".pkl"
                label_name = "test_label_" + test_file + ".npy"
                if not total:
                    pickle_file = "CountVectorizer_" + test_file + ".pkl"
                else:
                    pickle_file = "Vectorizer_total_ngram_1_2.pkl"

            if not os.path.exists(os.path.join(TEST_PATH, self.rule)):
                os.mkdir(os.path.join(TEST_PATH, self.rule))
            # 测试集特征文件不存在,生成
            if not os.path.exists(
                    os.path.join(TEST_PATH, self.rule, label_name)):
                self.Counter = pickle.load(
                    open(os.path.join("../../data", pickle_file), 'rb'))
                # self.Counter = pickle.load(open(os.path.join(self.path, test_file[:-1], pickle_file), 'rb'))
                self.load_test(test_file)

                print("get label...")
                self.get_label(os.path.join(TEST_PATH, self.rule, label_name))

                print("get weight...")
                token_counter = self.Counter.transform(
                    self.data['transData.sentenceList'].values)
                weight = token_counter.toarray()
                print(weight.shape)
                pickle.dump(
                    token_counter,
                    open(os.path.join(TEST_PATH, self.rule, file_name), 'wb'))
Exemple #13
0
    not_data = data.drop(index)
    del (data)
    if mode == 'train':
        not_data = not_data.sample(len(_data.index))
    labels.extend([0] * len(not_data.index))
    data = pd.concat([_data, not_data])
    data['label'] = labels
    return data
if not os.path.exists('sample'):
    os.mkdir('sample')
_path = '../../data/Content'
dirs = os.listdir(_path)
for i in dirs:
    _trainpath = os.path.join(_path, i, '{}_train.csv'.format(i))
    _testpath = os.path.join(_path, i, '{}_test.csv'.format(i))
    train = load_data(_trainpath)
    tempnum = train.shape[0]
    train = set_label(train, i)
    assert train.shape[0] == tempnum
    if os.path.exists(_testpath):
        test = load_data(_testpath)
        test.drop(['content', 'mark_tag'], axis=1, inplace=True)
        test['label'] = 0
        train = pd.concat((train, test)).drop_duplicates('UUID')
        train.reset_index(inplace=True)
        train.drop('index', axis=1, inplace=True)
    uuids = np.array(train.index)
    for j in range(3):
        np.random.shuffle(uuids)
    train.iloc[uuids][['UUID', 'label']].\
        to_csv('sample/{}.csv'.format(i), index=False, encoding='utf-8')