def __init__(self, rule, alone, max_df, min_df, max_features, window=None, use_idf=False): self.rule = rule self.alone = alone self.path = os.path.join(PATH, rule) if not self.alone: self.data = load_data( os.path.join(PATH, rule, rule + "_agent_tokens.csv")) else: self.data = load_data( os.path.join(PATH, rule, rule + "_tokens.csv")) self.tokens = "transData.sentenceList" self.seed = 2018 self.Counter = None self.max_df = max_df self.min_df = min_df self.max_features = max_features self.use_idf = use_idf self.window = window
def load_all_data(self): data1 = load_data(PATH1) print(data1.shape) data2 = load_data(PATH2) print(data2.shape) # 合并数据 data = pd.concat([data1, data2]) print(data.shape) del (data2, data1) data.drop_duplicates(['UUID'], inplace=True) data.reset_index(inplace=True) print(data.shape) data[['UUID', '']]
def makeToken(self): # thu1 = thulac.thulac(seg_only=True) # 只进行分词,不进行词性标注 # thu1.cut_f("input.txt", "output.txt") # 对input.txt文件内容进行分词,输出到output.txt # jieba.load_userdict('setting/userdict1.txt') _content_prepath = os.path.join(self.path, self.content) # ../../data/Content _files = os.listdir(_content_prepath) _files = [_ for _ in _files] # 所有的文件 _labels = [os.path.splitext(_)[0] for _ in _files] # 所有违规标签 for i, _file in enumerate(_files): print(i + 1, _labels[i]) # if not os.path.exists(_token_prepath): # os.makedirs(_token_prepath) file_name = _labels[i] + "_{}_sentences.csv".format(self.alone) token_name = _labels[i] + "_{}_tokens.csv".format(self.alone) data = load_data( os.path.join(_content_prepath, _labels[i], file_name)) data['transData.sentenceList'] = data['transData.sentenceList'].\ apply(lambda x: ' '.join([word for word in jieba.cut(x) if word not in [' ']])) data.to_csv(os.path.join(_content_prepath, _labels[i], token_name), sep=',', encoding="utf-8", index=False) print('Make Tokens of all files completed')
def makeContents(self): _files = os.listdir(os.path.join(self.path, self.content)) _files = [_ + '.csv' for _ in _files] # 所有的文件 _labels = [os.path.splitext(_)[0] for _ in _files] # 所有违规标签 print(_files) print(_labels) for i, _file in enumerate(_files): print(i + 1, _labels[i]) prepath = os.path.join(self.path, self.content, _labels[i]) # ../../data/Content/XXX _file_df = load_data(os.path.join(os.path.join( prepath, _file))) # ../../data/Content/XXX/XXX.csv if 'transData.sentenceList' not in _file_df.columns: continue _file_df['transData.sentenceList'] = _file_df['transData.sentenceList'].apply(eval)\ .apply(lambda x: get_sentences(x, self.alone)) if not self.alone: file_name = _labels[ i] + "_agent_sentences.csv" # XXX_sentences.csv else: file_name = _labels[ i] + "_sentences.csv" # XXX_agent_sentences.csv _file_df.to_csv(os.path.join(prepath, file_name), sep=',', encoding="utf-8", index=False) del _file_df print('save all contents completed!')
def load_test(self, test_file): test_uuid = pd.read_csv(os.path.join('../../data/Sample', test_file + ".txt"), header=None) rules = os.listdir(PATH) rules = [os.path.splitext(_)[0] for _ in rules] # 所有违规标签 if not self.alone: suffix = "_agent_tokens.csv" else: suffix = "_tokens.csv" test_data = pd.DataFrame() for rule in rules: _ = load_data(os.path.join(PATH, rule, rule + suffix)) test_data = pd.concat([test_data, _], axis=0) # 测试集样本空间 test_data.drop_duplicates(['UUID'], inplace=True) test_data.reset_index(inplace=True) print(len(test_data)) self.data = test_data[test_data['UUID'].isin(test_uuid.values[:, 0])] self.data.reset_index(drop=True, inplace=True) if self.window: print("window:", self.window) key_words = [] with open(os.path.join('../setting', self.rule + ".txt"), 'r', encoding='utf-8') as f: for line in f.readlines(): key_words.append(line.strip()) self.data[self.tokens] = self.data[self.tokens].apply( lambda x: get_window_words(x, key_words, windows=self.window))
def load_train(self, test_file, only): """ 采样训练集数据,首先将出现在测试集中的数据去除 然后使用剩余数据的所有正样本,采样相同数量的负样本 将训练集 UUID 保存在 self.path 路径中 :param test_file: 测试集UUID文件 :param only: only 为 True,负样本仅从不出现任何违规的数据中提取 """ # 已经提取过训练集,直接加载返回 if only: file_name = self.rule + "_train_only_" + test_file + ".csv" else: file_name = self.rule + "_train_" + test_file + ".csv" if os.path.exists(os.path.join(self.path, test_file[:-1], file_name)): self.data = load_data( os.path.join(self.path, test_file[:-1], file_name)) return print("sample train data...") test_uuid = pd.read_csv(os.path.join('../../data/Sample', test_file + ".txt"), header=None) self.data = self.sample(test_uuid, only=only) self.data.reset_index(drop=True, inplace=True) if self.window: print("window:", self.window) key_words = [] with open(os.path.join('../setting', self.rule + ".txt"), 'r', encoding='utf-8') as f: for line in f.readlines(): key_words.append(line.strip()) self.data[self.tokens] = self.data[self.tokens].apply( lambda x: get_window_words(x, key_words, windows=self.window)) print(len(self.data))
def sample(self, test_uuid, only=True): """ 采样训练集数据,首先将出现在测试集中的数据去除 然后使用剩余数据的所有正样本,采样相同数量的负样本 将训练集 UUID 保存在 self.path 路径中 :param test_uuid: 测试集数据 UUID :param only: only 为 True,负样本仅从不出现任何违规的数据中提取 :return: 训练集 """ self.data = self.data[~self.data['UUID'].isin(test_uuid.values[:, 0])] print("pos data:", len(self.data)) print("是否只从不出现任何违规的数据中采集负样本: " + str(only)) # 负样本从不出现任何违规的数据中提取 if only: if not self.alone: file_name = os.path.join(PATH, "不违规", "不违规_agent_tokens.csv") else: file_name = os.path.join(PATH, "不违规", "不违规_tokens.csv") neg_data = load_data(file_name) else: rules = os.listdir(PATH) rules = [os.path.splitext(_)[0] for _ in rules] # 所有违规标签 rules.remove(self.rule) if not self.alone: suffix = "_agent_tokens.csv" else: suffix = "_tokens.csv" neg_data = pd.DataFrame() for rule in rules: _ = load_data(os.path.join(PATH, rule, rule + suffix)) neg_data = pd.concat([neg_data, _], axis=0) # 负样本空间 neg_data.drop_duplicates(['UUID'], inplace=True) neg_data = neg_data[~neg_data['UUID'].isin(test_uuid.values[:, 0])] neg_data = neg_data[~neg_data['UUID'].isin(self.data['UUID'])] print("neg data:", len(neg_data)) train_data = pd.concat([self.data, neg_data], axis=0) # train_data = pd.concat([self.data, neg_data.sample(n=len(self.data) * 2, random_state=self.seed)], axis=0) train_data = train_data.sample(frac=1, random_state=self.seed) return train_data
def load_all_data(alone=False): rules = os.listdir(PATH) rules = [os.path.splitext(_)[0] for _ in rules] # 所有违规标签 if not alone: suffix = "_agent_tokens.csv" else: suffix = "_tokens.csv" all_data = pd.DataFrame() for rule in rules: _ = load_data(os.path.join(PATH, rule, rule + suffix)) all_data = pd.concat([all_data, _], axis=0) # 测试集样本空间 all_data.drop_duplicates(['UUID'], inplace=True) all_data.reset_index(inplace=True) return all_data
def load_corpus(self): rules = os.listdir(PATH) rules = [os.path.splitext(_)[0] for _ in rules] # 所有违规标签 if not self.alone: suffix = "_agent_tokens.csv" else: suffix = "_tokens.csv" data = pd.DataFrame() for rule in rules: _ = load_data(os.path.join(PATH, rule, rule + suffix)) data = pd.concat([data, _], axis=0) # 样本空间 data.drop_duplicates(['UUID'], inplace=True) data.reset_index(inplace=True) print("All corpus size:", len(data)) self.data = data
def makeContents(self): _files = os.listdir(os.path.join(self.path, self.content)) _files = [_ + '.csv' for _ in _files] # 所有的文件 _labels = [os.path.splitext(_)[0] for _ in _files] # 所有违规标签 print(_files) print(_labels) for i, _file in enumerate(_files): print(i + 1, _labels[i]) prepath = os.path.join(self.path, self.content, _labels[i]) # ../../data/Content/XXX _file_df = load_data(os.path.join(os.path.join( prepath, _file))) # ../../data/Content/XXX/XXX.csv if 'transData.sentenceList' not in _file_df.columns: continue _file_df['transData.sentenceList'] = _file_df['transData.sentenceList'].apply(eval)\ .apply(lambda x: get_sentences(x, self.alone)) if not self.alone: file_name = _labels[ i] + "_agent_sentences.csv" # XXX_sentences.csv else: file_name = _labels[ i] + "_sentences.csv" # XXX_agent_sentences.csv _file_df.to_csv(os.path.join(prepath, file_name), sep=',', encoding="utf-8", index=False) del _file_df # for _id in range(len(_file_df)): # uuid = _file_df['UUID'][_id] # sentenceList = _file_df['transData.sentenceList'][_id] # if not self.alone: # _contents = ['{}:{}'.format(_['role'], _['content']) for _ in sentenceList] # else: # _contents = ['{}'.format(_['content']) for _ in sentenceList if _['role'] == 'AGENT'] # contens = '\n'.join(_contents) # save_file(contens, os.path.join(prepath, '{}-{}.txt'.format(uuid, _labels[i]))) print('save all contents completed!')
all_rules[l].append(i) for key, value in all_rules.items(): indices.extend( random.sample(value, int(float(len(value) * random_rate)) + 1)) indices = sorted(indices) print(len(indices)) uuid = data['UUID'][indices] uuid.to_csv(os.path.join(PATH, file_name), sep=',', encoding="utf-8", index=False) if __name__ == "__main__": data1 = load_data(PATH1) print(data1.shape) data2 = load_data(PATH2) print(data2.shape) data3 = load_data(PATH3) print(data3.shape) data4 = load_data(PATH4) print(data4.shape) data = pd.concat([data1, data2, data3, data4]) print(data.shape) del (data1, data2, data3, data4) data.drop_duplicates(['UUID'], inplace=True) data.reset_index(inplace=True) print(data.shape) for i in range(5):
def get_weight(self, test_file, only, total=False, train=True): # if total and not os.path.exists(os.path.join(self.path, test_file[:-1], "Vectorizer_total_ngram_1_2.pkl")): if total and not os.path.exists( os.path.join("../../data", "Vectorizer_total_ngram_1_2.pkl")): print("generate vocabulary...") rules = os.listdir(PATH) rules = [os.path.splitext(_)[0] for _ in rules] # 所有违规标签 if not self.alone: suffix = "_agent_tokens.csv" else: suffix = "_tokens.csv" total_data = pd.DataFrame() for rule in rules: _ = load_data(os.path.join(PATH, rule, rule + suffix)) total_data = pd.concat([total_data, _], axis=0) # 样本空间 total_data.drop_duplicates(['UUID'], inplace=True) total_data.reset_index(inplace=True) if self.window: print("window:", self.window) key_words = [] with open(os.path.join('../setting', self.rule + ".txt"), 'r', encoding='utf-8') as f: for line in f.readlines(): key_words.append(line.strip()) total_data[self.tokens] = total_data[self.tokens].apply( lambda x: get_window_words( x, key_words, windows=self.window)) print("fitting in data: ", total_data.shape) self.Counter = TfidfVectorizer(max_df=self.max_df, min_df=self.min_df, use_idf=True, max_features=self.max_features, ngram_range=(1, 2)) # self.Counter = CountVectorizer(max_df=self.max_df, min_df=self.min_df, # max_features=self.max_features) self.Counter.fit(total_data[self.tokens]) if not os.path.exists(os.path.join(self.path, test_file[:-1])): os.mkdir(os.path.join(self.path, test_file[:-1])) if not os.path.exists(os.path.join(self.path, 'sample_proportion')): os.mkdir(os.path.join(self.path, test_file[:-1])) pickle.dump( self.Counter, open( os.path.join(self.path, test_file[:-1], "Vectorizer_total_ngram_1_2.pkl"), 'wb')) pickle.dump( self.Counter, open( os.path.join(self.path, 'sample_proportion', "Vectorizer_total_ngram_1_2.pkl"), 'wb')) if train: if not os.path.exists(os.path.join(self.path, test_file[:-1])): os.makedirs(os.path.join(self.path, test_file[:-1])) print("load train data...") # 生成保存文件名 if only: file_name = self.rule + "_train_weight_only_" + test_file + ".pkl" label_name = self.rule + "_train_label_only_" + test_file + ".npy" if not total: pickle_file = "CountVectorizer_" + test_file + "_only" + ".pkl" else: pickle_file = "Vectorizer_total_ngram_1_2.pkl" else: file_name = self.rule + "_train_weight_" + test_file + ".pkl" label_name = self.rule + "_train_label_" + test_file + ".npy" if not total: pickle_file = "CountVectorizer_" + test_file + ".pkl" else: pickle_file = "Vectorizer_total_ngram_1_2.pkl" # 特征文件不存在,生成 # ../../data/Sample/rule/sample/label_name if not os.path.exists( os.path.join(self.path, test_file[:-1], label_name)): self.load_train(test_file, only) # if os.path.exists(os.path.join(self.path, test_file[:-1], pickle_file)): if os.path.exists(os.path.join("../../data", pickle_file)): print("load counter_vectorizer...") self.Counter = pickle.load( open(os.path.join("../../data", pickle_file), 'rb')) # self.Counter = pickle.load(open(os.path.join(self.path, test_file[:-1], pickle_file), 'rb')) else: print("fitting in data: ", self.data.shape) self.Counter = TfidfVectorizer( max_df=self.max_df, min_df=self.min_df, use_idf=True, max_features=self.max_features, ngram_range=(1, 1)) # self.Counter = CountVectorizer(max_df=self.max_df, min_df=self.min_df, # max_features=self.max_features) self.Counter.fit(self.data[self.tokens]) pickle.dump( self.Counter, open( os.path.join(self.path, test_file[:-1], pickle_file), 'wb')) print("get label...") self.get_label( os.path.join(self.path, test_file[:-1], label_name)) print("get weight...") token_counter = self.Counter.transform( self.data['transData.sentenceList'].values) print(len(self.Counter.vocabulary_.items())) weight = token_counter.toarray() print(weight.shape) pickle.dump( token_counter, open(os.path.join(self.path, test_file[:-1], file_name), 'wb')) # 测试集特征 else: print("load test data...") # 生成保存文件名 if only: file_name = "test_weight_only_" + test_file + ".pkl" label_name = "test_label_only_" + test_file + ".npy" if not total: pickle_file = "CountVectorizer_" + test_file + "_only" + ".pkl" else: pickle_file = "Vectorizer_total_ngram_1_2.pkl" file_name = "test_weight_" + test_file + ".pkl" label_name = "test_label_" + test_file + ".npy" else: file_name = "test_weight_" + test_file + ".pkl" label_name = "test_label_" + test_file + ".npy" if not total: pickle_file = "CountVectorizer_" + test_file + ".pkl" else: pickle_file = "Vectorizer_total_ngram_1_2.pkl" if not os.path.exists(os.path.join(TEST_PATH, self.rule)): os.mkdir(os.path.join(TEST_PATH, self.rule)) # 测试集特征文件不存在,生成 if not os.path.exists( os.path.join(TEST_PATH, self.rule, label_name)): self.Counter = pickle.load( open(os.path.join("../../data", pickle_file), 'rb')) # self.Counter = pickle.load(open(os.path.join(self.path, test_file[:-1], pickle_file), 'rb')) self.load_test(test_file) print("get label...") self.get_label(os.path.join(TEST_PATH, self.rule, label_name)) print("get weight...") token_counter = self.Counter.transform( self.data['transData.sentenceList'].values) weight = token_counter.toarray() print(weight.shape) pickle.dump( token_counter, open(os.path.join(TEST_PATH, self.rule, file_name), 'wb'))
not_data = data.drop(index) del (data) if mode == 'train': not_data = not_data.sample(len(_data.index)) labels.extend([0] * len(not_data.index)) data = pd.concat([_data, not_data]) data['label'] = labels return data if not os.path.exists('sample'): os.mkdir('sample') _path = '../../data/Content' dirs = os.listdir(_path) for i in dirs: _trainpath = os.path.join(_path, i, '{}_train.csv'.format(i)) _testpath = os.path.join(_path, i, '{}_test.csv'.format(i)) train = load_data(_trainpath) tempnum = train.shape[0] train = set_label(train, i) assert train.shape[0] == tempnum if os.path.exists(_testpath): test = load_data(_testpath) test.drop(['content', 'mark_tag'], axis=1, inplace=True) test['label'] = 0 train = pd.concat((train, test)).drop_duplicates('UUID') train.reset_index(inplace=True) train.drop('index', axis=1, inplace=True) uuids = np.array(train.index) for j in range(3): np.random.shuffle(uuids) train.iloc[uuids][['UUID', 'label']].\ to_csv('sample/{}.csv'.format(i), index=False, encoding='utf-8')