class MLData(object): def __init__(self, debug_mode=False, train_mode=True): self.debug_mode = debug_mode self.em = Embedding() self.em.load() if train_mode: self.preprocessor() def preprocessor(self): logger.info('load data ... ') self.train = pd.read_csv(config.root_path + '/data/train.csv', sep=',', names=['label', 'text']).dropna() self.dev = pd.read_csv(config.root_path + '/data/test.csv', sep=',', names=['label', 'text']).dropna() if self.debug_mode: self.train = self.train.sample(n=1000).reset_index(drop=True) self.dev = self.dev.sample(n=100).reset_index(drop=True) self.train['queryCut'] = self.train['text'].apply(query_cut) self.dev['queryCut'] = self.dev['text'].apply(query_cut) # 去掉停用词 self.train['queryCutRMStopWord'] = self.train['queryCut'].apply( lambda x: [word for word in x if word not in self.em.stopWords]) self.dev['queryCutRMStopWord'] = self.dev['queryCut'].apply( lambda x: [word for word in x if word not in self.em.stopWords]) if os.path.exists(config.root_path + '/data/label2id.json'): labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) else: labelName = self.train['label'].unique() labelIndex = list(range(len(labelName))) labelNameToIndex = dict(zip(labelName, labelIndex)) with open(config.root_path + '/data/label2id.json', 'w', encoding='utf-8') as f: json.dump({k: v for k, v in labelNameToIndex.items()}, f) self.train['labelIndex'] = self.train['label'].map(labelNameToIndex) self.dev['labelIndex'] = self.dev['label'].map(labelNameToIndex) def process_data(self, method='word2vec'): x_train = self.get_features(self.train, method) x_test = self.get_features(self.dev, method) y_train = self.train['labelIndex'] y_test = self.dev['labelIndex'] return x_train, x_test, y_train, y_test def get_features(self, data, method='word2vec'): if method == 'tfidf': data = [' '.join(query) for query in data['queryCutRMStopWord']] return self.em.tfidf.transform(data) elif method == 'word2vec': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.w2v)[0])) elif method == 'fasttext': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.fast)[0])) else: NotImplementedError
def __init__(self, debug_mode=False): ''' @description: initlize ML dataset class @param {type} debug_mode: if debug_Mode the only deal 10000 data em, new embedding class @return:None ''' self.debug_mode = debug_mode self.em = Embedding() self.em.load() self.preprocessor()
class MLData(object): def __init__(self, debug_mode=False, train_mode=True): ''' @description: initlize ML dataset class @param {type} debug_mode: if debug_Mode the only deal 10000 data em, new embedding class @return:None ''' # 加载embedding, 如果不训练, 则不处理数据 self.debug_mode = debug_mode self.em = Embedding() self.em.load() if train_mode: self.preprocessor() def preprocessor(self): ''' @description: Preprocess data, segment, transform label to id @param {type}None @return: None ''' logger.info('load data') self.train = pd.read_csv(config.root_path + '/data/train.tsv', sep='\t').dropna() self.dev = pd.read_csv(config.root_path + '/data/dev.tsv', sep='\t').dropna() if self.debug_mode: self.train = self.train.sample(n=1000).reset_index(drop=True) self.dev = self.dev.sample(n=100).reset_index(drop=True) # 拼接数据 self.train["text"] = self.train['title'] + self.train['desc'] self.dev["text"] = self.dev['title'] + self.dev['desc'] # 分词 ########################################### # TODO: module 2 task 1.1 # ########################################### self.train["queryCut"] = self.train["text"].apply(query_cut) self.dev["queryCut"] = self.dev["text"].apply(query_cut) # 过滤停止词 ########################################### # TODO: module 2 task 1.2 # ########################################### self.train["queryCutRMStopWord"] = self.train["queryCut"].apply( lambda x: [word for word in x if word not in self.em.stopWords]) self.dev["queryCutRMStopWord"] = self.dev["queryCut"].apply( lambda x: [word for word in x if word not in self.em.stopWords]) # 生成label 与id的对应关系, 并保存到文件中, 如果存在这个文件则直接加载 ########################################### # TODO: module 2 task 1.3 # ########################################### if os.path.exists(config.root_path + '/data/label2id.json'): labelNameToIndex = json.load( open(config.root_path + '/data/label2id.json', encoding='utf-8')) else: labelName = self.train['label'].unique() # 全部label列表 labelIndex = list(range(len(labelName))) # 全部label标签 labelNameToIndex = dict(zip(labelName, labelIndex)) # label的名字对应标签的字典 with open(config.root_path + '/data/label2id.json', 'w', encoding='utf-8') as f: json.dump({k: v for k, v in labelNameToIndex.items()}, f) self.train["labelIndex"] = self.train['label'].map(labelNameToIndex) # 将测试集中的label名字映射到标签并保存到列labelIndex中 # 将测试集中的label名字映射到标签并保存到列labelIndex中 self.dev["labelIndex"] = self.dev['label'].map(labelNameToIndex) def process_data(self, method='word2vec'): ''' @description: generate data used for sklearn @param {type} method: three options, word2vec, fasttext, tfidf @return: X_train, feature of train set X_test, feature of test set y_train, label of train set y_test, label of test set ''' # 处理数据, 获取到数据的embedding, 如tfidf ,word2vec, fasttext X_train = self.get_feature(self.train, method) X_test = self.get_feature(self.dev, method) y_train = self.train["labelIndex"] y_test = self.dev["labelIndex"] return X_train, X_test, y_train, y_test def get_feature(self, data, method='word2vec'): ''' @description: generate feature @param {type} data, input dataset method: three options, word2vec, fasttext, tfidf @return: coresponding feature ''' if method == 'tfidf': data = [' '.join(query) for query in data["queryCutRMStopWord"]] return self.em.tfidf.transform(data) elif method == 'word2vec': # return [np.array(wam(x, self.em.w2v)) for x in data['text'].values.tolist()] return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.w2v)[0])) elif method == 'fasttext': return np.vstack(data['queryCutRMStopWord'].apply( lambda x: wam(x, self.em.fast)[0])) else: NotImplementedError
def __init__(self, debug_mode=False, train_mode=True): self.debug_mode = debug_mode self.em = Embedding() self.em.load() if train_mode: self.preprocessor()