def encode_label(): ''' 给对原始的uid 排序,得到有序的pid ''' train, test = access_data.load_raw_data() uid_train = train[0].values uid_test = test[0].values labelencoder = LabelEncoder() labelencoder.fit(list(uid_train) + list(uid_test)) uid_new_train = labelencoder.transform(uid_train) uid_new_test = labelencoder.transform(uid_test) x, y = pd.DataFrame({ 'uid': uid_new_train, 'pid': train[1].values }), pd.DataFrame({ 'uid': uid_new_test, 'pid': test[1].values }) addr1 = setting.processed_data_dir + 'uid&pid_train' addr2 = setting.processed_data_dir + 'uid&pid_test' x.to_pickle(addr1) y.to_pickle(addr2) return x, y
def clean_corpus(): ''' 清理原始语聊:并且提取符号特征 ''' train, test = access_data.load_raw_data() train = pd.DataFrame({'corpus': train[6]}) test = pd.DataFrame({'corpus': test[3]}) keyword = [ r'http[0-9a-zA-Z?:=._@%/\-#&\+|]+', r'//@', r'@', r'#', r'【', r'《', r'\[' ] for string in keyword: reg = re.compile(string) train[string] = train['corpus'].map(lambda x: reg.subn(' ', x)) test[string] = test['corpus'].map(lambda x: reg.subn(' ', x)) train['corpus'] = train[string].map(lambda x: x[0]) test['corpus'] = test[string].map(lambda x: x[0]) train[string] = train[string].map(lambda x: x[1]) test[string] = test[string].map(lambda x: x[1]) logging.info('finished cleaning symbol %s' % string) return train, test
def encode_label(): ''' 给对原始的uid 排序,得到有序的pid ''' train,test = access_data.load_raw_data() uid_train = train[0].values uid_test = test[0].values labelencoder = LabelEncoder() labelencoder.fit(list(uid_train)+list(uid_test)) uid_new_train = labelencoder.transform(uid_train) uid_new_test = labelencoder.transform(uid_test) x ,y = pd.DataFrame({'uid':uid_new_train,'pid':train[1].values}) , pd.DataFrame({'uid':uid_new_test,'pid':test[1].values}) addr1 = setting.processed_data_dir + 'uid&pid_train' addr2 = setting.processed_data_dir + 'uid&pid_test' x.to_pickle(addr1) y.to_pickle(addr2) return x,y
def clean_corpus(): ''' 清理原始语聊:并且提取符号特征 ''' train , test = access_data.load_raw_data() train = pd.DataFrame({'corpus' :train[6]}) test = pd.DataFrame({'corpus' : test[3]}) keyword = [r'http[0-9a-zA-Z?:=._@%/\-#&\+|]+' ,r'//@', r'@' , r'#' , r'【' ,r'《' ,r'\[' ] for string in keyword: reg = re.compile(string) train[string] = train['corpus'].map(lambda x : reg.subn(' ',x)) test[string] = test['corpus'].map(lambda x : reg.subn(' ',x)) train['corpus'] = train[string].map(lambda x:x[0]) test['corpus'] = test[string].map(lambda x:x[0]) train[string] = train[string].map(lambda x:x[1]) test[string] = test[string].map(lambda x:x[1]) logging.info('finished cleaning symbol %s' % string) return train,test