def x_2func(traindir,testdir): ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) category_tokens = get_category_tokens() token_x = x_2(train,sta,category_tokens) nb.write(token_x,'token_x') ''' category = nb.read('category_nb_eventmodel') tokens_x = nb.read('token_x') category_convert = nb.convert(category) tokens_all_x = [] x_category = {} for i in range(10): tokens = sorted(tokens_x[i],key=tokens_x[i].get,reverse = True) x_category[i] = tokens[:100] ''' x_category[i] = [] for word in tokens: if tokens_x[i][word] >10.83: x_category[i].append(word) else: break ''' print len(x_category[i]) tokens_all_x = set(tokens_all_x)|set(x_category[i]) print len(tokens_all_x) nb.write(x_category,'x_category') nb.write(tokens_all_x,'tokens_all_x')
def dffunc(): #这里先使用文档频率,需要统计在某一类别下词t在多少个文档中出现 ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) all_tokens = get_all_tokens() tokens_df = df(train,sta,all_tokens) nb.write(tokens_df,'tokens_df') ''' category = nb.read('category_nb_eventmodel') tokens_df = nb.read('tokens_df') category_convert = nb.convert(category) tokens_all_df = [] df_category = {} for i in range(10): tokens = sorted(tokens_df,key=tokens_df.get,reverse = True) df_category[i] = tokens[:200] tokens_all_df = set(tokens_all_df)|set(df_category[i]) print tokens_all_df nb.write(df_category,'df_category') nb.write(tokens_all_df,'tokens_all_df')
def mi_func(): ''' train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) category_tokens = get_category_tokens() token_mi = mi(train,sta,category_tokens) nb.write(token_mi,'token_mi') ''' category = nb.read('category_nb_eventmodel') token_mi = nb.read('token_mi') category_convert = nb.convert(category) tokens_all_mi = [] mi_category = {} for i in range(10): tokens = sorted(token_mi[i],key = token_mi[i].get,reverse = True) mi_category[i] = tokens[:500] tokens_all_mi = set(tokens_all_mi)|set(mi_category[i]) print len(tokens_all_mi) nb.write(mi_category,'mi_category') nb.write(tokens_all_mi,'tokens_all_mi') '''
def gifunc(): train = nb.read('train_nb_eventmodel') sta = nb.sta_count(train) all_tokens = get_all_tokens() tokens_gi = gi(train,sta,all_tokens) nb.write(tokens_gi,'tokens_gi') category = nb.read('category_nb_eventmodel') tokens_gi = nb.read('tokens_gi') category_convert = nb.convert(category) tokens_all_gi = [] gi_category = {} for i in range(10): tokens = sorted(tokens_gi,key=tokens_gi.get,reverse=True) gi_category[i] = tokens[:100] tokens_all_gi = set(tokens_all_gi)|set(gi_category[i]) print tokens_all_gi nb.write(gi_category,'gi_category') nb.write(tokens_all_gi,'tokens_all_gi')
def preprocess(): traindir = './data/training' testdir = './data/test' tokens_all_x = nb.read('tokens_all_x') train_x,train_y,category = nb.func2(traindir) train_x = np.array(train_x) train_y = np.array(train_y) result,test_x,test_file = nb.func3(testdir,category) test_x = np.array(test_x) test_file = np.array(test_file) nb.write(train_x,'train_x') nb.write(train_y,'train_y') nb.write(category,'category') nb.write(result,'result') nb.write(test_x,'test_x') nb.write(test_file,'test_file')
def preprocess(): traindir = './data/training' testdir = './data/test' tokens_all_x = nb.read('tokens_all_x') train_x, train_y, category = nb.func2(traindir) train_x = np.array(train_x) train_y = np.array(train_y) result, test_x, test_file = nb.func3(testdir, category) test_x = np.array(test_x) test_file = np.array(test_file) nb.write(train_x, 'train_x') nb.write(train_y, 'train_y') nb.write(category, 'category') nb.write(result, 'result') nb.write(test_x, 'test_x') nb.write(test_file, 'test_file')