def gen_train_samps(self, train_samp_params): fname_samps_train = train_samp_params['samp_dir'] + os.sep + 'train.samp' fname_term_set_train = train_samp_params['term_set_dir'] + os.sep + 'term.set' doc_class_list_train, doc_str_token_train, doc_terms_list_train, term_set_train = \ self.gen_doc_terms_list(train_samp_params, train_opt = 1) pytc.save_term_set(term_set_train, fname_term_set_train) # unigram的token,单独作为参数用来构建其他规则特征 doc_uni_token_train = pytc.gen_N_gram(doc_str_token_train,'uni') term_dict = dict(zip(term_set_train, range(1,len(term_set_train)+1))) class_dict = dict(zip(train_samp_params['class_name'], range(1,1+len(train_samp_params['class_name'])))) if self.term_weight=='TFIDF': doc_num_train = len(doc_class_list_train) df_term_train = pytc.stat_df_term(term_set_train,doc_terms_list_train) idf_term_train = pytc.stat_idf_term(doc_num_train,df_term_train) else: idf_term_train = [] train_embeddings = [] print "building training samps......" samp_list_train, class_list_train = pytc.build_samps(term_dict, class_dict, doc_class_list_train, doc_terms_list_train, doc_uni_token_train, self.term_weight, self.rule_feature, idf_term_train, train_embeddings) print "saving training samps......" pytc.save_samps(samp_list_train, class_list_train, fname_samps_train)
def gen_test_samps(self, test_samp_params): fname_term_set = test_samp_params['term_set_dir'] + os.sep + 'term.set' fname_samps_test = test_samp_params['samp_dir'] +os.sep+'test.samp' # if not os.path.isfile(fname_term_set): # print "cant find term set file." # return if not os.path.exists(test_samp_params['raw_dir']): print "test dir does not exist." os.mkdir(test_samp_params['raw_dir']) doc_class_list_test, doc_str_token_test, doc_terms_list_test, term_set_test = \ self.gen_doc_terms_list(test_samp_params, train_opt = 0) # unigram的token,用来构建其他规则特征 doc_uni_token_test = pytc.gen_N_gram(doc_str_token_test,'uni') term_set_train = pytc.load_term_set(fname_term_set) term_dict = dict(zip(term_set_train, range(1,len(term_set_train)+1))) class_dict = dict(zip(test_samp_params['class_name'], range(1,1+len(test_samp_params['class_name'])))) class_dict['test'] = 0 if self.term_weight=='TFIDF': doc_num_test = len(doc_class_list_test) df_term_test = pytc.stat_df_term(term_set_test,doc_terms_list_test) idf_term_test = pytc.stat_idf_term(doc_num_test,df_term_test) else: idf_term_test = [] test_embeddings = [] print "building testing samps......" samp_list_test, class_list_test = pytc.build_samps(term_dict, class_dict, doc_class_list_test, doc_terms_list_test, doc_uni_token_test, self.term_weight, self.rule_feature, idf_term_test, test_embeddings) print "saving testing samps......" pytc.save_samps(samp_list_test, class_list_test, fname_samps_test)
input_dir = 'corpus' + os.sep + 'unlabeled_data' # input_dir = 'corpus' + os.sep + 'book2' pos_fenci_lines = [ re.sub('#.*?#', '', x.strip()) for x in open(input_dir + os.sep + 'pos_fenci').readlines() ] neg_fenci_lines = [ re.sub('#.*?#', '', x.strip()) for x in open(input_dir + os.sep + 'neg_fenci').readlines() ] random.shuffle(pos_fenci_lines) pos_fenci_lines = pos_fenci_lines[:len(neg_fenci_lines)] for ngram in ngram_list: pos_terms_list = pytc.gen_N_gram(pos_fenci_lines, ngram) neg_terms_list = pytc.gen_N_gram(neg_fenci_lines, ngram) term_set = gen_term_set(pos_terms_list + neg_terms_list) term_pos_freq = {}.fromkeys(term_set, 0) # 词在情感正向类中的词频或者文档频 term_neg_freq = {}.fromkeys(term_set, 0) # 词在情感负向类中的词频或者文档频 pos_df = len(pos_terms_list) neg_df = len(neg_terms_list) print "pos_df=", pos_df print "neg_df=", neg_df for lst in pos_terms_list: if metric == 'DF': lst = list(set(lst))
def gen_doc_terms_list(self, samp_params, train_opt = 0): doc_class_list = [] # token ngram doc_token_list, token_set = {}, {} doc_pos_list, pos_set = {}, {} doc_tag_list, tag_set = {}, {} doc_character_list, character_set = {}, {} doc_str_token, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + \ os.sep + x for x in samp_params['token']], samp_params['class_name']) if len(self.pos_gram.keys()) > 0: doc_str_pos, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + os.sep + x \ for x in samp_params['pos']], samp_params['class_name']) if len(self.tag_gram.keys()) > 0: doc_str_tag, doc_class_list = pytc.read_annotated_data([samp_params['raw_dir'] + os.sep + x \ for x in samp_params['tag']], samp_params['class_name']) for gram_key in ['uni', 'bis', 'tri', 'quat', 'five', 'six']: if self.token_gram.has_key(gram_key): doc_token_list[gram_key] = pytc.gen_N_gram(doc_str_token, gram_key) token_set[gram_key] = pytc.get_term_set(doc_token_list[gram_key]) params = self.token_gram[gram_key] # if it's in training producure if train_opt == 1 and len(doc_token_list[gram_key]) > 0: print "token " + gram_key, "len(token_set)=",len(token_set[gram_key]) token_set[gram_key] = pytc.feature_selection_all(doc_token_list[gram_key], doc_class_list, samp_params['class_name'], token_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num']) print "token " + gram_key, "after feature selection, len(token_set) =", len(token_set[gram_key]) if self.pos_gram.has_key(gram_key): doc_pos_list[gram_key] = pytc.gen_N_gram(doc_str_pos, gram_key) pos_set[gram_key] = pytc.get_term_set(doc_pos_list[gram_key]) params = self.pos_gram[gram_key] # if it's in training producure if train_opt == 1 and len(doc_pos_list[gram_key]) > 0: print "pos " + gram_key, "len(pos_set)=",len(pos_set[gram_key]) pos_set[gram_key] = pytc.feature_selection_all(doc_pos_list[gram_key], doc_class_list, samp_params['class_name'], pos_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num']) print "pos " + gram_key, "after feature selection, len(pos_set) =", len(pos_set[gram_key]) if self.tag_gram.has_key(gram_key): doc_tag_list[gram_key] = pytc.gen_N_gram(doc_str_tag, gram_key) tag_set[gram_key] = pytc.get_term_set(doc_tag_list[gram_key]) params = self.tag_gram[gram_key] # if it's in training producure if train_opt == 1 and len(doc_tag_list[gram_key]) > 0: print "tag " + gram_key, "len(tag_set)=",len(tag_set[gram_key]) tag_set[gram_key] = pytc.feature_selection_all(doc_tag_list[gram_key], doc_class_list, samp_params['class_name'], tag_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num']) print "tag " + gram_key, "after feature selection, len(tag_set) =", len(tag_set[gram_key]) if self.character_gram.has_key(gram_key): doc_character_list[gram_key] = pytc.gen_character_ngram_list(doc_str_token, gram_key) character_set[gram_key] = pytc.get_term_set(doc_character_list[gram_key]) params = self.character_gram[gram_key] # if it's in training producure if train_opt == 1 and len(doc_character_list[gram_key]) > 0: print "character " + gram_key, "len(character_set)=",len(character_set[gram_key]) character_set[gram_key] = pytc.feature_selection_all(doc_character_list[gram_key], doc_class_list, samp_params['class_name'], character_set[gram_key], params['fs_opt'], params['df'], params['fs_method'], params['fs_num']) print "character " + gram_key, "after feature selection, len(character_set) =", len(character_set[gram_key]) doc_terms_list, term_set = [], [] for gram_key in ['uni', 'bis', 'tri', 'quat', 'five', 'six']: if self.token_gram.has_key(gram_key): term_set += token_set[gram_key] if len(doc_terms_list) == 0: doc_terms_list = doc_token_list[gram_key] else: pytc.get_joint_sets(doc_terms_list, doc_token_list[gram_key]) if self.pos_gram.has_key(gram_key): term_set += pos_set[gram_key] if len(doc_terms_list) == 0: doc_terms_list = doc_pos_list[gram_key] else: pytc.get_joint_sets(doc_terms_list, doc_pos_list[gram_key]) if self.tag_gram.has_key(gram_key): term_set += tag_set[gram_key] if len(doc_terms_list) == 0: doc_terms_list = doc_tag_list[gram_key] else: pytc.get_joint_sets(doc_terms_list, doc_tag_list[gram_key]) if self.character_gram.has_key(gram_key): term_set += character_set[gram_key] if len(doc_terms_list) == 0: doc_terms_list = doc_character_list[gram_key] else: pytc.get_joint_sets(doc_terms_list, doc_character_list[gram_key]) return doc_class_list, doc_str_token, doc_terms_list, term_set
for test_data_name in test_data_list: test_dir = base_dir + os.sep + test_data_name output_dir = test_dir + os.sep + 'rule_distant' if not os.path.exists(output_dir): os.mkdir(output_dir) fenci_fname = 'test_fenci' rule_score_fname = 'rule_score.txt' rule_result_fname = 'rule_result.txt' test_fenci_lines = [x.strip() for x in open(test_dir + os.sep + fenci_fname).readlines()] print "一共" + str(len(test_fenci_lines)) + "篇文档......" final_score = [0.0] * len(test_fenci_lines) terms_list = pytc.gen_N_gram(test_fenci_lines, 'uni') for j in range(len(terms_list)): score = distant_dict_score(terms_list[j], senti_dict) final_score[j] += score tools.write_score_file(final_score, output_dir + os.sep + rule_score_fname) num = 0 tools.classify_2_way(output_dir + os.sep + rule_score_fname, output_dir + os.sep + rule_result_fname, num) '''performance''' result = [x.strip() for x in open(output_dir + os.sep + 'rule_result.txt').readlines()] label = [x.strip() for x in open(test_dir + os.sep + 'test_label').readlines()] class_dict = {'1':'neg','2':'pos'} result_dict = performance.demo_performance(result,label,class_dict)