def test_clf(self, clf, clf_name): pre_train = Preprocesser(self.pos_file, self.neg_file) ngrams = ('unigram', ) # 'bigram' selector_methods = ('df', ) weight_methods = ('tf_idf', ) for ngram in ngrams: data, target = pre_train.get_ngram(ngram) train_data, test_data, train_target, test_target = train_test_split( data, target, test_size=0.70) selector = FeatureSelector(train_data, train_target) for selector_method in selector_methods: all_features = selector.select(selector_method, selector.all_features_size) for count in self.feature_range: features = all_features[:count] train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight( weight_method) clf.fit(train_weighted_data, train_target) test_result = clf.predict(test_weighted_data) score = compute_aprf(test_target, test_result) self.write_result(clf_name, ngram, selector_method, count, weight_method, score)
def test_clf(self, clf, clf_name): pre_train = Preprocesser(self.pos_file, self.neg_file) pre_test = Preprocesser(self.pos_test, self.neg_test) ngrams = ('unigram', ) # 'unigram', 'bigram', 'dictWords' selector_methods = ( 'df', 'mi', ) weight_methods = ('tf_idf', ) for ngram in ngrams: train_data, train_target = pre_train.get_ngram(ngram) test_data, test_target = pre_test.get_ngram(ngram) selector = FeatureSelector(train_data, train_target) for selector_method in selector_methods: all_features = selector.select(selector_method, selector.all_features_size) print(len(all_features)) for count in self.feature_range: print(int(len(all_features) * count)) features = all_features[:int(len(all_features) * count)] #[:count] train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight( weight_method) clf.fit(train_weighted_data, train_target) test_result = clf.predict(test_weighted_data) score = compute_aprf(test_target, test_result) self.write_result(clf_name, ngram, selector_method, len(features), weight_method, score)
def test_unigram(clf, clf_name, pos_file, neg_file, data_type): pre = Preprocesser(pos_file, neg_file) ngrams = ('unigram', ) weight_methods = ('tf_idf', ) for ngram in ngrams: data, target = pre.get_ngram(ngram) train_data, test_data, train_target, test_target = train_test_split( data, target) selector = FeatureSelector(train_data, train_target) all_features_count = selector.all_features_size features = selector.all_features train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight(weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight(weight_method) clf.fit(train_weighted_data, train_target) score = clf.score(test_weighted_data, test_target) result_dir = '../data/result/' if not os.path.exists(result_dir): os.mkdir(result_dir) with open(os.path.join(result_dir, clf_name + '_unigram.txt'), 'at', encoding='utf-8') as f: f.write( 'clf={1}\t data_type={2}\t count={3}\t score={4:.2f}\n'. format(clf_name, data_type, all_features_count, score * 100))
def test_clf_by_percent(self, clf, clf_name, feature_count_range=[ 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 ], bigram_range=[0.1, 0.3, 0.5, 0.7, 0.9]): pre = Preprocesser(self.pos_file, self.neg_file) selector_methods = ('df', ) weight_methods = ('tf_idf', ) unigram_data, unigram_target = pre.get_unigram(is_shuffle=False) bigram_data, bigram_target = pre.get_bigram(is_shuffle=False) train_unigram, test_unigram, train_bigram, test_bigram, train_target, \ test_target = self.split_data(unigram_data, bigram_data, unigram_target) unigram_selector = FeatureSelector(train_unigram, train_target) bigram_selector = FeatureSelector(train_bigram, train_target) train_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(train_unigram, train_bigram) ] test_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(test_unigram, test_bigram) ] for selector_method in selector_methods: unigram_features = unigram_selector.select( selector_method, unigram_selector.all_features_size) bigram_features = bigram_selector.select( selector_method, bigram_selector.all_features_size) for count in feature_count_range: for bigram_size in bigram_range: bigram_count = int(count * bigram_size) bigram_selected_features = bigram_features[:bigram_count] unigram_selected_features = unigram_features[:( count - bigram_count)] features = unigram_selected_features features.extend(bigram_selected_features) train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight( weight_method) clf.fit(train_weighted_data, train_target) test_result = clf.predict(test_weighted_data) score = compute_aprf(test_target, test_result) self.write_result(clf_name + '_uni_bi', 'unigram+bigram', selector_method, str(count) + ':' + str(bigram_size), weight_method, score)
def test_rule(clf, clf_name, train_unigram, train_bigram, train_target, test_unigram, test_bigram, test_target, data_type): feature_count_range = [ 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 5000, 6000 ] bigram_range = [0.1, 0.3, 0.5, 0.7, 0.9, 1] selector_methods = ('mi', 'df', 'ig') weight_methods = ('tf_idf', ) unigram_selector = FeatureSelector(train_unigram, train_target) bigram_selector = FeatureSelector(train_bigram, train_target) train_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(train_unigram, train_bigram) ] test_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(test_unigram, test_bigram) ] for selector_method in selector_methods: unigram_features = unigram_selector.select( selector_method, unigram_selector.all_features_size) bigram_features = bigram_selector.select( selector_method, bigram_selector.all_features_size) for count in feature_count_range: for bigram_size in bigram_range: bigram_count = int(count * bigram_size) bigram_selected_features = bigram_features[:bigram_count] unigram_selected_features = unigram_features[:(count - bigram_count)] features = unigram_selected_features features.extend(bigram_selected_features) train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight(weight_method) clf.fit(train_weighted_data, train_target) score = clf.score(test_weighted_data, test_target) result_dir = '../data/result/' if not os.path.exists(result_dir): os.mkdir(result_dir) with open(os.path.join(result_dir, clf_name + '_unigram_and_rule.txt'), 'at', encoding='utf-8') as f: f.write( 'clf={1}\t data_type={2}\t count={3}\t rule_size={4}\t score={5:.2f}\n' .format(clf_name, data_type, count, bigram_size, score * 100))
def test_clf_uni_bi( self, clf, clf_name, feature_count_range=[ 100, 300, 500, 700, 1000, 1500, 2000, 2500, 3000, 4000, 5000 ], uni_bi_range=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]): # pre = Preprocesser(self.pos_file, self.neg_file) pre_train = Preprocesser(self.pos_file, self.neg_file) pre_test = Preprocesser(self.pos_test, self.neg_test) selector_methods = ( 'df', 'ig', 'mi', 'chi', ) # 'df', 'ig', 'mi', 'chi' weight_methods = ('tf_idf', ) train_unigram, train_target = pre_train.get_unigram(is_shuffle=False) train_bigram, train_target = pre_train.get_bigram(is_shuffle=False) test_unigram, test_target = pre_test.get_unigram( is_shuffle=False) #, 0.7, 0.9 test_bigram, test_target = pre_test.get_bigram(is_shuffle=False) unigram_selector = FeatureSelector(train_unigram, train_target) bigram_selector = FeatureSelector(train_bigram, train_target) train_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(train_unigram, train_bigram) ] test_data = [ unigram_line + bigram_line for unigram_line, bigram_line in zip(test_unigram, test_bigram) ] for selector_method in selector_methods: unigram_features = unigram_selector.select( selector_method, unigram_selector.all_features_size) bigram_features = bigram_selector.select( selector_method, bigram_selector.all_features_size) for count in feature_count_range: for uni_bi_size in uni_bi_range: bi_size = int(count * uni_bi_size) bigram_selected_features = bigram_features[:bi_size] unigram_selected_features = unigram_features[:(count - bi_size)] features = unigram_selected_features features.extend(bigram_selected_features) train_vectorizer = TermWeight(train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(test_data, test_target, features) test_weighted_data = test_vectorizer.weight( weight_method) clf.fit(train_weighted_data, train_target) test_result = clf.predict(test_weighted_data) score = compute_aprf(test_target, test_result) self.write_result(clf_name + '_uni_bi', 'unigram+bigram', selector_method, str(count) + ':' + str(uni_bi_size), weight_method, score)
def test_unigram_rule(pos_file, neg_file, clf, clf_name, data_type): selector_methods = ('df', ) weight_methods = ('tf_idf', ) feature_count_range = [ 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000 ] rule_range = (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1) pos_data, rule_pos_data = read_word_rule(pos_file) neg_data, rule_neg_data = read_word_rule(neg_file) test_size = 0.2 pos_test_index = int(len(pos_data) * test_size) neg_test_index = int(len(neg_data) * test_size) pos_test_data = pos_data[:pos_test_index] pos_train_data = pos_data[pos_test_index:] neg_test_data = neg_data[:neg_test_index] neg_train_data = neg_data[neg_test_index:] train_data = pos_train_data + neg_train_data test_data = pos_test_data + neg_test_data train_target = [1] * len(pos_train_data) + [0] * len(neg_train_data) test_target = [1] * len(pos_test_data) + [0] * len(neg_test_data) rule_pos_test_data = rule_pos_data[:pos_test_index] rule_pos_train_data = rule_pos_data[pos_test_index:] rule_neg_test_data = rule_neg_data[:neg_test_index] rule_neg_train_data = rule_neg_data[neg_test_index:] rule_train_data = rule_pos_train_data + rule_neg_train_data rule_test_data = rule_pos_test_data + rule_neg_test_data def merge_data(data_a, data_b): result = [] for i in range(len(data_a)): result.append(data_a[i] + data_b[i]) return result all_train_data = merge_data(train_data, rule_train_data) all_test_data = merge_data(test_data, rule_test_data) unigram_selector = FeatureSelector(train_data, train_target) rule_selector = FeatureSelector(rule_train_data, train_target) for selector_method in selector_methods: unigram_features = unigram_selector.select( selector_method, unigram_selector.all_features_size) rule_features = rule_selector.select(selector_method, rule_selector.all_features_size) for count in feature_count_range: for rule_size in rule_range: rule_count = int(count * rule_size) rule_selected_features = rule_features[:rule_count] unigram_selected_features = unigram_features[:(count - rule_count)] features = unigram_selected_features features.extend(rule_selected_features) train_vectorizer = TermWeight(all_train_data, train_target, features) for weight_method in weight_methods: train_weighted_data = train_vectorizer.weight( weight_method) test_vectorizer = TermWeight(all_test_data, test_target, features) test_weighted_data = test_vectorizer.weight(weight_method) clf.fit(train_weighted_data, train_target) test_result = clf.predict(test_weighted_data) score = compute_aprf(test_target, test_result) write_result(clf_name, data_type, selector_method, count, rule_size, score)