def corpus_to_feature_and_label_mat(self, corpus_path, result_path): if Util.is_file(result_path): Util.log_tool.log.debug("loading data") return Util.get_libsvm_data(result_path) data = codecs.open(corpus_path, 'rb', FilePathConfig.file_encodeing, 'ignore') sparse_mat = codecs.open(result_path, 'wb', FilePathConfig.file_encodeing, 'ignore') count = 0 for line in data: count += 1 if count % 10000 == 0: Util.log_tool.log.debug("add" + str(count)) document = Document(line) label_id = self.category_dic[document.label] content_words = document.get_filtered_content_words_feature() doc_len = len(content_words) words = self.lexicon.convert_document(content_words) terms = self.test_vector_builder.build(words, True, doc_len) sparse_mat.write(str(label_id)) # 将id_weight对按照id大小,从小到大排列 terms.sort(cmp=lambda x, y: cmp(x.term_id, y.term_id)) for term in terms: sparse_mat.write(" " + str(term.term_id) + ":" + str(term.weight)) sparse_mat.write("\n") data.close() sparse_mat.close() return Util.get_libsvm_data(result_path)
def data_to_feature(self, data): row = list() col = list() weight = list() row_num = 0 for line in data: print row_num document = Document(line) # 如果需要对文章的内容进行过滤,则添加词的过滤器 # if not ClassifierConfig.is_use_bigram: # for feature_filter in self.filters: # document.add_filter(feature_filter) content_words = document.get_content_words_feature() doc_len = len(content_words) words = self.lexicon.convert_document(content_words) terms = self.test_vector_builder.build(words, True, doc_len) terms.sort(cmp=lambda x, y: cmp(x.term_id, y.term_id)) for term in terms: row.append(row_num) col.append(term.term_id) weight.append(term.weight) row_num += 1 sparse_mat = csr_matrix( (np.array(weight), (np.array(row), np.array(col))), shape=(row_num, ClassifierConfig.max_num_features)) return sparse_mat
def add_document(self, raw_document): # 将原始数据转换成整齐格式的文档 document = Document(raw_document) # 检查类别是否合法 if document.label not in self.category_dic: Util.log_tool.log.error("Error category error") # 如果cache文件还未打开,则打开 if self.cache_file is None: Util.log_tool.log.debug("open file") self.cache_file = codecs.open(FilePathConfig.cache_file_path, 'wb', FilePathConfig.file_encodeing, 'ignore') # 如果需要对文章的内容进行过滤,则添加词的过滤器 if not ClassifierConfig.is_use_bigram: for feature_filter in self.filters: document.add_filter(feature_filter) # 从文档中拿出我们需要的特征 content_words = document.get_filtered_content_words_feature() self.lexicon.add_document(content_words) words = self.lexicon.convert_document(content_words) terms = self.training_vector_builder.build(words, False, 0) try: if len(terms) > self.longest_length_doc: self.longest_length_doc = len(terms) line_result = str( self.category_dic[document.label]) + FilePathConfig.tab for term in terms: line_result += (str(term.term_id) + FilePathConfig.colon + str(term.weight)) line_result += FilePathConfig.space self.cache_file.write(line_result.strip() + '\n') except: Util.log_tool.log.error( "Error write cache error when add document") self.num_doc += 1
import sys from config.config import FilePathConfig from feature_extractor.entity.document import Document reload(sys) sys.setdefaultencoding('UTF-8') title_label_dic = {} data = codecs.open(FilePathConfig.raw_news_path, 'r', 'utf-8', 'ignore') labels = codecs.open(FilePathConfig.file_root_path + "label.txt", 'r', 'utf-8', 'ignore') match_result = codecs.open(FilePathConfig.file_root_path + "match_result.txt", 'w', 'utf-8', 'ignore') for line in labels: title = line.split('\t')[0] label = line.split('\t')[1].strip() title_label_dic[title] = label count = 0 for line in data: document = Document(line) count += 1 print count if document.title not in title_label_dic: match_result.write(line.strip() + '\n') match_result.close()
data_path = "../file/" data = codecs.open(data_path + "match_result.txt", 'r', 'utf-8', 'ignore') match_result = codecs.open(data_path + "new_corpus.txt", 'w', 'utf-8', 'ignore') common_filter = CommonFilter() stop_words_filter = StopWordFilter() speech_filter = SpeechFilter() count = 0 for line in data: print count count += 1 document = Document(line) document.add_filter(common_filter).add_filter( stop_words_filter).add_filter(speech_filter) keywords = document.get_filtered_content_words_feature() if keywords is None: continue raw_content = document.get_raw_content() content = "" for keyword in keywords: content = content + keyword + "," # 去除最后一个逗号 content = content[:-1] # 提取出内容切词和原文,重新写入文件 match_result.write(document.json + '\t' + content + '\t' + raw_content + '\t' + document.label + '\n')
# coding=UTF-8 import codecs import sys sys.path.append("../") reload(sys) sys.setdefaultencoding('UTF-8') from feature_extractor.entity.document import Document gongyi_data = codecs.open("../file/type_8.txt", 'r', 'utf-8', 'ignore') filtered_gongyi_data = codecs.open("../file/type_8_filtered.txt", 'w', 'utf-8', 'ignore') count = 0 for line in gongyi_data: label = 0 document = Document(line) title = document.title words = document.get_filtered_content_words_feature() if len(words) > 6: count += 1 filtered_gongyi_data.write(line) filtered_gongyi_data.close() gongyi_data.close() print count