Esempio n. 1
0
    def corpus_to_feature_and_label_mat(self, corpus_path, result_path):
        if Util.is_file(result_path):
            Util.log_tool.log.debug("loading data")
            return Util.get_libsvm_data(result_path)
        data = codecs.open(corpus_path, 'rb', FilePathConfig.file_encodeing,
                           'ignore')
        sparse_mat = codecs.open(result_path, 'wb',
                                 FilePathConfig.file_encodeing, 'ignore')
        count = 0
        for line in data:
            count += 1
            if count % 10000 == 0:
                Util.log_tool.log.debug("add" + str(count))
            document = Document(line)
            label_id = self.category_dic[document.label]
            content_words = document.get_filtered_content_words_feature()
            doc_len = len(content_words)

            words = self.lexicon.convert_document(content_words)
            terms = self.test_vector_builder.build(words, True, doc_len)

            sparse_mat.write(str(label_id))
            # 将id_weight对按照id大小,从小到大排列
            terms.sort(cmp=lambda x, y: cmp(x.term_id, y.term_id))
            for term in terms:
                sparse_mat.write(" " + str(term.term_id) + ":" +
                                 str(term.weight))
            sparse_mat.write("\n")

        data.close()
        sparse_mat.close()
        return Util.get_libsvm_data(result_path)
Esempio n. 2
0
    def data_to_feature(self, data):
        row = list()
        col = list()
        weight = list()
        row_num = 0

        for line in data:
            print row_num
            document = Document(line)
            # 如果需要对文章的内容进行过滤,则添加词的过滤器
            # if not ClassifierConfig.is_use_bigram:
            #     for feature_filter in self.filters:
            #         document.add_filter(feature_filter)
            content_words = document.get_content_words_feature()
            doc_len = len(content_words)
            words = self.lexicon.convert_document(content_words)
            terms = self.test_vector_builder.build(words, True, doc_len)
            terms.sort(cmp=lambda x, y: cmp(x.term_id, y.term_id))
            for term in terms:
                row.append(row_num)
                col.append(term.term_id)
                weight.append(term.weight)
            row_num += 1
        sparse_mat = csr_matrix(
            (np.array(weight), (np.array(row), np.array(col))),
            shape=(row_num, ClassifierConfig.max_num_features))
        return sparse_mat
Esempio n. 3
0
    def add_document(self, raw_document):
        # 将原始数据转换成整齐格式的文档
        document = Document(raw_document)

        # 检查类别是否合法
        if document.label not in self.category_dic:
            Util.log_tool.log.error("Error category error")

        # 如果cache文件还未打开,则打开
        if self.cache_file is None:
            Util.log_tool.log.debug("open file")
            self.cache_file = codecs.open(FilePathConfig.cache_file_path, 'wb',
                                          FilePathConfig.file_encodeing,
                                          'ignore')

        # 如果需要对文章的内容进行过滤,则添加词的过滤器
        if not ClassifierConfig.is_use_bigram:
            for feature_filter in self.filters:
                document.add_filter(feature_filter)

        # 从文档中拿出我们需要的特征
        content_words = document.get_filtered_content_words_feature()
        self.lexicon.add_document(content_words)
        words = self.lexicon.convert_document(content_words)
        terms = self.training_vector_builder.build(words, False, 0)
        try:
            if len(terms) > self.longest_length_doc:
                self.longest_length_doc = len(terms)

            line_result = str(
                self.category_dic[document.label]) + FilePathConfig.tab
            for term in terms:
                line_result += (str(term.term_id) + FilePathConfig.colon +
                                str(term.weight))
                line_result += FilePathConfig.space
            self.cache_file.write(line_result.strip() + '\n')
        except:
            Util.log_tool.log.error(
                "Error write cache error when add document")

        self.num_doc += 1
Esempio n. 4
0
import sys

from config.config import FilePathConfig
from feature_extractor.entity.document import Document

reload(sys)
sys.setdefaultencoding('UTF-8')

title_label_dic = {}

data = codecs.open(FilePathConfig.raw_news_path, 'r', 'utf-8', 'ignore')
labels = codecs.open(FilePathConfig.file_root_path + "label.txt", 'r', 'utf-8',
                     'ignore')
match_result = codecs.open(FilePathConfig.file_root_path + "match_result.txt",
                           'w', 'utf-8', 'ignore')

for line in labels:
    title = line.split('\t')[0]
    label = line.split('\t')[1].strip()
    title_label_dic[title] = label

count = 0
for line in data:
    document = Document(line)
    count += 1
    print count
    if document.title not in title_label_dic:
        match_result.write(line.strip() + '\n')

match_result.close()
Esempio n. 5
0
data_path = "../file/"

data = codecs.open(data_path + "match_result.txt", 'r', 'utf-8', 'ignore')

match_result = codecs.open(data_path + "new_corpus.txt", 'w', 'utf-8',
                           'ignore')

common_filter = CommonFilter()
stop_words_filter = StopWordFilter()
speech_filter = SpeechFilter()
count = 0
for line in data:
    print count
    count += 1
    document = Document(line)
    document.add_filter(common_filter).add_filter(
        stop_words_filter).add_filter(speech_filter)
    keywords = document.get_filtered_content_words_feature()
    if keywords is None:
        continue
    raw_content = document.get_raw_content()
    content = ""
    for keyword in keywords:
        content = content + keyword + ","
    # 去除最后一个逗号
    content = content[:-1]
    # 提取出内容切词和原文,重新写入文件
    match_result.write(document.json + '\t' + content + '\t' + raw_content +
                       '\t' + document.label + '\n')
# coding=UTF-8
import codecs
import sys

sys.path.append("../")
reload(sys)
sys.setdefaultencoding('UTF-8')
from feature_extractor.entity.document import Document

gongyi_data = codecs.open("../file/type_8.txt", 'r', 'utf-8', 'ignore')
filtered_gongyi_data = codecs.open("../file/type_8_filtered.txt", 'w', 'utf-8',
                                   'ignore')

count = 0
for line in gongyi_data:
    label = 0
    document = Document(line)
    title = document.title
    words = document.get_filtered_content_words_feature()

    if len(words) > 6:
        count += 1
        filtered_gongyi_data.write(line)

filtered_gongyi_data.close()
gongyi_data.close()
print count