def test_get_paras(): chapter_content_filter = ChapterContentFilter() site_id = '1' chapter_title = '' with codecs.open('data/select_sample/' + site_id, encoding='gbk') as sample_file: for line in sample_file: raw_chapter_content = line.strip().split('\t')[5] para_list = chapter_content_filter.get_paras(raw_chapter_content, site_id, chapter_title) raw_cn_count = count_chinese(raw_chapter_content) fmt_cn_count = 0 for para in para_list: fmt_cn_count += count_chinese(para.fmt_content) if raw_cn_count != fmt_cn_count: print 'not equal {0} vs {1}'.format(raw_cn_count, fmt_cn_count) last_index = line.rfind('\t') print line[:last_index].encode('gbk') sentences = chapter_content_filter.get_sentences(para_list) for para_index, para in enumerate(para_list): print para_index, para.fmt_content.encode('gbk') if para.para_index == -1: continue for index in range(para.sentence_start_index, para.sentence_end_index): print sentences[index].fmt_content.encode('gbk') raw_input('next_sentences')