def test_get_paras():
    chapter_content_filter = ChapterContentFilter()
    site_id = '1'
    chapter_title = ''
    with codecs.open('data/select_sample/' + site_id, encoding='gbk') as sample_file:
        for line in sample_file:
            raw_chapter_content = line.strip().split('\t')[5]
            para_list = chapter_content_filter.get_paras(raw_chapter_content, site_id, chapter_title)
            raw_cn_count = count_chinese(raw_chapter_content)
            fmt_cn_count = 0
            for para in para_list:
                fmt_cn_count += count_chinese(para.fmt_content)

            if raw_cn_count != fmt_cn_count:
                print 'not equal {0} vs {1}'.format(raw_cn_count, fmt_cn_count)

                last_index = line.rfind('\t')
                print line[:last_index].encode('gbk')

            sentences = chapter_content_filter.get_sentences(para_list)
            for para_index, para in enumerate(para_list):
                print para_index, para.fmt_content.encode('gbk')
                if para.para_index == -1:
                    continue

                for index in range(para.sentence_start_index, para.sentence_end_index):
                    print sentences[index].fmt_content.encode('gbk')
                    raw_input('next_sentences')