Ejemplo n.º 1
0
    def sample_cid(self):
        """
        从top 10W中随机抽取1000本书,从一本书中随机挑选20个章节。
        :return:
        """
        if not os.path.isdir('data/sample/'):
            os.makedirs('data/sample/')

        #读取top 10W书的id
        rid_list = []
        with codecs.open('../rid.txt.100k') as top_100k_file:
            for line in top_100k_file:
                rid = line.strip()
                if rid != '':
                    rid_list.append(line.strip())

        if self.debug:
            sample_book_num = 1
        else:
            sample_book_num = 2000

        #从top 10W中随机抽取1000本书,前1W选取100,后9W选取900
        sample_rid_list = SampleChapter.sample_list(rid_list, sample_num=sample_book_num, sep_num=10000, head_num=100)

        sample_size = 1000
        sample_count = 0
        for rid in sample_rid_list:
            rid = int(rid)

            # 获得rid对应的权威目录,然后从中随机挑选20个章节
            chapter_db = ChapterDBModule()
            agg_dir_list = chapter_db.get_novelaggregationdir_list(rid)

            #从最后100章或者最后20%章内挑选
            sample_seq_num = min(len(agg_dir_list) - 100, len(agg_dir_list) * 4/5)
            sample_agg_dir_list = SampleChapter.sample_list(agg_dir_list, sample_num=20,
                                                            sep_num=sample_seq_num, head_num=10
            )

            if len(sample_agg_dir_list) == 0:
                continue

            sample_count += 1
            if sample_count > sample_size:
                break

            self.logger.info('rid: {0}, sample_num/chapter_sum: {1}/{2}'.format(
                rid, len(sample_agg_dir_list), len(agg_dir_list)))

            with codecs.open('data/sample_cid', 'a', encoding='gbk') as sample_cid_file:
                for (align_id, chapter_index, chapter_url, chapter_status) in sample_agg_dir_list:
                    sample_cid_file.write(str(rid) + cur_delimiter + str(align_id) + cur_linesep)
Ejemplo n.º 2
0
    def sample_chapter_rid(self, rid):
        """
        从一本书中随机挑选20个章节,按照之前章节选取的思路,并将候选章节写入对应的站点的样本文件。
        :return:
        """
        rid = int(rid)

        # 获得rid对应的权威目录,然后从中随机挑选10个章节
        chapter_db = ChapterDBModule()
        agg_dir_list = chapter_db.get_novelaggregationdir_list(rid)

        #从最后100章或者最后20%章内挑选
        sample_seq_num = min(len(agg_dir_list) - 100, len(agg_dir_list) * 4/5)
        sample_agg_dir_list = SampleChapter.sample_list(agg_dir_list, sample_num=20,
                                                        sep_num=sample_seq_num, head_num=10
        )

        chapter_module = ChapterOptimizeModule()
        for (align_id, chapter_index, chapter_url, chapter_status) in sample_agg_dir_list:
            self.logger.info('rid: {0}, sample_index: {1}/{2}, align_id: {3}, chapter_status: {4}'.format(
                rid, chapter_index, len(sample_agg_dir_list), align_id, chapter_status))

            total_candidate_chapter_list = chapter_module.candidate_chapter_collecion(rid, align_id)
            self.logger.info('total_candidate_chapter_length: {0}'.format(len(total_candidate_chapter_list)))

            candidate_chapter_list = chapter_module.candidate_chapter_generate(rid, align_id, total_candidate_chapter_list)
            if len(candidate_chapter_list) == 0:
                continue

            if len(candidate_chapter_list) >= 3:
                candidate_chapter_list = chapter_module.basic_chapter_filter(candidate_chapter_list)

            self.logger.info('selected_candidate_chapter_length: {0}'.format(len(candidate_chapter_list)))

            with codecs.open('data/sample_cid', 'a', encoding='gbk') as sample_cid_file:
                sample_cid_file.write(str(rid) + cur_delimiter + str(align_id) + cur_linesep)

            for candidate_chapter in candidate_chapter_list:
                with codecs.open('data/sample/' + str(candidate_chapter.site_id), 'a',
                                 encoding='gbk', errors='ignore') as sample_file:
                    sample_file.write(str(candidate_chapter.rid) + cur_delimiter
                                      + str(candidate_chapter.align_id) + cur_delimiter
                                      + str(candidate_chapter.chapter_id) + cur_delimiter
                                      + str(candidate_chapter.site_id) + cur_delimiter
                                      + str(candidate_chapter.site_status) + cur_delimiter
                                      + candidate_chapter.chapter_content + cur_linesep)