Esempio n. 1
0
 def add_chapter_to_db(self, fieldset_id, detail_page_url, args):
     item_bank_init = dict()
     item_bank_init['fieldset_id'] = fieldset_id
     item_bank_init['detail_page_url'] = detail_page_url
     item_bank_init['ques_url'] = args.get('url')
     item_bank_init['from_code'] = self.from_code
     item_bank_init['item_style_code'] = args.get('item_style_code')
     item_bank_init['library_id'] = args.get('library_id')
     item_bank_init['chaper_id'] = self.chapter_id
     item_bank_init['is_finish'] = 0
     mutex.acquire()
     self.db_connect.add(ItemBankInit(**item_bank_init))
     mutex.release()
Esempio n. 2
0
 def get_item_bank_init_url(self, chapter_id, subject_code):
     """
     获取题库url列表用来爬取数据
     :return:
     """
     re_dict = dict()
     query = self.db_session.query(LibraryChapter).filter(
         LibraryChapter.id == chapter_id)
     url_str = 'http://www.jyeoo.com/{subject}/ques/search?f=0&q={pk}&so={from_code}'
     last_data = None
     # 遍历章节
     for item in query:
         if last_data:
             is_ok_count = self.db_session.query(ItemBankInit).filter(
                 ItemBankInit.chaper_id == last_data.id).count()
             if is_ok_count > 1:
                 last_data.is_finish = 1
                 mutex.acquire()
                 self.db_session.commit()
                 mutex.release()
         last_data = item
         temp_dict = dict()
         # 学科
         temp_dict['subject'] = subject_code
         # 教材ID
         temp_dict['library_id'] = item.library_id
         # 章节ID
         temp_dict['chaper_id'] = item.id
         # 章节直连
         temp_dict['pk'] = item.pk
         # 题型
         temp_dict['item_style_code'] = ''
         # 题类
         temp_dict['field_code'] = ''
         # 来源
         temp_dict['from_code'] = self.from_code
         temp_dict['url'] = url_str.format(**temp_dict)
         re_dict[item.id] = temp_dict
         yield re_dict
Esempio n. 3
0
    def library_chapter(self):
        """
        章节爬取动作
        :return:
        """
        start_url = self.get_chapter_url()
        try:
            self.driver.get(start_url)
            WebDriverWait(self.driver, 30).until(
                ec.visibility_of_element_located(
                    (By.XPATH,
                     '//div[@class="tree-head"]/span[@id="spanEdition"]')))
        except TimeoutException as e:
            self.sinOut.emit('超时!!! %s' % str(e))
            self.driver.get_screenshot_as_file('./error.png')
            return
        teaching = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanEdition"]').text
        level_name = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanGrade"]').text
        teaching = teaching.replace(':', '').replace(':', '')
        self.sinOut.emit('进行爬取章节!')
        if self.teaching_name != teaching or self.level_name != level_name:
            self.message_box.emit('警告', "没有数据!")
            return
        et = etree.HTML(self.driver.page_source)
        library_id = self.teaching
        sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li')
        chapters_list = list()

        total = len(sub_obj)
        current_count = 0
        for item in sub_obj:
            lc_item = dict()
            lc_item['id'] = str(uuid.uuid1())
            pk = item.attrib.get('pk')
            nm = item.attrib.get('nm')
            child = utils.recursive_get_li(lc_item['id'], library_id, item)
            lc_item['pk'] = pk
            lc_item['parent_id'] = ''
            lc_item['library_id'] = library_id
            lc_item['name'] = nm
            lc_item['child'] = child
            chapters_list.append(lc_item)
            current_count += 1
            self.crawler_chapter_progress.emit(current_count, total)
        self.sinOut.emit('正在解析入库')

        if chapters_list:
            mutex.acquire()
            chapters = self.db_connect.session.query(
                LibraryChapter.name, LibraryChapter.id,
                LibraryChapter.pk).filter(
                    LibraryChapter.library_id == library_id)
            new_list = utils.split_list(chapters_list)
            if chapters.count() > 0:
                # 如果章节存在数据则进行更新
                relational_dict = dict()
                for item in chapters:
                    # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list)
                    for item2 in new_list:
                        if item2.get('pk') == item.pk:
                            relational_dict[item2['id']] = item.id
                            item2['id'] = item.id
                            break
                    for item3 in new_list:
                        if item3.get('parent_id') and relational_dict.get(
                                item3['parent_id']):
                            item3['parent_id'] = relational_dict.get(
                                item3['parent_id'])
                chapters.delete()
                self.db_connect.session.commit()
            mutex.release()

            # 插入新值
            for item in new_list:
                mutex.acquire()
                if 'child' in item:
                    del item['child']
                self.db_connect.add(LibraryChapter(**item))
                mutex.release()
        self.sinOut.emit('章节爬取完成,重新加载查看')
Esempio n. 4
0
    def item_bank_details(self):
        """
        详情页爬取方法
        :return:
        """
        current_count = 0
        if not self.chapter_id:
            self.sinOut.emit('错误!章节获取失败,可能未选择章节!')
        else:
            start_urls = self.get_details_url()
            for item in start_urls:
                current_count += 1
                bank_item = dict()
                self.sinOut.emit('正在获取详情页 %s' % item.get('detail_page_url'))
                self.driver.get(item.get('detail_page_url'))
                et = etree.HTML(self.driver.page_source)
                year_html = et.xpath('.//div[@class="pt1"]/a/text()')
                if year_html:
                    year_area = utils.txt_wrap_by('(', ')', year_html[0])
                    if not year_area:
                        year_area = utils.txt_wrap_by('(', ')', year_html[0])
                    if year_area:
                        bank_item['year_code'] = year_area.split('•')[0]
                    bank_item['year_area'] = year_area
                else:
                    bank_item['year_area'] = ''
                bank_item['used_times'] = ''
                bank_item['exam_times'] = ''
                fieldset_xpath = '//div[@id="{fieldset_id}"]'.format(
                    fieldset_id=item.get('fieldset_id'))
                detail_data = et.xpath(fieldset_xpath)
                # 考题
                bank_item['context'] = str(
                    detail_data[0].xpath('.//div[@class="pt1"]/text()'))
                bank_item['anwser'] = self.driver.page_source
                fieldtip_left = detail_data[0].xpath(
                    './/div[@class="fieldtip-left"]')
                record_time = fieldtip_left[0].xpath('.//span[1]/text()')
                used_times = fieldtip_left[0].xpath('.//span[2]/text()')
                exam_times = fieldtip_left[0].xpath('.//span[3]/text()')
                difficult_code = fieldtip_left[0].xpath('.//span[4]/text()')
                if record_time:
                    bank_item['record_time'] = record_time[0].replace(
                        ":", ":").split(':')[1]
                if used_times:
                    bank_item['used_times'] = used_times[0].replace(
                        ":", ":").split(':')[1]
                if exam_times:
                    bank_item['exam_times'] = exam_times[0].replace(
                        ":", ":").split(':')[1]
                if difficult_code:
                    bank_item['difficult_code'] = difficult_code[0].replace(
                        ":", ":").split(':')[1]
                bank_item['from_code'] = self.from_code
                bank_item['url'] = item.get('detail_page_url')
                bank_item['chaper_id'] = item.get('chaper_id')
                bank_item['library_id'] = item.get('library_id')
                bank_item['item_style_code'] = item.get('item_style_code')
                point_list = self.get_pointcard(item.get('fieldset_id'),
                                                bank_item, et)
                bank_item['points'] = point_list
                # 入库
                mutex.acquire()
                self.item_bank_deails_and_point_db(bank_item)
                mutex.release()
                # 更新爬虫次数进度
                self.details_progress.emit(current_count,
                                           int(self.crawl_maximum))

        return