Exemple #1
0
 def refresh_level(self):
     """
     刷新学级
     :return:
     """
     self.comboBox_level.clear()
     mutex.acquire()
     levels = self.db_connect.session.query(
         ItemStyle.level_name, ItemStyle.level_code).group_by(
             ItemStyle.level_name).order_by(ItemStyle.level_code.desc())
     mutex.release()
     for item in levels:
         self.comboBox_level.addItem(item[0], item[1])
 def add_chapter_to_db(self, fieldset_id, detail_page_url, args):
     item_bank_init = dict()
     item_bank_init['fieldset_id'] = fieldset_id
     item_bank_init['detail_page_url'] = detail_page_url
     item_bank_init['ques_url'] = args.get('url')
     item_bank_init['from_code'] = self.from_code
     item_bank_init['item_style_code'] = args.get('item_style_code')
     item_bank_init['library_id'] = args.get('library_id')
     item_bank_init['chaper_id'] = self.chapter_id
     item_bank_init['is_finish'] = 0
     mutex.acquire()
     self.db_connect.add(ItemBankInit(**item_bank_init))
     mutex.release()
Exemple #3
0
 def refresh_grade(self):
     """
     年级
     :return:
     """
     self.comboBox_grade.clear()
     level_code = self.comboBox_level.currentData()
     mutex.acquire()
     grade_query = self.db_connect.session.query(
         LevelGradeRef.grade_name, LevelGradeRef.grade_code).filter(
             LevelGradeRef.level_code == level_code)
     mutex.release()
     for item in grade_query:
         self.comboBox_grade.addItem(item[0], item[1])
Exemple #4
0
 def refresh_subject(self):
     """
     刷新学科
     :return:
     """
     self.comboBox_subject.clear()
     level_data = self.comboBox_level.currentData()
     mutex.acquire()
     subject_query = self.db_connect.session.query(
         LevelSubjectsRef.subject_name,
         LevelSubjectsRef.subject_code).filter(
             LevelSubjectsRef.level_code == level_data)
     mutex.release()
     for item in subject_query:
         _level = '' if int(level_data) == 1 else level_data
         self.comboBox_subject.addItem(item[0], item[1] + _level)
Exemple #5
0
 def refresh_from(self):
     """
     刷新来源
     :return:
     """
     self.comboBox_from.clear()
     mutex.acquire()
     level_code = self.comboBox_level.currentData()
     levels = self.db_connect.session.query(
         ItemFrom.from_name,
         ItemFrom.from_code).filter(ItemFrom.level_code == level_code)
     mutex.release()
     # 默认为全部
     self.comboBox_from.addItem('全部', '')
     for item in levels:
         self.comboBox_from.addItem(item[0], item[1])
Exemple #6
0
 def refresh_teaching(self):
     """
     刷新教材
     :return:
     """
     self.comboBox_teaching.clear()
     grade = self.comboBox_grade.currentData()
     subject = self.comboBox_subject.currentData()
     if subject[-1].isdigit():
         subject = subject[:-1]
     mutex.acquire()
     teaching_query = self.db_connect.session.query(
         LibraryEntry.style_name,
         LibraryEntry.id).filter(LibraryEntry.grade_code == grade,
                                 LibraryEntry.subject_code == subject)
     mutex.release()
     for item in teaching_query:
         self.comboBox_teaching.addItem(item[0], item[1])
Exemple #7
0
    def refresh_chapter(self):
        """
        章节
        :return:
        """
        self.comboBox_chapter.clear()
        self.treeWidget_chapter.clear()
        self.treeWidget_chapter.setColumnCount(1)
        library_id = self.comboBox_teaching.currentData()
        mutex.acquire()
        chapters = self.db_connect.session.query(
            LibraryChapter.name, LibraryChapter.id, LibraryChapter.parent_id,
            LibraryChapter.pk).filter(LibraryChapter.library_id == library_id)

        tree_dict = dict()
        for item in chapters:
            self.comboBox_chapter.addItem(item[0], item[1])
            if '' == item[2]:
                tree_item = QTreeWidgetItem(self.treeWidget_chapter)
                tree_item.setText(0, item[0])
                tree_item.setText(1, item[1])
                tree_item.setText(2, item[3])
                tree_dict[item[1]] = {'item': tree_item, 'parent_id': ''}
            else:
                tree_item = QTreeWidgetItem()
                tree_item.setText(0, item[0])
                tree_item.setText(1, item[1])
                tree_item.setText(2, item[3])
                tree_dict[item[1]] = {'item': tree_item, 'parent_id': item[2]}
        mutex.release()
        for key, value in tree_dict.items():
            parent_id = value.get('parent_id')
            if parent_id:
                if not tree_dict.get(parent_id):
                    result = self.message_box_choice('章节获取错误', '请重新获取此章节')
                    if result == QMessageBox.Ok:
                        # 重新爬取章节
                        self.start_chapter()
                    break
                tree_dict[parent_id]['item'].addChild(value.get('item'))
        # 设置默认选中第一个
        item = self.treeWidget_chapter.topLevelItem(0)
        self.treeWidget_chapter.setCurrentItem(item)
 def get_item_bank_init_url(self, chapter_id, subject_code):
     """
     获取题库url列表用来爬取数据
     :return:
     """
     re_dict = dict()
     query = self.db_session.query(LibraryChapter).filter(
         LibraryChapter.id == chapter_id)
     url_str = 'http://www.jyeoo.com/{subject}/ques/search?f=0&q={pk}&so={from_code}'
     last_data = None
     # 遍历章节
     for item in query:
         if last_data:
             is_ok_count = self.db_session.query(ItemBankInit).filter(
                 ItemBankInit.chaper_id == last_data.id).count()
             if is_ok_count > 1:
                 last_data.is_finish = 1
                 mutex.acquire()
                 self.db_session.commit()
                 mutex.release()
         last_data = item
         temp_dict = dict()
         # 学科
         temp_dict['subject'] = subject_code
         # 教材ID
         temp_dict['library_id'] = item.library_id
         # 章节ID
         temp_dict['chaper_id'] = item.id
         # 章节直连
         temp_dict['pk'] = item.pk
         # 题型
         temp_dict['item_style_code'] = ''
         # 题类
         temp_dict['field_code'] = ''
         # 来源
         temp_dict['from_code'] = self.from_code
         temp_dict['url'] = url_str.format(**temp_dict)
         re_dict[item.id] = temp_dict
         yield re_dict
    def library_chapter(self):
        """
        章节爬取动作
        :return:
        """
        start_url = self.get_chapter_url()
        try:
            self.driver.get(start_url)
            WebDriverWait(self.driver, 30).until(
                ec.visibility_of_element_located(
                    (By.XPATH,
                     '//div[@class="tree-head"]/span[@id="spanEdition"]')))
        except TimeoutException as e:
            self.sinOut.emit('超时!!! %s' % str(e))
            self.driver.get_screenshot_as_file('./error.png')
            return
        teaching = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanEdition"]').text
        level_name = self.driver.find_element_by_xpath(
            '//div[@class="tree-head"]/span[@id="spanGrade"]').text
        teaching = teaching.replace(':', '').replace(':', '')
        self.sinOut.emit('进行爬取章节!')
        if self.teaching_name != teaching or self.level_name != level_name:
            self.message_box.emit('警告', "没有数据!")
            return
        et = etree.HTML(self.driver.page_source)
        library_id = self.teaching
        sub_obj = et.xpath('//ul[@id="JYE_POINT_TREE_HOLDER"]/li')
        chapters_list = list()

        total = len(sub_obj)
        current_count = 0
        for item in sub_obj:
            lc_item = dict()
            lc_item['id'] = str(uuid.uuid1())
            pk = item.attrib.get('pk')
            nm = item.attrib.get('nm')
            child = utils.recursive_get_li(lc_item['id'], library_id, item)
            lc_item['pk'] = pk
            lc_item['parent_id'] = ''
            lc_item['library_id'] = library_id
            lc_item['name'] = nm
            lc_item['child'] = child
            chapters_list.append(lc_item)
            current_count += 1
            self.crawler_chapter_progress.emit(current_count, total)
        self.sinOut.emit('正在解析入库')

        if chapters_list:
            mutex.acquire()
            chapters = self.db_connect.session.query(
                LibraryChapter.name, LibraryChapter.id,
                LibraryChapter.pk).filter(
                    LibraryChapter.library_id == library_id)
            new_list = utils.split_list(chapters_list)
            if chapters.count() > 0:
                # 如果章节存在数据则进行更新
                relational_dict = dict()
                for item in chapters:
                    # new_list = self.update_chapter_pk_id(item.id, item.pk, new_list)
                    for item2 in new_list:
                        if item2.get('pk') == item.pk:
                            relational_dict[item2['id']] = item.id
                            item2['id'] = item.id
                            break
                    for item3 in new_list:
                        if item3.get('parent_id') and relational_dict.get(
                                item3['parent_id']):
                            item3['parent_id'] = relational_dict.get(
                                item3['parent_id'])
                chapters.delete()
                self.db_connect.session.commit()
            mutex.release()

            # 插入新值
            for item in new_list:
                mutex.acquire()
                if 'child' in item:
                    del item['child']
                self.db_connect.add(LibraryChapter(**item))
                mutex.release()
        self.sinOut.emit('章节爬取完成,重新加载查看')
    def item_bank_details(self):
        """
        详情页爬取方法
        :return:
        """
        current_count = 0
        if not self.chapter_id:
            self.sinOut.emit('错误!章节获取失败,可能未选择章节!')
        else:
            start_urls = self.get_details_url()
            for item in start_urls:
                current_count += 1
                bank_item = dict()
                self.sinOut.emit('正在获取详情页 %s' % item.get('detail_page_url'))
                self.driver.get(item.get('detail_page_url'))
                et = etree.HTML(self.driver.page_source)
                year_html = et.xpath('.//div[@class="pt1"]/a/text()')
                if year_html:
                    year_area = utils.txt_wrap_by('(', ')', year_html[0])
                    if not year_area:
                        year_area = utils.txt_wrap_by('(', ')', year_html[0])
                    if year_area:
                        bank_item['year_code'] = year_area.split('•')[0]
                    bank_item['year_area'] = year_area
                else:
                    bank_item['year_area'] = ''
                bank_item['used_times'] = ''
                bank_item['exam_times'] = ''
                fieldset_xpath = '//div[@id="{fieldset_id}"]'.format(
                    fieldset_id=item.get('fieldset_id'))
                detail_data = et.xpath(fieldset_xpath)
                # 考题
                bank_item['context'] = str(
                    detail_data[0].xpath('.//div[@class="pt1"]/text()'))
                bank_item['anwser'] = self.driver.page_source
                fieldtip_left = detail_data[0].xpath(
                    './/div[@class="fieldtip-left"]')
                record_time = fieldtip_left[0].xpath('.//span[1]/text()')
                used_times = fieldtip_left[0].xpath('.//span[2]/text()')
                exam_times = fieldtip_left[0].xpath('.//span[3]/text()')
                difficult_code = fieldtip_left[0].xpath('.//span[4]/text()')
                if record_time:
                    bank_item['record_time'] = record_time[0].replace(
                        ":", ":").split(':')[1]
                if used_times:
                    bank_item['used_times'] = used_times[0].replace(
                        ":", ":").split(':')[1]
                if exam_times:
                    bank_item['exam_times'] = exam_times[0].replace(
                        ":", ":").split(':')[1]
                if difficult_code:
                    bank_item['difficult_code'] = difficult_code[0].replace(
                        ":", ":").split(':')[1]
                bank_item['from_code'] = self.from_code
                bank_item['url'] = item.get('detail_page_url')
                bank_item['chaper_id'] = item.get('chaper_id')
                bank_item['library_id'] = item.get('library_id')
                bank_item['item_style_code'] = item.get('item_style_code')
                point_list = self.get_pointcard(item.get('fieldset_id'),
                                                bank_item, et)
                bank_item['points'] = point_list
                # 入库
                mutex.acquire()
                self.item_bank_deails_and_point_db(bank_item)
                mutex.release()
                # 更新爬虫次数进度
                self.details_progress.emit(current_count,
                                           int(self.crawl_maximum))

        return