class ItemBankPipeLine(object): session = DBSession() def process_item(self, item, spider): if "item_bank" == spider.name: item_dict = dict(**item) point_list = item_dict.pop('point') for point in point_list: chaper_point = dict() chaper_point['chaper_id'] = point.get('chaper_id') chaper_point['title'] = point.get('title') # point.get('url') chaper_point['code'] = point.get('point_code') chaper_point['url'] = point.get('url') self.session.add(ChaperPoint(**chaper_point)) item_point = dict() item_point['item_id'] = point.get('item_id') item_point['point_code'] = point.get('point_code') self.session.add(ItemPoint(**item_point)) try: self.session.add(ItemBank(**item_dict)) except Exception as e: print(e,item_dict) return item return item
class LibraryChapterPipeLine(object): session = DBSession() def process_item(self, item, spider): if "library_chapter" == spider.name: # print(item) self.session.add(LibraryChapter(**item)) return item return item
def get_item_bank_init_url(): """ 获取题库url列表用来爬取数据 :return: """ from jyeoo.mysql_model import DBSession, LibraryChapter, LibraryEntry, ItemStyle, ItemBankInit # re_list = list() re_dict = dict() session = DBSession()
class ChapterPointPipeLine(object): session = DBSession() def process_item(self, item, spider): if "chapter_point" == spider.name: query = self.session.session.query(ChaperPoint).filter(ChaperPoint.id == item.get('id')).one() query.content = item.get('content') self.session.session.commit() return item return item
def get_item_bank_url(): from jyeoo.mysql_model import DBSession, ItemBankInit session = DBSession() query = session.session.query(ItemBankInit).filter(ItemBankInit.is_finish == 0) last_data = None for item in query: if last_data: last_data.is_finish = 1 session.session.commit() last_data = item yield item.detail_page_url
def get_chapter_url(): from jyeoo.mysql_model import DBSession, LibraryEntry # re_list = list() re_dict = dict() session = DBSession() query = session.session.query(LibraryEntry).all() for item in query: url_str = 'http://www.jyeoo.com/{subject}/ques/search?f=0&q={id}' if int(item.level_code) > 1: re_dict[item.id] = url_str.format(subject=item.subject_code + item.level_code, id=item.id) else: re_dict[item.id] = url_str.format(subject=item.subject_code, id=item.id) return re_dict
class ItemBankInitPipeLine(object): session = DBSession() def process_item(self, item, spider): if "item_bank_init" == spider.name: fieldset_id = item.get('fieldset_id') if 0 == self.session.session.query(ItemBankInit).filter(ItemBankInit.fieldset_id == fieldset_id).count(): try: self.session.add(ItemBankInit(**item)) except Exception as e: print(e) return item return item
def get_chapter_point_url(): """ 获取知识点url列表 :return: """ from jyeoo.mysql_model import DBSession, ChaperPoint session = DBSession() chaper_point_query = session.session.query(ChaperPoint).filter(ChaperPoint.content.is_(None)) re_list = list() for item in chaper_point_query: temp_dict = dict() temp_dict['url'] = item.url temp_dict['id'] = item.id re_list.append(temp_dict) return re_list
def get_valid_cookie(re_type=DICT): """ 获取有效的cookie :param re_type:返回类型 :return: """ from jyeoo.mysql_model import DBSession, CookieInfo session = DBSession() cookie_query = session.session.query(CookieInfo).filter(CookieInfo.is_valid == 1) cookie_str = cookie_query[0].cookie # 调用api网址获取cookie # cookie_str = login_parse() # for item in cookie_query: if isinstance(re_type, dict): return cookie_str_to_dict(cookie_str) # return cookie_str_to_dict(item.cookie) if isinstance(re_type, str): return cookie_str # return item.cookie if isinstance(re_type, list): return cookie_str_to_list(cookie_str) # return cookie_str_to_list(item.cookie) return None