def save_shop_cate(session, shop_prof_dir): parsed = {i.sid for i in session.query(ShopTags).distinct().all()} print '{} shop category parsed'.format(len(parsed)) data = [] for sid, c in read_file(shop_prof_dir, parsed, lambda fn: fn[:-5]): text = parse(_cate_progs, c, id, 'shop cate') tags = set(_cate_field_progs.findall(text)) - {'»'} data.extend([ShopTags(sid, tag) for tag in tags]) session.add_all(data) session.commit()
def start(self, urls): """ 启动爬虫方法 :param urls: 启动URL :return: 抓取的URL数量 """ number = 0 self._manager.append_new_urls(urls) while self._manager.has_new_url(): number += 1 new_url = self._manager.get_new_url() print('开始下载第{:03}个URL:{}'.format(number, new_url)) html = download(new_url) if html is None: # print('html is empty .') continue links, results = parse(html, new_url) if len(links) > 0: self._manager.append_new_urls(links) if len(results) > 0: self._processor.process(results) return number
def addr(c, sid): return parse(addr_ptns, c, sid, 'shop addr')
def name(c, sid): return parse(name_ptns, c, sid, 'shop name')
def star(c, sid): return int(parse(star_ptns, c, sid, 'shop star', default=0))
re.compile(r'class="user-info">\s*<a.*?href="/member/(\d+)".*?>(.*?)</a>', re.DOTALL), re.compile(r'<p class="name">\s*<a.*?href="/member/(\d+)".*?>(.*?)</a>', re.DOTALL), ] _rev_star_ptns = [ re.compile(r'-str(\d+)'), re.compile(r'-star(\d+)'), ] _cate_progs = [ re.compile(r'<div class="breadcrumb">(.*?)</div>', re.DOTALL), ] _cate_field_progs = re.compile(r'>\s*([^<>]+?)\s*(?:</a>|</span>)', re.DOTALL) rev_entry = lambda c, id: parse(_rev_entry_ptns, c, id, 'review entry') or '' rev_rec = lambda c, id: parse(_rev_recommend_ptns, c, id, 'review recommend', log_not_match=False) or '' rev_user = lambda c, id: parse(_rev_user_ptns, c, id, 'review user') rev_star = lambda c, id: int(parse(_rev_star_ptns, c, id, 'review star') or 0) def star(c, sid): return int(parse(star_ptns, c, sid, 'shop star', default=0)) def name(c, sid): return parse(name_ptns, c, sid, 'shop name') def addr(c, sid): return parse(addr_ptns, c, sid, 'shop addr')