Exemple #1
0
def save_shop_cate(session, shop_prof_dir):
    parsed = {i.sid for i in session.query(ShopTags).distinct().all()}
    print '{} shop category parsed'.format(len(parsed))

    data = []

    for sid, c in read_file(shop_prof_dir, parsed, lambda fn: fn[:-5]):
        text = parse(_cate_progs, c, id, 'shop cate')
        tags = set(_cate_field_progs.findall(text)) - {'»'}
        data.extend([ShopTags(sid, tag) for tag in tags])

    session.add_all(data)
    session.commit()
Exemple #2
0
 def start(self, urls):
     """
     启动爬虫方法
     :param urls: 启动URL
     :return: 抓取的URL数量
     """
     number = 0
     self._manager.append_new_urls(urls)
     while self._manager.has_new_url():
         number += 1
         new_url = self._manager.get_new_url()
         print('开始下载第{:03}个URL:{}'.format(number, new_url))
         html = download(new_url)
         if html is None:
             # print('html is empty .')
             continue
         links, results = parse(html, new_url)
         if len(links) > 0:
             self._manager.append_new_urls(links)
         if len(results) > 0:
             self._processor.process(results)
     return number
Exemple #3
0
def addr(c, sid):
    return parse(addr_ptns, c, sid, 'shop addr')
Exemple #4
0
def name(c, sid):
    return parse(name_ptns, c, sid, 'shop name')
Exemple #5
0
def star(c, sid):
    return int(parse(star_ptns, c, sid, 'shop star', default=0))
Exemple #6
0
    re.compile(r'class="user-info">\s*<a.*?href="/member/(\d+)".*?>(.*?)</a>', re.DOTALL),
    re.compile(r'<p class="name">\s*<a.*?href="/member/(\d+)".*?>(.*?)</a>', re.DOTALL),
    ]

_rev_star_ptns = [
    re.compile(r'-str(\d+)'),
    re.compile(r'-star(\d+)'),
    ]

_cate_progs = [
    re.compile(r'<div class="breadcrumb">(.*?)</div>', re.DOTALL),
    ]
_cate_field_progs = re.compile(r'>\s*([^<>]+?)\s*(?:</a>|</span>)', re.DOTALL)


rev_entry = lambda c, id: parse(_rev_entry_ptns, c, id, 'review entry') or ''
rev_rec = lambda c, id: parse(_rev_recommend_ptns, c, id, 'review recommend', log_not_match=False) or ''
rev_user = lambda c, id: parse(_rev_user_ptns, c, id, 'review user')
rev_star = lambda c, id: int(parse(_rev_star_ptns, c, id, 'review star') or 0)


def star(c, sid):
    return int(parse(star_ptns, c, sid, 'shop star', default=0))


def name(c, sid):
    return parse(name_ptns, c, sid, 'shop name')


def addr(c, sid):
    return parse(addr_ptns, c, sid, 'shop addr')