def get_fanyi_content(fanyi_id): url = 'https://so.gushiwen.org/shiwen2017/ajaxfanyi.aspx' params = {'id': fanyi_id} time.sleep(10) client = HttpClient() page_content = client.get(url, params=params) # page_content = open("fanyi.html").read() fanyi = '' if page_content: page_content = unicode(page_content, 'utf-8') dom = fromstring(page_content) elements = dom.xpath("//div[@class='contyishang']/p") for element in elements: for sub in element: tag = sub.tag if tag == 'strong': continue elif tag == 'a': fanyi = fanyi[:-1] tmp = sub.text elif tag == 'br': tmp = sub.tail if tmp is None: continue tmp += '\n' if tmp: tmp = tmp.replace(u"▲", "") fanyi += tmp else: LOG.info("down page error: %s, params: %s", url, params) return fanyi
def update_main(): page = 20 count = 2000 more_data = True pool = ThreadPool(16) goods_obj = TbkGoods() last_id = '' while more_data: # more_data = False if last_id: cond = {'_id': {'$gt': last_id}} else: cond = {} goods_list = goods_obj.find_goods_by_cond( cond, page, count, ['title', 'num_id', 'update_time']) last_id = '' for goods in goods_list: last_id = goods['_id'] if not last_id: print("done") break # goods_list = list(goods_list) # if len(goods_list) < count: # more_data = False # break # else: # more_data = True LOG.info("page: %s ok", page) # pool.apply_async(update_worker, (goods_list, page)) page += 1 pool.close() pool.join()
def crawler(keyword, page, count, cat_list=''): if cat_list and isinstance(cat_list, list): cat = ','.join(cat_list) else: cat = '' goods_list = _crawler(keyword=keyword, page=page, count=count, cat=cat) if goods_list is None: return [] result = [] for goods in goods_list: tmp = _ship_goods_supers(goods) if not tmp: continue tmp.update({'table': 'goods'}) cat_obj = Category(id=tmp['category_id'], name=tmp['category_name']) cat_obj.save_category() if tmp.get("sub_category_id"): cat_obj = SubCategory(id=tmp['sub_category_id'], name=tmp.get('sub_category_name', ''), parent=tmp['category_id']) cat_obj.save_category() source = keyword if keyword else 'crawler' tmp.update({'source': source}) goods_instance = TbkGoods(**tmp) if goods_instance.check_save(): goods_info = goods_instance.find_goods_by_id() if not goods_info: similar_ids = crawler_similar(tmp['num_id']) goods_instance.similar_goods = similar_ids result.append(tmp) ret = goods_instance.save() searcher.update_index(tmp) LOG.debug(ret) return result
def crawler_one_page(link, table, mid): parse_ret = urlparse(link) domain = parse_ret.netloc config = DATA_FIELD.get(domain) if not config: LOG.info("domain: %s not config", domain) return res_data_field = config.get("res_data") id_field = config.get("id") start = time.time() client = HttpClient() res = client.get(link) goods_list = res.get(res_data_field, []) for goods in goods_list: num_id = goods.get(id_field) tmp = _ship_goods(num_id) if not tmp: continue tmp.update({'mid': mid}) if isinstance(table, unicode): table = table.encode("utf-8") tmp.update({'table': table}) searcher.update_index(tmp) goods_obj = TbkGoods(**tmp) goods_obj.__table__ = table goods_obj.save() LOG.info("link: %s takes: %s", link, time.time() - start)
def crawler_worker(keyword): total_page = 10 count = 0 start = time.time() for i in range(total_page): saved_list = crawler(keyword, i + 1, 100) count += len(saved_list) LOG.info("keyword: %s, crawler: %s, takes: %s", keyword, count, time.time() - start)
def _ship_goods_supers(item): pattern = re.compile(COUPON_PATTERN) goods_data = {} try: coupon_share_url = item.get('coupon_share_url') if not coupon_share_url: coupon_share_url = item['url'] if coupon_share_url and not coupon_share_url.startswith( ('http', 'https')): coupon_share_url = 'https:' + coupon_share_url goods_data['category_id'] = item['level_one_category_id'] goods_data['sub_category_id'] = item['category_id'] goods_data['small_images'] = item.get("small_images", {}).get("string", []) # goods_data['big_images'] = goods_data['small_images'] goods_data['is_tmall'] = item['user_type'] goods_data['coupon_id'] = item['coupon_id'] goods_data['coupon_share_url'] = coupon_share_url goods_data['sales'] = int(item['volume']) goods_data['coupon_info'] = item['coupon_info'] coupon_data = re.search(pattern, item['coupon_info']) if coupon_data: goods_data['coupon_start'] = float( coupon_data.group("coupon_start")) goods_data['coupon_amount'] = float( coupon_data.group("coupon_amount")) else: goods_data['coupon_start'] = 0 goods_data['coupon_amount'] = 0 goods_data['commssion_rate'] = float( float(item['commission_rate']) / 10000.0) goods_data['coupon_total_count'] = int(item['coupon_total_count']) goods_data['shop_id'] = item.get("seller_id", 0) goods_data['shop_title'] = item.get("nick", '') goods_data['category_name'] = item.get('level_one_category_name', '') # goods_data['sub_category_name'] = item.get('category_name') goods_data['end'] = item.get('coupon_end_time', '') goods_data['start'] = item.get('coupon_start_time', '') goods_data['price'] = round(float(item['zk_final_price']), 2) goods_data[ 'coupon_fee'] = goods_data['price'] - goods_data['coupon_amount'] goods_data['num_id'] = item['num_iid'] goods_data['created'] = int(time.time() * 1000) goods_data['update_time'] = int(time.time() * 1000) goods_data['pic_url'] = item['pict_url'] goods_data['title'] = item['title'] goods_data['coupon_remain'] = int(item['coupon_remain_count']) except Exception as ex: LOG.error("crawler item: %s, error: %s", item, ex, exc_info=True) goods_data = {} if item.get("category_id"): goods_data.update({"sub_category_id": item.get("category_id")}) if item.get("category_name"): goods_data.update({"sub_category_name": item.get("category_name")}) return goods_data
def _crawler(**kwargs): keys = ("keyword", "page", "count", "platform", "is_overseas", "is_tmall", "sort", "has_coupon", "need_free_shipment", "cat") sp = SearchParams() for key in keys: if kwargs.get(key): sp[key] = kwargs[key] try: res = client.super_search(sp) return res['tbk_dg_material_optional_response']['result_list'][ 'map_data'] except Exception as ex: LOG.error("ex: %s", ex, exc_info=True)
def crawler_author_poetry(author_id=None): page = 1 count = 100 author_obj = Author() while True: if author_id is None: authors = author_obj.find_authors({"id": {">": 1229}}, page, count) else: authors = author_obj.find_authors({'id': { '=': author_id }}, page, count) LOGGER.info("type: %s, len: %s", type(authors), len(authors)) if not authors: break for author in authors: try: LOGGER.info("start crawler author: %s", author['name']) crawler_author_record(author) LOGGER.info(author) except Exception as ex: LOGGER.error("author: %s, ex: %s", author['name'], ex, exc_info=True) # time.sleep(60) page += 1
def check(): page = 1 count = 100 author_obj = Author() while True: authors = author_obj.find_authors({}, page, count) LOGGER.info("type: %s, len: %s", type(authors), len(authors)) if not authors: break for author in authors: _id = author['id'] ps = Poetry(author_id=_id) ret = ps.find_poetry_by_author_id(1, 1) if len(ret) == 0: # print("_id: %s not found" % _id) crawler_author_poetry(_id) page += 1
def get_shangxi_content(shangxi_id): url = 'https://so.gushiwen.org/shiwen2017/ajaxshangxi.aspx' params = {'id': shangxi_id} time.sleep(10) client = HttpClient() page_content = client.get(url, params=params) shangxi = '' if page_content: page_content = unicode(page_content, 'utf-8') dom = fromstring(page_content) elements = dom.xpath("//div[@class='contyishang']/p") for element in elements: tmp = element.xpath("string(.)") tmp = tmp.replace(u"▲", "") shangxi += tmp shangxi += '\n' else: LOG.debug("down page error: %s, params: %s", url, params) return shangxi
def update_one_by_one(table): page = 1 count = 1000 have_data = True update_count = 0 goods_obj = TbkGoods() goods_obj.__table__ = table LOG.info(table) while have_data: have_data = False goods_list = goods_obj.find_goods_by_cond({}, page, count) now = int(time.time() * 1000) for goods in goods_list: have_data = True update_time = goods.get('update_time') if update_time and now - update_time < 3600000: continue update_goods(goods['title'], goods['num_id'], table) page += 1 LOG.info("page: %s" % page) print(update_count)
def update_worker(goods_list, page): start = time.time() LOG.info("page: %s, start: %s", page, start) for goods in goods_list: now = time.time() * 1000 update_time = goods.get("update_time") if update_time and now - update_time < 3600000: continue title = goods['title'] _id = goods['num_id'] sp = SearchParams() sp.page = 1 sp.count = 100 sp.keyword = title data = _super_search(sp) ok = 0 for g in data: goods_data = _ship_goods_supers(g) if goods_data['num_id'] == _id: ok = 1 goods_obj = TbkGoods(**goods_data) goods_obj.save() break if not ok: goods_obj = TbkGoods(num_id=_id) goods_obj.delete() LOG.info("delete id: %s", _id) del goods_list LOG.info("page: %s process ok: %s", page, time.time() - start)
def get_detail_url(detail_url, author_id): client = HttpClient() page_content = client.get(detail_url) if page_content: dom = fromstring(page_content) cont_xpath = '//div[@class="main3"]/div[@class="left"]/'\ 'div[@class="sons"][1]' title = dom.xpath("//h1/text()") dynasty = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[1]/text()') author = dom.xpath(cont_xpath + '/div[@class="cont"]/p/a[2]/text()') content = dom.xpath(cont_xpath + '/div[@class="cont"]/div[@class="contson"]') content = split_content(content[0]) keywords = dom.xpath(cont_xpath + '/div[@class="tag"]/a/text()') keywords = '&'.join(keywords) likes = dom.xpath(cont_xpath + '//div[@class="good"]/a/span/text()') if len(likes) >= 1: likes = match_number(likes[0]) else: likes = 0 fanyi = dom.xpath("//div[starts-with(@id, 'fanyi')][1]/@id") if fanyi: fanyi_id = match_number(fanyi[0]) fanyi_con = get_fanyi_content(fanyi_id) else: fanyi_xpath = "//div[@class='left']/div[@class='sons'][2]/div[@class='contyishang']/p/text()" fanyi_con = dom.xpath(fanyi_xpath) if fanyi_con: fanyi_con = '\n'.join(fanyi_con) else: fanyi_con = '' shangxi = dom.xpath("//div[starts-with(@id, 'shangxi')][1]/@id") if shangxi: shangxi_id = match_number(shangxi[0]) shangxi_con = get_shangxi_content(shangxi_id) else: shangxi_con = '' if not shangxi_con: LOG.info("url: %s no shangxi", detail_url) if not fanyi_con: LOG.info("url: %s no fanyi", detail_url) poetry_data = { 'title': title[0], 'dynasty': dynasty[0], 'author': author[0], 'content': content, 'tags': keywords, 'likes': likes, 'author_id': author_id, 'translate': fanyi_con, 'shangxi': shangxi_con, 'plink': detail_url } # print(poetry_data) return poetry_data else: LOG.error("download url: %s, error", detail_url) return {}
def save_centence(centence, source, c, t): pattern = re.compile(u"(?P<author>.*)《(?P<title>.*)》") match = pattern.search(source) if not match: LOG.info("cent: %s, source: %s error", centence, source) return author = match.group("author") title = match.group("title") poetry_obj = Poetry(title=title, author=author) poetry = poetry_obj.find_poetry_by_title() if not poetry: LOG.error("title: %s, author: %s found error", title, author) poetry = {} centence_data = { "title": title, "content": centence, "tags": '&'.join([c, t]), "author_id": poetry.get('author_id', 0), "author": author, "poetry_id": poetry.get('id', 0) } sentence_obj = Sentence(**centence_data) sentence_obj.save()
def get_one_goods(cat=None): if cat is None: cat_obj = Category(recommend=1) cats = cat_obj.all_category() cat_list = [] for cat in cats: cat_list.append(int(cat['id'])) # cat_list = [1801, 16, 30, 50002766, 50006843, 122952001] cat_id = random.choice(cat_list) else: cat_id = cat start = time.time() - 8 * 86400 cond = { "coupon_amount": { '$gt': 5 }, "created": { '$gt': start * 1000 }, "sales": { '$gt': 3000 }, 'category_id': cat_id, "sended": { '$exists': False }, "coupon_expire": 0 } LOG.debug(cond) goods_obj = TbkGoods() goods = goods_obj.find_goods_by_cond(cond, 1, count=20) goods_list = list(goods) length = len(goods_list) if length == 0: return {} index = random.randint(0, length - 1) return goods_list[index]
def _do_send_template(user): poetry_data = get_recommend_poetry(user['openid']) # poetry_id = user.pop("poetry_id", 1) # poetry_obj = Poetry(id=poetry_id) # poetry_data = poetry_obj.find_poetry_by_id() if not poetry_data: LOG.error("recommend failed: %s", user['openid']) return res = None try: res = send_template_poetry(user, poetry_data) except Exception as ex: LOG.error("openid: %s, ex: %s", user['openid'], ex) if res is None: LOG.error("openid: %s, send failed", user['openid']) else: LOG.info("openid: %s, res: %s", user['openid'], res)
def crawler_poetry_record(link, author_id): try: poetry_data = get_detail_url(link, author_id) poetry_id = save_crawled_poetry(poetry_data) if poetry_id: LOGGER.info("link: %s, author: %s ok", link, author_id) else: LOGGER.info("link: %s, not save") except Exception as ex: LOGGER.error("link: %s, ex: %s", link, ex, exc_info=True)
def crawler_author_record(author): next_page = author['poetry_link'] author_id = author['id'] count = 0 while next_page: detail_links, next_page = detail_crawler(next_page) for poetry_link in detail_links: try: poetry_data = get_detail_url(poetry_link, author_id) poetry_id = save_crawled_poetry(poetry_data) if poetry_id: count += 1 LOGGER.debug("save poetry: %s, authorid: %s", poetry_id, author_id) except Exception as ex: LOGGER.error("link: %s, ex: %s", poetry_link, ex, exc_info=True) # time.sleep(random.randint(6, 10)) LOGGER.info("page: %s, save: %s", next_page, count) count = 0
def _save(goods_info): goods_obj = TbkGoods(**goods_info) goods_obj.source = 'search' ret = goods_obj.save() LOG.info("save goods: %s, ret: %s", goods_info['num_id'], ret)
parser = argparse.ArgumentParser() parser.add_argument('--type', '-t', type=str, required=True, help='the type of the timed') parser.add_argument('--keyword', '-k', type=str, required=False, help='crawler words') args = parser.parse_args() start = time.time() type_ = args.type if type_ == 'update': LOG.info("update start: %s", start) # update_by_category() tables = ['haitao', 'jiukjiu', 'juhuasuan', 'xiaoliangbang', 'goods'] pool = ThreadPool(len(tables)) for table in tables: pool.apply_async(update_one_by_one, (table, )) pool.close() pool.join() # update_one_by_one() LOG.info("update takes: %ss", time.time() - start) elif type_ == 'crawler': LOG.info("crawler start: %s", start) crawler_main() LOG.info("crawler takes: %s", time.time() - start) elif type_ == 'word': if args.keyword: