def main(): connection.close() print_log('[asins_spider.py] Get ASINs from stores... ') pool = multiprocessing.Pool(processes=2) # 少于criticals_spider pool.map(get_asins_from_page, get_stores()) pool.close() connection.close()
def save_email(self): # 为用户保存邮箱 email = self.Content print_log('[raven.py] email 的样子:{}'.format(email)) self.user.email = email self.user.save() print_log('[raven.py] email 已经储存。') return self.reply( '您的邮箱地址 {} 已经储存。请确保正确,如欲更换,只需重新回复一次正确邮箱。'.format(email))
def get_critical_from_asin(asin): ''' asin指asin object,不是字符串 ''' def get_few_review(soup): # 一般是1个 # base_url 本来就是查询的差评页,所以只要底下有任何评论,都是差评,都是含有 data-hook="review"字段的 reviews = soup.find_all('div', attrs={'data-hook': 'review'}) # 找不到返回 [],注意是find_all 有下划线 return len(reviews) # 直接返回 critical reviews 的个数 可以为0 url = urljoin(domains[asin.country], BASE_URL.format(asin.value)) response, proxy = utils.get_response(url) if type(response) != int: soup = BeautifulSoup(response, 'lxml') # 有差评才有这个 critical_review_list = soup.find(id='cm_cr-review_list') if critical_review_list: # 有差评 try: raw_nums = critical_review_list.find('span', class_='a-size-base') nums = raw_nums.text.replace(',', '').replace('.', '') nums = re.findall('\d+', nums) except AttributeError: with open('none.txt', 'a', encoding='utf-8') as f: f.write(soup) print_log('[criticals_spider.py] none.txt captured.') try: num = max(nums) print(asin, 'get {} review.'.format(num)) save_critical(num, asin) except ValueError: # 有评论,但是没有差评 print(asin, 'get 0 critical review.') save_critical(0, asin) elif 'Correios.DoNotSend' in str(soup): proxy.fail += 1 proxy.set_rate() proxy.set_stamp() proxy.save() print(asin, 'is busted.') # 没有用print_log else: # 这种是完全没任何评论 print(asin, 'get no review at all.') save_critical(0, asin) else: print('ERR CODE:', response, asin) if response == 404: # 该asin已经失效 asin.valid = False asin.save() elif response == 503: proxy.fail += 1 proxy.set_rate() proxy.set_stamp() proxy.save()
def send_alerts(): ''' 遍历用户的asin,并整合成比较结果,然后发送邮件。 ''' for user in User.objects.filter(subscribe=True): content_text, content_html = make_report(user) if content_html: # non empty if user.email: print_log('[criticals_spider.py] Sending alerts to {}...'.format(user.email)) send_email(user.email, content_text, content_html, subject="矩阵数据提醒您:亚马逊店铺有新的差评,请及时处理") Email(address=user.email, content_html=content_html).save() else: print_log('[criticals_spider.py] {} has\'t set email.'.format(user))
def run_spiders(): ''' 启动所有爬虫 1. 每48h抓取一次店铺asin。 2. 1小时抓取一次差评。未来asin太多的话,会独立一台服务器无限抓。 ''' while True: asins_spider.main() # 自动只处理新店铺和久未更新的店铺 Asin.objects.all().update(flag=False) # set_all_costs() # 每个用户判断一遍 criticals_spider.main() interval = set_interval() print_log('[core.py] Wait for {} hour ... '.format(interval)) time.sleep(interval * 3600 * random.uniform(0.9, 1.2)) # uncertainty
def main(): connection.close() print_log('[criticals_spider.py] Get criticals from ASINs...') # 经过测试如果进程为10,勉强服务器还能带得动,如果再大,没有试过 # 进程为30则直接宕机。 pool = multiprocessing.Pool(processes=3) while Asin.objects.filter( flag=False ): # as long as it's not done, it will start another poll pool.map(get_critical_from_asin, get_asins()) pool.close() # print_log('[criticals_spider.py] Entering alert.send_alerts()...') alert.send_alerts() # 每次数据全部爬完来一次 print_log('[criticals_spider.py] All done.') connection.close()
def get_asins_from_page(url): ''' 0. 工作原理:可以同时写两个yield,一个yield内容迭代完毕后,会接着 迭代第二个的具体而言,先输出asin,该页面asin输出完毕后,就输出下一 页的链接外函数判断是链接,则重复调用内函数,直至无法找到下一页链接为止 1. 每次爬取店铺。如果asin未保存过,则保存,已保存过则忽略。 2. 如果之前保存的asin,本次没有爬取到,则不做任何处理。 3. 因为在criticals_spider中,可能发生302或者404(未经测试) 4. 发生404,该asin直接valid=False就不再处理 5. 发生302可能是跳到尚有货的变体(未经测试) ''' response, proxy = utils.get_response(url) country = utils.get_country_from_url(url) if type(response) != int and 'Correios.DoNotSend' not in response: soup = BeautifulSoup(response, 'lxml') asin_strs = [] lis = soup.find_all('li') for li in lis: if li.has_attr('data-asin'): asin_str = li['data-asin'] if asin_str not in asin_strs: asin_strs.append(asin_str) # 一般没有重复,但是确保万无一失,啰嗦一下 以后可能会去掉 for asin_str in asin_strs: if not Asin.objects.filter(value=asin_str, country=country): asin = Asin(value=asin_str, country=utils.get_country_from_url(url), store=utils.get_store_by_url(url)) asin.save() # 此asin是Asin对象和前面的字符串asin不是一回事 print_log('[asins_spider.py]', asin, 'is newly added.') else: print('[asins_spider.py]', asin_str + '-' + utils.get_country_from_url(url), 'exists already.') try: time.sleep(5) # 确保万无一失,毕竟这个的实际频率很低 next_page = urljoin(url, soup.find(id='pagnNextLink')['href']) get_asins_from_page(next_page) # 自我迭代 except TypeError: # 最后一页 print_log('[asins_spider.py]', url, 'is done.') else: print('[asins_spider.py]', 'Busted or other error!') proxy.fail += 1 proxy.set_rate() proxy.set_stamp() proxy.save() get_asins_from_page(url) # 重新爬一次(换了个代理)
def save_store(self): # 保存店铺链接 url = self.Content print_log('[raven.py] URL 的样子:{}'.format(url)) me = utils.get_me(url) print_log('[raven.py] me 的样子:{}'.format(me)) if not Store.objects.filter(user=self.user, url=url): # 重复提交的不保存 store = Store(user=self.user, url=url, me=me, last_update=0) # 此时的store.last_update=0 store.save() print_log('[raven.py] 店铺已经保存。') return self.reply( '您提交的店铺已经保存。回复「店铺」即可查看已经保存的全部店铺。回复「邮箱」可以进一步查看邮箱设置。')
def is_store_url(url): if url.startswith('https'): if 'marketplaceID' in url: if 'merchant' in url: print_log('[raven.py] 捕获到了店铺。') return True
def save_msg(self): print_log(str(self.dict)) return self.reply('您提交的图片我们已收到。我们会尽快进行人工审核。')