def __init__(self, crawl_request=None): self.crawl_request = crawl_request if self.crawl_request is not None: self.sitemap_urls = crawl_request.get('sitemap_urls', None) self.logger.info("Crawl reqquest : %s " % self.crawl_request) super(MySpider, self).__init__() logger.info("intialized the jobdiva spider")
def main(): app = JobScrapper() http_server = tornado.httpserver.HTTPServer(app) http_server.listen(8888) crawler_handler.startCrawl() logger.info('started the crawl at 8888') tornado.ioloop.IOLoop.instance().start()
def crawl(): while True: try: rules, sleep_time = sche.tick() yield runner.crawl(CommonSpider, rules=rules) logger.info('sleep in {} seconds...'.format(sleep_time)) time.sleep(sleep_time) except Exception: time.sleep(3) reactor.stop()
def parse(self, response): logger.info('dicespider|url in parse %s', response.url) self.crawler.stats.inc_value('completed_url', 1) self.crawler.stats.set_value('spider', 'Dice') response_value = -2 temp = {'urls': []} item = parse_fields(self.crawl_request, response, response_value) if len(item) is not 0: yield item for link in LxmlLinkExtractor( allow_domains=self.allowed_domains).extract_links(response): url = response.urljoin(link.url) if str(url).find(self.crawl_request['urlPattern'][0]) >= 0: yield scrapy.Request(url=url, callback=self.parse)
def update_sche(self): logger.info('begin updating schedule.') cursor = None if self.need_init_sche: logger.info('initializing schedule at the first time.') current_time = datetime.utcnow() cursor = self.db.rule.find( { 'updatedAt': { '$lte': current_time }, 'status': 'RUN', }, projection=self.need_rule_fields) self.need_init_sche = False self.last_time_pull_rules_from_db = current_time else: easy_time = self.last_time_pull_rules_from_db - timedelta( minutes=2) cursor = self.db.rule.find({ 'updatedAt': { '$gt': easy_time }, }, projection=self.need_rule_fields) self.last_time_pull_rules_from_db = datetime.utcnow() logger.info('begin to init or update rules.') changed_rules = [] balanced_rules_cnt = 0 for rule in cursor: self.str_dict_object(rule) if rule['spider'] in self.cron_balanced_spiders: rule['balance_cron'] = CronIter.balance_cron( rule['_id'], rule['cron']) balanced_rules_cnt += 1 changed_rules.append(rule) logger.info('{} rules have been updated during this interval.'.format( len(changed_rules))) logger.debug( '{} rules have been balanced during this interval.'.format( balanced_rules_cnt)) if changed_rules: self.sync_sche_to_heap(changed_rules) logger.info('finish updating schedule.')
def tick(self): rules_to_be_returned = [] delayed_rules_cnt = 0 while True: try: self.run() entry = self._heap.pop() logger.info('length: {}'.format(self._heap.length())) if not entry: logger.info('no rules in heap now.') return rules_to_be_returned, self.pull_sche_interval if not isinstance(entry, heap_rule_t): logger.Warning( 'detect an entry does not fit heap_rule_t struct: {}'. format(entry)) continue rule = self.schedule.get(entry.id) if not rule: logger.info( 'detect a rule that has been removed from sche, discard, ruieId[{}].' .format(entry.id)) continue if entry.cron_updatedAt == rule['cron_updatedAt']: delay = self.__when(entry.next_call_time) if delay < 0: rules_to_be_returned.append(rule) self.__populate(rule) delayed_rules_cnt += 1 else: self._heap.push(entry) logger.info( '{} rules were delayed since last tick.'.format( delayed_rules_cnt)) return rules_to_be_returned, delay else: # 过期了,目前只能是因为cron字段被更新过了,这条rule作废,因为最新的rule已经被推送进了堆 logger.info( 'detect an outdated rule, discard, ruleId[{}].'.format( entry.id)) except Exception as e: logger.exception( 'tick catches an exception. Msg: {}'.format(e)) return [], self.pull_sche_interval
def process_item(self, item, spider): logger.info(item) product_code = item['productCode'] seller_flag = item['sellerFlag'] # current_price = item['price'] total_price = item['totalPrice'] conn = self.mariadb_connection times = 1 while not conn.is_connected(): logger.error(u'数据库连接已经断开,正在尝试第[%s]次连接...' % times) self.open_spider(spider) conn = self.mariadb_connection times += 1 if times == 4: logger.error(u'数据库连接存在问题,停止爬虫...') # spider.crawler.signals.disconnect_all(signal=signals.spider_closed) return if times > 1: logger.info(u'数据库连接成功...') cursor = conn.cursor() # 如果商品编码不存在 或 商品编码存在及销售标识不存在 if not self._query_exists_product(product_code, seller_flag): self._insert_t_product(cursor, item) self._insert_t_product_price(cursor, item) # 商品编码及销售标识存在 else: price = self._query_price(product_code, seller_flag) history_min_price = price[0] history_avg_price = price[1] # 如果当前抓取价格(减去积分)小于历史最低价或历史平均价,更新商品信息 if total_price != u'None': if float(total_price) < history_min_price: item['minPrice'] = total_price item['avgPrice'] = unicode(history_avg_price) mail_item_dict[product_code] = item self._update_t_product(cursor, item) else: item['avgPrice'] = unicode(history_avg_price) if float(total_price) < history_avg_price: item['minPrice'] = unicode(history_min_price) mail_item_dict[product_code] = item self._update_t_product(cursor, item, flag=2) self._insert_t_product_price(cursor, item) conn.commit() cursor.close()
def openlink(self, url, data): """ urlopen error 10060错误 :param url: 请求的网址 :param headers: 报文头部信息 :return: 服务器响应 """ maxTryNum = 15 for tries in range(maxTryNum): try: logging.info("请求%s,%s" % (url, data)) req = requests.post(url, data=data, timeout=13) return req except: if tries < (maxTryNum - 1): continue else: logger.info("尝试%d 次连接网址%s失败!" % (maxTryNum, url))
def _query_price(self, product_code, seller_flag): """ 查询指定商品编码及卖家标识的商品历史最低售价及历史平均价 :param product_code: 商品编码 :param seller_flag: 卖家标识 :return: (最低售价, 平均价) """ if product_code is not None and seller_flag is not None: query_min_price = "SELECT minPrice FROM t_product WHERE productCode = %s AND sellerFlag = %s" query_avg_price = "SELECT AVG(price) FROM t_product_price WHERE productCode = %s AND sellerFlag = %s" cursor = self.mariadb_connection.cursor(buffered=True) cursor.execute(query_min_price, (product_code, seller_flag)) min_price = cursor.fetchall()[0][0] cursor.execute(query_avg_price, (product_code, seller_flag)) avg_price = cursor.fetchall()[0][0] logger.info(u'商品历史最低价为: %s, 历史平均价为: %s' % (min_price, avg_price)) return min_price, avg_price else: raise ValueError(u'商品编码或卖家标识为空.')
def parse(self, response): parse_response = {} logger.info('job_scrapper|url in parse : %s', response.url) self.crawler.stats.inc_value('completed_url', 1) self.crawler.stats.set_value('spider','indeed') response_value = -2 parse_response = parse_links(self.crawl_request, response, response_value) print(parse_response) if parse_response is not None: if parse_response['type'] == 'links': links = parse_response.get('content') for link in links: url = response.urljoin(link) yield scrapy.Request(url=url, callback=self.parse) else: item = parse_response.get('content') if len(item) is not 0: yield item
def _update_t_product(self, cursor, item, flag=1): """ 更新已有商品的最低售价及平均价 :param item: item对象 :return: """ try: if flag == 1: update_t_product = "UPDATE t_product SET state = %s, chineseName = %s, imgUrl = %s, minPrice = %s, avgPrice = %s WHERE productCode = %s AND sellerFlag = %s" # cursor = self.mariadb_connection.cursor() logger.info(u'更新商品[%s]最低售价及平均价...' % item['chineseName']) cursor.execute( update_t_product, (item['state'], item['chineseName'], item['imgUrl'], item['minPrice'], item['avgPrice'], item['productCode'], item['sellerFlag'])) # self.mariadb_connection.commit() else: update_t_product = "UPDATE t_product SET state = %s, chineseName = %s, imgUrl = %s, avgPrice = %s WHERE productCode = %s AND sellerFlag = %s" logger.info(u'更新商品[%s]平均价...' % item['chineseName']) cursor.execute(update_t_product, (item['state'], item['chineseName'], item['imgUrl'], item['avgPrice'], item['productCode'], item['sellerFlag'])) logger.info(u'更新t_product成功...') except mariadb.Error as err: logger.error(u'更新历史最低价及平均价失败. 原因为: %s', err)
def _insert_t_product(self, cursor, item): """ 插入t_product表数据 :param item: item对象 :return: """ if item['japaneseName'] != u'None': try: insert_t_product = "INSERT INTO t_product(productCode, productUrl, state, chineseName, japaneseName, " \ "imgUrl, sellerFlag) VALUES (%s, %s, %s, %s, %s, %s, %s)" # cursor = self.mariadb_connection.cursor() logger.info(u'插入新的商品[%s]...' % item['chineseName']) cursor.execute( insert_t_product, (item['productCode'], item['productUrl'], item['state'], item['chineseName'], item['japaneseName'], item['imgUrl'], item['sellerFlag'])) # self.mariadb_connection.commit() logger.info(u'插入t_product成功...') except mariadb.Error as err: logger.error(u'插入t_product表失败. 原因为: %s' % err) self.mariadb_connection.rollback()
def _insert(self, item, out): if isinstance(item, (BilibiliItem, ListItem)): self.b_post.insert(ItemAdapter(item).asdict()) # 插入后构造一个弹幕的url,并写到redis中,弹幕爬虫启动 # if isinstance(item, BilibiliItem): # 如果item是BIlibiliItem的实例,说明是普通Up主上传的视频,分集信息存在item的pages里 # pages = item['pages'] # else: # 是B站上传的视频,分集信息存在item的ep_list里 # pages = item['ep_list'] # for page in pages: # 构造每一个分集的danmu_url,并把url插入到redis的list中,以供弹幕爬虫调用 # danmu_url = spider.settings['DANMU_URL'].format(oid=page['cid'], pid=page['aid']) # self.redis_client.rpush(spider.settings.get('REDIS_DANMAKU_KEY'), danmu_url) elif isinstance(item, PageItem): logger.info('_insert ' + item) rank_date = ItemAdapter(item).pop('rank_date') rank_type = item.pop('rank_type') self.b_post.update_many( { 'aid': item['aid'], 'rank_date': rank_date, 'rank_type': rank_type }, {'$push': { 'pages': ItemAdapter(item).asdict() }}) elif isinstance(item, EpItem): rank_date = item.pop('rank_date') rank_type = item.pop('rank_type') self.b_post.update_many( { 'season_id': item['season_id'], 'rank_date': rank_date, 'rank_type': rank_type }, {'$push': { 'ep_list': ItemAdapter(item).asdict() }}) reactor.callFromThread(out.callback, item)
def _insert_t_product_price(self, cursor, item): """ 插入t_product_price表数据 :param item: item对象 :return: """ # 为None时说明该商品暂时没货,不需要插入price表 if item['price'] != u'None': try: insert_t_product_price = "INSERT INTO t_product_price(productCode, price, sellerFlag, pointFlag, points, " \ "promotionFlag, totalPrice, extractTime) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)" # cursor = self.mariadb_connection.cursor() logger.info(u'插入新的爬虫商品[%s]价格...' % item['chineseName']) cursor.execute( insert_t_product_price, (item['productCode'], item['price'], item['sellerFlag'], item['pointFlag'], item['points'], item['promotionFlag'], item['totalPrice'], item['extractTime'])) # self.mariadb_connection.commit() logger.info(u'插入t_product_price成功...') except mariadb.Error as err: logger.error(u'插入t_product_price表失败. 原因为: %s' % err) self.mariadb_connection.rollback()
def parse(self, response): logger.info('jobdiva|url in parse %s', response.url) self.crawler.stats.inc_value('completed_url', 1) self.crawler.stats.set_value('spider', 'jobdiva') response_value = -2 temp = {'urls': []} tags = ['span', 'td'] item = parse_fields(self.crawl_request, response, response_value, tags) iframe_url = response.css('iframe::attr(src)').extract() for url in iframe_url: for allowed_domain in self.allowed_domains: response_value = url.find(allowed_domain) if response_value >= 0: yield scrapy.Request(url=url, callback=self.parse) if len(item) is not 0: yield item for link in LxmlLinkExtractor( allow_domains=self.allowed_domains).extract_links(response): url = response.urljoin(link.url) temp['urls'].append(url) yield scrapy.Request(url=url, callback=self.parse)
def _send_email(self, mail_item_list): # logger.info(mail_item_list) html_content = self._gen_html_content(mail_item_list) logger.info(html_content) smtp_host = get_project_settings().get('SMTP_HOST') mail_from = get_project_settings().get('MAIL_FROM') mail_to = ",".join(get_project_settings().get('MAIL_TO')) smtp_user = get_project_settings().get('SMTP_USER') smtp_pass = get_project_settings().get('SMTP_PASS') smtp_port = get_project_settings().get('SMTP_PORT') subject = '哇涩,又有最新低价,赶紧查看~~~' msg = MIMEText(html_content, _subtype='html', _charset='utf8') msg['Subject'] = subject msg['From'] = mail_from msg['To'] = mail_to try: # s = smtplib.SMTP() s = smtplib.SMTP_SSL(host=smtp_host, port=smtp_port) # s.connect(smtp_host, smtp_port) # s.login(smtp_user, smtp_pass) s.ehlo() # s.starttls() # enable TLS # s.ehlo() s.login(smtp_user, smtp_pass) s.sendmail(mail_from, mail_to, msg.as_string()) s.close() logger.info(u'发送邮件成功...') return True except Exception, e: logger.error(u'邮件发送失败,原因为: %s' % e) is_success = False times = 1 while not is_success: logger.info(u'正在尝试第[%s]次重新发送邮件...' % times) # 休眠3秒再尝试新的连接 time.sleep(3) is_success = self._send_email(mail_item_dict.values()) times += 1 if times == 4: logger.error(u'邮件发送失败...') break
def __init__(self, crawl_request=None): self.crawl_request = crawl_request if self.crawl_request is not None: self.start_urls = crawl_request['start_urls'] self.allowed_domains = crawl_request['allowed_domains'] logger.info("intialized the job_scraper spider")
def sync_sche_to_heap(self, changed_rules): rules_to_heap_cnt = 0 rules_delete_from_sche = 0 logger.info('begin to process changed rules.') for rules_processed_cnt, rule in enumerate(changed_rules, 1): if rule['status'].lower() == 'run': old_rule = self.schedule.get(rule['_id']) if not old_rule or old_rule['cron'] != rule[ 'cron']: # 新增了rule或者旧的rule的cron字段更新了 rule['cron_updatedAt'] = datetime.utcnow() next_call_time = CronIter.get_next_cron_time( rule.get('balance_cron') or rule.get('cron'), datetime.now()) if not next_call_time: continue rule_t = heap_rule_t(next_call_time, rule['_id'], rule['cron_updatedAt']) self._heap.push(rule_t) rules_to_heap_cnt += 1 self.schedule[rule['_id']] = rule else: # TODO: FIX THIS BUG # self.schedule[rule['_id']].update(rule) # 这里如果py<3.6有个bug: https://bugs.python.org/issue6766 # 考虑手动更新schedule, 目前会产生变化的只有cron_updatedAt字段 rule['cron_updatedAt'] = old_rule['cron_updatedAt'] self.schedule[rule['_id']] = rule else: if rule['_id'] in self.schedule: del self.schedule[rule['_id']] rules_delete_from_sche += 1 if rules_processed_cnt % 1e4 == 0: logger.info('1W rules have been processed. {} left.'.format( len(changed_rules) - rules_processed_cnt)) logger.info('finish to process changed rules.') logger.info( '{} rules have been pushed to heap in this interval.'.format( rules_to_heap_cnt)) logger.info( '{} rules have been removed from schedule in this interval.'. format(rules_delete_from_sche)) logger.info('Currently {} rules in the schedule.'.format( len(self.schedule)))
#!/usr/bin/env python # -*- coding: utf-8 -*- from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging, logger from spiders.moer_spider import MoerSpider if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) user_set = set() runner = CrawlerRunner() runner.settings.set( 'ITEM_PIPELINES', {'moer.pipelines.MoerStorePipeline': 200, 'moer.pipelines.ArticleStorePipeline': 300} ) runner.crawl(MoerSpider, runner, user_set) d = runner.join() d.addBoth(lambda _: reactor.stop()) reactor.run() logger.info('All tasks have been finished!')
class CtripPriceSpider(RedisSpider): name = 'CtripPrice' allowed_domains = ['hotels.ctrip.com', "*"] redis_key = 'CtripPriceSpider:start_urls' logger.info("携程价格消费者!") def __init__(self, *args, **kwargs): super(CtripPriceSpider, self).__init__(*args, **kwargs) self.site_id = kwargs["site_id"] self.__class__.KinesisQueue = kwargs.get("KinesisQueue", "") self.cf = kwargs["cf"] # self.CHECK_POINT = kwargs["CHECK_POINT"] self.__class__.download_delay = kwargs.get("DOWNLOAD_DELAY", 1.5) self.dateTime = json.loads(kwargs.get("GlobalParams", "")).get("dateTime", 3) def parse(self, response): item = response.meta.get("item", "") eleven_dict = json.loads(response.body.decode()) eleven = eleven_dict.get("ELEVEN", "") if eleven: check_out_list = getEveryDayTuple(self.dateTime) for check_in, check_out in check_out_list: header = { "Referer": "http://hotels.ctrip.com/hotel/{hotelId}.html?isFull=F". format(hotelId=item["ctrip_hotel_id"]), "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36" } ctrip_price_url = "http://hotels.ctrip.com/Domestic/tool/AjaxHote1RoomListForDetai1.aspx?MasterHotelID={hotelId}&hotel={hotelId}&EDM=F&showspothotel=T&IsDecoupleSpotHotelAndGroup=F&startDate={check_in}&depDate={check_out}&RequestTravelMoney=F&contyped=0&priceInfo=-1&TmFromList=F&eleven={eleven}".format( hotelId=item["ctrip_hotel_id"], check_in=check_in, check_out=check_out, eleven=eleven) item["CHECKIN_DATE"] = check_in item["CHECKOUT_DATE"] = check_out yield Request(url=ctrip_price_url, headers=header, callback=self.get_price, priority=10, meta={ "item": copy.deepcopy(item), "is_need_proxy": True }) else: print("没有eleven", eleven_dict) def get_price(self, response): item = response.meta.get("item") # print("neirong:",response.body.decode()) BLOCK = extract_re(""""html":(.*?)isFullHouse""", response.body.decode()) RECORDS = re.findall("""room_unfold(.*?)class='clicked hidden""", BLOCK) for RECORD in RECORDS: # 房型名称 RoomName = extract_re(r"""RoomName\\":\\"(.*?)\\""", RECORD) item["ROOM_TYPE"] = RoomName RECORDS2 = re.findall( r"""data-hotelInvoice(.*?class=\\"hotel_room_last\\">.*?<\\/div>)""", RECORD) for RECORD2 in RECORDS2: itemValue = copy.deepcopy(item) # 产品名称 itemValue["PRODUCT_TYPE"] = extract_re( r"""(room_type_name\\".*?background-image:url\(|room_type_name\\".*?)([^>"]*?)(<br\\/>[^']|\)\\"><|\\/span>|<\\/[es])""", RECORD2, group_num=2) # 预定方式 pay_type = extract_re(r"""payment_txt\\".*?>(.*?)<""", RECORD2) map_pay_type = classify( { "0": "(在线付)", "2": "(担保)", "1": "(到店付)" }, pay_type) itemValue[ "PAYMENT_TYPE"] = map_pay_type if map_pay_type else "null" # 代理 daili = extract_re(r"""data-role=\\"title\\">(.*?)<\\/span>""", RECORD2) itemValue["IS_NOT_AGENT"] = daili if daili else "true" # 预订状态 pay_status = extract_re(r"""btns_base22_main\\">(.*?)<""", RECORD2) itemValue["AVAILABLE_ROOM_SITUATION"] = classify( { "可预订": "(预订)", "满房": "(订完)" }, pay_status) # 早餐 BREAKFAST = extract_re(r"""col4'>(.*?)<""", RECORD2) itemValue["BREAKFAST"] = BREAKFAST if BREAKFAST else "null" # 原价 itemValue["ORIGINAL_PRICE"] = extract_re( r"""data-price='(\d+)'""", RECORD2) # 套餐价格 taocan_price = extract_re( r"""rt_origin_price\\"><dfn>¥<\\/dfn>(.*?)<""", RECORD2) itemValue[ "DISCOUNT_PRICE"] = taocan_price if taocan_price else "0" # 返减 fanjian = extract_re(r"""span>返现(.*?)<""", RECORD2) itemValue["DISCOUNT"] = fanjian if fanjian else "0" print(itemValue) yield itemValue
def close_spider(self, spider): self.mariadb_connection.close() if mail_item_dict: # 如果mail_item_dict不为空,则发送邮件 logger.info(u'哇涩,又有最新低价,发送邮件中...') self._send_email(mail_item_dict.values())
.format(entry.id)) continue if entry.cron_updatedAt == rule['cron_updatedAt']: delay = self.__when(entry.next_call_time) if delay < 0: rules_to_be_returned.append(rule) self.__populate(rule) delayed_rules_cnt += 1 else: self._heap.push(entry) logger.info( '{} rules were delayed since last tick.'.format( delayed_rules_cnt)) return rules_to_be_returned, delay else: # 过期了,目前只能是因为cron字段被更新过了,这条rule作废,因为最新的rule已经被推送进了堆 logger.info( 'detect an outdated rule, discard, ruleId[{}].'.format( entry.id)) except Exception as e: logger.exception( 'tick catches an exception. Msg: {}'.format(e)) return [], self.pull_sche_interval if __name__ == '__main__': rules, sleep = CronJobScheduler().tick() logger.info('rules: {}, sleep: {}'.format(rules, sleep))