def loop(): while True: if utils.redis_get('auto_detect_status'): try: connected_humans = get_connected_humans() except: LOGGER.exception("message") LOGGER.warning('Error while fetching connected_humans') time.sleep(5) continue if connected_humans: LOGGER.info('%s are connected. Turning off camera', connected_humans) utils.redis_set('home', True) utils.redis_set('camera_status', False) time.sleep(60*5) else: LOGGER.info('No humans are connected.') utils.redis_set('home', False) utils.redis_set('camera_status', True) # newly connected devices take ~ 30 seconds to show up time.sleep(30) else: time.sleep(60)
def parse_list_v1(page=None, page_num=None, data=None, refer=None, proxy=None): logger = utils.get_logger() logger.info('parse_list_v1 当前解析第 %s 页 %s ' % (page_num, data)) result = {} flg = True detail_count = 0 document = etree.HTML(page.decode('utf-8')) try: containers = document.xpath( '//div[@class="main-left main_current_items"]//div[@class="details_container " or @class="details_container bg_container "]' ) if containers: page_index = 0 for container in containers: time_ = datetime.datetime.now() - datetime.timedelta(days=2) time_str_ = time_.strftime('%y') + '-' + time_.strftime( '%m') + '-' + time_.strftime('%d') if time_str_ in etree.tostring(container, encoding='utf-8'): logger.info('当前行的发布时间为 %s ,超过时间限制,不再进行抓取 %s ' % (time_str_, refer)) flg = True break vacancyid = container.xpath('input[@name="vacancyid"]/@value') if vacancyid: key = 'zhilian_' + vacancyid[0] if utils.redis_get(key): logger.info('parse_list_v1已经抓取,跳过 %s-%s' % (key, refer)) continue else: logger.info('parse_list_v1当前id不重复,进行抓取 %s ' % key) utils.redis_set(key, '1') page_index += 1 detail_count += 1 parse_list_columnV1(content=etree.tostring(container, encoding='utf-8'), page_num=page_num, page_index=page_index, data=data, refer=refer, proxy=proxy) else: logger.error('parse_list_v1 没有找到details_container 元素 %s' % data) # flg = True result['info_length'] = 0 except Exception as e: logger.error('parse_list_v1 出错了 %s %s' % (data, traceback.format_exc())) flg = False result['status'] = flg result['detail_count'] = detail_count logger.info('parse_list_v1 %s 第 %s 页执行完毕 %s ' % (refer, page_num, result)) return result
def parse_list_html(list_html, track_id, page_num): """ 解析列表页面 :return: list: dict :job_url,job_name,layout """ if '对不起,没有找到符合你条件的职位' in list_html: return 'none_jd' findall = re.findall('<div class="el">[\\s\\S]*?</div>', list_html) # lines = tree.xpath('//div[@id="resultList"]') if findall: info_list = [] now_time = (datetime.datetime.now()).strftime("%m-%d") yestday_time = (datetime.datetime.now() + datetime.timedelta(-1)).strftime("%m-%d") for index, line in enumerate(findall): line = trim(line) tree = xmlh.document_fromstring(line) a_ = tree.xpath('//p/span/a') layout_txt = tree.xpath('//span[@class="t5"]/text()')[0] hot_flag = 0 urgent_flag = 0 if tree.xpath('.//p/span/img'): img_name = tree.xpath('.//p/span/img')[0].attrib['src'].split( '/')[-1] if img_name == 'tag_hot.jpg': hot_flag = 1 elif img_name == 'tag_qk.jpg': urgent_flag = 1 if now_time == layout_txt or yestday_time == layout_txt: if a_: job_url = a_[0].attrib.get('href') if utils.redis_get(job_url): continue one_info = { 'job_url': job_url, 'job_name': a_[0].text.encode(), 'layout': layout_txt, 'page_num': page_num, 'page_index': index + 1, 'urgentFlag': urgent_flag, 'hotFlag': hot_flag, } info_list.append(one_info) utils.redis_set(job_url, '1') if info_list: return info_list return 'none_jd'
def parse_list_v2(page=None, page_num=None, data=None, refer=None, proxy=None): logger = utils.get_logger() logger.info('parse_list_v2 当前解析第 %s 页 %s ' % (page_num, data)) document = etree.HTML(page.decode('utf-8')) flg = True result = {} detail_count = 0 try: div = document.xpath('//div[@id="newlist_list_content_table"]') if not div: logger.info('newlist_list_content_table 没有符合条件的职位 %s' % json.dumps(data)) result['info_length'] = 0 result['status'] = True return result tables = div[0].xpath('//table') if tables: page_index = 0 for table in tables: table_str_ = etree.tostring(table, encoding='utf-8') if '以下职位也很不错' in table_str_ or '前天' in table_str_: logger.info('当前好行搜索到 推荐位置,或者是发布时间为前天,程序结束 %s ' % refer) result['info_length'] = 0 result['status'] = True break if '职位名称' not in table_str_ and '公司名称' not in table_str_: logger.info('parse_list_v2 解析第 %s 条 ' % page_index) page_index += 1 tr = table.xpath('tr') if tr: vacancyid = tr[0].xpath( 'td/input[@name="vacancyid"]/@value') if vacancyid: key = 'zhilian_' + vacancyid[0] if utils.redis_get(key): logger.info('parse_list_v2已经抓取,跳过 %s-%s' % (key, refer)) continue else: logger.info('parse_list_v2当前id不重复,进行抓取 %s ' % key) utils.redis_set(key, '1') detail_count += 1 pay_data = {} # 置顶 on_top = table.xpath('.//tr[1]/td[1]/div/a[2]/img[1]') if on_top: pay_data['onTopFlag'] = 1 # 加急 urgent_flag = table.xpath( './/tr[1]/td[1]/div/a[2]/img[2]') if urgent_flag: pay_data['urgentFlag'] = 1 # 会员服务 member = table.xpath('.//tr[1]/td[3]/a[2]/img') if member: pay_data['memberFlag'] = 1 logger.info('付费选项 %s' % pay_data) parse_list_columnV2(content=etree.tostring( tr[0], encoding='utf-8'), page_index=page_index, page_num=page_num, data=data, refer=refer, proxy=proxy, pay_data=pay_data) else: logger.error('parse_list_v2 页面不包含 tr 标签 %s ' % data) else: logger.error('parse_list_v2 没有找到table元素 %s ' % data) result['info_length'] = 0 except Exception as e: logger.error('parse_list_v2 出错了 %s ' % data) logger.error(traceback.format_exc()) flg = False result['status'] = flg result['detail_count'] = detail_count logger.info('parse_list_v2 %s 第 %s 页执行完毕 %s ' % (refer, page_num, result)) return result
def parse_list_html(list_html, url, trackId, proxy_ip): """ 解析获取列表页面中字段 :return: dict 下页列表页连接 ,list( dict:详情页连接-发布日期 ) 异常页面返回None , 没有职位了返回'none_jd' """ if not list_html: logger.error("没有获取到页面%s" % trackId) return None tree = xmlh.document_fromstring(list_html) none_jd = tree.xpath('//*[@id="infolist"]/dl[1]/dt/text()') if none_jd and "没有符合条件的信息" in none_jd: return 'none_jd' title_txt = tree.xpath('/html/head/title/text()') if not title_txt: logger.error("搜索到的页面异常:%s" % trackId) return None if spider_utils.find('.*?请输入验证码.*?|.*?Denied Access Policy.*?|.*?ERROR.*?', str(title_txt[0])): logger.error("搜索到的页面异常:%s" % trackId) return None lis = tree.xpath('//*[@id="list_con"]/li[@class="job_item clearfix"]') if not lis: logger.info('没有jd了-- %s' % trackId) return 'none_jd' list_dis = {} info_list = [] # page_num_text = tree.xpath('//div[@class="pagerout"]/div/strong/span/text()') now_page_num = find('.*?pn(\\d+).*?', url) key_set = set() user_ids = set([]) for index, item in enumerate(lis): # print item # 一个职位 div = item.xpath('./div/div[@__addition="0"]') if not div: continue result_dist = { 'onTopFlag': 0, 'authentication': {}, } result_dist['pageNum'] = now_page_num result_dist['pageIndex'] = index + 1 link = item.xpath('./div/div/a')[0].attrib.get('href') redis_key = utils.find('.*?entinfo=(\d+)_.*?', link) if not redis_key or redis_key in key_set: # 对连接进行去重 continue else: redis_key = 'five_eight:' + redis_key redis_get_link = utils.redis_get(redis_key) if redis_get_link: key_set.add(redis_key) continue else: utils.redis_set(redis_key, '1') key_set.add(redis_key) # print link # 拿到详情链接 layout_list = item.xpath('./a[@class="sign"]/text()') layout = None if layout_list: layout = layout_list[0] result_dist['info_url'] = link # 拿到详情链接 if layout: if '精准' in layout: result_dist['jdLayoutTime'] = None result_dist['adTag'] = '1' result_dist['onTopFlag'] = 1 elif '置顶' in layout: result_dist['jdLayoutTime'] = None result_dist['adTag'] = '2' result_dist['onTopFlag'] = 1 else: result_dist['jdLayoutTime'] = layout result_dist['adTag'] = None else: result_dist['jdLayoutTime'] = None result_dist['adTag'] = None if item.xpath('.//i[@class="comp_icons mingqi"]'): result_dist['authentication']['58_mingqi'] = 1 uids = item.xpath('.//input[@name="uid"]') if uids and uids[0].attrib.get('uid', ''): user_ids.add(uids[0].attrib['uid']) result_dist['uid'] = uids[0].attrib['uid'] info_list.append(result_dist) brand = {} wltStats = {} if user_ids: get_brand_headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'zp.service.58.com', 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36', } params = {'userIds': '|'.join(user_ids)} get_brand_url = 'http://zp.service.58.com/api?returnType=1&action=wltStats,brand&callback=jQuery110207922296067085253_%s¶ms=%s&_=%s' % ( str(int(time.time() * 1000)), urllib.quote( json.dumps(params)), str(int(time.time() * 1000))) get_brand_response = utils.download(url=get_brand_url, headers=get_brand_headers, proxy=proxy_ip, allow_redirects=False, is_json=False, retry_time=3) if not get_brand_response['code']: get_brand_json_list = re.findall('[^\(]*\((.*)\)[^\)]*', get_brand_response['data']) if get_brand_json_list and get_brand_json_list[0]: get_brand_json = json.loads(get_brand_json_list[0]) brand = get_brand_json.get('brand', {}).get('data', {}) wltStats = get_brand_json.get('wltStats', {}).get('data', {}) for i in info_list: if i.get('uid', ''): if i['uid'] in brand and brand[i['uid']]: i['authentication']['58_renzheng'] = 1 if i['uid'] in wltStats and wltStats[i['uid']].startswith( 'wlt') and wltStats[i['uid']][3:].isdigit(): i['memberFlag'] = int(wltStats[i['uid']][3:]) list_dis['info_list'] = info_list next_url = None if len(info_list) >= 1: # 每页总数55 next_url = tree.xpath('//div[@class="pagesout"]/a[@class="next"]') if next_url: list_dis['next_url'] = next_url[0].attrib.get('href') return list_dis else: list_dis['next_url'] = None return list_dis
def run(self): while True: if utils.redis_get('camera_status'): stream_iterator = self.stream() for frame, frame_delta, contours in stream_iterator: timestamp = datetime.now() ts = timestamp.strftime(self.ts_format_2) # Classify latest frame as occupied or not occupied = self.model.classify( frame, contours, self.pir_values) self.motion_counter.append(1 if occupied else 0) self.motion_counter = self.motion_counter[ -1*self.motion_store_cnt:] # Save latest image if enough time has elapsed since last save last_save = (timestamp - self.last_save).seconds if last_save >= self.min_save_seconds: LOGGER.debug('Saving latest image') self.save_last_image(frame, timestamp, 'latest', True) self.last_save = timestamp # Save for backtesting & training if not occupied and self.train: self.save_pickle( self.frames, frame_delta, self.avg, contours, self.pir_values, ts, classification=False ) # Determine whether to notify in slack last_notified = (timestamp - self.last_notified).seconds notify_time_check = last_notified >= self.min_notify_seconds notifications_on = utils.redis_get('camera_notifications') enough_motion = np.mean(self.motion_counter) \ >= self.min_occupied_fraction if notifications_on and notify_time_check and enough_motion: LOGGER.info('Sending slack alert!') fpath = self.save_last_image(frame, timestamp, ts) self.last_notified = timestamp response = utils.slack_upload( fpath, title=os.path.basename(fpath)) os.remove(fpath) # Save for backtesting & training if self.train: utils.slack_post_interactive(response) self.save_pickle( self.frames, frame_delta, self.avg, contours, self.pir_values, ts, classification=True ) if not utils.redis_get('camera_status'): LOGGER.info('Clearing stored data') self.clear_stored_data() LOGGER.info('Stopping camera thread') stream_iterator.close() break else: time.sleep(2)
def parse_list_page(page_num=None, page=None, list_url=None, data=None, refer=None, session=None, proxy=None): logger = utils.get_logger() logger.info('当前解析第 %s 页面列表 %s ' % (page_num, list_url)) result = {'status': False} if '没有找到相关的信息' in page: logger.info('没有符合条件的职位 %s' % json.dumps(data)) result['status'] = True result['info_length'] = 0 else: try: document = etree.HTML(page.decode('utf-8')) dl_list = document.xpath( '//dl[@class="list-noimg job-list clearfix new-dl"]') if dl_list and len(dl_list) > 0: logger.info('当前页有 %s 条数据' % len(dl_list)) page_index = 1 for dl in dl_list: logger.info("解析第 %s 条" % page_index) puid = re.search('puid="(\\d+)"', etree.tostring(dl, encoding='utf-8')) if puid: puid_ = puid.group(1) if not puid_: continue key = 'ganji_' + puid_ if utils.redis_get(key): logger.info('不进行重复抓取 %s %s ' % (key, page_index)) continue else: logger.info('进行抓取 %s %s ' % (key, page_index)) utils.redis_set(key, 1) page_index += 1 jd_layout_time = dl.xpath( 'dd[@class="pub-time"]/span/text()')[0] logger.info('发布时间: %s' % jd_layout_time) # 获取前天 time_ = datetime.datetime.now() - datetime.timedelta( days=2) time_str_ = time_.strftime('%m') + '-' + time_.strftime( '%d') if time_str_ in jd_layout_time: logger.info(' %s 当前行 发布时间为 %s ,或者更前,程序退出 ' % (refer, time_str_)) result['info_length'] = 0 result['status'] = True break detail_url = dl.xpath('dt/a/@href') logger.info('详情页面URL为:%s' % detail_url) if detail_url: pay_data = {} # 赶集帮帮 bb_count = dl.xpath( './/span[@class="ico-bang-new"]/text()') if bb_count: pay_data['memberFlag'] = bb_count[0] # 企业邮箱认证 ganji_email = dl.xpath( './/span[@class="s-mailbox01"]/text()') if ganji_email: pay_data['ganji_email'] = 1 # 热招职位 ganji_hot = dl.xpath( './/span[@class="ico-hot"]/text()') if ganji_hot: pay_data['hotFlag'] = 1 # 驴招 ganji_lv = dl.xpath('.//span[@class="icon-safety"]') if ganji_lv: pay_data['ganji_lv'] = 1 # 置顶 ganji_top = dl.xpath( './/span[contains("class","new-top-icon") or contains("class","ico-stick-yellow")]' ) if ganji_top: pay_data['onTopFlag'] = 1 # 品牌 ganji_branch = dl.xpath('.//span[@class="icon-pp"]') if ganji_branch: pay_data['ganji_branch'] = 1 logger.info('付费的数据 %s ' % pay_data) parse_detail_page(detail_url=detail_url[0], list_url=list_url, data=data, jd_layout_time=jd_layout_time, page_num=page_num, page_index=page_index, refer=refer, session=session, proxy=proxy, pay_data=pay_data) else: logger.error('没有解析到 new-dl 元素 %s' % list_url) result['info_length'] = 0 result['status'] = True except Exception as e: logger.error('parse_list_page 列表页解析异常 %s %s ' % (list_url, traceback.format_exc())) result['status'] = False logger.info('parse_list_page %s 第 %s 页 执行完毕 %s' % (list_url, page_num, result)) return result