Exemple #1
0
def loop():
    while True:
        if utils.redis_get('auto_detect_status'):

            try:
                connected_humans = get_connected_humans()        
            except:
                LOGGER.exception("message")
                LOGGER.warning('Error while fetching connected_humans')
                time.sleep(5)
                continue

            if connected_humans:
                LOGGER.info('%s are connected. Turning off camera', 
                            connected_humans)
                utils.redis_set('home', True)
                utils.redis_set('camera_status', False)
                time.sleep(60*5)
            else:
                LOGGER.info('No humans are connected.')
                utils.redis_set('home', False)
                utils.redis_set('camera_status', True)
                # newly connected devices take ~ 30 seconds to show up
                time.sleep(30)

        else:
            time.sleep(60)
Exemple #2
0
def parse_list_v1(page=None, page_num=None, data=None, refer=None, proxy=None):
    logger = utils.get_logger()
    logger.info('parse_list_v1 当前解析第 %s 页 %s ' % (page_num, data))
    result = {}
    flg = True
    detail_count = 0
    document = etree.HTML(page.decode('utf-8'))
    try:
        containers = document.xpath(
            '//div[@class="main-left main_current_items"]//div[@class="details_container  " or  @class="details_container bg_container "]'
        )
        if containers:
            page_index = 0
            for container in containers:
                time_ = datetime.datetime.now() - datetime.timedelta(days=2)
                time_str_ = time_.strftime('%y') + '-' + time_.strftime(
                    '%m') + '-' + time_.strftime('%d')
                if time_str_ in etree.tostring(container, encoding='utf-8'):
                    logger.info('当前行的发布时间为 %s ,超过时间限制,不再进行抓取 %s ' %
                                (time_str_, refer))
                    flg = True
                    break
                vacancyid = container.xpath('input[@name="vacancyid"]/@value')
                if vacancyid:
                    key = 'zhilian_' + vacancyid[0]
                    if utils.redis_get(key):
                        logger.info('parse_list_v1已经抓取,跳过 %s-%s' %
                                    (key, refer))
                        continue
                    else:
                        logger.info('parse_list_v1当前id不重复,进行抓取 %s ' % key)
                        utils.redis_set(key, '1')
                page_index += 1
                detail_count += 1
                parse_list_columnV1(content=etree.tostring(container,
                                                           encoding='utf-8'),
                                    page_num=page_num,
                                    page_index=page_index,
                                    data=data,
                                    refer=refer,
                                    proxy=proxy)
        else:
            logger.error('parse_list_v1 没有找到details_container 元素  %s' % data)
            # flg = True
            result['info_length'] = 0

    except Exception as e:
        logger.error('parse_list_v1 出错了 %s %s' %
                     (data, traceback.format_exc()))
        flg = False
    result['status'] = flg
    result['detail_count'] = detail_count
    logger.info('parse_list_v1 %s 第 %s 页执行完毕 %s ' % (refer, page_num, result))
    return result
Exemple #3
0
def parse_list_html(list_html, track_id, page_num):
    """
    解析列表页面
    :return: list: dict :job_url,job_name,layout
    """

    if '对不起,没有找到符合你条件的职位' in list_html:
        return 'none_jd'
    findall = re.findall('<div class="el">[\\s\\S]*?</div>', list_html)

    # lines = tree.xpath('//div[@id="resultList"]')
    if findall:
        info_list = []
        now_time = (datetime.datetime.now()).strftime("%m-%d")
        yestday_time = (datetime.datetime.now() +
                        datetime.timedelta(-1)).strftime("%m-%d")
        for index, line in enumerate(findall):
            line = trim(line)
            tree = xmlh.document_fromstring(line)
            a_ = tree.xpath('//p/span/a')
            layout_txt = tree.xpath('//span[@class="t5"]/text()')[0]
            hot_flag = 0
            urgent_flag = 0
            if tree.xpath('.//p/span/img'):
                img_name = tree.xpath('.//p/span/img')[0].attrib['src'].split(
                    '/')[-1]
                if img_name == 'tag_hot.jpg':
                    hot_flag = 1
                elif img_name == 'tag_qk.jpg':
                    urgent_flag = 1
            if now_time == layout_txt or yestday_time == layout_txt:
                if a_:
                    job_url = a_[0].attrib.get('href')
                    if utils.redis_get(job_url):
                        continue
                    one_info = {
                        'job_url': job_url,
                        'job_name': a_[0].text.encode(),
                        'layout': layout_txt,
                        'page_num': page_num,
                        'page_index': index + 1,
                        'urgentFlag': urgent_flag,
                        'hotFlag': hot_flag,
                    }
                    info_list.append(one_info)
                    utils.redis_set(job_url, '1')
        if info_list:
            return info_list
    return 'none_jd'
Exemple #4
0
def parse_list_v2(page=None, page_num=None, data=None, refer=None, proxy=None):
    logger = utils.get_logger()
    logger.info('parse_list_v2 当前解析第 %s 页 %s ' % (page_num, data))
    document = etree.HTML(page.decode('utf-8'))
    flg = True
    result = {}
    detail_count = 0
    try:
        div = document.xpath('//div[@id="newlist_list_content_table"]')
        if not div:
            logger.info('newlist_list_content_table 没有符合条件的职位 %s' %
                        json.dumps(data))
            result['info_length'] = 0
            result['status'] = True
            return result
        tables = div[0].xpath('//table')
        if tables:
            page_index = 0
            for table in tables:
                table_str_ = etree.tostring(table, encoding='utf-8')
                if '以下职位也很不错' in table_str_ or '前天' in table_str_:
                    logger.info('当前好行搜索到 推荐位置,或者是发布时间为前天,程序结束 %s ' % refer)
                    result['info_length'] = 0
                    result['status'] = True
                    break

                if '职位名称' not in table_str_ and '公司名称' not in table_str_:
                    logger.info('parse_list_v2 解析第 %s 条 ' % page_index)
                    page_index += 1
                    tr = table.xpath('tr')
                    if tr:
                        vacancyid = tr[0].xpath(
                            'td/input[@name="vacancyid"]/@value')
                        if vacancyid:
                            key = 'zhilian_' + vacancyid[0]
                            if utils.redis_get(key):
                                logger.info('parse_list_v2已经抓取,跳过 %s-%s' %
                                            (key, refer))
                                continue
                            else:
                                logger.info('parse_list_v2当前id不重复,进行抓取 %s ' %
                                            key)
                                utils.redis_set(key, '1')
                        detail_count += 1
                        pay_data = {}
                        # 置顶
                        on_top = table.xpath('.//tr[1]/td[1]/div/a[2]/img[1]')
                        if on_top:
                            pay_data['onTopFlag'] = 1
                        # 加急
                        urgent_flag = table.xpath(
                            './/tr[1]/td[1]/div/a[2]/img[2]')
                        if urgent_flag:
                            pay_data['urgentFlag'] = 1
                        # 会员服务
                        member = table.xpath('.//tr[1]/td[3]/a[2]/img')
                        if member:
                            pay_data['memberFlag'] = 1
                        logger.info('付费选项 %s' % pay_data)
                        parse_list_columnV2(content=etree.tostring(
                            tr[0], encoding='utf-8'),
                                            page_index=page_index,
                                            page_num=page_num,
                                            data=data,
                                            refer=refer,
                                            proxy=proxy,
                                            pay_data=pay_data)
                    else:
                        logger.error('parse_list_v2 页面不包含 tr 标签 %s ' % data)
        else:
            logger.error('parse_list_v2 没有找到table元素 %s ' % data)
            result['info_length'] = 0
    except Exception as e:
        logger.error('parse_list_v2 出错了 %s ' % data)
        logger.error(traceback.format_exc())
        flg = False
    result['status'] = flg
    result['detail_count'] = detail_count
    logger.info('parse_list_v2 %s 第 %s 页执行完毕 %s ' % (refer, page_num, result))
    return result
Exemple #5
0
def parse_list_html(list_html, url, trackId, proxy_ip):
    """
    解析获取列表页面中字段
    :return: dict 下页列表页连接 ,list( dict:详情页连接-发布日期 )
                异常页面返回None , 没有职位了返回'none_jd'
    """
    if not list_html:
        logger.error("没有获取到页面%s" % trackId)
        return None

    tree = xmlh.document_fromstring(list_html)

    none_jd = tree.xpath('//*[@id="infolist"]/dl[1]/dt/text()')
    if none_jd and "没有符合条件的信息" in none_jd:
        return 'none_jd'
    title_txt = tree.xpath('/html/head/title/text()')
    if not title_txt:
        logger.error("搜索到的页面异常:%s" % trackId)
        return None
    if spider_utils.find('.*?请输入验证码.*?|.*?Denied Access Policy.*?|.*?ERROR.*?',
                         str(title_txt[0])):
        logger.error("搜索到的页面异常:%s" % trackId)
        return None
    lis = tree.xpath('//*[@id="list_con"]/li[@class="job_item clearfix"]')

    if not lis:
        logger.info('没有jd了-- %s' % trackId)
        return 'none_jd'
    list_dis = {}
    info_list = []
    # page_num_text = tree.xpath('//div[@class="pagerout"]/div/strong/span/text()')

    now_page_num = find('.*?pn(\\d+).*?', url)
    key_set = set()
    user_ids = set([])
    for index, item in enumerate(lis):
        # print item  # 一个职位
        div = item.xpath('./div/div[@__addition="0"]')
        if not div:
            continue
        result_dist = {
            'onTopFlag': 0,
            'authentication': {},
        }
        result_dist['pageNum'] = now_page_num
        result_dist['pageIndex'] = index + 1
        link = item.xpath('./div/div/a')[0].attrib.get('href')
        redis_key = utils.find('.*?entinfo=(\d+)_.*?', link)
        if not redis_key or redis_key in key_set:  #  对连接进行去重
            continue
        else:
            redis_key = 'five_eight:' + redis_key
            redis_get_link = utils.redis_get(redis_key)
            if redis_get_link:
                key_set.add(redis_key)
                continue
            else:
                utils.redis_set(redis_key, '1')
                key_set.add(redis_key)
        # print link # 拿到详情链接
        layout_list = item.xpath('./a[@class="sign"]/text()')
        layout = None
        if layout_list:
            layout = layout_list[0]
            result_dist['info_url'] = link  # 拿到详情链接
        if layout:
            if '精准' in layout:
                result_dist['jdLayoutTime'] = None
                result_dist['adTag'] = '1'
                result_dist['onTopFlag'] = 1
            elif '置顶' in layout:
                result_dist['jdLayoutTime'] = None
                result_dist['adTag'] = '2'
                result_dist['onTopFlag'] = 1
            else:
                result_dist['jdLayoutTime'] = layout
                result_dist['adTag'] = None
        else:
            result_dist['jdLayoutTime'] = None
            result_dist['adTag'] = None
        if item.xpath('.//i[@class="comp_icons mingqi"]'):
            result_dist['authentication']['58_mingqi'] = 1
        uids = item.xpath('.//input[@name="uid"]')
        if uids and uids[0].attrib.get('uid', ''):
            user_ids.add(uids[0].attrib['uid'])
            result_dist['uid'] = uids[0].attrib['uid']

        info_list.append(result_dist)

    brand = {}
    wltStats = {}
    if user_ids:
        get_brand_headers = {
            'Accept':
            '*/*',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Host':
            'zp.service.58.com',
            'Referer':
            url,
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36',
        }
        params = {'userIds': '|'.join(user_ids)}
        get_brand_url = 'http://zp.service.58.com/api?returnType=1&action=wltStats,brand&callback=jQuery110207922296067085253_%s&params=%s&_=%s' % (
            str(int(time.time() * 1000)), urllib.quote(
                json.dumps(params)), str(int(time.time() * 1000)))
        get_brand_response = utils.download(url=get_brand_url,
                                            headers=get_brand_headers,
                                            proxy=proxy_ip,
                                            allow_redirects=False,
                                            is_json=False,
                                            retry_time=3)
        if not get_brand_response['code']:
            get_brand_json_list = re.findall('[^\(]*\((.*)\)[^\)]*',
                                             get_brand_response['data'])

            if get_brand_json_list and get_brand_json_list[0]:
                get_brand_json = json.loads(get_brand_json_list[0])
                brand = get_brand_json.get('brand', {}).get('data', {})
                wltStats = get_brand_json.get('wltStats', {}).get('data', {})

    for i in info_list:
        if i.get('uid', ''):
            if i['uid'] in brand and brand[i['uid']]:
                i['authentication']['58_renzheng'] = 1
            if i['uid'] in wltStats and wltStats[i['uid']].startswith(
                    'wlt') and wltStats[i['uid']][3:].isdigit():
                i['memberFlag'] = int(wltStats[i['uid']][3:])

    list_dis['info_list'] = info_list
    next_url = None
    if len(info_list) >= 1:  # 每页总数55
        next_url = tree.xpath('//div[@class="pagesout"]/a[@class="next"]')

    if next_url:
        list_dis['next_url'] = next_url[0].attrib.get('href')
        return list_dis
    else:
        list_dis['next_url'] = None
        return list_dis
    def run(self):
        while True:
            if utils.redis_get('camera_status'):
                stream_iterator = self.stream()

                for frame, frame_delta, contours in stream_iterator:
                    timestamp = datetime.now()
                    ts = timestamp.strftime(self.ts_format_2)

                    # Classify latest frame as occupied or not
                    occupied = self.model.classify(
                        frame, contours, self.pir_values)

                    self.motion_counter.append(1 if occupied else 0)
                    self.motion_counter = self.motion_counter[
                        -1*self.motion_store_cnt:]

                    # Save latest image if enough time has elapsed since last save
                    last_save = (timestamp - self.last_save).seconds
                    if last_save >= self.min_save_seconds:
                        LOGGER.debug('Saving latest image')
                        self.save_last_image(frame, timestamp, 'latest', True)
                        self.last_save = timestamp

                        # Save for backtesting & training
                        if not occupied and self.train:
                            self.save_pickle(
                                self.frames, frame_delta, self.avg, contours,
                                self.pir_values, ts, classification=False
                            )

                    # Determine whether to notify in slack
                    last_notified = (timestamp - self.last_notified).seconds
                    notify_time_check = last_notified >= self.min_notify_seconds
                    notifications_on = utils.redis_get('camera_notifications')
                    enough_motion = np.mean(self.motion_counter) \
                        >= self.min_occupied_fraction

                    if notifications_on and notify_time_check and enough_motion:
                        LOGGER.info('Sending slack alert!')
                        fpath = self.save_last_image(frame, timestamp, ts)
                        self.last_notified = timestamp
                        response = utils.slack_upload(
                            fpath, title=os.path.basename(fpath))
                        os.remove(fpath)

                        # Save for backtesting & training
                        if self.train:
                            utils.slack_post_interactive(response)
                            self.save_pickle(
                                self.frames, frame_delta, self.avg, contours,
                                self.pir_values, ts, classification=True
                            )


                    if not utils.redis_get('camera_status'):
                        LOGGER.info('Clearing stored data')
                        self.clear_stored_data()
                        LOGGER.info('Stopping camera thread')
                        stream_iterator.close()
                        break
            else:
                time.sleep(2)
Exemple #7
0
def parse_list_page(page_num=None,
                    page=None,
                    list_url=None,
                    data=None,
                    refer=None,
                    session=None,
                    proxy=None):
    logger = utils.get_logger()
    logger.info('当前解析第 %s 页面列表 %s ' % (page_num, list_url))
    result = {'status': False}
    if '没有找到相关的信息' in page:
        logger.info('没有符合条件的职位 %s' % json.dumps(data))
        result['status'] = True
        result['info_length'] = 0
    else:
        try:
            document = etree.HTML(page.decode('utf-8'))
            dl_list = document.xpath(
                '//dl[@class="list-noimg job-list clearfix new-dl"]')
            if dl_list and len(dl_list) > 0:
                logger.info('当前页有 %s 条数据' % len(dl_list))
                page_index = 1
                for dl in dl_list:
                    logger.info("解析第 %s 条" % page_index)
                    puid = re.search('puid="(\\d+)"',
                                     etree.tostring(dl, encoding='utf-8'))
                    if puid:
                        puid_ = puid.group(1)
                        if not puid_:
                            continue
                        key = 'ganji_' + puid_
                        if utils.redis_get(key):
                            logger.info('不进行重复抓取 %s %s ' % (key, page_index))
                            continue
                        else:
                            logger.info('进行抓取 %s %s ' % (key, page_index))
                            utils.redis_set(key, 1)
                    page_index += 1
                    jd_layout_time = dl.xpath(
                        'dd[@class="pub-time"]/span/text()')[0]
                    logger.info('发布时间: %s' % jd_layout_time)
                    # 获取前天
                    time_ = datetime.datetime.now() - datetime.timedelta(
                        days=2)
                    time_str_ = time_.strftime('%m') + '-' + time_.strftime(
                        '%d')
                    if time_str_ in jd_layout_time:
                        logger.info(' %s 当前行 发布时间为 %s ,或者更前,程序退出 ' %
                                    (refer, time_str_))
                        result['info_length'] = 0
                        result['status'] = True
                        break
                    detail_url = dl.xpath('dt/a/@href')
                    logger.info('详情页面URL为:%s' % detail_url)
                    if detail_url:
                        pay_data = {}
                        # 赶集帮帮
                        bb_count = dl.xpath(
                            './/span[@class="ico-bang-new"]/text()')
                        if bb_count:
                            pay_data['memberFlag'] = bb_count[0]
                        # 企业邮箱认证
                        ganji_email = dl.xpath(
                            './/span[@class="s-mailbox01"]/text()')
                        if ganji_email:
                            pay_data['ganji_email'] = 1
                        # 热招职位
                        ganji_hot = dl.xpath(
                            './/span[@class="ico-hot"]/text()')
                        if ganji_hot:
                            pay_data['hotFlag'] = 1
                        # 驴招
                        ganji_lv = dl.xpath('.//span[@class="icon-safety"]')
                        if ganji_lv:
                            pay_data['ganji_lv'] = 1
                        # 置顶
                        ganji_top = dl.xpath(
                            './/span[contains("class","new-top-icon") or contains("class","ico-stick-yellow")]'
                        )
                        if ganji_top:
                            pay_data['onTopFlag'] = 1
                        # 品牌
                        ganji_branch = dl.xpath('.//span[@class="icon-pp"]')
                        if ganji_branch:
                            pay_data['ganji_branch'] = 1
                        logger.info('付费的数据 %s ' % pay_data)
                        parse_detail_page(detail_url=detail_url[0],
                                          list_url=list_url,
                                          data=data,
                                          jd_layout_time=jd_layout_time,
                                          page_num=page_num,
                                          page_index=page_index,
                                          refer=refer,
                                          session=session,
                                          proxy=proxy,
                                          pay_data=pay_data)
            else:
                logger.error('没有解析到 new-dl 元素 %s' % list_url)
                result['info_length'] = 0
            result['status'] = True
        except Exception as e:
            logger.error('parse_list_page 列表页解析异常 %s %s ' %
                         (list_url, traceback.format_exc()))
            result['status'] = False
    logger.info('parse_list_page %s 第 %s 页 执行完毕 %s' %
                (list_url, page_num, result))
    return result