Exemple #1
0
    def _single_by_key_city(self):
        for key in self.keys:
            if not key:
                continue
            for cid in PLUS_CITYS:
                if not cid:
                    continue
                if not isinstance(cid, int):
                    cid = int(cid)

                city_info = self.citys_list.get(cid)
                if not city_info:
                    LOG.info('@@@@@: %s is not have city information, exit...' % cid)
                    sys.exit(1)

                city_full_name = city_info.get('full_name')
                min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key, _type='city',
                                                                                               city_id=cid, cityes=self.citys_list)

                self._is_not_max_range_die(max_range)
                LOG.info('[%s][%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s'
                         % (RUN_MODE, key, city_full_name, min_page, max_page, max_pagination, max_range))
                if not isinstance(max_pagination, int):
                    max_pagination = int(max_pagination)
                for i in range(0, max_range, 1):
                    max_page = PAGINATION + min_page
                    if max_page > max_pagination:
                        max_page = max_pagination
                    self._print_info('[%s][%s][%s]%s ~ %s' % (RUN_MODE, key, city_full_name, min_page, max_page))
                    _res = self.tyc_client.work_by_key(key, min_page, max_page, cid=cid, city_info=city_info)
                    self.to_store(key, min_page, max_page, datas=_res)
                    min_page = max_page + 1
Exemple #2
0
 def _wrapper(*args, **kwargs):
     start = datetime.now()
     res = fn(*args, **kwargs)
     end = datetime.now()
     LOG.info('@timeer %s is run: %s' % (fn.__name__,
                                         (end - start).seconds))
     return res
Exemple #3
0
def login_in():
    if request.method == 'GET':
        is_ok, message = check_login()
        if not is_ok:
            return render_template("login.html", login_message="")

        g.menuf = 'index'
        g.menusub = 'index'
        return render_template("index.html")
    elif request.method == 'POST':
        form = request.form
        user_id = form.get('login_user')
        user_pwd = form.get('login_password')
        if not user_id:
            return render_template("login.html",
                                   login_message=u'请输入用户信息(ID、电话、邮箱)')
        if not user_pwd:
            return render_template("login.html", login_message=u'请输入账号密码')
        is_register_user = SysUserService().get_user_by_params(user_id)
        if not is_register_user:
            return render_template("login.html", login_message=u'账户未注册')
        # 支持用户id、phone、email登录
        user = SysUserService().check_user(user_id, user_pwd)
        if not user:
            return render_template("login.html", login_message=u'账号密码不匹配')
        session['user_id'] = user_id
        g.menuf = 'index'
        g.menusub = 'index'
        LOG.info('%s login in ==========' % user_id)
        return render_template("index.html")
    else:
        return render_template("login.html", login_message="")
    def adds(self, datas):
        if not datas:
            LOG.error('DB: data is null.')
            return

        failure_list = list()
        success_list = list()
        for data in datas:
            if not data:
                continue

            if isinstance(data, str):
                data = dict(data)
            credit_code = data.get('credit_code')
            name = data.get('name')
            if not credit_code:
                failure_list.append(name)
                continue
            model = self.enterprise_bo.get_by_code(credit_code)
            if model:
                failure_list.append(name)
                continue

            new_model = self.enterprise_bo.new_mode()
            new_model.name = name
            new_model.phone = data.get('phone')
            new_model.email = data.get('email')
            new_model.tyc_url = data.get('tyc_url')
            new_model.company_url = data.get('company_url')
            new_model.address = data.get('address')
            new_model.register_funds = data.get('register_funds')
            new_model.paidin_funds = data.get('paidin_funds')
            new_model.establish_date = data.get('establish_date')
            new_model.status = data.get('status')
            new_model.credit_code = credit_code
            new_model.company_type = data.get('company_type')
            new_model.industry = data.get('industry')
            new_model.business_term = data.get('business_term')
            new_model.resume = data.get('resume')
            new_model.business_scope = data.get('business_scope')
            new_model.key = data.get('key')
            new_model.create_time = get_now()
            new_model.city = data.get('city')
            new_model.sub_city = data.get('sub_city')
            try:
                self.enterprise_bo.add_model(new_model)
                success_list.append(name)
            except Exception as e:
                LOG.error('DB add error %s: %s' % (e, str(data)))
                failure_list.append(name)
        else:
            if success_list:
                LOG.info('success list:【%s】' % len(success_list))
            if failure_list:
                LOG.info('failure list:【%s】' % len(failure_list))
            return success_list, failure_list
Exemple #5
0
 def register_blueprint(self, obj_n, obj):
     """
     view blueprint register
     :param obj_n: blueprint object
     :param obj: blueprint name
     :return: None
     """
     if obj:
         LOG.info('Blueprint %s is register' % obj_n)
         self.app.register_blueprint(obj)
    def get_pagination(self, key):
        min_page = 0
        max_page = 5
        if not key:
            return min_page, max_page

        if API_MODE == 'tyc':
            return min_page, max_page
        elif API_MODE == 'pro':
            url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, '0', parse.quote(key))
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')

            soup = BeautifulSoup(search_resp, 'lxml')
            search_pagination = soup.find_all('div',
                                              class_='search-pagination')

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(search_pagination) == 0 or not is_ok:
                while 1:
                    if is_ok and len(search_pagination) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        search_pagination = soup.find_all(
                            'div', class_='search-pagination')

            l = len(search_pagination[0].find_all('a'))
            for index_a, a in enumerate(search_pagination[0].find_all('a')):
                if index_a == (l - 2):
                    max_page = a.string.strip()
                    if max_page.find('...') > -1:
                        max_page = max_page.split('...')[1]
                        if isinstance(max_page, str):
                            max_page = int(max_page)
                    break
            LOG.info('[%s] pagination max: %s' % (key, max_page))
            return min_page, max_page
Exemple #7
0
    def _process_by_key_only_sub_city(self):
        pool = multiprocessing.Pool(processes=(MAX_CPU - 1 if MAX_CPU > 2 else 1))
        LOG.info('Main process: %s, run cpu count: %s' % (os.getpid(), (MAX_CPU - 1 if MAX_CPU > 2 else 1)))
        process = list()

        for key in self.keys:
            if not key:
                continue

            for sub_cid in PLUS_CITYS:
                if not sub_cid:
                    continue
                if not isinstance(sub_cid, int):
                    sub_cid = int(sub_cid)

                sub_city_info = self.sub_citys_dict.get(sub_cid)
                if not sub_city_info:
                    LOG.info('@@@@@: %s is not have sub_city information, exit...' % sub_cid)
                    sys.exit(1)
                sub_city_pid = sub_city_info.get('parent_id')
                city_info = self.citys_list.get(sub_city_pid)
                if not city_info:
                    LOG.info('@@@@@: %s is not have parent city information, exit...' % sub_cid)
                    sys.exit(1)

                city_id = city_info.get('id')
                city_full_name = city_info.get('full_name')
                sub_city_id = sub_city_info.get('id')
                min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key, _type='sub_city',
                                                                                               city_id=city_id,
                                                                                               sub_city_id=sub_city_id,
                                                                                               cityes=self.citys_list,
                                                                                               sub_city_info=sub_city_info)
                max_pagination = int(max_pagination)
                LOG.info('[%s][%s][%s-%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s'
                         % (RUN_MODE, key, city_full_name, sub_city_info.get('full_name'), min_page, max_page, max_pagination, max_range))
                self._is_not_max_range_die(max_range)

                for i in range(0, max_range, 1):
                    max_page = min_page + PAGINATION
                    if max_page > max_pagination:
                        max_page = max_pagination
                    process.append(
                        pool.apply_async(self.tyc_client.work_by_key, args=(key, min_page, max_page, 'city',
                                                                            self.q, city_id, sub_city_id, city_info, sub_city_info))
                    )
                    min_page = max_page + 1

        pool.close()
        pool.join()

        while 1:
            try:
                if self.q.empty():
                    break
                self.ret_res_list.append(self.q.get_nowait())
            except:
                pass

        self.to_store(self.keys, MIN_PAGE, MAX_PAGE)
Exemple #8
0
def upload_image():
    image = request.files.get('avatar')
    g.menuf = 'setter'
    g.menusub = 'user'
    try:
        form = request.form
        res = SetterService().upload_info(image, form)
    except Exception as e:
        LOG.error("setter>upload_info is error: %s" % e)
        res = Status(101,
                     'failure',
                     u'Server发生错误,获取失败',
                     {}).json()
    LOG.info('%s update information' % get_user_id())
    return res
Exemple #9
0
 def to_store(self, keys, min_page, max_page, datas=None, excelname=None):
     _keys = list()
     if isinstance(keys, list):
         _keys = keys
     else:
         _keys.append(keys)
     _data = datas if datas else self.ret_res_list
     if STORE_EXCEL:
         if (min_page and max_page):
             to_excel_name = os.path.join(get_excel_folder(), '%s[%s]-%s[%s~%s].xls'
                                          % (get_now(), API_MODE, '_'.join(_keys), min_page, max_page))
         else:
             to_excel_name = os.path.join(get_excel_folder(), '%s[%s]-%s[ALL].xls' % (get_now(), API_MODE, '_'.join(_keys)))
         self.excel_client.to_excel(_data, ATTRS_DICT, to_excel_name)
         LOG.info("Excel is finished[%s ~ %s]: %s" % (min_page, max_page, to_excel_name))
     if STORE_DB:
         self.enterprise_service.adds(_data)
         LOG.info('DB is finished[%s ~ %s]: %s' % (min_page, max_page, '_'.join(_keys))) \
             if (MIN_PAGE and MAX_PAGE) else LOG.info('DB is finished[ALL]: %s' % ('_'.join(_keys)))
Exemple #10
0
    def _single_by_key(self):
        for key in self.keys:
            if not key:
                continue

            min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key)
            self._is_not_max_range_die(max_range)
            LOG.info('[%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s'
                     % (RUN_MODE, key, min_page, max_page, max_pagination, max_range))

            if not isinstance(max_pagination, int):
                max_pagination = int(max_pagination)
            for i in range(0, max_range, 1):
                max_page = PAGINATION + min_page
                if max_page > max_pagination:
                    max_page = max_pagination
                self._print_info('[%s][%s]%s ~ %s' % (RUN_MODE, key, min_page, max_page))
                _res = self.tyc_client.work_by_key(key, min_page, max_page)
                self.to_store(key, min_page, max_page, datas=_res)
                min_page = max_page + 1
Exemple #11
0
    def _process_by_key(self):
        pool = multiprocessing.Pool(processes=(MAX_CPU - 1 if MAX_CPU > 2 else 1))
        LOG.info('Main process: %s, run cpu count: %s' % (os.getpid(), (MAX_CPU - 1 if MAX_CPU > 2 else 1)))
        process = list()

        for key in self.keys:
            if not key:
                continue

            min_page, max_page, max_pagination, max_range = self.tyc_client.get_pagination(key)
            LOG.info('[%s][%s]spider page: %s ~ %s ||| max_pagination: %s ||| max range: %s'
                     % (RUN_MODE, key, min_page, max_page, max_pagination, max_range))
            if not max_range:
                LOG.error("It's not have max range")
                sys.exit()

            for i in range(0, max_range, 1):
                max_page = min_page + PAGINATION
                if max_page > max_pagination:
                    max_page = max_pagination
                process.append(
                    pool.apply_async(self.tyc_client.work_by_key, args=(key, min_page, max_page,
                                                                        self.q, self.citys_list, self.sub_citys_mapping))
                )
                min_page = max_page + 1

        pool.close()
        pool.join()

        while 1:
            try:
                if self.q.empty():
                    break
                self.ret_res_list.append(self.q.get_nowait())
            except:
                pass

        self.to_store(self.keys, MIN_PAGE, MAX_PAGE)
Exemple #12
0
    def detail_by_url(self, comp_url: str, obj_id: str):
        print(self.count, comp_url, obj_id, '$' * 80)
        detail_res = dict()
        if not comp_url:
            return detail_res

        is_ok, search_resp = api_get(url=comp_url,
                                     headers=self.headers,
                                     data={},
                                     resptype='text')
        if not is_ok:
            return detail_res

        soup = BeautifulSoup(search_resp, 'lxml')

        # header: 详情页 公司名称
        title_list = soup.find_all('div', class_="header")
        et2 = etree.HTML(search_resp)
        # if not title_list:
        #     return -1
        try:
            company_name = (title_list[0].find_all(
                'h1', class_="name"))[0].get_text()
        except:
            name = et2.xpath(
                '//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()')
            company_name = ''.join(name)
        detail_res['company_name'] = company_name

        # 电话 更多联系方式
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80)
        origin_phone = et2.xpath(
            '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'
        )

        # 邮箱 更多邮箱
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80)
        origin_email = et2.xpath(
            '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'
        )

        if origin_phone and origin_email:
            year_list = [i.get('showSource') for i in eval(origin_phone[0])]
            phone_item_vals = [
                i.get('phoneNumber') for i in eval(origin_phone[0])
            ]
            email_list = eval(origin_email[0])
            contact_item = {}
            for contact in zip(year_list, phone_item_vals, email_list):
                contact_item['c_id'] = obj_id
                contact_item['company_name'] = detail_res.get(
                    'company_name', '')
                contact_item['report_year'] = contact[0]
                contact_item['phone'] = contact[1]
                contact_item['email'] = contact[-1]
                contact_item['date_time'] = datetime.now()
                bixao_phone_emial.find_one_and_update({'c_id': obj_id},
                                                      {'$set': contact_item},
                                                      upsert=True)

        # detail: 电话 邮箱 公司官网 地址 简介
        detail_div = soup.find_all('div', class_="detail")

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if not detail_div:
            while 1:
                if is_ok and detail_div:
                    break
                else:
                    LOG.critical('验证############### %s ###############' %
                                 comp_url)
                    random_sleep(20, 25)
                    self.headers['Cookie'] = cookies_get()
                    is_ok, search_resp = while_req(comp_url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    detail_div = soup.find_all('div', class_="detail")

        for div in detail_div[0].find_all('div'):
            if not div:
                continue

            # f0 电话 && 邮箱
            if div.get('class') == ['f0']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['phone'] = child.get_text().strip(
                                ) or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['email'] = child.get_text().strip(
                                ) or '-'
                                break
                    else:
                        break
            # 公司官网 && 地址
            elif div.get('class') == ['f0', 'clearfix']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['company_url'] = child.get_text(
                                ).strip() or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                for small_index, small_child in enumerate(
                                        child.children):
                                    if small_index == 0:
                                        detail_res[
                                            'address'] = small_child.get_text(
                                            ).strip() or '-'
                                        break
                                break
                    else:
                        break
            # 简介
            elif div.get('class') == ['summary']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        resume = big_child.string
                        if resume:
                            resume = resume.strip()
                        detail_res['resume'] = resume or '-'
                        break
                    else:
                        break
            else:
                continue

        # detail-list:
        detail_list_div = soup.find_all('div', class_="detail-list")
        if not detail_list_div:
            return detail_res

        detail_res['c_id'] = obj_id
        etc = etree.HTML(search_resp)
        for div in detail_list_div[0].find_all('div'):
            if not div:
                continue

            if div.get('tyc-event-ch'
                       ) == 'CompangyDetail.gongshangxinxin':  # 工商信息
                for index_1, child_1 in enumerate(
                        div.find_all('div', recursive=False)):
                    if index_1 == 1:
                        for index_1_1, child_1_1 in enumerate(child_1):
                            if index_1_1 == 2:
                                for index_tr, tr in enumerate(
                                        child_1_1.find_all('tr')):
                                    if index_tr == 0:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 注册资本
                                                detail_res[
                                                    'register_funds'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 实缴资金
                                                detail_res[
                                                    'paidin_funds'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 1:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 成立日期
                                                detail_res[
                                                    'establish_date'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 经营状态
                                                detail_res[
                                                    'status'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 2:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 统一社会信用代码
                                                detail_res[
                                                    'credit_code'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 工商注册号
                                                detail_res[
                                                    'registration_number'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 3:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 纳税人识别号
                                                detail_res[
                                                    'identification_number'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 组织机构代码
                                                detail_res[
                                                    'organization_code'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 4:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 公司类型
                                                detail_res[
                                                    'company_type'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 行业
                                                detail_res[
                                                    'industry'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 6:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 营业期限
                                                detail_res[
                                                    'business_term'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 纳税人资质
                                                detail_res[
                                                    'taxpayer_qualification'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 7:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 人员规模
                                                detail_res[
                                                    'personnel_size'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 参保人数
                                                detail_res[
                                                    'insured_num'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 9:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 注册地址
                                                detail_res[
                                                    'registered_address'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 10:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 经营范围
                                                detail_res[
                                                    'business_scope'] = td.get_text(
                                                    ).strip() or '-'

                        break
                continue

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan':  # 主要人员
                people_item = {}
                people_item['c_id'] = obj_id
                people_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 姓名
                people_item['name'] = etc.xpath(
                    '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()'
                )[0]
                # 职位
                people_item['position'] = etc.xpath(
                    '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[3]/span/text()'
                )[0]
                bixiao_people.find_one_and_update({'c_id': obj_id},
                                                  {'$set': people_item},
                                                  upsert=True)
                print(people_item)
                for people_vals in people_item:
                    if not people_item[people_vals]:
                        LOG.info(f'主要人员数据匹配异常:{people_item}, 请求地址:{comp_url}')

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.gudongxinxi':  # 股东信息
                capital_item = {}
                capital_item['c_id'] = obj_id
                capital_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 股东名称
                title = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()'
                )
                # 标签
                label = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/div/span/text()'
                )
                # 持股比例
                has_rates = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[3]/div/div/span/text()'
                )
                # 认缴出资额
                subscribed_capital = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[4]/div/span/text()'
                )

                capital_item['title'] = ''.join(title)
                capital_item['label'] = ''.join(label)
                capital_item['has_rates'] = ''.join(has_rates)
                capital_item['subscribed_capital'] = ''.join(
                    subscribed_capital)
                bixiao_shareholder.find_one_and_update({'c_id': obj_id},
                                                       {'$set': capital_item},
                                                       upsert=True)
                print(capital_item, 'C' * 80)

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.findNewsCount':  # 新闻舆情
                news_item = {}
                news_item['c_id'] = obj_id
                news_item['company_name'] = detail_res.get('company_name', '')
                # 标题
                news_item['title'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/text()'
                )[0]
                # 内容地址
                news_item['info_url'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/@href'
                )[0]
                # 来源
                news_item['source'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[1]/text()'
                )[0]
                # 发布时间
                news_item['date_doc'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[2]/text()'
                )[0]
                print(news_item)
                bixiao_news.update({'c_id': obj_id}, {'$set': news_item},
                                   upsert=True)
                for news_vals in news_item:
                    if not news_item[news_vals]:
                        LOG.info(f'新闻舆情数据匹配异常:{news_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin':  # 产品信息
                product_item = {}
                product_item['c_id'] = obj_id
                product_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 产品名称
                product_item['name'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[2]/table'
                    '/tbody/tr/td[2]/span/text()')[0]
                # 产品简称
                product_item['short_name'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[3]'
                    '/span/text()')[0]
                # 产品分类
                product_item['type'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[4]/span'
                    '/text()')[0]
                # 领域
                product_item['domain'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[5]'
                    '/span/text()')[0]
                print(product_item)
                bixiao_product.find_one_and_update({'c_id': obj_id},
                                                   {'$set': product_item},
                                                   upsert=True)
                for product_vals in product_item:
                    if not product_item[product_vals]:
                        LOG.info(f'产品信息数据匹配异常:{product_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin':  # 招聘信息
                recruit_item = {}
                recruit_item['c_id'] = obj_id
                recruit_item['company_name'] = detail_res.get(
                    'company_name', '')
                recruit_item['opd_date'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[2]'
                    '/text()')[0]
                recruit_item['position_'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[3]'
                    '/text()')[0]
                recruit_item['month_salary'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[4]'
                    '/text()')[0]
                recruit_item['education'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[5]'
                    '/text()')[0]
                recruit_item['work_experience'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[6]'
                    '/text()')[0]
                recruit_item['address'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[7]'
                    '/text()')[0]
                print(recruit_item, 'P' * 80)
                bixiao_recruit.find_one_and_update({'c_id': obj_id},
                                                   {'$set': recruit_item},
                                                   upsert=True)
                for recruit_vals in recruit_item:
                    if not recruit_item[recruit_vals]:
                        LOG.info(f'招聘信息数据匹配异常:{recruit_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch'
                         ) == 'CompangyDetail.lishiwangzhanbeian':  # ICP备案
                record_item = {}
                record_item['c_id'] = obj_id
                record_item['company_name'] = detail_res.get(
                    'company_name', '')
                record_item['opd_date'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]'
                    '/span/text()')[0]
                record_item['web_name'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]'
                    '/span/text()')[0]
                record_item['index_url'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/'
                    'a/@href')[0]
                record_item['domain_name'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]'
                    '/text()')[0]
                record_item['website_filing'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/'
                    'span/text()')[0]
                print(record_item, 'M' * 80)
                bixiao_record_icp.find_one_and_update({'c_id': obj_id},
                                                      {'$set': record_item},
                                                      upsert=True)
                for record_vals in record_item:
                    if not record_item[record_vals]:
                        LOG.info(f'ICP备案数据匹配异常:{record_item}, 请求地址:{comp_url}')

        print(detail_res, '%' * 80)
        bixiao_business.find_one_and_update({'c_id': obj_id},
                                            {'$set': detail_res},
                                            upsert=True)
        return detail_res
Exemple #13
0
    def add_or_edit_empl(self, args):
        """
        add employee
        :param args: form parameters
        :return: 
        """
        new_args = dict()
        for k, v in args.items():
            if isinstance(k, unicode):
                k = k.encode('utf-8')
            if v and isinstance(v, unicode):
                v = v.encode('utf-8')
            if k not in self.request_add_attrs:
                return Status(202, 'failure', u'%s参数不合法' % k, {}).json()

            if k in self.request_not_need_attrs:
                new_args[k] = str(v)
                continue

            if k and not v:
                attr_name = self.employee_attrs_dict.get(k)
                return Status(203, 'failure', u'%s内容需要进行填写' % attr_name,
                              {}).json()

            new_args[k] = v

        card_id = args.get('card_id')
        is_add = new_args['is_add']
        china_name = args.get('china_name')
        if isinstance(card_id, unicode):
            card_id = card_id.encode('utf-8')
        exist_empl_mode = self.employee_bo.get_empl_by_card_id(card_id)

        if is_add in ['1', 1] and exist_empl_mode:
            return Status(204, 'failure', u'%s用户已存在,无需重新建立信息档案' % china_name,
                          {}).json()

        empl_mode = self.employee_bo.new_mode() if is_add == '1' \
            else exist_empl_mode

        # submit
        for attr in self.request_add_attrs:
            if not attr:
                continue

            if attr == 'china_name':
                empl_mode.china_name = new_args.get(attr)
            elif attr == 'english_name':
                empl_mode.english_name = new_args[attr]
            elif attr == 'email':
                empl_mode.email = new_args[attr]
            elif attr == 'phone':
                empl_mode.phone = new_args[attr]
            elif attr == 'entry_date':
                empl_mode.entry_date = new_args[attr]
            elif attr == 'sex':
                empl_mode.sex = new_args[attr]
            elif attr == 'nation':
                empl_mode.nation = new_args[attr]
            elif attr == 'birth_date':
                empl_mode.birth_date = new_args[attr]
            elif attr == 'political_status':
                empl_mode.political_status = new_args[attr]
            elif attr == 'nationality':
                empl_mode.nationality = new_args[attr]
            elif attr == 'residence_type':
                empl_mode.residence_type = new_args[attr]
            elif attr == 'education':
                empl_mode.education = new_args[attr]
            elif attr == 'marriage':
                empl_mode.marriage = new_args[attr]
            elif attr == 'card_type':
                empl_mode.card_type = new_args[attr]
            elif attr == 'card_id':
                empl_mode.card_id = new_args[attr]
            elif attr == 'card_deadline':
                empl_mode.card_deadline = new_args[attr]
            elif attr == 'card_place':
                empl_mode.card_place = new_args[attr]
            elif attr == 'current_address':
                empl_mode.current_address = new_args[attr]
            elif attr == 'bank_type':
                empl_mode.bank_type = new_args[attr]
            elif attr == 'bank_country':
                empl_mode.bank_country = new_args[attr]
            elif attr == 'bank_city':
                empl_mode.bank_city = new_args[attr]
            elif attr == 'bank_id':
                empl_mode.bank_id = new_args[attr]
            elif attr == 'bank_name':
                empl_mode.bank_name = new_args[attr]
            elif attr == 'status':
                empl_mode.status = new_args[attr] if new_args[attr] else '1'

        # record
        if is_add in [1, '1']:
            empl_mode.entry_submit_rtx = get_user_id()
            empl_mode.entry_submit_time = get_now()
        else:
            empl_mode.last_update_rtx = get_user_id()
            empl_mode.last_update_time = get_now()

        self.employee_bo.add_model(empl_mode) if is_add == '1' \
            else self.employee_bo.merge_model(empl_mode)

        if is_add == '1':
            LOG.info("%s add employee is success" % card_id)
            return Status(100, 'success', u'新增%s成功' % china_name, {}).json()

        LOG.info("%s edit employee is success" % card_id)
        return Status(110, 'success', u'%s信息编辑成功' % china_name, {}).json()
Exemple #14
0
    def get_all(self, args):

        data = dict()
        new_args = dict()
        start = 0
        for k, v in args.items():
            if k not in self.request_attrs:
                return Status(202, 'failure', u'%s参数不合法' % k, data).json()
            if k == 'start':
                start = int(v)
                new_args[k] = start
            elif k == 'search':
                if isinstance(v, unicode):
                    v = v.encode('utf-8')
                new_args[k] = "%" + str(v) + "%"
            else:
                new_args[k] = v
        # start = (int(new_args['index']) - 1) * int(new_args.get('limit'))
        # new_args['start'] = start

        # status 任职状态 1在职 2离职
        all_empls, count = self.employee_bo.get_all(new_args, status=1)
        data = dict()
        LOG.info('employee>api_list: %s' % count)
        if not all_empls:
            data['totalCount'] = 0
            data['datalist'] = []
            return Status(101, 'failure', u'成功,但数据为空', data).json()

        results = list()
        for empl in all_empls:
            if not empl:
                continue

            result = dict()
            for attr in self.employee_show_attrs:
                params = dict()
                if attr == 'id':
                    result[attr] = start + 1
                elif attr == 'china_name':
                    result[attr] = empl.china_name
                elif attr == 'english_name':
                    result[attr] = empl.english_name
                elif attr == 'nationality':
                    nationality = empl.nationality
                    if not nationality:
                        result[attr] = empl.nationality
                    else:
                        params['enum_type'] = attr
                        params['enum_subid'] = empl.nationality
                        result[attr] = self.enums_bo.get_enumname_by_params(
                            params)
                elif attr == 'sex':
                    sex = empl.sex
                    if not sex:
                        result[attr] = empl.sex
                    else:
                        params['enum_type'] = attr
                        params['enum_subid'] = empl.sex
                        result[attr] = self.enums_bo.get_enumname_by_params(
                            params)
                elif attr == 'birth_date':
                    result[attr] = d2s(empl.birth_date, fmt="%Y-%m-%d") \
                        if empl.birth_date else ''
                elif attr == 'entry_date':
                    result[attr] = d2s(empl.entry_date, fmt="%Y-%m-%d") \
                        if empl.entry_date else ''
                elif attr == 'email':
                    result[attr] = empl.email
                elif attr == 'phone':
                    result[attr] = empl.phone
                elif attr == 'card_id':
                    result[attr] = empl.card_id

            start += 1
            results.append(result)
        data['totalCount'] = count
        data['datalist'] = results

        return Status(100, 'success', u'成功', data).json()
Exemple #15
0
def login_out():
    user_id = get_user_id()
    if user_id:
        LOG.info('%s login out ==========' % user_id)
    session.clear()
    return redirect(url_for('manage.index'))
Exemple #16
0
    def work_by_key(self,
                    key,
                    min_page,
                    max_page,
                    type='default',
                    queue=None,
                    cid=None,
                    sub_cid=None,
                    city_info=None,
                    sub_city_info=None):
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        # page
        for page in range(min_page, max_page + 1, 1):
            if API_MODE == 'tyc' and type == 'default':
                url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'tyc' and type == 'city':
                url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page,
                                                 parse.quote(key),
                                                 city_info.get('name'))
            elif API_MODE == 'tyc' and type == 'sub_city':
                if cid in ZXS_CITY_IDS:
                    url = '%s/p%s?key=%s&base=%s&areaCode=%s' % (
                        TYC_SEARCH_API, page, parse.quote(key),
                        sub_city_info.get('name'), sub_city_info.get('code'))
                else:
                    url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page,
                                                     parse.quote(key),
                                                     sub_city_info.get('name'))
            elif API_MODE == 'pro' and type == 'default':
                url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'pro' and type == 'city':
                url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page,
                                                 parse.quote(key),
                                                 city_info.get('name'))
            elif API_MODE == 'pro' and type == 'sub_city':
                if cid in ZXS_CITY_IDS:
                    url = '%s/p%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \
                          % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name')))
                else:
                    url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page,
                                                     parse.quote(key),
                                                     sub_city_info.get('name'))
            else:
                LOG.critical('====== API_MODE is not in [tyc, pro] ======')
                sys.exit(1)
            LOG.info('%s[%s]%s' % (key, API_MODE, url))

            self.headers['Referer'] = url
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')
            if not is_ok:
                continue
            if self.check_no(url, _type='page'):
                continue

            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})

            for tag in tags:
                if not tag or not tag.attrs.get('href'):
                    continue

                res_dict = dict()
                if API_MODE == 'tyc':
                    tyc_url = tag.get('href').strip()
                elif API_MODE == 'pro':
                    tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API,
                                                   tag.get('href').strip())
                else:
                    tyc_url = ''
                res_dict['tyc_url'] = tyc_url
                res_dict['name'] = tag.get_text().strip()
                res_dict['key'] = key
                res_dict['is_send_email'] = False
                res_dict['city'] = city_info.get(
                    'full_name') if city_info else '-'
                res_dict['sub_city'] = sub_city_info.get(
                    'full_name') if sub_city_info else '-'
                detail_res = list()
                if API_MODE == 'tyc':
                    detail_res = self.detail_by_url(res_dict.get('tyc_url'))
                elif API_MODE == 'pro':
                    detail_res = self.detail_pro_by_url(
                        res_dict.get('tyc_url'))
                res_dict.update(detail_res)
                print('%s[%s] %s' %
                      (res_dict['name'], str(True if res_dict else False),
                       res_dict['tyc_url']))
                ret_res.append(res_dict)
                if queue:
                    queue.put(res_dict)
                random_sleep(3.5, 4.5)
                if IS_TEST_BREAK:
                    break
            if IS_TEST_BREAK:
                break
        return ret_res
Exemple #17
0
 def init_run(self):
     LOG.debug('Server is initializing......')
     self._autoinit_register_blueprint()
     LOG.info('Web server is running......')
Exemple #18
0
    def __init__(self, app):
        """
        Initialize webFlaskServer instance
        and flask configuration
        """
        self.app = app
        if not self.app:
            LOG.info('Web server initialize is failure......')
            sys.exit(1)

        _realpath = os.path.dirname(os.path.realpath(__file__))
        self.app.template_folder = _realpath + '/templates/'
        self.app.secret_key = SECRET_KEY or 'python'
        self.app.static_folder = _realpath + '/static'
        self.app.static_url_path = '/static'
        self.app.add_url_rule(self.app.static_url_path + '/<path:filename>',
                              endpoint='static',
                              view_func=self.app.send_static_file)

        super(WebFlaskServer, self).__init__()

        @self.app.before_request
        def before_request():
            if get_user_id():
                return
            # api: rest apis
            # manage: login apis
            if request.blueprint in ['api', 'manage', None]:
                return
            # special api for blueprints
            if request.endpoint.endswith('ForApi') or \
                    request.endpoint.endswith('for_api'):
                return

            return redirect(url_for('manage.index'))

        @self.app.before_first_request
        def before_first_request():
            g._session = get_session()

        @self.app.errorhandler(404)
        def not_found_error(error):
            LOG.error("%s is not found 404" % request.url)
            return render_template('errors/404.html', ), 404

        @self.app.errorhandler(500)
        def server_error(error):
            LOG.error("%s is server error 500" % request.url)
            return render_template('errors/500.html'), 500

        @self.app.context_processor
        def default_context_processor():
            user_id = session.get('user_id')
            menu = current_user = dict()
            if hasattr(g, 'menuf'):
                menu['f'] = g.menuf or 'index'
            if hasattr(g, 'menusub'):
                menu['sub'] = g.menusub or 'index'
            if user_id:
                current_user = SysUserService().get_user_by_params(user_id)

            return {
                'current_user': current_user,
                'sysversion': VERSION,
                'menu': menu
            }

        # set favicon
        @self.app.route('/favicon.ico')
        def get_defaule_favicon():
            return self.app.send_static_file('images/favicon.ico')
    def work_by_key(self, key, min_page=0, max_page=5, queue=None):
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        if not min_page:
            min_page = self.MIN_PAGE
        if not max_page:
            max_page = self.MAX_PAGE

        LOG.info('%s[%s ~ %s]' % (key, min_page, max_page))
        # page
        for page in range(min_page, max_page, 1):
            if API_MODE == 'tyc':
                url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'pro':
                url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page,
                                         parse.quote(key))
            else:
                LOG.critical('====== API_MODE is not in [tyc, pro] ======')
                sys.exit(1)
            LOG.info('%s[%s]%s' % (key, API_MODE, url))

            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')

            if not is_ok:
                continue

            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})

            for tag in tags:
                if not tag or not tag.attrs.get('href'):
                    continue

                res_dict = dict()
                if API_MODE == 'tyc':
                    tyc_url = tag.get('href').strip()
                elif API_MODE == 'pro':
                    tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API,
                                                   tag.get('href').strip())
                else:
                    tyc_url = ''
                res_dict['tyc_url'] = tyc_url
                res_dict['name'] = tag.get_text().strip()
                res_dict['key'] = key
                detail_res = list()
                if API_MODE == 'tyc':
                    detail_res = self.detail_by_url(res_dict.get('tyc_url'))
                elif API_MODE == 'pro':
                    detail_res = self.detail_pro_by_url(
                        res_dict.get('tyc_url'))
                res_dict.update(detail_res)
                print('%s[%s] %s' %
                      (res_dict['name'], str(True if res_dict else False),
                       res_dict['tyc_url']))
                ret_res.append(res_dict)
                if queue:
                    queue.put(res_dict)
                random_sleep(3.2, 4.5)
                if IS_TEST_BREAK:
                    break
            if IS_TEST_BREAK:
                break
        return ret_res
Exemple #20
0
def start():
    LOG.info('%s run start [IS TEST RUN: %s]......' % (NAME, IS_TEST_BREAK))
    SpiderTYCClass().init_run()
    LOG.info('%s run end [IS TEST RUN: %s]......' % (NAME, IS_TEST_BREAK))
Exemple #21
0
 def _print_info(self, message):
     LOG.info('=' * 20 + message + '=' * 20)