コード例 #1
0
ファイル: resume_fenjianli.py プロジェクト: logonmy/Spider-1
def test():
    task = {
        "area_name": u"天津",
        "job_code": "918",
        "area_code": "120000",
        "model_name": "zhilian",
        "job_name": u"其他"
    }
    cookie = {
        'code': 0,
        'cookie': {
            'SERVERID':
            '70356234b78238645df699ef52f30d81|1522806241|1522806241',
            'JSESSIONID': 'A3CF43A5891596C98EDE1C2D5E5BE76A'
        }
    }
    cookie_str = cookie = 'SERVERID=' + cookie.get('cookie').get(
        'SERVERID') + ';JSESSIONID=' + cookie.get('cookie').get('JSESSIONID')
    # awake_one_task(task)
    print get_list(cookie=cookie_str,
                   page_numner=1,
                   task=task,
                   proxy=settings.get_proxy())
    # print get_resume('1015973541', cookie, 'zhilian', {'http': 'http://47.93.115.141:3128', 'https': 'http://47.93.115.141:3128'})
    account = {'username': '******', 'passwd': 'jinqian4611', 'id': 805}
    set_unavaliable_account(account)
コード例 #2
0
def login(username, password):
    # 3
    login_url = 'http://www.jianlika.com/Index/login.html'
    post_data = {'username': username, 'password': password, 'remember': 'no'}
    login_header = {
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        # 'Cookie': 'think_language=zh-CN; user_auth_sign=8rc593c82ogfc55938595vsc14; rememberUsername=18629947965; gift_hide_timeout=1',
        'Host': 'www.jianlika.com',
        'Origin': 'http://www.jianlika.com',
        'Referer': 'http://www.jianlika.com/',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }
    response = requests.post(url=login_url,
                             data=post_data,
                             headers=login_header,
                             proxies=settings.get_proxy())
    # 登录完成之后,返回的内容为:'{"info":"","status":1,"url":"\\/Search"}'
    # print response.proxies
    print response.content
    print 'cookie:===》》》 %s ' % response.headers.get('Set-Cookie')
    pass
コード例 #3
0
def get_list():
    # task = {"area_name": u"天津", "job_code": "918", "area_code": "120000", "model_name": "zhilian", "job_name": u"其他"}
    task = {
        "area_name": u"北京",
        "job_code": "228",
        "area_code": "110000",
        "model_name": "zhilian",
        "job_name": u"销售主管"
    }
    print resume_fenjianli.get_list(cookie=cookie_str,
                                    page_numner=1,
                                    params=task,
                                    proxy=settings.get_proxy())
コード例 #4
0
def main():
    logger = utils.get_logger()
    redis_client = redis.Redis(host=common_settings.REDIS_IP,
                               port=common_settings.REDIS_PORT,
                               db=1)
    delete_before_account_inredis(redis_client)
    cookie_result = get_chinahr_cookie_all()
    proxy = settings.get_proxy()
    start_count = 0
    if cookie_result['code']:
        logger.info('there has no cookie get.')
        return
    for cookie in cookie_result['cookie']:
        if start_count:
            start_count -= 1
            continue
        username = cookie.get('userName', '')
        password = cookie.get('password', '')
        if not cookie.get('cookie', ''):
            logger.info('not get uid in cookie:' + str(cookie))
            url = 'http://172.16.25.41:8002/acc/invalidCookie.json?userName=%s&password=%s' % (
                username, password)
            requests.get(url)
            continue
        #continue
        logger.info('start to deal with :' + username)
        try:
            cookie_dict = json.loads(cookie.get('cookie', '{}'))
        except Exception, e:
            logger.info('error when json cookie:' + username + ' ' + password)
            continue
        uid_list = uid_re.findall(cookie_dict.get('bps', ''))
        device_id = json.loads(cookie.get('extraContent',
                                          '{}')).get('device_id', '')
        if not uid_list:
            logger.info('not get uid in cookie:' + str(cookie))
            url = 'http://172.16.25.41:8002/acc/invalidCookie.json?userName=%s&password=%s' % (
                username, password)
            requests.get(url)
            #break
            continue
        uid = uid_list[0]
        vote_result = vote(uid, proxy, device_id, cookie_dict)
        if not vote_result:
            logger.info('vote failed:' + str(cookie))
        redis_client.set(
            time.strftime("%Y-%m-%d") + '_' + username + '_chinahr_0', 1000)
コード例 #5
0
def buy_thread():
    logger = utils.get_logger()
    logger.info(
        '====================================================\nstart buy thread!!!'
    )
    global emoji_pattern
    kafka_producer = None
    mns_client = None
    redis_client = None

    mysql_pool = PersistentDB(MySQLdb,
                              host=common_settings.MYSQL_HOST,
                              user=common_settings.MYSQL_USER,
                              passwd=common_settings.MYSQL_PASSWD,
                              db=common_settings.MYSQL_DOWNLOAD_DB,
                              port=common_settings.MYSQL_PORT,
                              charset='utf8')
    mysql_conn = mysql_pool.connection()
    mysql_cursor = mysql_conn.cursor()

    mysql_cursor.execute(
        'update download_record set valid=0 where valid=1 and source=6')
    mysql_conn.commit()
    # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'}
    proxy = settings.get_proxy()

    task_list = []

    while True:
        try:
            while not task_list:
                task_number = mysql_cursor.execute(
                    'select * from download_record where valid=0 and source=6 order by updateTime desc limit 30'
                )
                if not task_number:
                    logger.info(
                        'there has no avaliable task in mysql ,sleep!!!')
                    time.sleep(600)
                    mysql_conn = mysql_pool.connection()
                    mysql_cursor = mysql_conn.cursor()
                    continue
                task_list = list(mysql_cursor.fetchall())
                break
            task = task_list.pop()
            if task[8]:
                try:
                    extend_content = json.loads(task[8])
                    extend_content[
                        'emailJobType'] = '' if 'emailJobType' not in extend_content else extend_content[
                            'emailJobType']
                    extend_content[
                        'emailCity'] = '' if 'emailCity' not in extend_content else extend_content[
                            'emailCity']
                except Exception, e:
                    logger.info('not find extend_content in task:' + str(task))
                    extend_content = {"emailJobType": "", "emailCity": ""}
            else:
                extend_content = {"emailJobType": "", "emailCity": ""}

            get_null_text = 3
            for i in xrange(get_null_text):
                get_account_tag = True
                while get_account_tag:
                    account = get_account_from_redis()
                    if account['code']:
                        logger.info('get error account:' + str(account))
                        continue
                    cookie = account['cookie']
                    uid = account['uid']
                    device_id = account['device_id']
                    get_download_bean_result = util.get_download_beans(
                        cookie, uid, proxy, device_id)
                    if not get_download_bean_result[
                            'code'] and not get_download_bean_result[
                                'coin_number']:
                        logger.info('get a account whos bean number is 0:' +
                                    account['username'] + ' result:' +
                                    str(get_download_bean_result))
                        release_account_from_redis(account['account_key'], 0)
                    else:
                        get_account_tag = False

                resume_result = util.buy_resume(cookie, uid, str(task[1]),
                                                settings.get_proxy(),
                                                device_id)
                if resume_result['code'] in [0, '0']:
                    resume_uuid = uuid.uuid1()
                    try:
                        content = json.dumps(resume_result, ensure_ascii=False)
                        content = emoji_pattern.sub(r'', content)
                        mysql_cursor.execute(
                            u'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity) values ("CH_HR", %s, "python", %s, now(), %s, %s, %s)',
                            (content, resume_uuid, account['username'],
                             extend_content['emailJobType'],
                             extend_content['emailCity']))
                        mysql_conn.commit()
                        mysql_cursor.execute('select last_insert_id()')
                        save_mysql_ids = mysql_cursor.fetchall()
                        if not save_mysql_ids or not save_mysql_ids[0]:
                            logger.info('insert into mysql error!!!:' + sql +
                                        '    ' + str(sql_value))
                            raise Exception
                        save_mysql_id = save_mysql_ids[0][0]
                    except Exception, e:
                        logger.info('mysql error:' +
                                    str(traceback.format_exc()))
                        time.sleep(60)
                        continue

                    kafka_data = {
                        "channelType": "APP",
                        "content": {
                            "content": content,
                            "id": save_mysql_id,
                            "createBy": "python",
                            "createTime": int(time.time() * 1000),
                            "ip": '',
                            "resumeSubmitTime": '',
                            "resumeUpdateTime": '',
                            "source": "CH_HR",
                            "trackId": str(resume_uuid),
                            "avatarUrl": '',
                            "email": account['username'],
                            'emailJobType': extend_content['emailJobType'],
                            'emailCity': extend_content['emailCity'],
                        },
                        "interfaceType": "PARSE",
                        "resourceDataType": "RAW",
                        "resourceType": "RESUME_INBOX",
                        "externalInfo": "BUY",
                        "source": "CH_HR",
                        "trackId": str(resume_uuid),
                    }
                    if common_settings.SAVE_TYPE:
                        try:
                            if common_settings.SAVE_TYPE == 'kafka':
                                kafka_producer.produce(json.dumps(kafka_data))
                            elif common_settings.SAVE_TYPE == 'mns':
                                buf = StringIO()
                                f = gzip.GzipFile(mode='wb', fileobj=buf)
                                f.write(json.dumps(kafka_data))
                                f.close()
                                msg_body = base64.b64encode(buf.getvalue())
                                msg = Message(msg_body)
                                for send_message_count in range(
                                        common_settings.MNS_SAVE_RETRY_TIME):
                                    try:
                                        mns_client = get_mns_client()
                                        mns_client.send_message(msg)
                                        break
                                    except Exception, e:
                                        logger.info(
                                            'error when mns send message, time:'
                                            + str(send_message_count) + ':' +
                                            str(e))
                                else:
                                    raise Exception
                            else:
                                logger.info('did not support save type:' +
                                            common_settings.SAVE_TYPE)
                        except Exception, e:
                            logger.info('get error when produce data to ' +
                                        common_settings.SAVE_TYPE +
                                        ', exit!!!' +
                                        str(traceback.format_exc()))
コード例 #6
0
def awake_one_task(task):
    logger = utils.get_logger()
    global citys_dict, emoji_pattern
    has_find_count = 0
    not_find_count = 0
    redis_client = get_redis_client()
    kafka_client = None
    kafka_producer = None
    mns_client = None
    result = {'code': 0}
    user_now = {}
    logger.info('start to get data ' + str(task))
    mysql_error_time = 10
    list_page = int(task.get('page_now', 0))
    # 开始处理任务
    while list_page > 0:
        try:
            # 获取账号,若获取不到账号则sleep,直到可以拿到账号为止
            while not user_now:
                user_now = get_one_account_from_api()
                if not user_now['code']:
                    break
                else:
                    logger.info('get account failed:' + str(user_now))
                    user_now = {}
        except Exception, e:
            logger.info(str(traceback.format_exc()))
            continue

        time.sleep(1)
        # 开始下载列表页
        download_day_str = str(time.localtime().tm_year) + '-' + str(
            time.localtime().tm_mon) + '-' + str(time.localtime().tm_mday)
        download_day = datetime.datetime.today()
        while list_page >= 0:
            list_result = util.get_list(user_now, task, settings.get_proxy(),
                                        list_page)
            if list_result['code'] == 1:
                logger.info('get 800 code, to change accounts!!!')
                user_now = []
                break
            elif list_result['code'] == 3:
                logger.info('get 100 code, to change accounts!!!')
                set_account_invalid(user_now['username'], user_now['passwd'])
                user_now = []
                break
            elif list_result['code']:
                logger.info('get error list ,continue!!!' + str(list_result))
                user_now = {}
                break
            logger.info('has get the list of' + str(list_page))
            resume_list = list_result.get('data', {}).get('cvList', [])

            if len(resume_list) == 0 and list_page == 1:
                logger.info('not get resume in city:' + task['zone'] +
                            ' keyword:' + task['keyword'])

            # 循环列表页中的详情页
            for resume in resume_list:
                resume_key = 'chinahr_resume_' + resume.get('cvid', '')
                # 在redis中检查RESUME_DELAY_DAYS天内是不是已经下载过了
                try:
                    resume_download_time = redis_client.get(resume_key)
                    if resume_download_time:
                        datetime_last_download = datetime.datetime.strptime(
                            resume_download_time, '%Y-%m-%d')
                        if (download_day - datetime_last_download
                            ).days <= common_settings.RESUME_DELAY_DAYS:
                            logger.info('has find %s in redis' %
                                        (resume_key, ) + ' and the city is:' +
                                        task['zone'])
                            has_find_count += 1
                            continue
                        else:
                            redis_client.set(resume_key, download_day_str)
                            logger.info('has find %s in redis, update ' %
                                        (resume_key, ) + ' and the city is:' +
                                        task['zone'])
                    else:
                        redis_client.set(resume_key, download_day_str)
                        logger.info('not find %s in reids' % (resume_key, ) +
                                    ' and the city is:' + task['zone'])
                except Exception as e:
                    logger.exception('get error when use redis.' + str(e))
                    # redis_client.set(resume_key, download_day_str)
                    raise e

                # 开始下载唤醒详情页
                time.sleep(1)
                resume_result = util.get_resume(user_now,
                                                str(resume.get('cvid', '')),
                                                settings.get_proxy())
                # 保存详情页到mns和mysql
                if resume_result['code'] in [0, '0']:
                    resume_uuid = uuid.uuid1()
                    try:
                        content = json.dumps(resume_result, ensure_ascii=False)
                        content = emoji_pattern.sub(r'', content)
                        sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity) values ("CH_HR", %s, "python", %s, now(), %s, %s, %s)'
                        sql_value = (content, resume_uuid,
                                     user_now['username'], task['keyword'],
                                     task['zone'])

                        kafka_data = {
                            "channelType": "APP",
                            "content": {
                                "content": content,
                                "createBy": "python",
                                "createTime": int(time.time() * 1000),
                                "ip": '',
                                "resumeSubmitTime": '',
                                "resumeUpdateTime": resume.get('refDate', ''),
                                "source": "CH_HR",
                                "trackId": str(resume_uuid),
                                "avatarUrl": '',
                                "email": user_now['username'],
                                'emailJobType': task['keyword'],
                                'emailCity': task['zone'],
                            },
                            "interfaceType": "PARSE",
                            "resourceDataType": "RAW",
                            "resourceType": "RESUME_SEARCH",
                            "source": "CH_HR",
                            "trackId": str(resume_uuid),
                            "callSystemID": common_settings.PROJECT_NAME,
                            "traceID": str(resume_uuid),
                        }
                        utils.save_data(sql, sql_value, kafka_data)
                        logger.info('the cvid is:' + resume.get('cvid', '') +
                                    ' the lenght of data is:' +
                                    str(len(kafka_data['content']['content'])))
                    except Exception, e:
                        logger.info('mysql error ' + str(mysql_error_time) +
                                    ' time:' + str(traceback.format_exc()))
                        mysql_error_time -= 1
                        if not mysql_error_time:
                            # return
                            logger.info('there has no mysql_error_time')
                        continue
                elif resume_result['code'] == 6:
                    user_now = {}
                    # list_page = -1
                    break
                elif resume_result['code'] == 8:
                    set_account_invalid(account['username'], account['passwd'])
                    user_now = {}
                    # list_page = -1
                    break
コード例 #7
0
def download_thread():
    logger = utils.get_logger()
    logger.info('=' * 50 + '\nstart main!!!')
    global numbers_left
    global sleep_tag
    global get_task_queue

    redis_client = get_redis_client()

    mysql_pool = PersistentDB(MySQLdb,
                              host=common_settings.MYSQL_HOST,
                              user=common_settings.MYSQL_USER,
                              passwd=common_settings.MYSQL_PASSWD,
                              db=common_settings.MYSQL_DOWNLOAD_DB,
                              port=common_settings.MYSQL_PORT,
                              charset='utf8')
    mysql_conn = mysql_pool.connection()
    mysql_cursor = mysql_conn.cursor()
    # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()
    task_list = []
    mysql_cursor.execute(
        'select count(*) from download_record where updateTime>date(now()) and valid=2 and source=24'
    )
    number_today = mysql_cursor.fetchall()[0][0]
    numbers_left = 0 if numbers_left < number_today else numbers_left - number_today

    # 开始处理任务
    while True:
        task = None
        # 若当前时间不能下载简历,则sleep(此功能目前只在招聘狗上使用,此处暂未使用,sleep_tag会在心跳线程中被更改)
        while not sleep_tag:
            logger.info('not the correct time to buy resume, wait.')
            time.sleep(3600)
        # 若今日简历下载数已达今日上限, 则sleep
        if not numbers_left:
            logger.info('the number of today not left, sleep')
            time.sleep(1800)
            continue
        logger.info('the number left today is:' + str(numbers_left))
        # 从内存队列中获取任务,若没有获取到,sleep一分钟
        try:
            task = get_task_queue.get(timeout=10)
            if not task:
                logger.info('get None task, sleep!')
                time.sleep(600)
                continue
        except Exception, e:
            logger.info('not get task from queue, sleep.')
            time.sleep(60)
            continue

        # 解析任务中的城市、职位信息
        if task[8]:
            try:
                extend_content = json.loads(task[8])
                extend_content[
                    'emailJobType'] = '' if 'emailJobType' not in extend_content else extend_content[
                        'emailJobType']
                extend_content[
                    'emailCity'] = '' if 'emailCity' not in extend_content else extend_content[
                        'emailCity']
            except Exception, e:
                logger.info('not find extend_content in task:' + str(task))
                extend_content = {"emailJobType": "", "emailCity": ""}
コード例 #8
0
def awake_one_task(task):
    logger = utils.get_logger()
    logger.info('start aweak one task')
    global random_ids
    relogin_time = 3
    redis_client = get_redis_client()
    result = {'code': 0, 'executeParam': task}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()

    logger.info('deal with:' + str(task))

    page_now = 1
    download_day = str(time.localtime().tm_mon) + '-' + str(
        time.localtime().tm_mday)
    datetime_now = datetime.datetime.now()

    # 开始处理任务
    while page_now != -1:
        logger.info('start download page:' + str(page_now))
        # 每个列表页使用一个账号
        account, cookie = get_one_account()
        # 开始下载列表页
        list_result = get_list(account['username'], page_now, task, proxy)
        if list_result['code']:
            logger.info('get error list result:' + str(list_result))
            page_now = -1
            continue
        # 若列表页中没有简历,认为已经翻到最后一页
        if not list_result['data']:
            list_result['has_next_page'] = False
        logger.info('page number of now is ' + str(page_now))

        # 逐个处理详情页
        for resume_one in list_result['data']:
            resume, resume_update_time = resume_one

            datetime_update = datetime.datetime.strptime(
                resume_update_time, '%Y.%m.%d')
            if (datetime_now - datetime_update).days > 7:
                logger.info('get 1 days before resume, continue.' +
                            resume_update_time)
                list_result['has_next_page'] = False
                continue

            # 在redis中检查今天是否已经下载过
            has_find_in_redis = False
            resume_key = 'youzi_resume_' + str(resume)
            try:
                resume_redis_value = redis_client.get(resume_key)
                if resume_redis_value == download_day:
                    has_find_in_redis = True
            except Exception, e:
                logger.info(str(traceback.format_exc()))
                # redis_client.set(resume_key, download_day)
            if has_find_in_redis:
                logger.info('has find %s in redis' % resume_key)
                continue
            else:
                logger.info('not find %s in redis' % resume_key)

            # 下载简历详情页,尝试三次,三次均下载失败,则跳过此简历
            for x in xrange(3):
                account, cookie = get_one_account()
                resume_result = get_resume(resume,
                                           account['username'],
                                           proxy=proxy)

                if resume_result['code']:
                    logger.info('get error resume:' + str(resume_result))
                    continue
                redis_client.set(resume_key, download_day)
                break
            else:
                continue

            # 保存简历详情页
            resume_uuid = uuid.uuid1()
            try:
                content = json.dumps(
                    {
                        'name': '',
                        'email': '',
                        'phone': '',
                        'html': resume_result['data']
                    },
                    ensure_ascii=False)
                sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)'
                sql_value = (common_settings.SOURCE, content, resume_uuid,
                             str(account['username']), task['jobTitle'],
                             task['locationname'], str(resume))

                resume_update_time = ''
                kafka_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time() * 1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": str(account['username']),
                        'emailJobType': task['jobTitle'],
                        'emailCity': task['locationname'],
                        'subject': str(resume)
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH",
                    "source": common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': common_settings.CALLSYSTEMID,
                }
                utils.save_data(sql, sql_value, kafka_data)
            except Exception, e:
                logger.info('get error when write mns, exit!!!' +
                            str(traceback.format_exc()))
            time.sleep(1)
コード例 #9
0
def get_detail():
    # list_result = {'total': 169, 'code': 0, 'data': [],
    #                'ids': [['1022347197', '2018-04-03'], ['1032771612', '2018-04-03'], ['1006918947', '2018-04-03'],
    #                        ['1045864491', '2018-04-03'], ['1047088655', '2018-04-03'], ['1025484341', '2018-04-03'],
    #                        ['1069702121', '2018-04-03'], ['1033655918', '2018-04-03'], ['1030018035', '2018-04-03'],
    #                        ['1003596638', '2018-04-03'], ['1033708572', '2018-04-02'], ['1026380481', '2018-04-02'],
    #                        ['1022050585', '2018-04-02'], ['1002563812', '2018-04-02'], ['1025279315', '2018-04-02'],
    #                        ['1035975978', '2018-04-02'], ['1031484197', '2018-04-02'], ['1070907357', '2018-04-02'],
    #                        ['1066977155', '2018-04-02'], ['1022591047', '2018-04-02'], ['1030550073', '2018-04-02'],
    #                        ['1033854943', '2018-04-02'], ['1031640132', '2018-04-02'], ['1070885063', '2018-04-02'],
    #                        ['1070729932', '2018-04-02'], ['1045212527', '2018-04-02'], ['1028664287', '2018-04-02'],
    #                        ['1070885595', '2018-04-02'], ['1025872345', '2018-04-02'], ['1067914985', '2018-04-02']]}
    list_result = {
        'total':
        321,
        'code':
        0,
        'data': [],
        'ids': [[u'1026367068', u'2018-04-03'], [u'1033060991', u'2018-04-03'],
                [u'1038441431', u'2018-04-03'], [u'1001363022', u'2018-04-03'],
                [u'1019659221', u'2018-04-03'], [u'1002660342', u'2018-04-03'],
                [u'1035984395', u'2018-04-03'], [u'1014826155', u'2018-04-03'],
                [u'1028142888', u'2018-04-03'], [u'1022642061', u'2018-04-03'],
                [u'1043983298', u'2018-04-03'], [u'1029731078', u'2018-04-03'],
                [u'1002851632', u'2018-04-03'], [u'1002354915', u'2018-04-03'],
                [u'1027887893', u'2018-04-03'], [u'1032848553', u'2018-04-03'],
                [u'1070807725', u'2018-04-03'], [u'1031349867', u'2018-04-03'],
                [u'1027071847', u'2018-04-03'], [u'1021747472', u'2018-04-03'],
                [u'1000155591', u'2018-04-03'], [u'1029013102', u'2018-04-03'],
                [u'1070737095', u'2018-04-03'], [u'1020231833', u'2018-04-03'],
                [u'1037316081', u'2018-04-03'], [u'1019132225', u'2018-04-03'],
                [u'1026149838', u'2018-04-03'], [u'1070834895', u'2018-04-03'],
                [u'1029253681', u'2018-04-03'], [u'1058461095', u'2018-04-02']]
    }

    for id in list_result.get('ids'):
        detail_data = resume_fenjianli.get_resume(
            resume_id=id[0],
            cookie=
            'JSESSIONID=%s;  huodong=fenjianli; hdflag=active; SERVERID=%s' %
            (cookie.get('cookie').get('JSESSIONID'),
             cookie.get('cookie').get('SERVERID')),
            model_name='zhilian',
            proxy=settings.get_proxy())
        # logger.info(json.dumps(detail_data, ensure_ascii=False))
        # resume_result = get_resume(resume[0], cookie, task['model_name'], proxy=proxy)
        resume_uuid = uuid.uuid1()
        try:
            sql = 'insert into spider_search.resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)'
            sql_value = ('RESUME_FEN',
                         json.dumps(detail_data['json'], ensure_ascii=False),
                         resume_uuid, '18629947965', '其他', '天津', str(id[0]))

            resume_update_time = detail_data['json']['updateDate']
            kafka_data = {
                "channelType": "WEB",
                "content": {
                    "content": json.dumps(detail_data['json'],
                                          ensure_ascii=False),
                    "id": '',
                    "createBy": "python",
                    "createTime": int(time.time() * 1000),
                    "ip": '',
                    "resumeSubmitTime": '',
                    "resumeUpdateTime": resume_update_time,
                    "source": 'RESUME_FEN',
                    "trackId": str(resume_uuid),
                    "avatarUrl": '',
                    "email": '18629947965',
                    'emailJobType': '其他',
                    'emailCity': '天津',
                    'subject': str(id[0])
                },
                "interfaceType": "PARSE",
                "resourceDataType": "RAW",
                "resourceType": "RESUME_SEARCH",
                "source": 'RESUME_FEN',
                "trackId": str(resume_uuid),
                'traceID': str(resume_uuid),
                'callSystemID': 'python',
            }
            utils.save_data(sql, sql_value, kafka_data)
        except Exception, e:
            logger.info('get error when write mns, exit!!!' +
                        str(traceback.format_exc()))
            # return
        time.sleep(1)
コード例 #10
0
def download_thread():
    logger = utils.get_logger()
    logger.info('=' * 50 + '\nstart main!!!')
    global numbers_left
    global sleep_tag

    redis_client = get_redis_client()

    mysql_pool = PersistentDB(MySQLdb,
                              host=common_settings.MYSQL_HOST,
                              user=common_settings.MYSQL_USER,
                              passwd=common_settings.MYSQL_PASSWD,
                              db=common_settings.MYSQL_DOWNLOAD_DB,
                              port=common_settings.MYSQL_PORT,
                              charset='utf8')
    mysql_conn = mysql_pool.connection()
    mysql_cursor = mysql_conn.cursor()
    mysql_cursor.execute(
        'update download_record set valid=0 where valid=1 and source=24')
    mysql_conn.commit()
    # proxy = {'http': 'http://*****:*****@proxy.abuyun.com:9020', 'https': 'http://*****:*****@proxy.abuyun.com:9020'}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()
    # sqlite_conn = sqlite3.connect('zhaopingou.db')
    # sqlite_cursor = sqlite_conn.cursor()
    task_list = []
    mysql_cursor.execute(
        'select count(*) from download_record where updateTime>date(now()) and valid=2 and source=24'
    )
    number_today = mysql_cursor.fetchall()[0][0]
    numbers_left -= number_today
    numbers_left = 0 if numbers_left < 0 else numbers_left

    while True:
        while not sleep_tag:
            logger.info('not the correct time to buy resume, wait.')
            time.sleep(3600)
        if not numbers_left:
            logger.info('the number of today not left, sleep')
            time.sleep(1800)
            continue
        logger.info('the number left today is:' + str(numbers_left))
        while not task_list:
            while not sleep_tag:
                logger.info('not the correct time to buy resume, wait.')
                time.sleep(3600)
            task_number = mysql_cursor.execute(
                'select * from download_record where valid=0 and source=24 order by updateTime desc limit 1'
            )
            if not task_number:
                logger.info('there has no avaliable task in mysql ,sleep!!!')
                time.sleep(300)
                continue
            task_list = list(mysql_cursor.fetchall())
            break
        task = task_list.pop()
        if task[8]:
            try:
                extend_content = json.loads(task[8])
                extend_content[
                    'emailJobType'] = '' if 'emailJobType' not in extend_content else \
                extend_content['emailJobType']
                extend_content[
                    'emailCity'] = '' if 'emailCity' not in extend_content else \
                extend_content['emailCity']
            except Exception, e:
                logger.info('not find extend_content in task:' + str(task))
                extend_content = {"emailJobType": "", "emailCity": ""}
        else:
            extend_content = {"emailJobType": "", "emailCity": ""}

        logger.info('start to deal with task:' + task[1])
        # download

        get_null_text_count = 0
        for charge_count in xrange(1):
            if task[6]:
                account, cookie = get_one_account_with_download_by(task[6])
            else:
                account, cookie = get_one_account(download=True)
            charge_result = charge_resume(task[1], cookie, proxy)
            # if charge_result['code'] == 9:
            #     set_forbidden_account(account)
            #     continue
            # if charge_result['code'] == 10:
            #     get_null_text_count += 1
            #     continue
            if charge_result['code']:
                set_unavaliable_account(account)
                continue
            break
        else:
            logger.info('get error after 1 times try to charge resume')
            # if get_null_text_count == 1:
            try:
                mysql_cursor.execute(
                    'update download_record set valid=3 where id=%s' %
                    str(task[0]))
                mysql_conn.commit()
                logger.info('get 3 null text of resume:' + task[1])
            except Exception, e:
                logger.info(str(traceback.format_exc()))
            continue
コード例 #11
0
ファイル: resume_fenjianli.py プロジェクト: logonmy/Spider-1
def awake_one_task(task):
    logger = utils.get_logger()
    logger.info('start aweak one task')
    relogin_time = 3
    redis_client = get_redis_client()
    result = {'code': 0, 'executeParam': task}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()

    account, cookie = get_one_account()

    logger.info(str(cookie))
    logger.info('deal with:' + str(task))

    page_now = 1
    download_day = str(time.localtime().tm_mon) + '-' + str(
        time.localtime().tm_mday)
    download_day_datetime = datetime.datetime.today()
    while page_now != -1:
        logger.info('start download page:' + str(page_now))
        # if not account:
        account, cookie = get_one_account()
        list_result = get_list(cookie, page_now, task, proxy)
        # time.sleep(2)
        if list_result['code'] == 5:
            set_unavaliable_account(account)
            account = None
            continue
        elif list_result['code']:
            logger.info('get error list result:' + str(list_result))
            page_now = -1
            continue
        logger.info('page number of now is ' + str(page_now) +
                    ' all number is:' + str(list_result['total']))
        for resume in list_result['ids']:
            # logger.info('sleep 5')
            # time.sleep(5)
            list_resume_update_time = datetime.datetime.strptime(
                resume[1], '%Y-%m-%d')
            if (download_day_datetime - list_resume_update_time).days > 7:
                logger.info("find resume who's update time before 3 days.")
                page_now = -1
                continue
            if not account:
                account, cookie = get_one_account()
            has_find_in_redis = False
            resume_key = 'fenjianli_resume_' + str(resume[0])
            try:
                resume_download_time = redis_client.get(resume_key)
                if resume_download_time == download_day:
                    has_find_in_redis = True
                else:
                    redis_client.set(resume_key, download_day)
            except Exception, e:
                redis_client.set(resume_key, download_day)
            if has_find_in_redis:
                logger.info('has find %s in redis' % resume_key)
                continue
            else:
                logger.info('not find %s in redis' % resume_key)

            resume_result = get_resume(resume[0],
                                       cookie,
                                       task['model_name'],
                                       proxy=proxy)
            if resume_result['code'] == 1:
                set_unavaliable_account(account)
                account = None
                continue
            if resume_result['code'] == 3:
                logger.info('need set valid=0 of account:' + str(account))
            if resume_result['code']:
                logger.info('get error resume:' + str(resume_result))
                continue

            resume_uuid = uuid.uuid1()
            try:
                sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)'
                sql_value = (common_settings.SOURCE,
                             json.dumps(resume_result['json'],
                                        ensure_ascii=False), resume_uuid,
                             str(account['username']), task['job_name'],
                             task['area_name'], str(resume[0]))

                resume_update_time = resume_result['json']['updateDate']
                kafka_data = {
                    "channelType": "WEB",
                    "content": {
                        "content":
                        json.dumps(resume_result['json'], ensure_ascii=False),
                        "id":
                        '',
                        "createBy":
                        "python",
                        "createTime":
                        int(time.time() * 1000),
                        "ip":
                        '',
                        "resumeSubmitTime":
                        '',
                        "resumeUpdateTime":
                        resume_update_time,
                        "source":
                        common_settings.SOURCE,
                        "trackId":
                        str(resume_uuid),
                        "avatarUrl":
                        '',
                        "email":
                        str(account['username']),
                        'emailJobType':
                        task['job_name'],
                        'emailCity':
                        task['area_name'],
                        'subject':
                        str(resume[0])
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH",
                    "source": common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': common_settings.CALLSYSTEMID,
                }
                utils.save_data(sql, sql_value, kafka_data)
            except Exception, e:
                logger.info('get error when write mns, exit!!!' +
                            str(traceback.format_exc()))
                # return
            time.sleep(1)
コード例 #12
0
def awake_one_task(task):
    logger = utils.get_logger()
    logger.info('start aweak one task')
    global random_ids
    relogin_time = 3
    redis_client = get_redis_client()
    result = {'code': 0, 'executeParam': task}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()

    logger.info('deal with:' + str(task))

    page_now = 1
    download_day = str(time.localtime().tm_mon) + '-' + str(
        time.localtime().tm_mday)
    datetime_now = datetime.datetime.now()
    while page_now != -1:
        logger.info('start download page:' + str(page_now))
        # if not account:
        account, cookie = get_one_account(download=True)
        # list_result = get_list_with_keyword(account['username'], page_now,
        #                                     task, proxy)
        # 使用cookie 代替random_id
        list_result = get_list_with_keyword(cookie, page_now, task, proxy)
        # time.sleep(2)
        if list_result['code'] == 5:
            set_unavaliable_account(account)
            continue
        elif list_result['code'] == 6:
            set_forbidden_account(account)
            continue
        if list_result['code']:
            logger.info('get error list result:' + str(list_result))
            page_now = -1
            continue
        if not list_result['data']:
            list_result['has_next_page'] = False
        logger.info('page number of now is ' + str(page_now))
        for resume_one in list_result['data']:
            resume, resume_update_time = resume_one

            datetime_update = datetime.datetime.strptime(
                resume_update_time, '%Y.%m.%d')
            if (datetime_now - datetime_update).days > 7:
                logger.info('简历超出七天限制,跳过任务. ' + resume_update_time)
                list_result['has_next_page'] = False
                continue

            has_find_in_redis = False
            resume_key = 'youzi_all_resume_' + str(resume)
            try:
                resume_redis_value = redis_client.get(resume_key)
                if resume_redis_value:
                    # if resume_redis_value == download_day:
                    has_find_in_redis = True
            except Exception, e:
                logger.info(str(traceback.format_exc()))
                # redis_client.set(resume_key, download_day)
            if has_find_in_redis:
                logger.info('has find %s in redis' % resume_key)
                continue
            else:
                logger.info('not find %s in redis' % resume_key)

            for x in xrange(3):
                account, cookie = get_one_account(download=True)
                # resume_result = get_resume('20319214', account['username'], proxy=proxy)
                # resume_result = get_resume(resume, account['username'],
                #                            proxy=proxy)
                resume_result = get_resume(resume, cookie, proxy=proxy)
                # update_refresh_score(account)
                # if resume_result['code'] == 1:
                #     set_unavaliable_account(account)
                #     account = None
                #     # redis_client.delete(resume_key)
                #     continue
                # if resume_result['code'] == 7:
                #     set_forbidden_account(account)
                #     account = None
                #     # redis_client.delete(resume_key)
                #     continue
                if resume_result['code']:
                    logger.info('get error resume:' + str(resume_result))
                    # redis_client.delete(resume_key)
                    continue
                redis_client.set(resume_key, download_day)
                break
            else:
                continue

            resume_uuid = uuid.uuid1()
            try:
                content = json.dumps(
                    {
                        'name': '',
                        'email': '',
                        'phone': '',
                        'html': resume_result['data']
                    },
                    ensure_ascii=False)
                sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject) values (%s, %s, "python", %s, now(), %s, %s, %s, %s)'
                sql_value = (common_settings.SOURCE, content, resume_uuid,
                             str(account['username']), task['jobTitle'],
                             task['locationname'], str(resume))

                resume_update_time = ''
                # resume_update_time =  resume_result['json']['updateDate']
                kafka_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time() * 1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": str(account['username']),
                        'emailJobType': task['jobTitle'],
                        'emailCity': task['locationname'],
                        'subject': str(resume)
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_SEARCH",
                    "source": common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': common_settings.CALLSYSTEMID,
                }
                utils.save_data(sql, sql_value, kafka_data)
            except Exception, e:
                logger.info('get error when write mns, exit!!!' +
                            str(traceback.format_exc()))
                # return
            time.sleep(1)
コード例 #13
0
ファイル: resume_rencaia.py プロジェクト: logonmy/Spider-1
def awake_one_task(task):
    logger = utils.get_logger()
    logger.info('start aweak one task')
    relogin_time = 3
    redis_client = get_redis_client()
    result = {'code': 0, 'executeParam':task}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()

    logger.info('deal with:'+str(task))

    page_now = 1
    download_day = str(time.localtime().tm_mon)+'-'+str(time.localtime().tm_mday)
    while page_now != -1:
        logger.info('start download page:'+str(page_now))
        # 每个列表页更换一次账号
        account, cookie = get_one_account()
        # 获取列表页
        list_result = get_list(cookie, page_now, task, proxy)
        #time.sleep(2)
        if list_result['code'] == 5:
            set_unavaliable_account(account)
            continue
        elif list_result['code'] == 6:
            set_forbidden_account(account)
            continue
        elif list_result['code']:
            logger.info('get error list result:'+str(list_result))
            page_now = -1
            continue
        logger.info('page number of now is '+str(page_now))

        # 逐个处理简历
        for resume_one in list_result['data']:
            resume, thirdid, srcid, list_content = resume_one
            has_find_in_redis = False
            resume_key = 'rencaia_resume_'+str(resume)
            real_rid = ''
            real_thirdid = ''
            # 在redis中查看今天是否已经下过,若今天已经下过,则跳过
            try:
                resume_redis_value=redis_client.get(resume_key)
                if resume_redis_value:
                    resume_redis_value_list = resume_redis_value.split('_')
                    if resume_redis_value_list[0] == download_day:
                        has_find_in_redis = True
                    real_rid = resume_redis_value_list[1]
                    real_thirdid = resume_redis_value_list[2]
                # else:
                #     pass
                # if resume_redis_value_list and resume_redis_value_list[0] == download_day:
                #     has_find_in_redis=True
                # else:
                #     real_rid = resume_redis_value_list[1]
                #     read_thirdid = resume_redis_value_list[2]
                    # redis_client.set(resume_key, download_day)
            except Exception, e:
                logger.info(str(traceback.format_exc()))
                # redis_client.set(resume_key, download_day)
            if has_find_in_redis:
                logger.info('has find %s in redis' % resume_key)
                continue
            else:
                logger.info('not find %s in redis' % resume_key)

            # 获取简历信息,尝试三次,三次失败则跳过
            for x in xrange(3):
                account, cookie = get_one_account()
                resume_result = get_resume(resume, thirdid, srcid, cookie, proxy=proxy)
                update_refresh_score(account)
                if resume_result['code'] == 1:
                    set_unavaliable_account(account)
                    account = None
                    continue
                if resume_result['code'] == 7:
                    set_forbidden_account(account)
                    account = None
                    continue
                if resume_result['code']:
                    logger.info('get error resume:'+str(resume_result))
                    continue
                
                # if u'存在被盗用的风险' in resume_result['data']:
                #     logger.info(u'find 存在被盗用的风险 in page:'+str(account))
                #     set_forbidden_account(account)
                #     account = None
                #     redis_client.delete(resume_key)
                #     continue
                # if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']:
                #     # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username'])
                #     # redis_client.delete(resume_key)
                #     logger.info('get not open resumed:'+str(account))
                #     # set_forbidden_account(account)
                #     continue
                redis_client.set(resume_key, '_'.join([download_day, str(resume_result['real_rid'] or real_rid), thirdid, srcid]))
                break
            else:
                continue


            # 保存数据至mysql和mns
            resume_uuid = uuid.uuid1()
            try:
                content = {'html':list_content}
                content['email'] = resume_result['charge_json'].get('mail')
                content['phone'] = resume_result['charge_json'].get('phone')
                content['name'] = resume_result.get('name')
                content = json.dumps(content, ensure_ascii=False)
                sql = 'insert into resume_raw (source, content, createBy, trackId, createtime, email, emailJobType, emailCity, subject, reason) values (%s, %s, "python", %s, now(), %s, %s, %s, %s, "list")'
                sql_value = (common_settings.SOURCE, content, resume_uuid, str(account['username']), task['function_id_name'], task['residence_name'], str(resume))

                resume_update_time = ''
                # resume_update_time =  resume_result['json']['updateDate'] 
                kafka_data = {
                    "channelType": "WEB",
                    "content": {
                        "content": content,
                        "id": '',
                        "createBy": "python",
                        "createTime": int(time.time()*1000),
                        "ip": '',
                        "resumeSubmitTime": '',
                        "resumeUpdateTime": resume_update_time,
                        "source": common_settings.SOURCE,
                        "trackId": str(resume_uuid),
                        "avatarUrl": '',
                        "email": str(account['username']),
                        'emailJobType': task['function_id_name'],
                        'emailCity': task['residence_name'],
                        'subject': str(resume),
                        'reason': 'list',
                    },
                    "interfaceType": "PARSE",
                    "resourceDataType": "RAW",
                    "resourceType": "RESUME_INBOX",
                    "source": common_settings.SOURCE,
                    "trackId": str(resume_uuid),
                    'traceID': str(resume_uuid),
                    'callSystemID': common_settings.CALLSYSTEMID,
                }
                utils.save_data(sql, sql_value, kafka_data)
            except Exception, e:
                logger.info('get error when write mns, exit!!!'+str(traceback.format_exc()))
                # return
            time.sleep(1)
コード例 #14
0
def deal_task():
    logger = utils.get_logger()
    logger.info('='*50 + '\nstart deal thread')
    task_list = []
    conn, cur = get_mysql_client()
    change_account_number = 10
    account = {}

    while True:
        task = get_task()
        proxy = settings.get_proxy()
        
        # task = redis_client.get(task_list.pop())
        task_split = task[12].split('_')

        for x in xrange(5):
            if not change_account_number or not account:
                account, cookie = get_one_account()
            start_time = time.time()
            resume_result = get_resume(task_split[0], task_split[1], task_split[2], cookie, proxy=proxy)
            end_time = time.time()
            logger.info('once time cost:'+ str(int(end_time-start_time)))
            # update_refresh_score(account)
            if resume_result['code'] == 1:
                set_unavaliable_account(account)
                account = None
                # redis_client.delete(resume_key)
                continue
            if resume_result['code'] == 7:
                set_forbidden_account(account)
                account = None
                # redis_client.delete(resume_key)
                continue
            if resume_result['code']:
                logger.info('get error resume:'+str(resume_result))
                # redis_client.delete(resume_key)
                account = None
                continue
            
            # if u'存在被盗用的风险' in resume_result['data']:
            #     logger.info(u'find 存在被盗用的风险 in page:'+str(account))
            #     set_forbidden_account(account)
            #     account = None
            #     redis_client.delete(resume_key)
            #     continue
            if 'data' not in resume_result:
                logger.info('not get data in resume result.')
                continue
            if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']:
                # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username'])
                # redis_client.delete(resume_key)
                logger.info('get not open resumed:'+str(account))
                # set_forbidden_account(account)
                continue
            

            resume_uuid = uuid.uuid1()
            try:
                sql = 'update rencaia_all_resume set status=2, info_content=%s, resumeid=%s, url=%s,  phone=%s, email=%s, job_now=%s, ip=%s, account=%s where id=%s'
                sql_value = (resume_result['data'], resume_result['real_rid'], resume_result.get('info_url'), resume_result.get('phone'), resume_result.get('email'),resume_result.get('job_now'), proxy['http'], account['username'], task[0])

                cur.execute(sql, sql_value)
                conn.commit()

                logger.info('save resume success: %s' % task[0])
                break

            except Exception, e:
                logger.info('get error when write mysql, exit!!!'+str(traceback.format_exc()))
                # return
            # time.sleep(1)
            #return
        else:
            logger.info('did not success after 5 times get resume of '+str(task[0]))
            sql = 'update rencaia_all_resume set status=3 where id=%s'
            sql_value = (task[0], )
            cur.execute(sql, sql_value)
            conn.commit()
コード例 #15
0
def awake_one_task(task):
    logger = utils.get_logger()
    logger.info('start aweak one task')
    relogin_time = 3
    redis_client = get_redis_client()
    result = {'code': 0, 'executeParam': task}
    proxy = None
    if common_settings.USE_PROXY:
        proxy = settings.get_proxy()
    conn, cur = get_mysql_client()

    # account, cookie = get_one_account()

    # logger.info(str(cookie))
    logger.info('deal with:' + str(task))

    page_now = task['page_number']
    download_day = str(time.localtime().tm_mon) + '-' + str(
        time.localtime().tm_mday)
    while page_now != -1:
        logger.info('start download page:' + str(page_now))
        if not page_now % 10:
            f = open('page_record', 'r')
            lines = f.readlines()
            f.close()
            start_info = json.loads(lines[0])
            start_info[task['residence_ids']] = page_now

            f = open('page_record', 'w')
            # f.write(str(task['residence_ids'])+'\n')
            # f.write(str(page_now))
            f.write(json.dumps(start_info, ensure_ascii=False))
            f.close()
        # if not account:
        proxy = settings.get_proxy()
        account, cookie = get_one_account()
        start_list_time = time.time()

        list_result = get_list(cookie, page_now, task, proxy)
        end_list_time = time.time()
        logger.info('list cost:' + str(int(end_list_time - start_list_time)) +
                    ' ' + str(len(list_result['data'])))
        #time.sleep(2)
        if list_result['code'] == 5:
            set_unavaliable_account(account)
            logger.info('fail page : %s %s' %
                        (task['residence_name'], page_now))
            #page_now -= 1
            continue
        elif list_result['code'] == 6:
            set_forbidden_account(account)
            logger.info('fail page : %s %s' %
                        (task['residence_name'], page_now))
            #page_now -= 1
            continue
        elif list_result['code']:
            logger.info('get error list result:' + str(list_result))
            page_now = -1
            continue
        logger.info('page number of now is ' + str(page_now))
        # continue
        has_find_count = 0
        not_find_count = 0
        for resume_one in list_result['data']:
            resume, thirdid, srcid, list_content = resume_one
            #     # logger.info('sleep 5')
            #     # time.sleep(5)
            #     # if not account:
            has_find_in_redis = False
            resume_key = 'rencaia_all_resume_' + str(resume)
            try:
                resume_redis_value = redis_client.get(resume_key)
                if resume_redis_value:
                    has_find_in_redis = True
                # else:
                #     pass
                # if resume_redis_value_list and resume_redis_value_list[0] == download_day:
                #     has_find_in_redis=True
                # else:
                #     real_rid = resume_redis_value_list[1]
                #     read_thirdid = resume_redis_value_list[2]
                # redis_client.set(resume_key, download_day)
            except Exception, e:
                logger.info(str(traceback.format_exc()))
                # redis_client.set(resume_key, download_day)
            if has_find_in_redis:
                has_find_count += 1
                # logger.info('has find %s in redis' % resume_key)
                continue
            else:
                not_find_count += 1
                # logger.info('not find %s in redis' % resume_key)
            sql = 'insert into rencaia_all_resume (list_content, search_city, status, list_param, pageNo) values (%s, %s, %s, %s, %s)'
            sql_value = (list_content, task['residence_name'], '0',
                         '_'.join([resume, thirdid, srcid]), str(page_now))
            cur.execute(sql, sql_value)
            conn.commit()
            redis_client.set(resume_key,
                             '_'.join([download_day, resume, thirdid, srcid]))

        #     for x in xrange(15):
        #         #account, cookie = get_one_account()
        #         start_time = time.time()
        #         resume_result = get_resume(resume, thirdid, srcid, cookie, proxy=proxy)
        #         end_time = time.time()
        #         logger.info('once time cost:'+ str(int(end_time-start_time)))
        #         # update_refresh_score(account)
        #         if resume_result['code'] == 1:
        #             set_unavaliable_account(account)
        #             account = None
        #             # redis_client.delete(resume_key)
        #             continue
        #         if resume_result['code'] == 7:
        #             set_forbidden_account(account)
        #             account = None
        #             # redis_client.delete(resume_key)
        #             continue
        #         if resume_result['code']:
        #             logger.info('get error resume:'+str(resume_result))
        #             # redis_client.delete(resume_key)
        #             continue

        #         # if u'存在被盗用的风险' in resume_result['data']:
        #         #     logger.info(u'find 存在被盗用的风险 in page:'+str(account))
        #         #     set_forbidden_account(account)
        #         #     account = None
        #         #     redis_client.delete(resume_key)
        #         #     continue
        #         if 'data' not in resume_result:
        #             logger.info('not get data in resume result.')
        #             continue
        #         if u'该用户暂无求职意向,已在外网设置简历不公开' in resume_result['data']:
        #             # logger.info('un publish resume:'+str(resume)+ 'account:'+account['username'])
        #             # redis_client.delete(resume_key)
        #             logger.info('get not open resumed:'+str(account))
        #             # set_forbidden_account(account)
        #             continue
        #         break
        #     else:
        #         continue

        #     resume_uuid = uuid.uuid1()
        #     try:
        #         sql = 'insert into rencaia_all_resume (list_content, info_content, resumeid, url, search_city, search_job, phone, email, job_now, ip, account) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
        #         sql_value = (list_content, resume_result['data'], resume, resume_result.get('info_url'), task['residence_name'], task['function_id_name'], resume_result.get('phone'), resume_result.get('email'),resume_result.get('job_now'), proxy['http'], account['username'])

        #         cur.execute(sql, sql_value)
        #         conn.commit()

        #         cur.execute('select last_insert_id()')
        #         save_mysql_ids = cur.fetchall()
        #         if not save_mysql_ids or not save_mysql_ids[0]:
        #             logger.info('insert into mysql error!!!:' + sql + '    ' + str(sql_value))
        #             continue
        #         save_mysql_id = save_mysql_ids[0][0]
        #         logger.info('save resume success: %s' % save_mysql_id)

        #         redis_client.set(resume_key, '_'.join([download_day, str(resume_result['real_rid'] or real_rid), thirdid, srcid]))
        #     except Exception, e:
        #         logger.info('get error when write mysql, exit!!!'+str(traceback.format_exc()))
        #         # return
        #     time.sleep(1)
        #     #return

        page_now = page_now + 1 if list_result['has_next_page'] else -1
        logger.info(
            'city : %s page : %s redis check : %s / %s ' %
            (task['residence_name'], page_now, has_find_count, not_find_count))
コード例 #16
0
ファイル: dfcf.py プロジェクト: edwardlg/CrawlMan
def redis_seed():
    keys = r.keys()
    for code in keys:
        print('current code {}'.format(code))
        start_page=r.get(code)
        stop_flag = False
        # start_page = 1
        start_page=int(start_page)
        while 1:
            if stop_flag:
                print('完成一个')
                break

            start=1
            while start<RETRY:
                proxy = get_proxy()

                try:
                    response = requests.get('http://guba.eastmoney.com/list,{},f_{}.html'.format(code,start_page),
                                            proxies=proxy,headers=headers, cookies=cookies, verify=False,
                                            timeout=10)
                except Exception as e:
                    print(e)
                    start+=1
                else:
                    break

            if start==RETRY:
                continue

            text=response.text
            resp=Selector(text=text)

            detail=resp.xpath('//div[@id="articlelistnew"]/div[@class="articleh normal_post" or @class="articleh normal_post odd"]')
            # print('page {}'.format(start_page))
            c=0

            for item in detail:
                c=c+1
                read_ount=item.xpath('.//span[1]/text()').extract_first()
                comment_ount=item.xpath('.//span[2]/text()').extract_first()
                title=item.xpath('.//span[3]/a/@title').extract_first()
                author=item.xpath('.//span[4]/a/font/text()').extract_first()
                last_update=item.xpath('.//span[5]/text()').extract_first()
                next_url='http://guba.eastmoney.com'+item.xpath('.//span[3]/a/@href').extract_first()

                d={}
                d['code']=code
                d['title']=title
                d['page']=start_page
                d['count']=c
                d['read_count']=read_ount
                d['author']=author
                d['comment_count']=comment_ount
                d['last_update']=last_update
                d['next_url']=next_url
                d['crawltime']=datetime.datetime.now()

                try:
                    doc.update_one({'next_url':next_url},{'$set':d},True,True)
                except Exception as e:
                    print(e)

            start_ = 0
            start_page += 1
            r.set(code,start_page)

            while start_<RETRY:

                try:
                    proxy = get_proxy()
                    response_detail = requests.get(next_url, headers=headers,
                                        cookies=cookies, verify=False,proxies=proxy,timeout=10)

                except Exception as e:
                    print(e)
                    start_+=1
                    continue

                else:
                    break

            if start_==RETRY:
                continue

            resp_detail=response_detail.text
            detail_resp = Selector(text=resp_detail)
            zwfb_time=detail_resp.xpath('//div[@class="zwfbtime"]/text()').extract_first()
            if isinstance(zwfb_time,str):
                zwfb_pattern=re.search('发表于 (.*?) ',zwfb_time)
            else:
                continue

            if zwfb_pattern:
                zwfb_time = zwfb_pattern.group(1)
            else:
                print('未找到时间')
                zwfb_time = None

            if zwfb_time is not None and zwfb_time<END_DATE:
                stop_flag=True