Esempio n. 1
0
def get_from_unuseless_phonenum():
    r = set_redis(2)
    # for i in r.hkeys("unuseless_cookies_car168"):
    #     r.lpush("cookies_car168_list", i)
    for k, v in r.hgetall("unuseless_cookies_car168").items():

        print(k, v)
        r.hset("cookies_car168", k, v)
Esempio n. 2
0
def clicklogin(driver, phone_num, projectid, token, matchrule):
    # 手机号页面
    phone_input = driver.find_element_by_id("uname")
    phone_input.send_keys(phone_num)
    # 点击获取验证码
    sendcode = driver.find_element_by_id("sendCode")
    sendcode.click()
    # 接收验证码, 处理下接受不到短信的手机号
    code = get_code(projectid,
                    phonenum=phone_num,
                    token=token,
                    matchrule=matchrule)
    if code:
        # 输入验证码
        code_input = driver.find_element_by_id("code")
        code_input.send_keys(code)

        login = driver.find_element_by_id("button")
        login.click()
        sleep(2)
        driver.refresh()
        tbCookies = driver.get_cookies()
        # driver.quit()
        cookies = {}
        for item in tbCookies:
            if item['name'] == "DEVICE_ID" or item['name'] == "U":
                cookies[item['name']] = item['value']
        print(cookies)
        r = set_redis(2)

        # mapping = {json.dumps(cookies): int(phone_num)}
        # print(mapping)
        # r.zadd("cookies_car168", mapping)
        # 将cookie值存入redis list 和hash
        dump_cookies = json.dumps(cookies)
        r.lpush("cookies_car168_list", dump_cookies)
        r.hset("cookies_car168", dump_cookies, phone_num)
        print("成功存入redis")
        driver.quit()
    else:
        print("将无用注册手机号存入redis")
        set_redis(2).sadd("unuseless_car168_phonenum", phone_num)
        print("没有接收到验证码")
Esempio n. 3
0
def get_phone():
    phone_num = set_redis(2).spop("car168_phonenum")
    # phone_num_list = []#"17169479357""16535511249","16574980930","16531165344","16533431172",]
    # phone_num = phone_num_list[0]
    print("从列表中取得手机号:", phone_num)
    if phone_num:
        phone_num = phone_num.decode()
    # token = get_token('maxfire', "ma123456")
    # phone_num_from = get_phonenum(projectid, token=token, phone=phone_num, loop=2)
    token, phone_num_from = build_phonenum(projectid, loop=2, phone=phone_num)
    print("解码平台手机号:", phone_num_from)
    return token, phone_num_from, phone_num
Esempio n. 4
0
class MysqlPipeline(object):
    save_url_r = set_redis(db=1)

    def open_spider(self, spider):
        self.conn = pymysql.connect(
            host=HOST,
            port=3306,
            db=MYDB,
            user=USER,
            password=PASSWD,
            charset=charset,
        )
        self.cur = self.conn.cursor()

    def close_spider(self, spider):
        self.cur.close()
        self.conn.close()

    def process_item(self, item, spider):

        selflog = SelfLog(spider.name)
        selflog_error = SelfLog("error")
        keys = [
            "brand", "type", "year", "style", "guide_price", "displacement",
            "configuration", "version", "status"
        ]
        values = [item[i] for i in keys]
        # 插入车型数据前先在数据库中进行匹配是否存在,若存在则判断是否符合更新条件进行更新, 若不存在则保存
        sql_search_style = "select id, brand, type from car_style where brand=%s and type=%s and year=%s and style=%s and guide_price=%s"
        search_db_result = Mysqlpython().readall(sql_search_style, [
            item["brand"], item['type'], item['year'], item['style'],
            item['guide_price']
        ])

        if search_db_result:
            commit_id = search_db_result[0][0]
            # selflog.logger.info("出现的重复车型:%s", search_db_result[0])
            update_set = "update car_style set status=null "
            update_where = " where id=%s "
            sqlparam = [commit_id]

            if not item['guide_price'] and item['guide_price'] != "None":
                update_set += " , guide_price=%s"
                update_where += " and (guide_price is null or guide_price='None')"
                sqlparam.append(item['guide_price'])
            if not item['displacement'] and item['displacement'] != "None":
                update_set += " , displacement=%s"
                update_where += " and (displacement is null or displacement='None')"
                sqlparam.append(item['displacement'])
            if item['version'] and item['version'] != "None":
                update_set += " , version=%s"
                update_where += " and (version is null or version='None')"
                sqlparam.append(item['version'])
            # 如果update_set 还是原来的值则不执行更新操作
            if update_set == "update car_style set status=null ":
                pass
            else:
                sql_update = update_set + update_where
                # selflog.logger.info(
                #     "执行更新sql:{sql_update}, values:{values}, id:{commit_id}".format(sql_update=sql_update,
                #                                                                    values=sqlparam,
                #                                                                    commit_id=commit_id))
                try:
                    self.cur.execute(sql_update, sqlparam)
                    self.conn.commit()
                except Exception as e:
                    selflog_error.logger.error(
                        "更新sql出错:{sql_update}, values:{values}, id:{commit_id},e:{e}"
                        .format(sql_update=sql_update,
                                values=sqlparam,
                                commit_id=commit_id,
                                e=e))
        # 如果查不到则进行插入车型、价钱、更新时间等详情操作
        else:
            # 插入车型表
            sql = "insert into `{}` ({}) values ({})".format(
                item.table_name,
                ','.join(keys),
                # 使用占位符插入的方式是保证兼容除字符串格式以外数据
                ','.join(['%s'] * len(values)),
            )
            try:
                self.cur.execute(sql, values)
                self.conn.commit()
            except Exception as e:
                selflog_error.logger.info(
                    "{spidername} 插入车型表出错e:{e}sql:{sql}, --values:{values}".
                    format(spidername=spider.name, e=e, sql=sql,
                           values=values))
                commit_id = None
            else:
                commit_id = self.cur.lastrowid

        # 如果没有车价格和交易量则不进行插入
        if commit_id and (item['price'] or item['volume']):

            # 在redis中判断这个详情页hash: detail-url: 更新时间##数据库保存id是否存在,
            # 如果存在则判断更新时间是否符合,不过不符合则进行数据插入然后更新redis数据
            # 如果不存在则存入redis和数据库
            requesturl = item['detail_url']
            rediskey = item['rediskey']
            # 从 中取出更新时间和保存的id
            hash_value = self.save_url_r.hget(rediskey, requesturl)
            if hash_value:
                hash_value = hash_value.decode()
                updatetime = hash_value.split('##')[0]
                save_id = hash_value.split('##')[1]

                if updatetime == item['updatetime']:
                    pass
                # 执行插入
                else:
                    self.insert_detaildata(spider, commit_id, item,
                                           selflog_error, rediskey)
                #     sql_update_detail = "update car_detail set updatetime=%s, price=%s, volume=%s where id=%s".format(
                #         updatetime=item['updatetime'], price=item['price'],volume=item['volume'], save_id=save_id)
                #     sql_insert_detail = "insert "
                #     try:
                #         self.cur.execute(sql_insert_detail, [item['updatetime'], item['price'],item['volume'], save_id])
                #         # 更新redis的值
                #         self.save_url_r.hset(rediskey, requesturl, item['updatetime'] + "##" + str(self.cur.lastrowid))
                #         print("{spidername}--更新爬取价格数据:{updata}".format(spidername=spider.name, updata=item['updatetime'] + "##" + save_id))
                #
                #         # selflog.logger.info("更新爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + save_id))
                #     except Exception as e:
                #         selflog_error.logger.error("{spidername}更新爬取车价详情出错e:{e}__sql:{sql}".format(spidername=spider.name, e=e, sql=sql_update_detail))
            # redis中查不到这条数据,直接写入数据库中, 并在redis中添加 detail_url:更新时间##save_id 数据
            else:
                self.insert_detaildata(spider, commit_id, item, selflog_error,
                                       rediskey)
                # selflog.logger.info("写入爬取价格数据:{updata}".format(updata=item['updatetime'] + "##" + str(self.cur.lastrowid)))

                # self.save_url_r.sadd(item['rediskey'], requesturl)

        else:
            selflog.logger.info(
                "没有交易价格和交易量数据,car_detial表不进行插入key:%s, --values:%s" %
                (keys, values))
        return item

    def insert_detaildata(self, spider, commit_id, item, selflog_error,
                          rediskey):
        table_name = "car_detail"
        second_key = [
            "platform", "vehicleType", "price", "volume", "updatetime",
            "detail_url"
        ]
        second_values = [
            item['platform'], commit_id, item['price'],
            str(item['volume']), item['updatetime'], item['detail_url']
        ]

        # 插入车型平台价格详情表
        second_sql = "insert into `{}` ({}) values ({})".format(
            table_name, ','.join(second_key),
            ','.join(['%s'] * len(second_values)))
        try:
            self.cur.execute(second_sql, second_values)
            self.conn.commit()
            cardetail_id = self.cur.lastrowid
        except Exception as e:
            selflog_error.logger.info(
                "{spidername} 插入车价详情error:{e}--style表中的id:{id}--出错的sql:{second_sql}, --values:{second_values}"
                .format(spidername=spider.name,
                        e=e,
                        id=commit_id,
                        second_sql=second_sql,
                        second_values=second_values))
        else:
            self.save_url_r.hset(
                rediskey, item['detail_url'],
                item['updatetime'] + "##" + str(self.cur.lastrowid))
Esempio n. 5
0
import datetime
import json
import re

import requests
import scrapy
from fake_useragent import UserAgent
from scrapy import Request

from cars.CONSTANT import China, USA, Canada, European, Mexico
from cars.items import CarStyleItem
from cars.log_utils import SelfLog
from cars.utils import deal_style, deal_year, deal_displacement, deal_guideprice, sav_item, set_redis, Mysqlpython, \
    deal_updatetime

cookie_r = set_redis(2)
set_url_r = set_redis(4)
# type_r = set_redis()
dbhelper = Mysqlpython()

cookies_chezhen = "cookies_chezhen"
unuseless_cookies_chezhen = "unuseless_cookies_chezhen"
url_redis = "chezhen"
url_redis_chezhen = "chezhen_urls"


class ChezhenSpider(scrapy.Spider):
    name = 'chezhen'
    selflog = SelfLog(name)

    cookies = {
Esempio n. 6
0
def signup_car168(driver):
    try:
        # 等待3s看跳过界面是否出现
        if WebDriverWait(driver, 2).until(lambda x: x.find_element_by_xpath(
                "//android.widget.Button[@resource-id='com.zjw.chehang168:id/itemButton']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.Button[@resource-id='com.zjw.chehang168:id/itemButton']"
            ).click()
    except Exception as e:
        print(e)

    # 接码平台接口拿到的token,手机号
    token, phone_nume = build_phonenum(projectid, loop=2, filter='')
    try:
        # 输入手机号
        if WebDriverWait(driver, 3).until(lambda x: x.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/unameEdit']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/unameEdit']"
            ).send_keys(phone_nume)
            # 点击获取验证码按钮
            driver.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/loginButton']"
            ).click()
    except Exception as e:
        print(e, "输入手机号点击获取验证码出错")

    try:
        # 安全验证
        if WebDriverWait(driver, 15).until(lambda x: x.find_element_by_xpath(
                "//android.view.View[@resource-id='nc_1_n1t']")):
            inter = driver.find_element_by_xpath(
                "//android.view.View[@resource-id='nc_1_n1t']")
            wrapper = driver.find_element_by_xpath(
                "//android.view.View[@text='请向右滑动验证']")
            print(inter.location, inter.size)

            start = [
                inter.location['x'] + inter.size["width"] // 2,
                inter.location['y'] + inter.size["height"] // 2
            ]
            end = [
                wrapper.location['x'] + wrapper.size["width"] -
                inter.size["width"] // 2,
                inter.location['y'] + inter.size["width"] // 2
            ]
            # end = [670, 0]
            print(start, end)
            touch_test(driver=driver, start=start, end=end, el=inter)
    except Exception as e:
        print(e, "滑动验证模块出错")

    # 发送验证码接口
    print("手机号", phone_nume)
    code = get_code(projectid=projectid,
                    phonenum=phone_nume,
                    token=token,
                    matchrule=matchrule)
    # code = "1234"
    print("验证码:***{code}***".format(code=code))
    # relase_phonenum(projectid, phone_nume, token)
    if not code:
        print("没有收到验证码,直接退出")
        driver.quit()
        return None
    try:
        # 输入验证码
        if WebDriverWait(driver, 15).until(lambda x: x.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_0']"
        )):
            x1 = driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_0']"
            )
            x1.click()
            x1.send_keys(code[0])
            x2 = driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_1']"
            )
            x2.click()
            x2.send_keys(code[1])
            x3 = driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_2']"
            )
            x3.click()
            x3.send_keys(code[2])
            x4 = driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/tv_3']"
            )
            x4.click()
            x4.send_keys(code[3])
    except Exception as e:
        print(e)

    # 如果有错误提示
    try:
        if WebDriverWait(driver, 1).until(lambda x: x.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/btn2']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/btn2']"
            ).click()
            print("验证码错误")
    except Exception as e:
        print("验证通过:", e)

    # 输入个人信息进行注册
    try:
        if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/nameEdit']"
        )):
            # 真实姓名
            driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/nameEdit']"
            ).send_keys(random.choice(name_list))
            # 登录密码
            driver.find_element_by_xpath(
                "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/pwdEdit']"
            ).send_keys("ma123456")
            # 工作地点: 点击选择
            driver.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/areaText']"
            ).click()
            choice_driver = driver
            choice_city(choice_driver)
            # 填写公司名
            try:
                if WebDriverWait(
                        driver, 5
                ).until(lambda x: x.find_element_by_xpath(
                        "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/companyEdit']"
                )):
                    driver.find_element_by_xpath(
                        "//android.widget.EditText[@resource-id='com.zjw.chehang168:id/companyEdit']"
                    ).send_keys(random.choice(carnames))
                    # 选择公司类型
                    driver.find_element_by_xpath(
                        "//android.widget.RadioButton[@text='其他']").click()
                    # 勾选同意政策
                    driver.find_element_by_xpath(
                        "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/itemCheckImg']"
                    ).click()
                    # 完成注册
                    driver.find_element_by_xpath(
                        "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/submitButton']"
                    ).click()
            except Exception as e:
                print("选择城市后没有进行页面跳转", e)
    except Exception as e:
        print("选择公司前出错,", e)
    # 点击我的
    try:
        if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath(
                "//android.widget.RadioButton[@resource-id='com.zjw.chehang168:id/radio_button4']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.RadioButton[@resource-id='com.zjw.chehang168:id/radio_button4']"
            ).click()
    except Exception as e:
        print("没有我", e)
    # 点击设置
    try:
        if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath(
                "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/rightImg']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.ImageView[@resource-id='com.zjw.chehang168:id/rightImg']"
            ).click()
    except Exception as e:
        print("没有设置", e)

    # 点击退出登录
    try:
        if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/tv_content' and @text='退出登录']"
        )):
            driver.find_element_by_xpath(
                "//android.widget.TextView[@resource-id='com.zjw.chehang168:id/tv_content' and @text='退出登录']"
            ).click()
    except Exception as e:
        print("没有退出登录", e)
    # 点击确认
    try:
        if WebDriverWait(driver, 5).until(lambda x: x.find_element_by_xpath(
                "//android.widget.Button[@resource-id='android:id/button1']")):
            driver.find_element_by_xpath(
                "//android.widget.Button[@resource-id='android:id/button1']"
            ).click()
    except Exception as e:
        print("没有确认按钮", e)

    # 操作完成后把手机号存入redis
    else:
        result = set_redis(2).sadd("car168_phonenum", phone_nume)
        print("phone_num{}写入redis中:{}".format(phone_nume, result))
        driver.quit()
Esempio n. 7
0
        # mapping = {json.dumps(cookies): int(phone_num)}
        # print(mapping)
        # r.zadd("cookies_car168", mapping)
        # 将cookie值存入redis list 和hash
        dump_cookies = json.dumps(cookies)
        r.lpush("cookies_car168_list", dump_cookies)
        r.hset("cookies_car168", dump_cookies, phone_num)
        print("成功存入redis")
        driver.quit()
    else:
        print("将无用注册手机号存入redis")
        set_redis(2).sadd("unuseless_car168_phonenum", phone_num)
        print("没有接收到验证码")


if __name__ == '__main__':
    while set_redis(2).scard("car168_phonenum"):
        token, phone_num, phone_num_from_set = get_phone()
        if not phone_num:
            print("将无用注册手机号存入redis")
            set_redis(2).sadd("unuseless_car168_phonenum", phone_num_from_set)
            continue
        driver = get_driver()
        sleep(1)
        mouseclick()
        clicklogin(driver,
                   phone_num,
                   projectid,
                   token=token,
                   matchrule=matchrule)
Esempio n. 8
0
import json
import random
import re
import time

import scrapy
from fake_useragent import UserAgent
from scrapy import Request

from cars.CONSTANT import China, ChinaImport, USA, Canada, Mexico, European
from cars.items import CarStyleItem
from cars.log_utils import SelfLog
from cars.utils import Mysqlpython, set_redis, deal_year, deal_style, deal_displacement, deal_updatetime, \
    deal_guideprice

cookie_r = set_redis(2)
type_r = set_redis()
dbhelper = Mysqlpython()
set_url_r = set_redis(4)

cookies_car168 = "cookies_car168"
unuseless_cookies_car168 = "unuseless_cookies_car168"
url_redis = "car168"
url_redis_car168 = "car168_urls"

class Car168Spider(scrapy.Spider):
    name = 'car168'
    allowed_domains = ['www.chehang168.com']
    start_urls = ['http://www.chehang168.com/', 'http://www.chehang168.com/index.php?c=index&m=allBrands', "http://www.chehang168.com/index.php?c=index&m=Cardata"]
    selflog = SelfLog(name)
    
Esempio n. 9
0
url = "http://www.niuniuqiche.com/v2/sell_cars?brand_name=%E5%A5%A5%E8%BF%AA&car_model_name=%E5%A5%A5%E8%BF%AAA3&firm_name=%E4%B8%80%E6%B1%BD-%E5%A4%A7%E4%BC%97%E5%A5%A5%E8%BF%AA"
# url = "http://www.chehang168.com/index.php?c=index&m=index"
# result = requests.get(url, headers=headers)
# tree = etree.HTML(result.text)
# print("****"+tree.xpath('//div[@class="section-pagination"]//span[@class="page current"]/text()')[0].strip()+"*****")
# print("****"+tree.xpath('//div[@class="section-pagination"]//span[@class="last"]/a/@href')[0].split('page=')[1]+"*****")

# for i in r.zscan("test")[1]:
#     mapping = {}
#     mapping[i[0]] = i[1]
#     r.zadd("cookies_car168", mapping)

# dictss = {
#         "CzcyAutoLogin":"******",
#     }

r = set_redis(2)
# for i in r.zscan("unuseless_cookies_car168")[1]:
#     # mapping = {}
#     # mapping[i[0]] = i[1]
#     # print(mapping)
#     phone_num = str(int(i[1]))
#     print(phone_num)
#     r.sadd("car168_phonenum", phone_num)

if __name__ == '__main__':
    strs = "奥迪 A8L 17款 6.3TSFI W12 旗"
    print(
        re.search(
            "款\s.*?(\d+\.\d+i|\d+i|\d+\.\d+L[a-z]{0,1}|\d+L[a-z]{0,1}|\d+\.\d+T[A-Z]{0,3}|\d+T[A-Z]{0,3}|\d+ T[A-Z]{0,3})",
            strs).group(1))
Esempio n. 10
0
File: nnqc.py Progetto: ma-qing/cars
import random
import re

import scrapy
from fake_useragent import UserAgent
from scrapy import Request

# 账号:17061084088
# 密码:ma123456
from cars.CONSTANT import USA, Canada, Mexico, European, China, ChinaImport
from cars.items import CarStyleItem, CarDetailItem
from cars.log_utils import SelfLog
from cars.settings import BASE_DIR
from cars.utils import Mysqlpython, set_redis, deal_style, deal_year, deal_displacement, deal_guideprice

r_zet_cookie = set_redis(2)
set_url_r = set_redis(4)
dbhelper = Mysqlpython()
cookies_nnqc = "cookies_nnqc"
unuseless_cookies_nnqc = "unuseless_cookies_nnqc"
url_redis = "nnqc"
url_redis_nnqc = "nnqc_urls"


class NnqcSpider(scrapy.Spider):
    name = 'nnqc'
    allowed_domains = ['www.niuniuqiche.com']
    start_urls = [
        'http://www.niuniuqiche.com/', 'http://www.niuniuqiche.com/v2/brands'
    ]
    i = 0
Esempio n. 11
0
class DealDataMiddleware(object):
    add_url_r = set_redis(db=1)
    cookies_deal_r = set_redis(db=2)
    set_url_r = set_redis(4)

    # 获取代理
    def get_proxy(self):
        with open(os.path.join(BASE_DIR, "proxies.txt"), "r") as f:
            date = f.read().splitlines()
            proxy = random.choice(date)
            return proxy

    # # 获取请求头
    # def get_headers(self, request, spider):
    #     # sql = "select ua from useragent order by rand() limit 1;"
    #     cookie_dict = self.get_cookies(request, spider)
    #     cookie = ""
    #     for k, v in cookie_dict.items():
    #         cookie = cookie + k + "=" + v +";"
    #     print("得到的cookie值", cookie)
    #
    #     headers = {
    #         # "User-Agent": dbhelper.readall(sql)[0][0],
    #         "User-Agent": UserAgent().random,
    #         "Content-Type": "text/html; charset=utf-8",
    #         "Host": "www.niuniuqiche.com",
    #         "Cookie": cookie
    #     }
    #
    #     return headers

    def process_request(self, request, spider):
        selflog = SelfLog(spider.name)
        # 根据爬虫名字分别处理爬虫代理和cookie
        if spider.name == "nnqc":
            request.meta['http_proxy'] = self.get_proxy()
            print("{}使用的代理为{},请求url:{}".format(spider.name,
                                               request.meta['http_proxy'],
                                               request.url))
        elif spider.name == "chezhen":
            request.meta['http_proxy'] = self.get_proxy()
            print("{}使用的代理为{},请求url:{}".format(spider.name,
                                               request.meta['http_proxy'],
                                               request.url))
        elif spider.name == "car168":
            pass

        # # 添加cookie
        request.cookies = self.get_cookies(request, spider)
        # cookie_dict = self.get_cookies(request, spider)
        # cookie = ""
        # for k, v in cookie_dict.items():
        #     cookie = cookie + k + "=" + v +";"
        # request.headers["User-Agent"] = UserAgent().random
        # request.headers["Content-Type"] = "text/html; charset=utf-8"
        # request.headers["Cookie"] = cookie

        # url = request.url
        # redis_key = request.meta['url_redis']
        # if self.add_url_r.sismember(redis_key, url):
        #     spider.logger.info("该url已经爬取,舍弃:%s"%url)
        #     raise IgnoreRequest
        return None

    def process_response(self, request, response, spider):
        selferrorlog = SelfLog('error')
        selfinfolog = SelfLog(spider.name)
        # 把cookie从能用的库中转移到不能用的库里
        cookie_redis_key_hash = request.meta['cookies_redis']
        cookie_redis_key = request.meta['cookies_redis'] + "_list"
        unuse_cookie_redis_key = request.meta['useless_cookies']
        if response.status == 302:
            print(response.text)
            selferrorlog.logger.error(
                "{spidername}-被封,302重定向到登录界面{cookie}:".format(
                    spidername=spider.name, cookie=request.cookies))
            request = self.dealcookie(request, response, spider)
            return request
        elif "c=com&m=limitPage" in response.text:
            selferrorlog.logger.error(
                "{spidername}-重定向到限制界面, cookie值:{cookie}".format(
                    spidername=spider.name, cookie=request.cookies))
            request = self.dealcookie(request, response, spider)
            return request
        elif "请重新登录" in response.text:
            selferrorlog.logger.error(
                "{spidername}-cookie:{cookies}过期,或者IP不一致,到登录界面".format(
                    spidername=spider.name, cookies=request.cookies))
            request = self.dealcookie(request, response, spider)
            return request
        selfinfolog.logger.info("请求url:{url}使用的cookie:{cookie}".format(
            url=response.url, cookie=request.cookies))
        return response

    # 处理过期或者被封cookie
    def dealcookie(self, request, response, spider):
        selflog = SelfLog('error')
        cookie_redis_key_hash = request.meta['cookies_redis']
        cookie_redis_key_list = request.meta['cookies_redis'] + "_list"
        unuse_cookie_redis_key = request.meta['useless_cookies']

        redis_member = json.dumps(request.cookies)

        # 查到在hash 中的 手机号
        zset_phone = self.cookies_deal_r.hget(cookie_redis_key_hash,
                                              redis_member)
        # 移除在list 和 有用hash 中的 数据 # 在不能用的hash中添加
        self.cookies_deal_r.lrem(cookie_redis_key_list, 0, redis_member)
        self.cookies_deal_r.hdel(cookie_redis_key_hash, redis_member)
        self.cookies_deal_r.hset(unuse_cookie_redis_key, redis_member,
                                 zset_phone)
        # 再从redis中取出一个cookie构建request对象
        try:
            popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list)
            self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie)

        except Exception as e:
            selflog.logger.error("{spidername}--cookie 耗尽请补充, 错误信息:{e}".format(
                spidername=spider.name, e=e))
            # 发送邮件通知,并且最好处理能关闭爬虫
            sendEmail(content="{cookname}cookie耗尽,请尽快处理".format(
                cookname=cookie_redis_key_list))
            spider.crawler.engine.close_spider(
                spider, "{cookname}cookie耗尽,关闭爬虫".format(
                    cookname=cookie_redis_key_list))

        else:
            request.cookies = json.loads(popcookie)
            return request

    def get_cookies(self, request, spider):
        selflog = SelfLog('error')
        cookie_redis_key_list = request.meta['cookies_redis'] + "_list"
        cookie_redis_key_hash = request.meta['cookies_redis']
        unuse_cookie_redis_key = request.meta['useless_cookies']
        try:
            # cookies_dict = random.choice(self.cookies_deal_r.zscan(cookie_redis_key)[1])
            # 把cookie 取出来然后放到队尾
            popcookie = self.cookies_deal_r.lpop(cookie_redis_key_list)
            self.cookies_deal_r.rpush(cookie_redis_key_list, popcookie)
        except Exception as e:
            selflog.logger.error(
                "spidername:{spidername} 的cookie 耗尽请补充, 错误信息:{e}".format(
                    spidername=spider.name, e=e))
            # 发送邮件通知,并且最好处理能关闭爬虫
            sendEmail(content="{cookname}cookie耗尽,请尽快处理".format(
                cookname=cookie_redis_key_list))
            spider.crawler.engine.close_spider(
                spider, "{cookname}cookie耗尽,关闭爬虫".format(
                    cookname=cookie_redis_key_list))
        else:
            dicts = json.loads(popcookie)
            phonenum = self.cookies_deal_r.hget(cookie_redis_key_hash,
                                                popcookie)
            print("{cookie_redis}--手机号:{phonenum}--cookie:{cookie}".format(
                cookie_redis=cookie_redis_key_hash,
                phonenum=phonenum,
                cookie=dicts))
            return dicts
Esempio n. 12
0
import json
import os
import random
import re
import time
from multiprocessing import Process, Pool

import requests
from fake_useragent import UserAgent
from requests.utils import get_encoding_from_headers

from cars.login import signup, get_auth
from cars.settings import BASE_DIR
from cars.utils import set_redis

cookie_r = set_redis(2)
username = "******"
password = "******"
# 牛牛汽车注册id
projectid_nnqc = "17883"
cookies_nnqc = "cookies_nnqc"

name_list = [
    "裴玉",
    "陈英",
    "赵兵",
    "9442",
    "戴国强",
    "陶洪万",
    "朱洪纯",
    "徐亚玲",
Esempio n. 13
0
import requests

from cars.utils import Mysqlpython, set_redis

r = set_redis()
url = "http://www.chehang168.com/index.php?c=index&m=Cardata"
headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36",
        "Host": "www.chehang168.com",
        # "Cookie": "DEVICE_ID=77cd83d79707a7fb386f712f2bef8db0; _uab_collina=158894954478649638062081; soucheAnalytics_usertag=RI8C9Ol6w8; U=1769515_8fb8933d8591166ea0b616db963ba427"
        "Cookie":"DEVICE_ID=77cd83d79707a7fb386f712f2bef8db0; _uab_collina=158894954478649638062081; soucheAnalytics_usertag=RI8C9Ol6w8; U=1769515_8fb8933d8591166ea0b616db963ba427"

    }
# cookeis = {
# "DEVICE_ID":"77cd83d79707a7fb386f712f2bef8db0",
# "_uab_collina":"158894954478649638062081",
# "soucheAnalytics_usertag":"RI8C9Ol6w8",
# "U":"1769515_8fb8933d8591166ea0b616db963ba427",
#
# }
dbhelper = Mysqlpython()


def set_data_db():
    result = requests.get(url=url, headers=headers)
    dicts = eval(result.text[14:])
    for brand_encode, v in dicts.items():
        print("编码品牌名", brand_encode)
        for index,(i,m) in enumerate(v.items()):
            # 第一层记录了名字
            if index == 0: