Esempio n. 1
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.username = urllib.quote(user.get("name", ""))
        self.idcard = urllib.quote(user.get("idcard", ""))
        self.area = urllib.quote(user.get("area", ""))
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        # urllib.quote(a)

        self.LoginUrl = "https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?" + "resource_id=6899&query=%E5%A4%B1%E4%BF%A1%E8%A2%AB%E6%89%A7%E8%A1%8C%E4%BA%BA%E5%90%8D%E5%8D%95&cardNum=" + self.idcard + "&iname=" + self.username + "&areaName=" + self.area + "&ie=utf-8&oe=utf-8&format=json&t=" + str(
            int(round(time.time() *
                      1000))) + "&cb=jQuery110207690611877233657_" + str(
                          int(round(time.time() * 1000))) + "&_=" + str(
                              int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}
        self.PayRecord = {}
Esempio n. 2
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.title = user.get("title", "")
        # print chardet.detect(self.title)
        self.title = urllib.quote(self.title.encode('utf8'))

        self.project_district = user.get("project_district")
        self.project_developer_name_value = user.get("project", "")
        self.date_filter_min = user.get("date_filter_min","")
        self.date_filter_max = user.get("date_filter_max","")
        # print self.title,self.project_developer_name_value,self.project_district,self.date_filter_max,self.date_filter_min

        # self.title = urllib.quote(self.title.decode(sys.stdin.encoding).encode('utf-8'))
        # self.project_district = urllib.quote(self.project_district.decode(sys.stdin.encoding).encode('utf8'))
        # self.project_developer_name_value = urllib.quote(self.project_developer_name_value.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_min = urllib.quote(self.date_filter_min.decode(sys.stdin.encoding).encode('utf8'))
        # self.date_filter_max = urllib.quote(self.date_filter_max.decode(sys.stdin.encoding).encode('utf8'))
        #
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")


        self.LoginUrl = "https://newhouse.cnnbfdc.com"


        self.result = user.get("result", "")
        self.GJJInfo = []
        self.bild = []
Esempio n. 3
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.fpjy = user.get("fpjy", "")
        self.fpdm = user.get("fpdm", "")
        self.fphm = user.get("fphm", "")
        self.kprq = user.get("kprq", "")
        self.fpje = user.get("fpje", "")
        self.token = user.get("token", "")
        self.userid = user.get("userid", "")
        self.fpdm_area = self.fpdm[0:4]
        self.fpdm_url = AREA.get(self.fpdm_area, "")
        self.suiji = str(int(round(time.time() * 1000)))

        self.codeUrl = self.fpdm_url + '/WebQuery/yzmQuery?callback=jQuery110204713398352365614_' + self.suiji + '&fpdm=' + self.fpdm + '&r=' + str(
            '%.16f' %
            (random.random())) + '&v=V1.0.04_001' + '&nowtime=' + str(
                int(round(time.time() * 1000))
            ) + '&publickey=B8EE27C2CFEABABBD1DB92F4D84E4EA3&_=' + str(
                int(round(time.time() * 1000)))

        self.result = user.get("result", "")
        self.GJJInfo = []
        self.PerInfo = {}
Esempio n. 4
0
 def __init__(self):
     """
     初始化
     :param spider:
     :return:
     """
     # 线程数
     self.thread_num = THREAD_NUM
     # 队列数:1
     self.thread_q_size = THREAD_Q_SIZE
     # redis操作类
     self.redisUtils = RedisUtils()
Esempio n. 5
0
    def __init__(self, user):
        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""

        self.keyword = user.get("keyword", "")
        # self.gjjaccnum = self.username if len(self.username) <= 15 else ""
        # self.pwd = user.get("password", "")
        self.age = FANGAGE.get(user.get('age',''),'') or FANGAGE.get(user.get('year',''), '')
        self.token = user.get("token", "")
        self.flower = LOUCENG.get(user.get('flower',''),'') or LOUCENG.get(user.get('floor',''),'')
        self.hu_type = HUXING.get(user.get('hu_type',''),'') or HUXING.get(user.get('housetype',''),'')
        # self.userid = user.get("userid", "")

        self.startUrl = "http://esf.nb.fang.com/NewSecond/sale_info/searchlist_new2014.aspx"
        self.hostUrl = "http://esf.nb.fang.com/"
        self.result = user.get("result", "")
        self.GJJInfo = []
Esempio n. 6
0
def threadWork(t):
    """
    进程工作,用来启动多个线程
    :param x:
    :return:
    """
    redisUtils = RedisUtils()
    thread_pool = ThreadPool(THREAD_NUM * 10, q_size=THREAD_Q_SIZE)
    dict_json = redisUtils.getCons()
    logger.debug("Now have tasks -> " + str(dict_json))
    if dict_json != []:
        try:
            for i in dict_json:
                spider_name = re.findall('spider_(.*):task', i)[0]
                # for i in [i.split(":")[0] for i in dict_json]:
                requests = makeRequests(imptask, [spider_name])
                thread_pool.putRequest(requests[0])
            thread_pool.wait()
        except Exception as e:
            logger.error(e)
    else:
        time.sleep(1)
Esempio n. 7
0
    def __init__(self, dict_json, key=[], verifycode_type='png'):
        self.redisUtils = RedisUtils()
        self.damatuWeb = damatuWeb
        self.PROXYADDR = PROXYADDR
        self.dict_json = dict_json
        self.token = self.dict_json['token']
        self.verifycode_type = verifycode_type
        self.status = None
        self.desc = None

        self.current_milli_time = lambda: str(int(round(time.time() * 1000)))
        self.startTime = self.current_milli_time()

        self.realpath = os.path.split(os.path.realpath(__file__))[0]
        filename = 'verifycode/%s_verifycode.%s' % (self.startTime,
                                                    verifycode_type)
        self.code_path = os.path.join(self.realpath, filename)

        logging.config.fileConfig('unicom/logging.config')
        self.logger = logging.getLogger('flow')

        # 是否保存用户目录,默认为爬虫出错才保存,也可以手动修改
        self.rmuserdirFlag = False
        self.mkUserdir(key)
Esempio n. 8
0
    def __init__(self, user):
        self.url_area = {
            "北京": "http://bj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "天津": "http://tj.gsxt.gov.cn/index.html",
            "河北": "http://he.gsxt.gov.cn/notice/",
            "山西": "http://sx.gsxt.gov.cn/index.jspx",
            "内蒙古": "http://nm.gsxt.gov.cn:58888/",
            "辽宁": "http://ln.gsxt.gov.cn/saicpub/",
            "吉林": "http://jl.gsxt.gov.cn/",
            "黑龙江": "http://hl.gsxt.gov.cn/index.jspx",
            "上海": "http://sh.gsxt.gov.cn/notice",
            "江苏": "http://www.jsgsj.gov.cn:58888/province/",
            "浙江": "http://zj.gsxt.gov.cn/client/entsearch/toEntSearch",
            "安徽": "http://ah.gsxt.gov.cn/index.jspx",
            "福建": "http://fj.gsxt.gov.cn/notice",
            "江西": "http://jx.gsxt.gov.cn/",
            "山东": "http://sd.gsxt.gov.cn/",
            "广东": "http://gd.gsxt.gov.cn/",
            "广西": "http://gx.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml",
            "海南": "http://hi.gsxt.gov.cn/index.jspx",
            "河南": "http://ha.gsxt.gov.cn/index.jspx",
            "湖北": "http://hb.gsxt.gov.cn/index.jspx",
            "湖南": "http://hn.gsxt.gov.cn/notice/",
            "重庆": "http://cq.gsxt.gov.cn/",
            "四川": "http://sc.gsxt.gov.cn/notice/",
            "贵州": "http://gz.gsxt.gov.cn/",
            "云南": "http://yn.gsxt.gov.cn/notice/",
            "西藏": "http://xz.gsxt.gov.cn/index.jspx",
            "陕西": "http://sn.gsxt.gov.cn/ztxy.do?method=index&random=",
            "甘肃": "http://gs.gsxt.gov.cn/gsxygs/",
            "青海": "http://qh.gsxt.gov.cn/index.jspx",
            "宁夏": "http://nx.gsxt.gov.cn/",
            "新疆": "http://xj.gsxt.gov.cn/sydq/loginSydqAction!sydq.dhtml"
        }

        self.session = requests.session()
        self.redisUtils = RedisUtils()
        self.PROXYADDR = PROXYADDR
        self.bcode = NOT_NEED_BCODE
        self.status = CRAWL_READY
        self.desc = ""
        self.area = user.get("area", "")

        self.keyword = user.get("idCard", "")
        self.token = user.get("token", "")
        self.LoginUrl = self.url_area.get(
            self.area,
            "") if self.area else "http://www.gsxt.gov.cn/index.html"
        self.result = user.get("result", "")
        self.GJJInfo = []
        self.br = self.get_webdriver("chrome")

        self.br.dc = DriverClean(1, time.time(), self.br.service.process.pid,
                                 self.br)
        self.br.get1 = MethodType(get1, self.br, webdriver.Chrome)
        self.br.find_element_by_xpath1 = MethodType(find_element_by_xpath1,
                                                    self.br, webdriver.Chrome)

        global globallogger
        globallogger = self.logger

        # self.br.maximize_window()
        # self.br.set_window_size(1300,900)
        # self.proxy = self._proxy()
        # proxy=webdriver.Proxy()
        # proxy.proxy_type=ProxyType.MANUAL
        # proxy.http_proxy=self.proxy
        # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中

        # proxy.add_to_capabilities(self.dcap)
        # self.br.start_session(self.dcap)
        # self.br.get('http://httpbin.org/ip')
        # print self.br.page_source
        self.wait = WebDriverWait(self.br, 10, 0.5)
        self.br.set_page_load_timeout(10)
        self.br.set_script_timeout(15)
        self.br.implicitly_wait(10)