Esempio n. 1
0
 def __init__(self, info):
     SessionRequests.__init__(self)
     self.info = info
     self.select_user_agent('firefox')
     self._con = None
     self.onl = OnlineOCR(info['prov'])
     port = int(info['prov']) / 10000 + 8000
     #self.onl.server = "http://win.haohaogame.com:%d/codeocr" % port
     self.onl.server = "http://192.168.0.10:%d/codeocr" % port
     self.code = None
Esempio n. 2
0
def test_fetch_paper(url):
    rq = SessionRequests()
    con = rq.request_url(url)
    print con.headers
    print con.cookies
    print con.text
    con = rq.request_url(url)
    print con.text
    print con.headers
    print con.cookies
Esempio n. 3
0
def check_yantian_cookies():
    sq = SessionRequests()
    con = sq.request_url(
        'http://www.shenpan.cn/cpws/writopenlist.aspx?typeString=')
    print con.headers
    print con.cookies
    # print con.content
    yt = YantianGenQueries()
    yt.get_form_values()
    yt.show()
Esempio n. 4
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self.request = SessionRequests()
     self.view_state = None
     self.event_valid = None
     self.rand = None
     self.loc = "浙江"
     self.data_file = FileSaver("浙江_data.txt")
     self.have_get_url_file = FileSaver("浙江_get_url.txt")
     self.init_already()
     self.login("38037395", "773950")
Esempio n. 5
0
def load_url_with_cookies(url, cookies):
    rq = SessionRequests()
    for c in cookies:
        rq.add_cookie(c['domain'],
                      c['name'],
                      c['value'],
                      path=c['path'],
                      secure=c['secure'])
    con = rq.request_url(url)
    if con:
        print con.text
        print con.cookies
Esempio n. 6
0
 def init_req(self, proxies={}):
     req = getattr(self._curltls, "req", None)
     if req is None:
         req = SessionRequests()
         url = urls[random.randrange(0, len(urls))]
         while True:
             con_init = req.request_url(url, proxies=proxies)
             if con_init is None or con_init.code != 200:
                 print "...初始化失败..."
             else:
                 break
         setattr(self._curltls, "req", req)
         setattr(self._curltls, "init_url", url)
     return req
Esempio n. 7
0
 def get_session_request(self):
     sr = SessionRequests()
     with self.locker:
         if not isinstance(self.networker.sp_proxies, dict) or len(self.networker.sp_proxies.keys()) == 0:
             return sr
         if self.networker._auto_change_proxy:
             prs = self.networker.sp_proxies.keys()
             for i in range(0, len(prs)):
                 self.networker._cur_proxy_index = (self.networker._cur_proxy_index+1) % len(prs)
                 selproxy = prs[self.networker._cur_proxy_index]
                 if self.networker.sp_proxies.get(selproxy, 0) <= 10:
                     sr.set_proxy(selproxy, index=0, auto_change=False)
                     break
         elif self.networker._cur_proxy_index < 0:
             pass
             # don't auto change proxy, and the index < 0, no proxy is used.
             # but don't report an error.
         else:
             prs = self.networker.sp_proxies.keys()
             selproxy = prs[self.networker._cur_proxy_index % len(prs)]
             self.networker.set_proxy(selproxy, index=0, auto_change=False)
     return sr
Esempio n. 8
0
def test_post_list():
    ll = []
    with open('jobs', 'r') as f:
        for s in f:
            ll.append(s)
    l = ll[3]
    print l
    v = eval(l.encode('utf-8'))
    if not isinstance(v, dict):
        return
    p = copy.deepcopy(v['param'])
    if not isinstance(p, dict):
        return
    print p.keys()
    # p['__EVENTARGUMENT'] = ''
    p['hdPageIndex'] = '10'
    # p.pop('hdPageIndex')
    # p.pop('__VIEWSTATE')
    if p.has_key('__LASTFOCUS'):
        p.pop('__LASTFOCUS')
    # p.pop('__EVENTVALIDATION')
    p['__EVENTTARGET'] = 'btnNext'

    print len(p)
    rq = SessionRequests()
    # con = rq.request_url(v['url'])
    # if con and con.text:
    #     m = re.search(r'<title.*', con.text)
    #     if m:
    #         print m.group()
    #     else:
    #         print len(con.text)
    data = urlencode(p)
    con = rq.request_url(v['url'], data=p)
    print p
    if con and con.text:
        m = re.search(r'<title.*', con.text)

        print con.text
Esempio n. 9
0
    def test_post(self):
        req = SessionRequests()
        res = req.request_url(self.main_url + 'update',
                              data={
                                  'encrypt':
                                  'LuJAxGaUMqnDOARGzY9zIe0Rd41opkL7',
                                  'key': 'mumas',
                                  'value': '192.168.1.251'
                              })

        print res.headers
        print res.text
        res = req.request_url(self.main_url + 'update',
                              data={
                                  'encrypt':
                                  'LuJAxGaUMqnDOARGzY9zIe0Rd41opkL7',
                                  'key': 'mumaas',
                                  'value': '192.168.1.251'
                              })

        print res.headers
        print res.text
Esempio n. 10
0
class check(object):
    def __init__(self):
        self.sessionReq = SessionRequests()
        self.sessionReq.load_proxy('../_51job/proxy')

    def start(self):
        print 'begin crawler!'
        url = r"http://gaokao.chsi.com.cn/zsjh/search.do?ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=11&year=2016&yxdm=10001"
        header = {
            "User-Agent":
            r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0",
            "Host": r"gaokao.chsi.com.cn",
            "Referer": r"http://gaokao.chsi.com.cn/zsjh/"
        }
        while True:
            i = random.randint(1, 60)
            print i
            con = self.sessionReq.request_url(url, headers=header)
            if con is None or con.text.strip() is "":
                print "Nothing back!readd job"
                time.sleep(i)
                continue
            if r"302 Found" in con.text:
                print "302 Found!"
                time.sleep(i)
                continue
            if r"403 Forbidden" in con.text:
                print "403 Forbidden!"
                time.sleep(i)
                continue
            if "专业名称" not in con.text or "计划数" not in con.text:
                print "阳光高考还没更新!"
                time.sleep(i)
                continue
            if "专业名称" in con.text and "计划数" in con.text:
                print "阳光高考已经更新!"
                break
        spider.util.sendmail(['*****@*****.**'], '阳光高考可以爬了!', "请迅速去抓该网站!")
Esempio n. 11
0
    def test_session():
        cnt = 10
        random.seed = int(time.time())
        threads = []
        req = SessionRequests()
        for i in range(0, cnt):
            t = threading.Thread(target=_session_test, args=(req, i))
            threads.append(t)
        for t in threads:
            t.start()

        time.sleep(5)
        for t in threads:
            t.join()
Esempio n. 12
0
def get_code(image_content):
    retry = 3
    while retry >= 0:
        try:
            #联众帐号,密码
            user = "******"
            pwd = "15004622415"

            #软件key 或者作者帐号
            soft_key = "fpTof8NxP4FTFOTp6Tfi3ik6TxtfEOkE68nE3foK"
            s = SessionRequests()
            lz = Lianzhong(s, user, pwd, soft_key)

            vcode, vid = lz.create(image_content)
            return vcode
        except:
            traceback.print_exc()
            retry -= 1
            raise RuntimeError()
Esempio n. 13
0
class X315Spider(Spider):
    def init_req(self):
        self.sesnreq.load_proxy("../gongshangju/proxy_all.txt")
        self.sesnreq.select_user_agent("firefox")
        self.sesnreq.request_url("http://www.x315.com/index")
        con = self.sesnreq.request_url(
            "http://s4.cnzz.com/stat.php?id=1256666136&show=pic1")
        jsessionid = con.cookies[0].split("\t")[-1]
        jsscript = "var document={};var window = {};" + con.text + "console.log(document.cookie);"
        f = open("login.js", "w+b")
        f.write(jsscript)
        f.close()
        os.system("nodejs login.js>cookie.txt")
        f = open("cookie.txt", "r+b")
        cookiestr = f.read()
        self.cookiestr = urllib.unquote(
            re.search("(CNZZDATA.*?;)", cookiestr).group(1) + "JSESSIONID=" +
            jsessionid + ";")
        print self.cookiestr

    def __init__(self, threadcnt):
        Spider.__init__(self, threadcnt)
        self.sesnreq = SessionRequests()
        self.sesnreq.load_proxy("../gongshangju/proxy1.txt", 0, False)
        self.sesnreq.select_user_agent("firefox")
        # self.init_req()

    def dispatch(self):
        currline = 0
        skipto = 0
        endline = 100000
        with open(os.environ["HOME"] + "/r1.txt", "rb") as f:
            while currline < skipto:
                line = f.readline()
                currline += 1
            while currline < endline:
                line = f.readline().strip()
                key = line.split(" ")[2]
                job = {
                    "type": "t1",
                    "key": key,
                    "line": line,
                    "lineno": currline
                }
                self.add_main_job(job)
                currline += 1
        self.add_main_job(None)

    def jobrunner1(self, job):
        con = self.sesnreq.request_url(
            "http://www.x315.com/",
            headers={"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"})
        # self.sesnreq.request_url("http://s4.cnzz.com/stat.php?id=1256666136&show=pic1")
        url = r"http://www.x315.com/quicksearch?qk=%s&t=1&z=&timestamp=%s" % (
            "富士康", str(time.time()).split(".")[0] +
            str(time.time()).split(".")[1][:3])
        header = {
            "Accept":
            r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Connection": "Keep-alive",
            "Content-Type": "application/x-www-form-urlencoded",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Referer": "http://www.x315.com/",
            "X-Requested-With": "XMLHttpRequest"
        }
        con = self.sesnreq.request_url(url, headers=header)
        print con.text
        if ur"查询过于频繁" in con.text:
            if not self.re_add_job(job):
                Log.error("readd job failed.==>" + utf8str())
            Log.info("查询过于频繁,sleep 10 s.")
            time.sleep(10)
Esempio n. 14
0
 def __init__(self):
     self.sessionReq = SessionRequests()
     self.sessionReq.load_proxy('../_51job/proxy')
Esempio n. 15
0
class GK66(Spider):
    """这是使用自己的多线程框架重构的gk66网站爬取代码,没有针对所有省份进行处理,只能针对特定省份的账号密码进行爬取,验证码也是手动的,未进行处理"""
    def __init__(self, thcnt):
        Spider.__init__(self, thcnt)
        self.request = SessionRequests()
        self.view_state = None
        self.event_valid = None
        self.rand = None
        self.loc = "浙江"
        self.data_file = FileSaver("浙江_data.txt")
        self.have_get_url_file = FileSaver("浙江_get_url.txt")
        self.init_already()
        self.login("38037395", "773950")

    def init_already(self):
        cnt = 0
        with open("浙江_get_url.txt") as f:
            for line in f:
                line = line.strip()
                have_get_url.add(line)
                cnt += 1
        print "初始化已经爬过的链接 ", cnt

    def login(self, username, password):
        # 验证码
        captcha_url = "http://14wj.gk66.cn/ashx/code.ashx"
        rand = None
        while True:
            res = self.request.request_url(captcha_url)
            if res is not None and res.code == 200:
                spider.util.FS.dbg_save_file("captcha.jpeg", res.content)
                rand = raw_input("login ---> 请输入验证码:")  #get_code(res.text) #
                if rand == "retry":
                    continue
                #break

                login_url = "http://14wj.gk66.cn/ashx/login.ashx"
                data = {
                    "username": username,
                    "password": password,
                    "rand": rand,
                    "rempass": "******"
                }
                while True:
                    con = self.request.request_url(login_url, data=data)
                    if con is None or con.code != 200:
                        continue
                    content = con.content
                    r_type = eval(content)[0]["type"]
                    if r_type == "2":
                        print "登陆失败:", spider.util.utf8str(content)
                        if u"验证码错误".encode("gb2312") in content:
                            break
                        continue
                    else:
                        self.view_state, self.event_valid = self.prepare_param(
                        )
                        self.rand = self.prepare_rand()
                        if self.view_state is None or self.event_valid is None or self.rand is None:
                            print "---未获取到view_state event_valid rand ---"
                            continue
                        print "登陆成功..."
                        return
                continue
            else:
                print "login 获取验证码图片失败,", "res is None" if res is None else "res.code = %d" % res.code

        # login_url = "http://14wj.gk66.cn/ashx/login.ashx"
        # data = {"username": username,
        #         "password": password,
        #         "rand": rand,
        #         "rempass": "******"}
        # while True:
        #     con = self.request.request_url(login_url, data=data)
        #     if con is None or con.code != 200:
        #         continue
        #     content = con.content
        #     r_type = eval(content)[0]["type"]
        #     if r_type == "2":
        #         print "登陆失败:", spider.util.utf8str(content)
        #         continue
        #     else:
        #         self.view_state, self.event_valid = self.prepare_param()
        #         self.rand = self.prepare_rand()
        #         if self.view_state is None or self.event_valid is None or self.rand is None:
        #             print "---未获取到view_state event_valid rand ---"
        #             continue
        #         print "登陆成功..."
        #         break

    def prepare_param(self):
        search_url = "http://14wj.gk66.cn/wj/fs.aspx"
        res = self.request.request_url(search_url)
        if res is not None and res.code == 200:
            fs_page = res.content
            soup = BeautifulSoup(fs_page, 'html5lib')
            view_state = soup.find(attrs={'id': "__VIEWSTATE"}).get("value")
            event_valid = soup.find(attrs={
                'id': "__EVENTVALIDATION"
            }).get("value")
            print "view_state=", view_state, "event_valid=", event_valid
            return view_state, event_valid
        return None, None

    def prepare_rand(self):
        global FILE_NAME_1
        captcha_url_2 = "http://14wj.gk66.cn/ashx/codewj.ashx"
        while True:
            con = self.request.request_url(captcha_url_2)
            if con is None or con.code != 200:
                print "prepare_rand 请求错误...", "结果为空" if con is None else "http code = %d" % con.code
                continue
            spider.util.FS.dbg_save_file("captcha2.jpeg", con.content)
            rand = raw_input(
                "prepare_rand ---> 请输入验证码:")  #get_code(con.content)
            if rand == "retry":
                continue
            return rand

    def logout(self):
        logout_url = "http://www.gk66.cn/loginout.aspx"
        self.request.request_url(logout_url)

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        nf_list = ["14"]  #["06" , "07", "08", "09", "10", "11"]     # 年份
        wl_list = ["w", "l"]  # 文科 理科
        bz_list = ["b", "z"]  # 本科 专科
        for fs in range(732, 810):
            print "分数:", fs
            for nf in nf_list:
                for wl in wl_list:
                    for bz in bz_list:
                        data = {
                            "fs": fs,
                            "nf": nf,
                            "wl": wl,
                            "bz": bz,
                            "pc": "",
                            "ImageButton1.x": 98,
                            "ImageButton1.y": 13
                        }
                        #print "合成请求数据:", data
                        job = {"data": data}
                        self.add_main_job(job)
                        time.sleep(0.1)
        self.wait_q_breakable()
        self.add_job(None, True)

    def run_job(self, job):
        data = job["data"]
        data["__VIEWSTATE"] = self.view_state
        data["__EVENTVALIDATION"] = self.event_valid
        data["rand"] = self.rand
        retry = 3
        while retry >= 0:
            try:
                self.loop_exec(data)
                break
            except:
                traceback.print_exc()
                print "出错,sleep 1s"
                time.sleep(1)
                retry -= 1
                try:
                    self.logout()
                except:
                    pass
                self.login("38037395", "773950")

    def loop_exec(self, data):
        try:
            while True:
                url = self.build_search_url(data)
                if url != None:
                    break
                else:
                    self.login("38037395", "773950")
        except Exception as e:
            print "build_search_url failure ...", e
            return
        page_break = False
        last_v = {}
        if url in have_get_url:
            print "已经爬取,pass"
            return
        if "http://14wj.gk66.cn/login.aspx?" in url:
            raise RuntimeError()
        for page in range(1, 1000):
            if page_break:
                break
            exec_url = url + "&s=0&page=" + str(page)
            print "执行链接:", exec_url
            datas = self.get_score_data(exec_url, page_break=page_break)
            if len(datas) < 20:
                page_break = True
            for v in datas:
                if v is None:
                    page_break = True
                    break
                v["location"] = self.loc
                v["year"] = data["nf"]
                v["wl"] = data["wl"]
                v["bz"] = data["bz"]
                if (str(last_v) == str(v)):
                    page_break = True
                    break
                last_v = v
                k = {
                    "location": v["location"],
                    "school": v["school"],
                    "spec": v["spec"],
                    "batch": v["batch"],  # 批次
                    "score": v["score"],
                    "year": v["year"],
                    "wl": v["wl"],
                    "bz": v["bz"]
                }
                print v
                self.data_file.append(spider.util.utf8str(v))
                #store_score(k, v)
        self.recorde_spided(url)

    def recorde_spided(self, url):
        self.have_get_url_file.append(url)
        have_get_url.add(url)

    def get_score_data(self, data_url, page_break=False):
        try:
            page_content = None
            while True:
                res = self.request.request_url(data_url)
                if res is not None and res.code == 200:
                    page_content = res.content
                    if u"对不起,请先登录".encode("gb2312") in page_content:
                        self.logout()
                        self.login("38037395", "773950")
                        continue
                    break
                else:
                    print "获取页面出错>..", "res is None" if res is None else "res.code == %d " % res.code
                    continue
            datas = []
            if string.find(page_content, u"相近分数".encode("gb2312")) > 0:
                print "该页面没有数据"
                return datas
            soup = BeautifulSoup(page_content, 'html5lib')
            rows = soup.findAll("tr")
            if rows is not None and len(rows) > 0:
                if len(rows) != 20:
                    page_break = True
                for row in rows:
                    cols = row.findAll("td")
                    if cols is not None and len(cols) == 13:
                        data = {}
                        data["school"] = cols[0].getText()
                        data["spec"] = cols[1].getText()
                        data["rank"] = cols[2].getText()
                        data["score"] = cols[3].getText()
                        data["batch"] = cols[4].getText()
                        data["score_number"] = cols[5].getText()
                        data["spec_number"] = cols[6].getText()
                        data["high_score"] = cols[7].getText()
                        data["high_score_rank"] = cols[8].getText()
                        data["low_score"] = cols[9].getText()
                        data["low_score_rank"] = cols[10].getText()
                        data["average_score"] = cols[11].getText()
                        data["average_score_rank"] = cols[12].getText()
                        datas.append(data)
                return datas
            else:
                print "页面无内容:", page_content
        except Exception as e:
            print "get_score_data 发生异常", e
            return None

    def build_search_url(self, data):
        search_url = "http://14wj.gk66.cn/wj/fs.aspx"
        headers = {
            "User-Agent":
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .N ET4.0C; .NET4.0E)"
        }
        resp = self.request.request_url(search_url, data=data, headers=headers)
        headers = resp.headers
        m = re.search("Location:(.*)\r\n", headers)
        if m:
            location = m.group(1).strip()
            return "http://14wj.gk66.cn" + location
        else:
            return None
        #location = resp.headers["Location"]

    def store_score(self, value):
        print filename + ' being write-->', value
        obj = Score_gk66.objects(
            location=value["location"],
            year=value["year"],
            bz=value["bz"],
            wl=value["wl"],
            school=value['school'],
            spec=value['spec'],
            rank=value['rank'],
            score=value['score'],
            batch=value["batch"],
            score_number=value['score_number'],
            spec_number=value['spec_number'],
            high_score=value['high_score'],
            high_score_rank=value['high_score_rank'],
            low_score=value['low_score'],
            low_score_rank=value['low_score_rank'],
            average_score=value['average_score'],
            average_score_rank=value['average_score_rank']).no_cache().timeout(
                False).first()
        if not obj:
            obj = Score_gk66(location=value["location"],
                             year=value["year"],
                             bz=value["bz"],
                             wl=value["wl"],
                             school=value['school'],
                             spec=value['spec'],
                             rank=value['rank'],
                             score=value['score'],
                             batch=value["batch"],
                             score_number=value['score_number'],
                             spec_number=value['spec_number'],
                             high_score=value['high_score'],
                             high_score_rank=value['high_score_rank'],
                             low_score=value['low_score'],
                             low_score_rank=value['low_score_rank'],
                             average_score=value['average_score'],
                             average_score_rank=value['average_score_rank'])
            obj.save()
            self.num_count += 1
            print "保存成功:", value
        else:
            print u"数据已存在"

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)
Esempio n. 16
0
class YggaokaoRetry(Spider):
    def __init__(self, threadcnt):
        super(Yggaokao, self).__init__(threadcnt)
        self.sessionReq = SessionRequests()

    def dispatch(self):
        f = open("prov_list", "r+b")
        currline = 0
        skipto = 0
        endline = 10000

        for line in f:
            currline += 1
            if currline >= skipto:
                sySsdm = line.split(" ")[0].strip()
                job = {
                    "sySsdm": sySsdm,
                    "year": "2014",
                    "start": 0,
                    "type": "u1"
                }
                self.add_main_job(job)
            if currline >= endline:
                break
        self.wait_q()
        self.add_job(None, True)

    def retry(self, con, job):
        if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search(
                u'Tinyproxy was unable to', con.text):
            #should reload this page.
            if int(job["retrycnt"]) < 5:
                job["retrycnt"] = int(job["retrycnt"]) + 1
                self.add_job(job)
                return True
        return False

    def save_sch_list(self, job, res):
        with self.locker:
            fr = open("prov/" + job["sySsdm"] + ".txt", "r+b")
            f = open("prov/" + job["sySsdm"] + ".txt", "a+b")
            schlist = re.findall(
                r'<tr bgcolor="#FFFFFF" onMouseOver="this.style.background=\'#FFFFEE\'" onMouseOut=\"this.'
                r'style.background=\'#ffffff\'">(.*?)</tr>', res, re.S)

            for schinfo in schlist:
                schstr = ""
                tds = re.findall(r"<td.*?>(.*?)</td>", schinfo, re.S)
                if len(tds) is 0:
                    spider.runtime.Log.error(
                        job["sySsdm"] + ", start at " + str(job["start"]) +
                        " match error! No td tag! Readd..\n")
                    self.re_add_job(job)
                    ferr = open(
                        "errors/" + job["sySsdm"] + "_" + str(job["start"]) +
                        ".html", "w+b")
                    ferr.write(res)
                    ferr.close()
                    f.close()
                    return
                schnamelist = re.findall(r'dhtml">(.*?)</a>', tds[0], re.S)
                if len(schnamelist) is 0:
                    schnamelist = []
                    schnamelist.append(tds[0].strip())
                    if schnamelist[0] is "":
                        spider.runtime.Log.error(
                            job["sySsdm"] + ", start at " + str(job["start"]) +
                            " match error! No school name! Readd..\n")
                        self.re_add_job(job)
                        ferr = open(
                            "errors/" + job["sySsdm"] + "_" +
                            str(job["start"]) + ".html", "w+b")
                        ferr.write(res)
                        ferr.close()
                        f.close()
                        return
                schname = schnamelist[0]
                if schname in fr.read():
                    print 'skip...', schname, ", in", job["sySsdm"], ".txt"
                    fr.close()
                    f.close()
                    return
                schstr += schname
                if r'span985">985</span>' in tds[0]:
                    schstr += " 985"
                if r'span211">211</span>' in tds[0]:
                    schstr += " 211"
                if r'spanyan">研</span>' in tds[0]:
                    schstr += " 研"

                for i in range(len(tds))[1:(len(tds) - 1)]:
                    schstr += " " + tds[i]
                stucnt = re.findall(r"doDialog.*?\">(.*?)</a>",
                                    tds[len(tds) - 1], re.S)[0].strip()
                schstr += " " + stucnt
                f.write(schstr + "\n")
                f.flush()
            f.close()

    def save_sch_detail(self, job, res):
        if not os.path.exists("detail/" + job["sySsdm"]):
            os.makedirs("detail/" + job["sySsdm"])
        f = open("detail/" + job["sySsdm"] + "/" + job["yxdm"] + ".html",
                 "w+b")
        f.write(res)
        f.flush()
        f.close()

    def check_should_fetch(self, job):
        if (job["type"] is "u1"):
            return True
        else:
            if os.path.exists("detail/" + job["sySsdm"] + r"/" + job["yxdm"] +
                              ".html"):
                return False
            else:
                return True

    def run_job(self, job):
        if job["type"] is "u1":
            print "searching %s, start at %d" % (job["sySsdm"], job["start"])
            url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh.do?ccdm=&jhxzdm=&kldm=&searchType=1&ssdm=&" \
                  "sySsdm=%s&year=%s&yxmc=&start=%d" % (job["sySsdm"], job["year"], job["start"])
            header = {
                "User-Agent":
                r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36",
                "Referer": r"http://gaokao.chsi.com.cn/zsjh/zsjh2014.jsp",
                "Origin": r"http://gaokao.chsi.com.cn",
                "X-Requested-With": r"XMLHttpRequest",
                "Pragma": r"no-cache"
            }
            con = self.sessionReq.request_url(url, headers=header)

            if con is None or con.text.strip() == "":
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " nothing return ! Readd..\n")
                time.sleep(10)
                self.re_add_job(job)
                return
            res = con.text
            if r"302 Found" in res:
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " 302 Found ! Readd..\n")
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return

            elif re.search(u'无查询结果', res):
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " no data! Readd..\n")
                self.re_add_job(job)
                return
            elif re.search(ur'<H1>错误</H1>', res):
                time.sleep(3)
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " error occur ! Readd..\n")
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return
            else:
                if int(job["start"]) is 0:
                    m = re.search(r"if \(Num > (\d+)", res)
                    if m:
                        pgcnt = int(m.group(1))
                        while pgcnt > 1:
                            jobb = {
                                "sySsdm": job["sySsdm"],
                                "year": job["year"],
                                "start": (pgcnt - 1) * 20,
                                "type": "u1"
                            }
                            self.add_job(jobb)
                            pgcnt -= 1
                    else:
                        spider.runtime.Log.error(job["sySsdm"] +
                                                 ", start at " +
                                                 str(job["start"]) +
                                                 " no more page! Readd!\n")
                        self.re_add_job(job)
                        return

                yxdms = re.findall(r"doDialog\('(\d+)'", res, re.S)
                if len(yxdms) == 0:
                    spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                             str(job["start"]) +
                                             " no url! Readd.\n")
                    self.re_add_job(job)
                    return
                for yxdm in yxdms:
                    job2 = {
                        "yxdm": yxdm,
                        "sySsdm": job["sySsdm"],
                        "year": "2014",
                        "type": "u2"
                    }
                    if not self.check_should_fetch(job2):
                        print "skip...", job['sySsdm'], "/", job["yxdm"]
                    else:
                        self.add_job(job2)
                self.save_sch_list(job, res)
        elif job["type"] is "u2":
            url = r"http://gaokao.chsi.com.cn/zsjh/search.do?" \
                  r"ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=%s&year=%s&yxdm=%s" % (job["sySsdm"], job["year"], job["yxdm"])
            header = {
                "User-Agent":
                r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36"
            }
            con = self.sessionReq.request_url(url, headers=header)
            if con is None or con.text.strip() is "":
                print "Nothing back!readd job" + job.__str__()
                time.sleep(10)
                self.re_add_job(job)
                return
            if r"302 Found" in con.text:
                spider.runtime.Log.error(job.__str__() +
                                         " 302 Found ! Readd..\n")
                self.re_add_job(job)
                return
            res = con.text
            self.save_sch_detail(job, res)
Esempio n. 17
0
 def __init__(self, threadcnt):
     Spider.__init__(self, threadcnt)
     self.sesnreq = SessionRequests()
     self.sesnreq.load_proxy("../gongshangju/proxy1.txt", 0, False)
     self.sesnreq.select_user_agent("firefox")
Esempio n. 18
0
 def __init__(self, threadcnt):
     super(Yggaokao, self).__init__(threadcnt)
     self.sessionReq = SessionRequests()
     self.sessionReq.load_proxy('proxy')
Esempio n. 19
0
class Yggaokao(Spider):
    def __init__(self, threadcnt):
        super(Yggaokao, self).__init__(threadcnt)
        self.sessionReq = SessionRequests()
        self.sessionReq.load_proxy('proxy')

    def dispatch(self):
        f = open("prov_list", "r+b")
        currline = 0
        skipto = 0
        endline = 100

        for line in f:
            currline += 1
            if currline >= skipto:
                sySsdm = line.split(" ")[0].strip()
                job = {
                    "sySsdm": sySsdm,
                    "year": "2015",
                    "start": 0,
                    "type": "u1"
                }
                self.add_main_job(job)
            if currline >= endline:
                break
        self.wait_q()
        self.add_job(None, True)

    def check_sch_list(self, url, flag=True):
        if not os.path.exists("prov"):
            os.makedirs("prov")
        if not os.path.exists("prov/check.txt"):
            f = open('prov/check.txt', 'w')
            f.close()
        if (flag):
            with open("prov/check.txt") as file_:
                for line in file_:
                    if url in line:
                        return False
            return True
        f = open("prov/check.txt", "a+")
        f.write(url + "\n")
        f.close()

    def check_sch_detail(self, job):
        if not os.path.exists("detail/" + job["sySsdm"]):
            os.makedirs("detail/" + job["sySsdm"])
            return True
        files = os.listdir("detail/" + job["sySsdm"])
        if files.count(job["yxdm"] + ".html") == 0:
            return True
        return False

    def retry(self, con, job):
        if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search(
                u'Tinyproxy was unable to', con.text):
            #should reload this page.
            if int(job["retrycnt"]) < 5:
                job["retrycnt"] = int(job["retrycnt"]) + 1
                self.add_job(job)
                return True
        return False

    def save_sch_list(self, job, res):
        if not os.path.exists("prov"):
            os.makedirs("prov")
        with self.locker:
            f = open("prov/" + job["sySsdm"] + ".txt", "a+b")
            schlist = re.findall(
                r'<tr bgcolor="#FFFFFF" onMouseOver="this.style.background=\'#FFFFEE\'" onMouseOut=\"this.'
                r'style.background=\'#ffffff\'">(.*?)</tr>', res, re.S)

            for schinfo in schlist:
                schstr = ""
                tds = re.findall(r"<td.*?>(.*?)</td>", schinfo, re.S)
                if len(tds) is 0:
                    spider.runtime.Log.error(
                        job["sySsdm"] + ", start at " + str(job["start"]) +
                        " match error! No td tag! Readd..\n")
                    self.re_add_job(job)
                    ferr = open(
                        "errors/" + job["sySsdm"] + "_" + str(job["start"]) +
                        ".html", "w+b")
                    ferr.write(res)
                    ferr.close()
                    f.close()
                    return
                schnamelist = re.findall(r'dhtml">(.*?)</a>', tds[0], re.S)
                if len(schnamelist) is 0:
                    schnamelist = []
                    schnamelist.append(tds[0].strip())
                    if schnamelist[0] is "":
                        spider.runtime.Log.error(
                            job["sySsdm"] + ", start at " + str(job["start"]) +
                            " match error! No school name! Readd..\n")
                        self.re_add_job(job)
                        ferr = open(
                            "errors/" + job["sySsdm"] + "_" +
                            str(job["start"]) + ".html", "w+b")
                        ferr.write(res)
                        ferr.close()
                        f.close()
                        return
                schname = schnamelist[0]
                schstr += schname
                if r'span985">985</span>' in tds[0]:
                    schstr += " 985"
                if r'span211">211</span>' in tds[0]:
                    schstr += " 211"
                if r'spanyan">研</span>' in tds[0]:
                    schstr += " 研"

                for i in range(len(tds))[1:(len(tds) - 1)]:
                    schstr += " " + tds[i]
                stucnt = re.findall(r"doDialog.*?\">(.*?)</a>",
                                    tds[len(tds) - 1], re.S)[0].strip()
                schstr += " " + stucnt
                f.write(schstr + "\n")
                f.flush()
            f.close()

    def save_sch_detail(self, job, res):
        if not os.path.exists("detail/" + job["sySsdm"]):
            os.makedirs("detail/" + job["sySsdm"])
        f = open("detail/" + job["sySsdm"] + "/" + job["yxdm"] + ".html",
                 "w+b")
        f.write(res)
        f.flush()
        f.close()

    def run_job(self, job):
        i = random.randint(15, 30)
        if job["type"] is "u1":
            print "searching %s, start at %d" % (job["sySsdm"], job["start"])
            url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh--year-%s,searchType-1,sySsdm-%s,start-%d.dhtml" % (
                job["year"], job["sySsdm"], job["start"])
            #url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh.do?ccdm=&jhxzdm=&kldm=&searchType=1&ssdm=&" \
            #      "sySsdm=%s&year=%s&yxmc=&start=%d" % (job["sySsdm"], job["year"], job["start"])
            header = {
                "User-Agent":
                r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0",
                #"Referer":r"http://gaokao.chsi.com.cn/zsjh/",
                #"Origin":r"http://gaokao.chsi.com.cn",
                "Host": r"gaokao.chsi.com.cn",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept - Language": "zh-CN,zh;q=0.5",
                "Accept - Encoding": "gzip, deflate",
                "Connection": "keep-alive",
                "Cache - Control": "max-age=0"
                #"X-Requested-With":r"XMLHttpRequest",
                #"Pragma":r"no-cache"
            }
            con = self.sessionReq.request_url(url, headers=header)
            if con is None or con.text.strip() == "":
                print "result is None!"
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " nothing return ! Readd..\n")
                time.sleep(i)
                self.re_add_job(job)
                return
            res = con.text
            if r"302 Found" in res:
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " 302 Found ! Readd..\n")
                time.sleep(i)
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return
            elif r"403 " in con.text:
                print "403 Forbidden (操作被拒绝) 列表页"
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         "403 Forbidden ! Readd..\n")
                time.sleep(i)
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return
            elif re.search(u'无查询结果', res):
                print "无查询结果!"
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " no data! Readd..\n")
                time.sleep(i)
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return
            elif re.search(ur'<H1>错误</H1>', res):
                print "<H1>错误</H1>!"
                time.sleep(i)
                spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                         str(job["start"]) +
                                         " error occur ! Readd..\n")
                self.re_add_job(job)
                firstCraw = True
                setattr(self._curltls, 'firstCraw', firstCraw)
                return
            else:
                if int(job["start"]) is 0:
                    m = re.search(r"if \(Num > (\d+)", res)
                    if m:
                        pgcnt = int(m.group(1))
                        while pgcnt > 1:
                            jobb = {
                                "sySsdm": job["sySsdm"],
                                "year": job["year"],
                                "start": (pgcnt - 1) * 20,
                                "type": "u1"
                            }
                            self.add_job(jobb)
                            pgcnt -= 1
                        #else:
                        #    spider.runtime.Log.error(job["sySsdm"]+", start at "+ str(job["start"]) + " no more page! Readd!\n")
                        #    self.re_add_job(job)

                yxdms = re.findall(r"doDialog\('(\d+)'", res, re.S)
                if len(yxdms) == 0:
                    spider.runtime.Log.error(job["sySsdm"] + ", start at " +
                                             str(job["start"]) +
                                             " no url! Readd.\n")
                    time.sleep(i)
                    self.re_add_job(job)
                    return
                for yxdm in yxdms:
                    job2 = {
                        "yxdm": yxdm,
                        "sySsdm": job["sySsdm"],
                        "year": "2015",
                        "type": "u2"
                    }
                    self.add_job(job2)
                if self.check_sch_list(url):
                    self.save_sch_list(job, res)
                    self.check_sch_list(url, False)
                else:
                    print "该列表页已抓取过!"

        elif job["type"] is "u2":
            url = r"http://gaokao.chsi.com.cn/zsjh/search.do?" \
                  r"ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=%s&year=%s&yxdm=%s" % (job["sySsdm"], job["year"], job["yxdm"])
            header = {
                "User-Agent":
                r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0",
                "Host": "gaokao.chsi.com.cn",
                "Referer": "http://gaokao.chsi.com.cn/zsjh/",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept - Language": "zh-CN,zh;q=0.5",
                "Accept - Encoding": "gzip, deflate",
                "Connection": "keep-alive",
                "Cache - Control": "max-age=0"
            }
            if self.check_sch_detail(job):
                con = self.sessionReq.request_url(url, headers=header)
                if con is None or con.text.strip() is "":
                    print "Nothing back!readd job" + job.__str__()
                    time.sleep(i)
                    self.re_add_job(job)
                    return
                if r"302 Found" in con.text:
                    print "302 Found!"
                    spider.runtime.Log.error(job.__str__() +
                                             " 302 Found ! Readd..\n")
                    time.sleep(i)
                    self.re_add_job(job)
                    return
                if r"403 " in con.text:
                    print "403 Forbidden (操作被拒绝) 详情页"
                    spider.runtime.Log.error(job.__str__() +
                                             " 302 Found ! Readd..\n")
                    time.sleep(i)
                    self.re_add_job(job)
                    return
                res = con.text
                self.save_sch_detail(job, res)
            else:
                print job["sySsdm"] + ":" + str(
                    job["yxdm"]) + "-----该学校已抓取过!!!"