Example #1
0
class LiepinSpider(Spider2):
    def __init__(self, thcnt):
        Spider2.__init__(self, thcnt)
        self._name = 'jd_liepin'
        self.bs = BinSaver("liepin.%d.bin" % os.getpid())

    def init_jobs(self):
        self.add_main_job_range({}, 1, 9999999)

    def run_job(self, job):
        print "job is ", job
        #url = "http://m.liepin.com/hjob/%d/" % (job['value'])
        value = job['value']
        url = "http://job.liepin.com/%03d_%d/" % (int(value)/10000, int(value))
        res = self.request_url(url)
        if re.search(u'您访问的页面不存在或已删除',  res.text ):
            print job, "match nothing"
        elif re.search(u'该职位已结束', res.text):
            print job, "match ending"
        elif re.search(u'您查看的职位已过期', res.text):
            print job, "match timeout"
        else:
            print "saving %d ..." % job['value']
            name = '%s.%d.%d' % (self._name, job['value'], int(time.time()) )
            self.bs.append(name, res.text)
Example #2
0
 def __init__(self, tc):
     Spider.__init__(self, tc)
     self._logport = 5556
     # self.channel = 'gsid'
     # self.job_queue = 'gsid'
     self.savebin = BinSaver("gongshang.bin")
     self.faillog = open("fail_list.txt", "w+b")
Example #3
0
def main():
    try:
        opts, args = getopt.gnu_getopt(sys.argv[1:], 'o:m:pi:')
    except getopt.GetoptError as e:
        showusage()
        return 1

    outfile = None
    matchstr = ''
    printout = False
    index = -1
    for (n, v) in opts:
        if n == '-o':
            outfile = v
        if n == '-m':
            matchstr = v
        if n == '-p':
            printout = True
        if n == '-i':
            index = int(v)

    if len(args) == 0:
        showusage()
        return 1

    if outfile:
        fo = BinSaver(outfile)
        for fn in args:
            r = BinReader(fn)
            while True:
                (n, v) = r.readone()
                if n is None:
                    break
                if matchstr in v:
                    fo.append(n, v)
    else:
        for fn in args:
            if printout or index != -1:
                r = BinReader(fn)
            else:
                r = BinReader1(fn)
            findex = 0
            while True:
                (n, v) = r.readone()
                if n is None:
                    break
                if index != -1:
                    if findex == index:
                        if printout:
                            print v
                        else:
                            print n
                    elif findex > index:
                        break
                elif printout:
                    print n, v
                else:
                    print n
                findex += 1
 def __init__(self):
     self.bs = BinSaver("gsinfo_Guangdong_html.bin")
     self.pic = BinSaver("gsinfo_Guangdong_pic.bin")
     self.fs_QyxyDetail = FileSaver("gsinfo_guangdong_QyxyDetail.txt")
     self.fs_GSpublicityList = FileSaver(
         "gsinfo_guangdong_GSpublicityList.txt")
     self.fs_entityShow = FileSaver("gsinfo_guangdong_entityShow.txt")
     self.fs_guangzhou = FileSaver("gsinfo_guangdong_guangzhou.txt")
Example #5
0
 def dispatch(self):
     self.bs = BinSaver("jobui_job.bin")
     i = 133002626
     while i > 130000000:
         #131127307  131207901
         job = {"id": i, "retry_none": 0, "retry_500": 0}
         self.add_job(job, True)
         i -= 1
     self.wait_q_breakable()
     self.add_job(None, True)
Example #6
0
 def __init__(self, thcnt, need_srl=True, qf_name=None):
     Spider2.__init__(self, thcnt)
     #
     self.ce_fs = FileSaver("court_queries/check_error")
     self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid())
     self.log_fs = FileSaver("log")
     #
     self.qf_name = qf_name
     self._name = "%s"% self.qf_name.split("/")[1]
     self.srl = {}
     self.need_srl = need_srl
     pass
Example #7
0
 def __init__(self, channel, dburl=None):
     super(PageStoreBase, self).__init__(channel, dburl)
     self.testmode = False
     opath = self.getopath()
     t = time.localtime()
     folder = "%s/%s/%d" % (opath, self.channel, t.tm_year)
     fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon)
     os.system("mkdir -m 777 -p " + folder)
     self._ofn = "%s/%s.bin" % (folder, fnbase)
     self._ofnlog = "%s/%s_update.log" % (folder, fnbase)
     self.fssaver = BinSaver(self._ofn)
     self._hashcheck = spider.util.LocalHashChecker()
     self._docobjtls = threading.local()
     self.saved_count = 0
Example #8
0
class CourtParser(CWPParser):
    def __init__(self, channel, dist_file, name, parser):
        CWPParser.__init__(self, channel, name)
        self.bin_writer = BinSaver(dist_file)
        self.parser = parser

    def process_child_item(self, item):
        print 'saving', item['name']
        self.bin_writer.append(item['name'], item['value'])

    def parse_item(self, page):
        res = self.parser.parse(page['indexUrl'], page['content'][1])
        if res:
            return [res]
        return []
Example #9
0
class Job51Spider(Spider):
    def dispatch(self):
        self.bs = BinSaver("job51.bin")
        for i in range(45000000, 75000000):
            self.add_main_job(i)
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):
        print "job is ", jobid
        url = "http://search.51job.com/job/%d,c.html" % jobid
        res = self.request_url(url, [])
        if re.search(u'您选择的职位目前已经暂停招聘', res.text):
            print jobid, "match nothing"
        else:
            print "saving %d ..." % jobid
            self.bs.append('51job.%d' % jobid, res.text)
Example #10
0
 def __init__(self, thcnt):
     Spider.__init__(self, thcnt)
     self._user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:42.0) Gecko/20100101 Firefox/42.0'
     self.baseurl = 'http://www.ewt360.com/LNLQXX/SearchResult?act=mark'
     self.headers = {'Referer': self.baseurl}
     #scores = range(450,750+1) + range(449, 0, -1) + [0]
     scores = range(750, 0, -1) + [0]
     self.possmap = {
         'Years': range(2009, 2014 + 1),
         'WL': ['l', 'w'],
         'BZ': ['b', 'z'],
         'PiCi': 0,
         'Score': scores,
         'ProvinceCode': 0,
         'page': 1
     }
     self.bs = BinSaver("fo.bin")
     self.racer = RaceValueByKey()
Example #11
0
class CourtCAPParser(CAPParser):
    def __init__(self, channel, dist_file, name, parser):
        CAPParser.__init__(self, channel, name)
        self.bin_writer = BinSaver(dist_file)
        self.parser = parser

    def parse(self, page):
        res = self.parser.parse(page['indexUrl'], page['content'][1])
        if res:
            return [res]
        return []

    def pre_save(self, saver):
        pass

    def on_save(self, items):
        for item in items:
            print 'saving', item['name']
            self.bin_writer.append(item['name'], item['value'])
Example #12
0
 def __init__(self, *proxyfile):
     threadcnt = self.prepare_proxy(*proxyfile)
     Spider.__init__(self, threadcnt)
     if not os.path.exists("data1"):
         os.makedirs("data1")
     self.namefile = open(
         "data1/corpname." + str(time.time()).split(".")[0] + ".txt", "w+b")
     self.failfile = open(
         "data1/fail." + str(time.time()).split(".")[0] + ".txt", "w+b")
     self.binsaver = BinSaver("data1/gsinfo" +
                              str(time.time()).split(".")[0] + ".bin")
Example #13
0
 def dispatch(self):
     self.bs = BinSaver('joblagou.bin')
     for query in q:
         try:
             for jobid in self.getIds(query):
                 if isinstance(jobid, int):
                     jobid = str(jobid)
                 self.add_main_job(jobid)
         except Exception as e:
             continue
     self.wait_q()
     self.add_main_job(None)
Example #14
0
 def dispatch(self):
     self.bs = BinSaver("youzy_job.bin")
     f = open("url_cfraction918-.txt", "r")
     while True :
         line = f.readline()
         if line.strip():
             job = {"url":line.strip()}
             self.add_job(job, True)
         else:
             break
     f.close()
     self.wait_q_breakable()
     self.add_job(None, True)
Example #15
0
    def __init__(self, thcnt):
        self.proxy_mode = 2
        # 代理模式如下:
        # 0:使用固定代理,代理数=线程数
        # 1:使用单一ADSL切换 ,线程数自定义
        # 2:使用多个ADSL切换,线程数自定义
        # 3:通过API提取快代理,并将代理放入到队列,所有线程共享同一个代理,代理异常则切换并从队列中移除(目前没有做代理切换)
        if self.proxy_mode == 0:
            self.proxies_dict = [{'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889'},
                                 {'http': 'http://*****:*****@106.75.134.190:18889', 'https': 'https://*****:*****@106.75.134.190:18889'},
                                 {'http': 'http://*****:*****@106.75.134.191:18889', 'https': 'https://*****:*****@106.75.134.191:18889'},
                                 {'http': 'http://*****:*****@106.75.134.192:18889', 'https': 'https://*****:*****@106.75.134.192:18889'},
                                 {'http': 'http://*****:*****@106.75.134.193:18889', 'https': 'https://*****:*****@106.75.134.193:18889'},
                                 ]
            Spider.__init__(self, 100)
        elif self.proxy_mode == 1:
            self.proxies = {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'}
            #self.proxies = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            #self.proxies = {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}
            #self.proxis = {'http': 'http://*****:*****@106.75.134.189:18889', 'https': 'https://*****:*****@106.75.134.189:18889'}
            Spider.__init__(self, 100)
        elif self.proxy_mode == 2:
            self.proxies_dict = [#{'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'},
                                {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'},
                                 {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'},
                                {'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'}]
            Spider.__init__(self, 200)
        elif self.proxy_mode == 3:
            self.proxies_dict = []
            self.proxies = {}
            self.get_kuaidaili()
        elif self.proxy_mode == 4:
            Spider.__init__(self, 1)
            self.proxies_dict = []

        self._curltls = threading.local()
        self.shoudong_img = False #手动输入验证码
        self.saver = FileSaver("nacao_traversal_info_l.txt")
        self.already = FileSaver("nacao_traversal_info_already_l.txt")
        self.queries0 = FileSaver("nacao_traversal_queies_0_l.txt")
        self.bin_saver = BinSaver("nacao_captcha_image.bin")
        self.init_already()
        self.time_record = time.time()
        self.scs_record = 0
Example #16
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Sichuan_html.bin")
     self.fs = FileSaver("gsinfo_sichuan.txt")
Example #17
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Hunan_html.bin")
     self.fs = FileSaver("gsinfo_hunan.txt")
Example #18
0
 def __init__(self, thcnt):
     Spider2.__init__(self, thcnt)
     self._name = 'jd_liepin'
     self.bs = BinSaver("liepin.%d.bin" % os.getpid())
Example #19
0
class ZGcpwswSpider2(Spider2):

    def __init__(self, thcnt, need_srl=True, qf_name=None):
        Spider2.__init__(self, thcnt)
        #
        self.ce_fs = FileSaver("court_queries/check_error")
        self.docbin_fs = BinSaver("ws_data/ws.%d.bin"% os.getpid())
        self.log_fs = FileSaver("log")
        #
        self.qf_name = qf_name
        self._name = "%s"% self.qf_name.split("/")[1]
        self.srl = {}
        self.need_srl = need_srl
        pass

    def init_jobs(self):
        with open(self.qf_name) as fs:
            for line in fs:
                job = eval(line.strip())
                count = job.get("count")
                if count > ZGcpwswData.total_max_record:
                    for i in ZGcpwswData.data_order:
                        for j in ZGcpwswData.order_direction:
                            for k in range(ZGcpwswData.page_max_index):
                                copy_job = copy.deepcopy(job)
                                copy_job["jobid"]["data"]["Index"] = k + 1
                                copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                                copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j]
                                copy_job["jobid"]["data"]["Order"] = ZGcpwswData.data_order[i]
                                self.add_job(copy_job)

                elif ZGcpwswData.total_core_record < count <= ZGcpwswData.total_max_record:
                    for j in ZGcpwswData.order_direction:
                        for k in range(ZGcpwswData.page_max_index):
                            copy_job = copy.deepcopy(job)
                            copy_job["jobid"]["data"]["Index"] = k + 1
                            copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                            copy_job["jobid"]["data"]["Direction"] = ZGcpwswData.order_direction[j]
                            self.add_job(copy_job)

                elif 0 < count <= ZGcpwswData.total_core_record:
                    for k in range(ZGcpwswData.page_max_index):
                        copy_job = copy.deepcopy(job)
                        copy_job["jobid"]["data"]["Index"] = k + 1
                        copy_job["jobid"]["data"]["Page"] = ZGcpwswData.page_max_count
                        self.add_job(copy_job)

        print "=======finish loading job======"

    def run_job(self, jobid):
        time.sleep(0.1)
        if isinstance(jobid, dict):
            url = jobid.get("jobid").get("url")
            data = jobid.get("jobid").get("data")
            headers = jobid.get("jobid").get("headers")
            reg_count = int(jobid.get("count"))
            resp = None
            try:
                if self.need_srl:
                    nr = self.srl.get(getattr(self._tls, 'tid', 0))
                else:
                    nr = self.get_session_request()
                    self.set_cookie_passport(nr)
                # 由于文书网系统升级,所以每次请求前需要再请求两次,用于获取cookie passport
                resp = nr.request_url(url, data=data, headers=headers)
                if isinstance(resp, CurlReq.Response) and resp and resp.content:
                    result_list = json.loads(json.loads(resp.content))
                    if result_list:
                        # for record
                        ZGcpwswData.set_doc_count(data, len(result_list) - 1, self.log_fs)
                        # for record
                        for result in result_list:
                            if result.get("Count"):
                                new_count = int(result.get("Count"))
                                if new_count > reg_count:
                                    jobid["check_count"] = new_count
                                    self.ce_fs.append(json.dumps(jobid, ensure_ascii=False))
                            else:
                                name = '%s.%d' % (result.get(ZGcpwswData.doc_id), int(time.time()) )
                                self.docbin_fs.append(name, json.dumps(result, ensure_ascii=False))
                    else:
                        pass
                else:
                    # owing to network, return None, add to job
                    pass
            except Exception, e:
                # print "%s-%s"%(resp.text, data)
                pass

            time.sleep(1)
            self.re_add_job(jobid)
Example #20
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Gansu_html.bin")
     self.fs = FileSaver("gsinfo_Gansu.txt")
Example #21
0
 def dispatch(self):
     self.bs = BinSaver('joblagou.bin')
     for i in xrange(0, 1500000):
         self.add_main_job(str(i))
     self.wait_q()
     self.add_main_job(None)
Example #22
0
 def __init__(self):
     self.bs = BinSaver("gsinfo_Tianjin_html.bin")
     self.fs = FileSaver("gsinfo_tianjin.txt")
Example #23
0
 def __init__(self, channel, dist_file, name, parser):
     CAPParser.__init__(self, channel, name)
     self.bin_writer = BinSaver(dist_file)
     self.parser = parser
Example #24
0
class QycxbSpider(Spider):
    def __init__(self, threadcnt):
        Spider.__init__(self, threadcnt)
        self.sqs = {}
        self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin")

    def init_req(self):
        with self.locker:
            threadident = str(threading.currentThread().ident)
            sq = QycxbReq()
            # sq.load_proxy("../../_zhilian/curproxy0")
            # sq.load_proxy("../_zhilian/curproxy")
            # sq.select_user_agent("firefox")
            sq.default_headers = {"Connection": "keep-alive",
                                  "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                                  "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                                  "Accept-Encoding": "gzip, deflate",
                                  "Referer":"http://qiye.qianzhan.com/",
                                  "X-Requested-With":"XMLHttpRequest",
                                  "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:42.0) Gecko/20100101 Firefox/42.0",
                                  "Pragma":"no-cache",
                                  "Cache-Control":"no-cache",
                                  "Content-Type":"application/x-www-form-urlencoded; charset=UTF-8"}
            # con = sq.request_url("http://qiye.qianzhan.com/")
            con1 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList",
                                  data=r"oc_name=%E5%B9%BF%E5%B7%9E%E5%B8%82%E5%8D%97%E6%B2%99%E5%8D%8E%E5%B7%A5%E7%A0%94%E7%A9%B6%E9%99%A2&oc_area=&sh_searchType=1&od_orderby=0&page=1&pageSize=10")
            self.sqs[threadident] = sq
            setattr(self._curltls, "sq", sq)
            return sq

    def dispatch(self):
        f = open("/home/peiyuan/r1.txt", "rb")
        currline = 0
        skip = 0
        endline = 1000
        while currline < skip:
            line = f.readline()
            currline += 1

        while currline < endline:
            line = f.readline()
            key = line.strip().split(" ")[-1].strip()
            job = {"key": key, "type": "u1", "lineno": currline}
            self.add_main_job(job)
            currline += 1
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, job):
        time.sleep(5)
        threadident = str(threading.currentThread().ident)
        sq = getattr(self._curltls, "sq",None)
        if sq is None:
            sq = self.init_req()
        Log.info("Running job:" + util.utf8str(job.__str__()))
        if job["type"] == "u1":
            Log.info("Searching line %d" % job["lineno"])
            con = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/searchList",
                                 data={"oc_name": job["key"], "od_orderby": 0, "page": 1,
                                       "pageSize": 10, "oc_area": "",
                                       "sh_searchType": 1})
            if con is None or con.text.strip() == "" or con.code != 200:
                Log.error("[u1]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = ""
                try:
                    jsonobj = json.loads(con.text.strip())
                except ValueError as e:
                    Log.error("Json decode error. String is %s" % con.text)
                    return
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error("[u1]Request fail, succ flag is False. JOB=>" + util.utf8str(job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    corplist = jsonobj["lst"]
                    if len(corplist) == 0:
                        Log.error("Search return nothing. %d:%s, no data." % (job["lineno"], job["key"]))
                        return
                    else:
                        for corp in corplist:
                            jobb = {"type": "u2", "orgCode": corp["oc_orgCode"], "name": corp["oc_name"]}
                            self.add_job(jobb)

        if job["type"] == "u2":
            Log.info("Getting detail info about %s" % job["name"])
            timestr = "%f" % time.time()
            con0 = sq.request_url(r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s" % (
                timestr.split(".")[1], timestr.split(".")[0]))
            if con0 is None or con0.text.strip() == "" or con0.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            if not os.path.exists(threadident):
                os.mkdir(threadident)
            f = open(threadident + "/qycxb.js", "w+b")
            f.write(r'var window = {document : {cookie :"qznewsite.uid=' + sq.get_cookie(
                    "qznewsite.uid").strip() +'"}};  ' + con0.text + "console.log(window.__qzmcf())")
            f.flush()
            os.system("nodejs " + threadident + "/qycxb.js > " + threadident + "/mcfcode.txt")
            mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip()
            con1 = sq.request_url("http://qiye.qianzhan.com/orgcompany/SearchItemDtl",
                                  data={"mcfCode": mcfcode, "orgCode": job["orgCode"]})
            if con1 is None or con1.text.strip() == "" or con1.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" + util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = json.loads(con1.text.strip())
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error(
                            "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>" + util.utf8str(
                                    job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." % sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip())
                    Log.info("%s,%s,saved." % (job["name"], job["orgCode"]))
                    return
Example #25
0
 def __init__(self, threadcnt):
     Spider.__init__(self, threadcnt)
     self.sqs = {}
     self.binsaver = BinSaver("Qycxb" + str(time.time()).split(".")[0] + ".bin")
Example #26
0
 def dispatch(self):
     self.bs = BinSaver("qichacha.bin")
     self.add_job("墨麟")
     self.add_job("爱拼")
     self.wait_q()
     self.add_job(None, True)
Example #27
0
class PageStoreBase(PageStoreDB):
    class CurDoc(object):
        def __init__(self, content, getime, jdid, real_url):
            self.cur_content = content
            self.cur_getime = getime
            self.cur_jdid = jdid
            self.cur_url = real_url

    def __init__(self, channel, dburl=None):
        super(PageStoreBase, self).__init__(channel, dburl)
        self.testmode = False
        opath = self.getopath()
        t = time.localtime()
        folder = "%s/%s/%d" % (opath, self.channel, t.tm_year)
        fnbase = "%s_%d%02d" % (self.channel, t.tm_year, t.tm_mon)
        os.system("mkdir -m 777 -p " + folder)
        self._ofn = "%s/%s.bin" % (folder, fnbase)
        self._ofnlog = "%s/%s_update.log" % (folder, fnbase)
        self.fssaver = BinSaver(self._ofn)
        self._hashcheck = spider.util.LocalHashChecker()
        self._docobjtls = threading.local()
        self.saved_count = 0

    def getopath(self):
        dirs = ['/data/crawler/_files3_', '/opt/_test_store_']
        for di in dirs:
            if os.path.isdir(di) and os.access(di, os.W_OK):
                return di
        raise RuntimeError("no dir to write files.")

    def get_cur_doc(self):
        return getattr(self._docobjtls, 'doc', None)

    def set_cur_doc(self, content, getime, jdid, real_url):
        doc = PageStoreBase.CurDoc(content, getime, jdid, real_url)
        setattr(self._docobjtls, 'doc', doc)

    @staticmethod
    def mktime(year=2015, m=1, d=1, hour=0, minute=0, second=0):
        arr = [year, m, d, hour, minute, second, 0, 0, 0]
        for i in range(0, len(arr)):
            arr[i] = int(arr[i])
        return time.mktime(arr)

    def extract_content(self):
        raise NotImplementedError('virtual function called')

    def page_time(self):
        raise NotImplementedError('virtual function called')

    def check_should_fetch(self, jdid):
        indexUrl = "%s://%s" % (self.channel, jdid)
        return not self.find_new(indexUrl)

    def save_time_log(self, indexUrl, cur_tm):
        oldtime = self.get_page_time(indexUrl)
        if oldtime == cur_tm:
            return
        logstr = "%s %ld => %ld\n" % (indexUrl, oldtime, cur_tm)
        cutil.mp_append_log(self._ofnlog, logstr)

    def save(self, getime, jdid, real_url, content, fnpath=None, offset=None):
        global MIN_TIME_MSEC
        if getime > MIN_TIME_MSEC:
            raise RuntimeError("get time muse be in seconds.")
        if self._hashcheck.query(jdid) > 0:
            return True
        self.set_cur_doc(content, getime, jdid, real_url)

        try:
            pageDesc = self.extract_content()
            if not pageDesc:
                print "jdid: %s, pageDesc empty" % self.get_cur_doc().cur_jdid
                return False
            elif self.testmode:
                print pageDesc
            pageTime = self.page_time()
            if pageTime is None or pageTime < MIN_TIME_MSEC:
                raise RuntimeError("page time must be in msec")
            if isinstance(pageTime, float):
                pageTime = int(pageTime)
            if isinstance(pageDesc, unicode):
                pageDesc = pageDesc.encode('utf-8')
            contentSign = hashlib.md5(pageDesc).hexdigest()
            indexUrl = "%s://%s" % (self.channel, jdid)

            self.save_time_log(indexUrl, pageTime)
            # if there is an entry with this contentSign, update it with no need to save webpage in binfile.
            # otherwise update by indexUrl.
            if self.find_item(indexUrl, contentSign):
                Log.warning("%s exists in db, skip" % jdid)
                self.update_time(indexUrl, contentSign,
                                 int(getime) * 1000, pageTime)
                return True
            print "saving", indexUrl
            odoc = {
                'contentSign': contentSign,
                'indexUrl': indexUrl,
                'realUrl': real_url,
                'createTimeFlag': 1,
                'owner': self.channel,
                'createTimeTimeStamp': pageTime,
                'crawlerUpdateTime': int(getime) * 1000,
                'updateTime': pageTime,
                'status': 0,
                'isUpdated': 0,
                'isExpired': 0,
            }
            if self.testmode:
                pprint.pprint(odoc)
                return True
            else:
                if self.do_save(odoc, content, fnpath, offset):
                    print indexUrl, "saved"
                    self.saved_count += 1
                    self._hashcheck.add(jdid)
                    return True
                return False
        except Exception as e:
            print e
            traceback.print_exc()
            Log.error("failed to save %s %s" % (self.channel, jdid))
            time.sleep(5)
            return False

    def do_save(self, odoc, content, fnpath=None, offset=None):
        if isinstance(content, unicode):
            content = content.encode('utf-8')
        filepos = self.fssaver.append(
            "%s.%s.%d" % (self.channel, self.get_cur_doc().cur_jdid,
                          self.get_cur_doc().cur_getime), content)
        odoc.update({'pageContentPath': "binf::%s::%d" % (self._ofn, filepos)})
        return self.upsert_doc(odoc['indexUrl'], odoc)
Example #28
0
 def dispatch(self):
     self.bs = BinSaver("jobui_job.bin")
     for i in range(133002626, 133002636, 1):
         self.add_job(i, True)
     self.wait_q()
     self.add_job(None, True)
Example #29
0
 def dispatch(self):
     self.bs = BinSaver("job51.bin")
     for i in range(45000000, 75000000):
         self.add_main_job(i)
     self.wait_q()
     self.add_main_job(None)
Example #30
0
class JobuiSpider(Spider):
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("../spider/proxy/proxy.txt")
        Spider.__init__(self, len(self.proxies_dict))
        self.num_count = 0
        self.__fail_ids = FileSaver("fail_ids.txt")

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        self.bs = BinSaver("jobui_job.bin")
        i = 133002626
        while i > 130000000:
            #131127307  131207901
            job = {"id": i, "retry_none": 0, "retry_500": 0}
            self.add_job(job, True)
            i -= 1
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls, 'failcount', 0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc

    def run_job(self, jobid):
        jobid_int = jobid.get("id")
        url = "http://www.jobui.com/job/%d/" % (jobid_int)
        tid = self.get_tid()
        proxies = self.proxies_dict[tid]
        #print "this request tid = [ %s ] proxies = [ %s ]" % (tid,proxies)
        res = self.request_url(url, proxies=self.proxies_dict[self.get_tid()])

        self.num_count += 1

        #print "id : %d ------------- response code : %s " % (jobid_int, "Response Is None" if res is None else str(res.code))

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    tid, proxies)
                self.__fail_ids.append(str(jobid_int))
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (proxies, self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls, 'failcount', 0)

        if res.code == 404:
            print "%d ------ 404" % jobid_int
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d ------ %d " % (jobid_int, res.code)
            self.add_job(jobid)
            time.sleep(0.8)
            return
        elif res.code == 200:
            print "%d ------ saving " % jobid_int
            fn = 'jobui_job.%d.%d' % (jobid_int, int(time.time()))
            self.bs.append(fn, res.text)
            if self.bs.getsize() >= 8 * 1024 * 1024 * 1024:
                raise AccountErrors.NoAccountError('file too large')
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            Log.error("unknown error...")
            Log.errorbin("%s" % jobid_int, res.text)
            raise AccountErrors.NoAccountError('fatal error')

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I)
                if m:
                    prstr = m.group(1)
                    proxies = {
                        'http': 'http://' + prstr + "/",
                        'https': 'https://' + prstr + "/"
                    }
                    self.proxies_dict.append(proxies)
                elif re.match('\s*#', line):
                    continue
        print " loaded [ %d ] proxis " % len(self.proxies_dict)