Example #1
0
    def extract_content(self):
        doc = self.get_cur_doc()

        hf = spider.util.htmlfind(doc.cur_content, 'id="resumeContentBody"', 0)

        dom = html.fromstring(doc.cur_content)
        contact_info = self.extract_info(dom)

        name = contact_info.get("name", "")
        email = contact_info.get("email", "")
        telephone = contact_info.get("telephone", "")

        if not (name and (email or telephone)):
            self.log.info("fail id: %s, extract contact infomation fail" %
                          self.get_cur_doc().cur_jdid)
            return None

        try:
            detail = hf.get_text()
        except:
            Log.errorbin("invalid cv content %s" % doc.cur_url,
                         doc.cur_content)
            return None

        return utf8str(contact_info) + utf8str(detail)
Example #2
0
    def extract_content(self):

        dom = html.fromstring(self.get_cur_doc().cur_content)

        xx = dom.xpath("//td[@id='divInfo']")

        contact_info = CV51DownloadPageStore.extract_info(dom)
        name = contact_info.get("name", "")
        email = contact_info.get("email", "")
        telephone = contact_info.get("telephone", "")
        if not (name and (email or telephone)):
            self.log.info("fail id: %s, extract contact infomation fail" %
                          self.get_cur_doc().cur_jdid)
            return None

        if xx is not None and len(xx) > 0:
            return utf8str(contact_info) + utf8str(xx[0].text_content())

        Log.errorbin(self.get_cur_doc().cur_url,
                     self.get_cur_doc().cur_content)
        Log.error("get cv failed", self.get_cur_doc().cur_url)

        return None
Example #3
0
 def jobrunner1(self, job):
     con = self.sesnreq.request_url(
         "http://www.x315.com/",
         headers={"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"})
     # self.sesnreq.request_url("http://s4.cnzz.com/stat.php?id=1256666136&show=pic1")
     url = r"http://www.x315.com/quicksearch?qk=%s&t=1&z=&timestamp=%s" % (
         "富士康", str(time.time()).split(".")[0] +
         str(time.time()).split(".")[1][:3])
     header = {
         "Accept":
         r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
         "Connection": "Keep-alive",
         "Content-Type": "application/x-www-form-urlencoded",
         "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
         "Referer": "http://www.x315.com/",
         "X-Requested-With": "XMLHttpRequest"
     }
     con = self.sesnreq.request_url(url, headers=header)
     print con.text
     if ur"查询过于频繁" in con.text:
         if not self.re_add_job(job):
             Log.error("readd job failed.==>" + utf8str())
         Log.info("查询过于频繁,sleep 10 s.")
         time.sleep(10)
Example #4
0
    def run_job(self, job):
        time.sleep(5)
        threadident = str(threading.currentThread().ident)
        sq = getattr(self._curltls, "sq", None)
        if sq is None:
            sq = self.init_req()
        Log.info("Running job:" + util.utf8str(job))
        if job["type"] == "u1":
            Log.info("Searching line %d" % job["lineno"])
            con = sq.request_url(
                r"http://qiye.qianzhan.com/orgcompany/searchList",
                data={
                    "oc_name": job["key"],
                    "od_orderby": 0,
                    "page": 1,
                    "pageSize": 10,
                    "oc_area": "",
                    "sh_searchType": 1
                })
            if con is None or con.text.strip() == "" or con.code != 200:
                Log.error("[u1]Bad connect or empty content return.. JOB=>" +
                          util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = ""
                try:
                    jsonobj = json.loads(con.text.strip())
                except ValueError as e:
                    Log.error("Json decode error. String is %s" % con.text)
                    return
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error("[u1]Request fail, succ flag is False. JOB=>" +
                              util.utf8str(job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." %
                                  sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    corplist = jsonobj["lst"]
                    if len(corplist) == 0:
                        Log.error("Search return nothing. %d:%s, no data." %
                                  (job["lineno"], job["key"]))
                        return
                    else:
                        for corp in corplist:
                            jobb = {
                                "type": "u2",
                                "orgCode": corp["oc_orgCode"],
                                "name": corp["oc_name"]
                            }
                            self.add_job(jobb)

        if job["type"] == "u2":
            Log.info("Getting detail info about %s" % job["name"])
            timestr = "%f" % time.time()
            con0 = sq.request_url(
                r"http://qiye.qianzhan.com/orgcompany/GetJsVerfyCode?t=0.%s&_=%s"
                % (timestr.split(".")[1], timestr.split(".")[0]))
            if con0 is None or con0.text.strip() == "" or con0.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" +
                          util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            if not os.path.exists(threadident):
                os.mkdir(threadident)
            f = open(threadident + "/qycxb.js", "w+b")
            f.write(r'var window = {document : {cookie :"qznewsite.uid=' +
                    sq.get_cookie("qznewsite.uid").strip() + '"}};  ' +
                    con0.text + "console.log(window.__qzmcf())")
            f.flush()
            os.system("nodejs " + threadident + "/qycxb.js > " + threadident +
                      "/mcfcode.txt")
            mcfcode = open(threadident + "/mcfcode.txt", "rb").read().strip()
            con1 = sq.request_url(
                "http://qiye.qianzhan.com/orgcompany/SearchItemDtl",
                data={
                    "mcfCode": mcfcode,
                    "orgCode": job["orgCode"]
                })
            if con1 is None or con1.text.strip() == "" or con1.code != 200:
                Log.error("[u2]Bad connect or empty content return.. JOB=>" +
                          util.utf8str(job.__str__()))
                self.re_add_job(job)
                return
            else:
                jsonobj = json.loads(con1.text.strip())
                if not jsonobj["succ"]:
                    Log.warning(jsonobj.__str__())
                    time.sleep(1)
                    Log.error(
                        "[u2]Request fail, succ flag is False.Check the orgcode and mcfcode. JOB=>"
                        + util.utf8str(job.__str__()))
                    if 'status' in jsonobj and jsonobj['status'] == '4':
                        Log.error("Remove current proxy...Used %d times....." %
                                  sq._proxy_use_times[sq.curproxy])
                        sq.remove_curproxy()
                    self.re_add_job(job)
                else:
                    #self.binsaver.append(job["name"] + job["orgCode"], con1.text.strip())
                    self.filesaver.append(job["name"] + "|" + job["orgCode"] +
                                          "|" + con1.text.strip())
                    Log.info("%s,%s,saved." % (job["name"], job["orgCode"]))
                    return
Example #5
0
 def run_job(self, job):
     Log.info("running job:" + utf8str(job))
     if job["type"] == "t1":
         self.jobrunner1(job)
     elif job["type"] == "t2":
         self.jobrunner2(job)
Example #6
0
        elif key == u"海南":
            outmap[key] = "4600"
        elif len(cclist[key]) == 1:
            thekey = cclist[key].keys()[0]
            outmap[key] = thekey
        else:
            preflist = []
            for thekey, v in cclist[key].items():
                for code, name in v:
                    if code[-2:] == '00':
                        preflist.append(code)
            if len(preflist) == 0:
                pass
            elif len(preflist) == 1:
                outmap[key] = preflist[0][0:4]
            else:
                assert not "nani?"

    fout = {}
    for k, v in outmap.items():
        fout[k] = v
    for k, v in fulloutmap.items():
        if len(v) == 1:
            fout[k] = v[0]
    return fout


spider.util.use_utf8()
allmap = get_area_code()
print utf8str(allmap)
Example #7
0
 def need_split(self, url, level, isLast):
     tol = util.utf8str(url).strip()
     if tol in self.oldjobs:
         return False
     raise RuntimeError('virtual function called.')
Example #8
0
 def log_url(self, url):
     tol = util.utf8str(url).strip()
     if tol in self.oldjobs:
         return
     self.fs.append(tol)