Ejemplo n.º 1
0
 def page_time(self):
     try:
         t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                              'class="resume-info"')
         if not t:
             t = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                  'class="tab"')  #猎头页面
             if not t:
                 return None
             return TimeHandler.fmt_time(t[0])
         return TimeHandler.fmt_time(t[0])
     except Exception as e:
         self._save_not_need_cv(self.get_cur_doc().cur_jdid)
Ejemplo n.º 2
0
    def page_time(self):
        cur_doc = self.get_cur_doc().cur_content
        if isinstance(cur_doc, unicode):
            cur_doc = cur_doc.encode('utf-8')

        s = re.search(r'发布日期:</dt>.*?<dd class="text_dd">(.*?)</dd>', cur_doc, re.S)
        if s:
            return TimeHandler.fmt_time(s.group(1))

        tag = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox"')
        if tag:
            m = re.search(r'(\d*-?\d+-\d+发布)', tag[0])
            if m:
                t = TimeHandler.fmt_time(m.group(1))
                return t
Ejemplo n.º 3
0
 def page_time(self):
     tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox"')
     if tag:
         m = re.search(ur'(\d*-?\d+-\d+发布)', tag[0])
         if m:
             t = TimeHandler.fmt_time(m.group(1))
             return t
Ejemplo n.º 4
0
    def real_dispatch(self):
        for url in urls:
            for ind in inds:
                i = 1
                while 1:
                    realUrl = "{}qz{}/pn{}".format(url, ind, i)
                    if self.get_latest:
                        l_time = TimeHandler.getTimeOfNDayBefore(
                            self.get_latest) / 1000
                        l_time_local = time.localtime(l_time)
                        l_time_str = '%04d%02d%02d' % (
                            l_time_local[0], l_time_local[1], l_time_local[2])

                        h_time_local = time.localtime(time.time())
                        h_time_str = '%04d%02d%02d' % (
                            h_time_local[0], h_time_local[1], h_time_local[2])

                        realUrl += "?postdate={}000000_{}000000".format(
                            l_time_str, h_time_str)

                    # self.add_main_job({"urlpart": realUrl,  "type":"loadPage"})
                    has_next = self.parse_html(realUrl)
                    if not has_next:
                        break
                    i += 1
Ejemplo n.º 5
0
    def getIds(self, q):
        url = "http://www.lagou.com/jobs/positionAjax.json"
        hasNext = True
        pageIndex = 0
        total_num = 100
        while hasNext and pageIndex <= total_num:
            pageIndex += 1
            q["pn"] = pageIndex
            res = self.request_url(url, data=q)
            json_resp = json.loads(res.text)
            if "content" in json_resp and "positionResult" in json_resp["content"] \
                and "result" in json_resp["content"]["positionResult"]:

                # if pageIndex == 1:
                #     total_num = json_resp["content"]["totalPageCount"]

                if not json_resp["content"]["positionResult"]["result"]:
                    hasNext = False
                elif json_resp["content"]["positionResult"]["result"]:
                    hasNext = True
                    for item in json_resp["content"]["positionResult"][
                            "result"]:
                        create_time = item['createTimeSort']
                        # 昨天的不管
                        if TimeHandler.isBeforeNDay(create_time, 2):
                            yield item["positionId"]
                            break
                        yield item["positionId"]
Ejemplo n.º 6
0
    def page_time(self):
        #TODO
        tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'li',
                               'class="posted" itemprop="datePosted"')

        if tag and len(tag) > 0:
            return TimeHandler.fmt_time(tag[0])

        return None
Ejemplo n.º 7
0
    def page_time(self):

        tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish-time"', 0)
        try:
            tag = tag.get_text()
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url, self.get_cur_doc().cur_content)
            return None

        return TimeHandler.fmt_time(tag)
Ejemplo n.º 8
0
    def page_time(self):
        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        find = re.search(r'<em>更新时间:(.*)</em>', cur_content, re.S)
        if find:
            return TimeHandler.fmt_time(find.group(1))

        return None
Ejemplo n.º 9
0
    def page_time(self):
        #TODO
        #tag = spider.util.htmlfind(self.get_cur_doc().cur_content, 'class="publish_time"', 0)
        tag = re.search('class="uptime common-icon"></em>(.*?)</dd>',
                        self.get_cur_doc().cur_content)
        try:
            #tag = tag.get_text()
            tag = tag.group(1)
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise

        return TimeHandler.fmt_time(tag)
Ejemplo n.º 10
0
    def page_time(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        return TimeHandler.fmt_time(tag)
Ejemplo n.º 11
0
    def page_time(self):

        tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul',
                               'class="headTag"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        if "天前" not in tag:
            return int(time.time() * 1000)
        else:
            find = re.search('(\d+).*?(\d+).*?(\d+)', tag, re.S)
            if find:
                day = find.group(1)
                return TimeHandler.getTimeOfNDayBefore(day)

        raise Exception("not copy time pattern: {}".format(tag))
Ejemplo n.º 12
0
 def page_time(self):
     m = re.search(r'"icons24 icons24-time"></i>(.*?)</span>',
                   self.get_cur_doc().cur_content, re.S)
     if m:
         ft = m.group(1)
         return TimeHandler.fmt_time(ft)
Ejemplo n.º 13
0
 def page_time(self):
     localtime = time.localtime(time.time())
     localtime = time.strftime('%Y-%m-%d', localtime)
     t = TimeHandler.fmt_time(localtime)
     return t