Beispiel #1
0
def extract_content(doc):
    content = ''
    spans = htmlfind.findTag(doc, 'div',
                             'class="company-introduction clearfix"')
    if spans:
        ps = re.findall(r'<p[^<>]*>(.*?)</p>', spans[0], re.S)
        for p in ps:
            content += htmlfind.remove_tag(p, True) + "#"
    if isinstance(content, unicode):
        content = content.encode('utf-8')
    return content
Beispiel #2
0
 def extract_content(self):
     content = ''
     spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                              'class="company_intro_text"')
     if spans:
         ps = re.findall(r'<span[^<>]*>(.*?)</span>', spans[0], re.S)
         for p in ps:
             content += htmlfind.remove_tag(p, True) + "#"
     if isinstance(content, unicode):
         content = content.encode('utf-8')
     return content
Beispiel #3
0
 def extract_content(self):
     content = ''
     divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                             'class="in"')
     if divs:
         ps = re.findall(r'<p[^<>]*>(.*?)</p>', divs[0], re.S)
         for p in ps:
             content += htmlfind.remove_tag(p, True) + "#"
     if isinstance(content, unicode):
         content = content.encode('utf-8')
     return content
Beispiel #4
0
    def extract_content(self):

        content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                   'posMsg borb')
        try:
            content = htmlfind.remove_tag(content[0], 1)
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            return None
        return content
Beispiel #5
0
 def run_job(self, jobid):
     if not self.page_store.check_should_fetch(jobid):
         return
     url = "http://www.lagou.com/jobs/{}.html".format(jobid)
     res = self.speed_control_requests.with_sleep_requests(url, sleep=0.1)
     if htmlfind.findTag(res.text, 'div', 'position_del'):
         print "jobid: {} match nothing".format(jobid)
     if res is not None:
         self.page_store.save(int(time.time()), jobid, url, res.text)
     else:
         self.re_add_job(jobid)
         Log.error("failed get url", url)
Beispiel #6
0
    def extract_content(self):
        content = ''
        spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'table', 'class="i_table"')
        if spans:
            ps = re.findall(r'<tr[^<>]*>(.*?)</tr>', spans[0], re.S)
            for tr in ps:
                tds = re.findall(r'<td[^<>]*>(.*?)</td>', tr, re.S)
                for td in tds:
                    content += htmlfind.remove_tag(td, True) + "#"

        if isinstance(content, unicode):
            content = content.encode('utf-8')
        print content
        return content
Beispiel #7
0
    def page_time(self):
        cur_doc = self.get_cur_doc().cur_content
        if isinstance(cur_doc, unicode):
            cur_doc = cur_doc.encode('utf-8')

        s = re.search(r'发布日期:</dt>.*?<dd class="text_dd">(.*?)</dd>', cur_doc, re.S)
        if s:
            return TimeHandler.fmt_time(s.group(1))

        tag = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox"')
        if tag:
            m = re.search(r'(\d*-?\d+-\d+发布)', tag[0])
            if m:
                t = TimeHandler.fmt_time(m.group(1))
                return t
Beispiel #8
0
    def page_time(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        return TimeHandler.fmt_time(tag)
Beispiel #9
0
    def extract_content(self):

        content = ''
        jobbt = spider.util.htmlfind(self.get_cur_doc().cur_content,
                                     'class="job_bt"', 0)
        job_request = htmlfind.findTag(self.get_cur_doc().cur_content, 'dd',
                                       'class="job_request"')
        for e in job_request:
            tags = re.findall(r'<span[^<>]*>(.*?)</span>', e)
            content += '#'.join(tags)
            if isinstance(content, unicode):
                content = content.encode('utf-8')
        try:
            content += jobbt.get_text()
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            return None
        return content
Beispiel #10
0
    def extract_content(self):
        content=''
        divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox">')
        if divs:
            spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S)
            if spans:
                spans = spans[:-1] # 忽略更新时间
                for span in spans:
                    content += htmlfind.remove_tag(span, True) + "#"

        if isinstance(content, unicode):
            content = content.encode('utf-8')

        hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0)
        t2 = htmlfind.remove_tag(hf.get_node(), 1)

        if isinstance(t2, unicode):
            t2 = t2.encode('utf-8')

        return content + t2
Beispiel #11
0
    def extract_content(self):

        content = ''
        uls = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul',
                               'class="terminal-ul clearfix"')
        if len(uls):
            strongs = re.findall(r'<strong[^<>]*>(.*?)</strong>', uls[0], re.S)
            for index, strong in enumerate(strongs):
                if 2 == index:  # updateTime 忽略
                    continue
                content += htmlfind.remove_tag(strong, True) + "#"

        m = re.search(ur'''<div class="tab-inner-cont">(.*?)</button>''',
                      self.get_cur_doc().cur_content, re.S)
        if m:
            a = re.sub(ur'<[a-zA-Z/!][^<>]*>', '', m.group(1))
            content += a.strip()
            return content

        Log.error(self.get_cur_doc().cur_url, "no content")
        return None
Beispiel #12
0
    def page_time(self):

        tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul',
                               'class="headTag"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        if "天前" not in tag:
            return int(time.time() * 1000)
        else:
            find = re.search('(\d+).*?(\d+).*?(\d+)', tag, re.S)
            if find:
                day = find.group(1)
                return TimeHandler.getTimeOfNDayBefore(day)

        raise Exception("not copy time pattern: {}".format(tag))
Beispiel #13
0
#!/usr/bin/env python
# -*- coding:utf8 -*-

import re
from spider.util import htmlfind, TimeHandler

with open('test.html', 'rb') as f:
    content = f.read()

    divs = htmlfind.findTag(content, 'div', 'class="jtag inbox">')
    if divs:
        spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S)
        if spans:
            spans = spans[:-1]  # 忽略更新时间
            for span in spans:
                content += htmlfind.remove_tag(span, True) + "#"

    if isinstance(content, unicode):
        content = content.encode('utf-8')

    hf = htmlfind(content, '<div class="bmsg job_msg inbox">', 0)
    t2 = htmlfind.remove_tag(hf.get_node(), 1)

    find = re.search(r'tCompany_text">(.*?)</div>', content, re.S)
    # print htmlfind.remove_tag(find.group(1), 1)
    s = re.search(r'(\d*-?\d+-\d+发布)', content, re.S)
    print htmlfind.remove_tag(s.group(1), True)
Beispiel #14
0
 def extract_content(self):
     content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                'class="rich-text" itemprop="description"')
     if content and len(content) > 0:
         content = htmlfind.remove_tag(content[0], 1)
     return content