Ejemplo n.º 1
0
    def extract_content(self):
        content=''
        cur_doc = self.get_cur_doc().cur_content
        if isinstance(cur_doc, unicode):
            cur_doc = cur_doc.encode('utf-8')
        find = re.search(r'tCompany_text">(.*?)</div>', cur_doc, re.S)
        if find:
            content = htmlfind.remove_tag(find.group(1), True)
            return content

        divs = htmlfind.findTag(cur_doc, 'div', 'class="jtag inbox">')
        if divs:
            spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S)
            if spans:
                spans = spans[:-1] # 忽略更新时间
                for span in spans:
                    content += htmlfind.remove_tag(span, True) + "#"

        if isinstance(content, unicode):
            content = content.encode('utf-8')

        hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0)
        t2 = htmlfind.remove_tag(hf.get_node(), 1)

        if isinstance(t2, unicode):
            t2 = t2.encode('utf-8')

        content = content + t2
        return content
Ejemplo n.º 2
0
    def extract_content(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        fields = htmlfind.findTag(cur_content, 'table')

        content = ''
        for field in fields:
            if r'所在行业:' in field:
                content = htmlfind.remove_tag(field, True)
                break
            elif r'Industry:' in field or r'Industry:' in field:
                print "Ignore..... is English page!"
                self._save_not_need_cv(self.get_cur_doc().cur_jdid)
                break

        if r'抱歉,该简历已经设置为对猎头顾问不开放!' in cur_content:
            print "Ignore..... can not access by lietou"
            return None
        if r'该简历人才已经设置了对企业不开放简历,可能该人才已经找到工作,或者暂时没有换工作的意向。' in cur_content:
            print "Ignore..... can not access by qiye"
            self._not_access_by_qiye.append(self.get_cur_doc().cur_jdid)
            return None

        return content
Ejemplo n.º 3
0
def extract_content(doc):
    content = ''
    divs = htmlfind.findTag(doc, 'div', 'class="in"')
    if divs:
        ps = re.findall(r'<p[^<>]*>(.*?)</p>', divs[0], re.S)
        for p in ps:
            content += htmlfind.remove_tag(p, True) + "#"
    if isinstance(content, unicode):
        content = content.encode('utf-8')
    return content
Ejemplo n.º 4
0
def extract_content(doc):
    content = ''
    spans = htmlfind.findTag(doc, 'div', 'class="company_intro_text"')
    if spans:
        ps = re.findall(r'<span[^<>]*>(.*?)</span>', spans[0], re.S)
        for p in ps:
            content += htmlfind.remove_tag(p, True) + "#"
    if isinstance(content, unicode):
        content = content.encode('utf-8')
    return content
Ejemplo n.º 5
0
    def extract_content(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        content_part1 = htmlfind.findTag(cur_content, 'ul', 'contact-list')
        if not content_part1:
            return
        content_part1 = htmlfind.remove_tag(content_part1[0], True)
        content_part2 = htmlfind.findTag(cur_content, 'div', 'field')
        for c in content_part2:
            if r'求职意向' in c:
                content_part2 = c
                break

        content_part2 = htmlfind.remove_tag(content_part2, True)

        return content_part1 + content_part2
Ejemplo n.º 6
0
    def extract_content(self):
        content=''
        divs = htmlfind.findTag(self.get_cur_doc().cur_content, 'div', 'class="jtag inbox">')
        if divs:
            spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S)
            if spans:
                spans = spans[:-1] # 忽略更新时间
                for span in spans:
                    content += htmlfind.remove_tag(span, True) + "#"

        if isinstance(content, unicode):
            content = content.encode('utf-8')

        hf = htmlfind(self.get_cur_doc().cur_content, '<div class="bmsg job_msg inbox">', 0)
        t2 = htmlfind.remove_tag(hf.get_node(), 1)

        if isinstance(t2, unicode):
            t2 = t2.encode('utf-8')

        return content + t2
Ejemplo n.º 7
0
    def extract_content(self):

        content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                   'posMsg borb')
        try:
            content = htmlfind.remove_tag(content[0], 1)
        except:
            Log.errorbin("invalid jd content %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            return None
        return content
Ejemplo n.º 8
0
    def extract_content(self):
        content = self.get_cur_doc().cur_content
        if isinstance(content, unicode):
            content = content.encode('utf-8')
        content = re.search(
            r'"job-main main-message ">.*?职位描述:.*?"content content-word">(.*?)</div>',
            content, re.S)
        if content:
            content = htmlfind.remove_tag(content.group(1), 1)
            return content

        return None
Ejemplo n.º 9
0
    def extract_content(self):
        content = self.get_cur_doc().cur_content
        if isinstance(content, unicode):
            content = content.encode('utf-8')

        find = htmlfind(content, '<table class="comTinyDes">', 0)
        try:
            rs = find.get_text()
            rs = htmlfind.remove_tag(rs, 1)
            return rs
        except Exception as e:
            print "co_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e)
            return None
Ejemplo n.º 10
0
    def extract_content(self):
        content = ''
        spans = htmlfind.findTag(self.get_cur_doc().cur_content, 'table', 'class="i_table"')
        if spans:
            ps = re.findall(r'<tr[^<>]*>(.*?)</tr>', spans[0], re.S)
            for tr in ps:
                tds = re.findall(r'<td[^<>]*>(.*?)</td>', tr, re.S)
                for td in tds:
                    content += htmlfind.remove_tag(td, True) + "#"

        if isinstance(content, unicode):
            content = content.encode('utf-8')
        print content
        return content
Ejemplo n.º 11
0
    def extract_content(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        find = htmlfind(cur_content, '<div class="inforBase">', 0)

        try:
            info = htmlfind.remove_tag(find.get_text(), True)
            return info
        except Exception, e:
            print "cv_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e)
            return None
Ejemplo n.º 12
0
    def page_time(self):

        cur_content = self.get_cur_doc().cur_content
        if isinstance(cur_content, unicode):
            cur_content = cur_content.encode('utf-8')

        tag = htmlfind.findTag(cur_content, 'span', 'class="last-modified"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        return TimeHandler.fmt_time(tag)
Ejemplo n.º 13
0
    def extract_content(self):
        content = self.get_cur_doc().cur_content
        if isinstance(content, unicode):
            content = content.encode('utf-8')

        if r'你查找的页面可能已被删除、或暂时不可用!' in content:
            print "jobid: %r, is not valid or deleted" % self.get_cur_doc(
            ).cur_jdid
            return None

        find = htmlfind(content, '<dl class="j-edit hasVist dlli mb10">', 0)
        try:
            rs = find.get_text()
            rs = htmlfind.remove_tag(rs, 1)
            return rs
        except Exception as e:
            print "co_id: %s, exception: %r" % (self.get_cur_doc().cur_jdid, e)
            return None
Ejemplo n.º 14
0
    def extract_content(self):

        content = ''
        uls = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul',
                               'class="terminal-ul clearfix"')
        if len(uls):
            strongs = re.findall(r'<strong[^<>]*>(.*?)</strong>', uls[0], re.S)
            for index, strong in enumerate(strongs):
                if 2 == index:  # updateTime 忽略
                    continue
                content += htmlfind.remove_tag(strong, True) + "#"

        m = re.search(ur'''<div class="tab-inner-cont">(.*?)</button>''',
                      self.get_cur_doc().cur_content, re.S)
        if m:
            a = re.sub(ur'<[a-zA-Z/!][^<>]*>', '', m.group(1))
            content += a.strip()
            return content

        Log.error(self.get_cur_doc().cur_url, "no content")
        return None
Ejemplo n.º 15
0
    def page_time(self):

        tag = htmlfind.findTag(self.get_cur_doc().cur_content, 'ul',
                               'class="headTag"')
        try:
            tag = htmlfind.remove_tag(tag[0], 1)
        except:
            Log.errorbin("invalid jd pubtime %s" % self.get_cur_doc().cur_url,
                         self.get_cur_doc().cur_content)
            raise
        if isinstance(tag, unicode):
            tag = tag.encode('utf-8')

        if "天前" not in tag:
            return int(time.time() * 1000)
        else:
            find = re.search('(\d+).*?(\d+).*?(\d+)', tag, re.S)
            if find:
                day = find.group(1)
                return TimeHandler.getTimeOfNDayBefore(day)

        raise Exception("not copy time pattern: {}".format(tag))
Ejemplo n.º 16
0
#!/usr/bin/env python
# -*- coding:utf8 -*-

import re
from spider.util import htmlfind, TimeHandler

with open('test.html', 'rb') as f:
    content = f.read()

    divs = htmlfind.findTag(content, 'div', 'class="jtag inbox">')
    if divs:
        spans = re.findall(r'<span[^<>]*>(.*?)</span>', divs[0], re.S)
        if spans:
            spans = spans[:-1]  # 忽略更新时间
            for span in spans:
                content += htmlfind.remove_tag(span, True) + "#"

    if isinstance(content, unicode):
        content = content.encode('utf-8')

    hf = htmlfind(content, '<div class="bmsg job_msg inbox">', 0)
    t2 = htmlfind.remove_tag(hf.get_node(), 1)

    find = re.search(r'tCompany_text">(.*?)</div>', content, re.S)
    # print htmlfind.remove_tag(find.group(1), 1)
    s = re.search(r'(\d*-?\d+-\d+发布)', content, re.S)
    print htmlfind.remove_tag(s.group(1), True)
Ejemplo n.º 17
0
 def extract_content(self):
     content = htmlfind.findTag(self.get_cur_doc().cur_content, 'div',
                                'class="rich-text" itemprop="description"')
     if content and len(content) > 0:
         content = htmlfind.remove_tag(content[0], 1)
     return content