コード例 #1
0
ファイル: parse_xpath.py プロジェクト: haogods/etl_task
def _get_inc_intro(doc):
    els = doc.xpath("//div[@class='tCompany_text_gsjs']")
    for el in els:
        r = etree.tounicode(el, pretty_print=True)
        r = HtmlFind.remove_tag(r)
        return r
    return ''
コード例 #2
0
ファイル: parse_with_xpath.py プロジェクト: haogods/etl_task
    def parse(self):

        top_info = self._doc.xpath("//div[@class='cn']")
        if not top_info:
            raise Exception("find top_info exception")
        position = top_info[0].xpath("h1/@title")[0]
        self.result.jobPosition = self.replace_pattern('\(.*?\)', '', position)
        self.result.jobWorkLoc = top_info[0].xpath("span[@class='lname']")[0].text_content()
        self.result.incName = top_info[0].xpath("p[@class='cname']/a/@title")[0]
        self.result.incUrl = top_info[0].xpath("p[@class='cname']/a/@href")[0]
        self.result.jobSalary = top_info[0].xpath("strong")[0].text_content()

        inc_info_part1 = top_info[0].xpath("p[@class='msg ltype']")
        if inc_info_part1:
            incType, incScale, incIndustry = inc_info_part1[0].text_content().split('|')
            self.result.incType = HtmlFind.remove_tag(incType, 1)
            self.result.incScale = HtmlFind.remove_tag(incScale, 1)
            self.result.incIndustry = HtmlFind.remove_tag(incIndustry, 1)

        ###########################################################################
        ###########################################################################
        mid_info = self._doc.xpath("//div[@class='jtag inbox']")
        if not mid_info:
            raise Exception("find mid_info exception")
        mid_elems = mid_info[0].xpath("div[@class='t1']/span[@class='sp4']")

        mid_field_list = ['jobWorkAge', 'jobPersonNumber']
        i = 0
        for mid_elem in mid_elems:
            if i >=2 :
                break

            setattr(self.result, mid_field_list[i], mid_elem.text_content())
            i += 1

        jobWelfare = ''
        mid_elems_2 = mid_info[0].xpath("p[@class='t2']/span")
        for el in mid_elems_2:
            jobWelfare += el.text_content() + ' '

        self.result.jobWelfare = jobWelfare

        jobDesc = self._doc.xpath("//div[@class='bmsg job_msg inbox']")
        if not jobDesc:
            raise Exception("get jobdesc exception")

        jobDescription = HtmlFind.remove_tag(html.tostring(jobDesc[0], encoding='utf-8'), 1)
        index = jobDescription.find('关键字:')
        if index != -1:
            jobDescription = jobDescription[:index]

        self.result.jobDescription = jobDescription

        index = jobDescription.find('职能类别:')
        if index != -1:
            jobCate = jobDescription[index:]

            self.result.jobCate = self.replace_pattern('职能类别:', '', jobCate)

        inc_intro = self._doc.xpath('//div[@class="tmsg inbox"]')
        if inc_intro:
            raw_content = html.tostring(inc_intro[0], encoding='utf-8')
            raw_content = HtmlFind.remove_tag(raw_content, 1)
            self.result.incIntro = raw_content


        self.myprint()
コード例 #3
0
ファイル: parse_xpath.py プロジェクト: haogods/etl_task
def _set_job_others(ret, doc):
    els = doc.xpath('//div[@class="tCompany_basic_job"]/dl[@class="lineDl"]')
    for dl in els:
        dts = dl.xpath("dt")
        dds = dl.xpath("dd")

        for i in range(0, len(dts)):
            key = dts[i].text_content().strip()
            value = dds[i].text_content().strip()

            if isinstance(key, unicode):
                key = key.encode("utf8")

            if "发布日期:" == key:
                ret.pubDate = value
            if "工作地点:" == key:
                ret.jobWorkLoc = value
            if "招聘人数:" == key:
                ret.jobPersonNumber = value
            if "工作年限:" == key:
                ret.jobWorkAge = value
            if "学历要求:" == key:
                ret.jobDiploma = value
            if "薪资范围:" == key:
                ret.jobSalary = value
            if "薪酬福利:" == key:
                els = dds[i].xpath('span')
                job_welfare_list = []
                for el in els:
                    job_welfare_list.append(el.text_content().strip())

                ret.jobWelfare = ",".join(job_welfare_list)
            if "职能类别:" == key:
                job_cate_list = []
                els = dds[i].xpath('a')
                for el in els:
                    job_cate_list.append(el.text_content().strip())

                ret.jobCate = ",".join(job_cate_list)
            if "职位标签:" == key:
                job_tags_list = []
                els = dds[i].xpath('a')
                for el in els:
                    job_tags_list.append(el.text_content().strip())

                ret.jobTags = ",".join(job_tags_list)

    # job描述
    els = doc.xpath("//div[@class='tCompany_text']/ul")
    r = ""
    for el in els:
        r += etree.tounicode(el, pretty_print=True)

    r = HtmlFind.remove_tag(r)
    ret.jobDescription = r

    # job职位
    a = doc.xpath("//li[@class='tCompany_job_name']")
    if a:
        ret.jobPosition = a[0].text_content().strip()
    else:
        ret.jobPosition = ''