def parse(doc): doc = doc.decode('utf-8') doc = html.fromstring(doc) ret = JdRaw() ret.incName = _get_inc_name(doc) ret.incIntro = _get_inc_intro(doc) ret.incUrl = _get_inc_url(doc) _set_inc_others(ret, doc) _set_job_others(ret, doc) return ret
def __init__(self, doc): BaseParse.__init__(self, doc) self.result = JdRaw()
class Jd51jobParse(BaseParse): def __init__(self, doc): BaseParse.__init__(self, doc) self.result = JdRaw() def myprint(self): # print self. print self.result.to_json() def parse(self): top_info = self._doc.xpath("//div[@class='cn']") if not top_info: raise Exception("find top_info exception") position = top_info[0].xpath("h1/@title")[0] self.result.jobPosition = self.replace_pattern('\(.*?\)', '', position) self.result.jobWorkLoc = top_info[0].xpath("span[@class='lname']")[0].text_content() self.result.incName = top_info[0].xpath("p[@class='cname']/a/@title")[0] self.result.incUrl = top_info[0].xpath("p[@class='cname']/a/@href")[0] self.result.jobSalary = top_info[0].xpath("strong")[0].text_content() inc_info_part1 = top_info[0].xpath("p[@class='msg ltype']") if inc_info_part1: incType, incScale, incIndustry = inc_info_part1[0].text_content().split('|') self.result.incType = HtmlFind.remove_tag(incType, 1) self.result.incScale = HtmlFind.remove_tag(incScale, 1) self.result.incIndustry = HtmlFind.remove_tag(incIndustry, 1) ########################################################################### ########################################################################### mid_info = self._doc.xpath("//div[@class='jtag inbox']") if not mid_info: raise Exception("find mid_info exception") mid_elems = mid_info[0].xpath("div[@class='t1']/span[@class='sp4']") mid_field_list = ['jobWorkAge', 'jobPersonNumber'] i = 0 for mid_elem in mid_elems: if i >=2 : break setattr(self.result, mid_field_list[i], mid_elem.text_content()) i += 1 jobWelfare = '' mid_elems_2 = mid_info[0].xpath("p[@class='t2']/span") for el in mid_elems_2: jobWelfare += el.text_content() + ' ' self.result.jobWelfare = jobWelfare jobDesc = self._doc.xpath("//div[@class='bmsg job_msg inbox']") if not jobDesc: raise Exception("get jobdesc exception") jobDescription = HtmlFind.remove_tag(html.tostring(jobDesc[0], encoding='utf-8'), 1) index = jobDescription.find('关键字:') if index != -1: jobDescription = jobDescription[:index] self.result.jobDescription = jobDescription index = jobDescription.find('职能类别:') if index != -1: jobCate = jobDescription[index:] self.result.jobCate = self.replace_pattern('职能类别:', '', jobCate) inc_intro = self._doc.xpath('//div[@class="tmsg inbox"]') if inc_intro: raw_content = html.tostring(inc_intro[0], encoding='utf-8') raw_content = HtmlFind.remove_tag(raw_content, 1) self.result.incIntro = raw_content self.myprint()