def construct_jd_raw(item): jd_raw = JdRaw() jd_raw.jdId = item.jdId jd_raw.jdFrom = item.jdFrom jd_raw.jdUrl = item.jdUrl jd_raw.pubDate = DtString(item.pubDate, getFlag(item.pubDate)) # 公司信息 jd_inc = JdIncRaw() jd_inc.incName = DtString(item.incName, getFlag(item.incName)) jd_inc.incType = DtString(item.incType, getFlag(item.incType)) jd_inc.incIntro = DtString(item.incIntro, getFlag(item.incIntro)) jd_inc.incIndustry = DtString(item.incIndustry, getFlag(item.incIndustry)) jd_inc.incScale = DtString(item.incScale, getFlag(item.incScale)) jd_inc.incUrl = DtString() # 量化不需要 # 职位信息 job_info = JdJobRaw() job_info.jobPosition = DtString(item.jobPosition, getFlag(item.jobPosition)) job_info.jobSalary = DtString(item.jobSalary, getFlag(item.jobSalary)) job_info.jobWorkLoc = DtString(item.jobWorkLoc, getFlag(item.jobWorkLoc)) job_info.jobDiploma = DtString(item.jobDiploma, getFlag(item.jobDiploma)) job_info.jobWorkAge = DtString(item.jobWorkAge, getFlag(item.jobWorkAge)) job_info.jobDescription = DtString() #量化不需要 job_info.jobCate = DtString(item.jobCate, getFlag(item.jobCate)) job_info.jobType = DtString() #量化不需要 job_info.jobWelfare = DtString() #量化不需要 jd_raw.jdJob = job_info jd_raw.jdInc = jd_inc remedyInfo = JdRemedyRaw() remedyInfo.age = DtString() remedyInfo.cert = [DtString()] remedyInfo.diploma = DtString() remedyInfo.endTime = DtString() remedyInfo.gender = DtString() remedyInfo.incName = DtString() remedyInfo.incTags = [DtString()] remedyInfo.jdId = "" remedyInfo.jobPosition = DtString() remedyInfo.jobTags = [DtString()] remedyInfo.jobWelfare = DtString() remedyInfo.major = DtString() remedyInfo.pubTime = DtString() remedyInfo.salary = DtString() remedyInfo.skills = [DtString()] remedyInfo.workDemand = DtString() remedyInfo.workLoc = DtString() remedyInfo.workDuty = DtString() remedyInfo.workExp = DtString() jd_raw.remedyInfo = remedyInfo return jd_raw
def parse(content): # check(content) doc = html.fromstring(content) jdRaw = JdRaw(jdFrom="jd_zhilian") jdRaw.jdUrl = "" jdRaw.jdInc = jdinc.parse(doc) jdRaw.jdJob = jdjob.parse(doc) pub_date = doc.xpath("//ul[@class='terminal-ul clearfix']/li[3]/strong") if pub_date: jdRaw.pubDate = DtString(pub_date[0].text_content(), DataFlag(hasValue=True).toByte()) return jdRaw
def parse(content): doc = html.fromstring(content) jdRaw = JdRaw(jdFrom="jd_lagou") jdRaw.jdInc = jdinc.parse(doc) jdRaw.jdJob = jdjob.parse(doc) return jdRaw