def _get_job_desc(doc): for pt in job_desc_patterns: ps = doc.xpath(pt) r = "" for p in ps: r += etree.tounicode(p, pretty_print=True) r = content_format(r) removed_special_token_content = re.sub(ur'岗位要求|岗位职责|工作描述|职位描述| ', '', r) if len(removed_special_token_content) < 10: continue return DtString(r, DataFlag(hasValue=True).toByte()) els = doc.xpath(job_desc_pattern_other) r = "" for el in els: r += etree.tounicode(el, pretty_print=True) r = content_format(r) return DtString(r, DataFlag(hasValue=True).toByte()) return DtString()
def fetch_content_by_patterns(doc, patterns): for pat in patterns: els = doc.xpath(pat) r = "" for el in els: r += el.text_content() r += '\n' r = content_format(r) if len(r) < 10: continue return r return ""
def _get_inc_intro(doc): for pat in company_patterns: els = doc.xpath(pat) r = "" for el in els: r += el.text_content() r += '\n' r = content_format(r) if len(r)>10: return DtString(r, DataFlag(hasValue=True).toByte()) return DtString()