def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') if lines[0].startswith(u"自我评价"): basic_info["self_remark"] = "\n".join(lines[1:]) return basic_info for line_pre, line in izip([""] + lines, lines): ## name update m_update = re.search(u"更新时间:\s*(?P<up>\d{4}-\d{2}-\d{2})", line) if m_update: basic_info["updated_at"] = m_update.group("up") + " 00:00:00" if len(line.split("|")) >= 3: if len(StringUtils.get_words(line_pre.strip())) in [2, 3, 4]: basic_info["name"] = line_pre.strip() ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) if email: basic_info["contact_email"] = email if phone: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line) if mbirth and mage: basic_info["birth"] = mbirth.group().replace(" ", "") ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"婚姻状况:(.+)", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(1).strip()) maccount = re.search(u"(户口/国籍|户\s*口)(:|:)(?P<acc>.+)", line) if maccount: basic_info["account_str"] = maccount.group("acc").strip() basic_info["account"] = match_region.match_region( maccount.group("acc").strip()) maddress = re.search(u"(居住地|现居住)(:|:)\s*(?P<s>.+?)(\t|\||$)", line) if maddress: basic_info["address_str"] = maddress.group("s").strip() basic_info["address"] = match_region.match_region( maddress.group("s").strip()) return basic_info
def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') for line_pre, line in izip([""] + lines, lines): ## name update if re.search(u"(.+)\s*流程状态.+标签.+", line) or re.search( u"(.+)\s*标签.+", line) or re.search(u"(.+)\s*ID:\d+", line): if u"流程状态" in line: basic_info["name"] = re.search(u"(.+)\s*流程状态", line).group(1).strip() elif u"标签" in line: basic_info["name"] = re.search(u"(.+)\s*标签", line).group(1).strip() else: basic_info["name"] = re.search(u"(.+)\s*ID", line).group(1).strip() if line_pre.strip().startswith(u"更新时间:"): m = re.search("\d{4}-\d{2}-\d{2}", line_pre) if m: basic_info["updated_at"] = m.group() + " 00:00:00" if "name" not in basic_info and re.search(u"^ID(:|:)\d{4,}", line): if len(StringUtils.get_words(line_pre)) in [2, 3, 4]: basic_info["name"] = line_pre if "name" not in basic_info and re.search(u"^\d{11}", line): if len(StringUtils.get_words(line_pre)) in [2, 3, 4]: basic_info["name"] = line_pre ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) if email: basic_info["contact_email"] = email if phone: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line) if mbirth and mage: basic_info["birth"] = mbirth.group().replace(" ", "") ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"婚姻状况:(.+)", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(1).strip()) maccount = re.search(u"户口/国籍:(.+)", line) if maccount: basic_info["account_str"] = maccount.group(1).strip() basic_info["account"] = match_region.match_region( maccount.group(1).strip()) maddress = re.search(u"现居住?(.+?)(\||$)", line) if maddress: basic_info["address_str"] = maddress.group(1).strip() basic_info["address"] = match_region.match_region( maddress.group(1).strip()) return basic_info
def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') if lines[0].startswith(u"自我评价"): basic_info["self_remark"] = "\n".join(lines[1:]) return basic_info for line_pre, line in izip([""] + lines, lines): ## update mupdate = re.search(u"简历更新时间(:|:)\s*(?P<t>\d{4}\.\d{2}\.\d{2})", line_pre + line) if mupdate: basic_info["updated_at"] = mupdate.group("t").replace( ".", "-") + " 00:00:00" ## name mname_id = re.search(u"^ID:", line_pre) mname_ph = re.search(u"^手机:", line) if mname_id and ("name" not in basic_info or not basic_info["name"]): basic_info["name"] = line.strip() if len( StringUtils.get_words(line.strip())) in [2, 3, 4] else "" if mname_ph and ("name" not in basic_info or not basic_info["name"]): basic_info["name"] = line_pre.strip() if len( StringUtils.get_words(line_pre.strip())) in [2, 3, 4] else "" ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) m_ph = re.search(u"^手机:(\d+)", re.sub(u"(\(|().+(\)|))", "", line).replace(" ", "")) if email: basic_info["contact_email"] = email if m_ph: basic_info["contact_phone"] = m_ph.group(1) elif phone and "contact_phone" not in basic_info: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"(?P<y>\d{4})年\s*(?P<m>\d{1,2})月", line) if mbirth and mage: month = mbirth.group('m') if len(mbirth.group("m")) == 1: month = '0' + month basic_info["birth"] = mbirth.group('y') + u'年' + month + u'月' ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"(已|未)婚|Single|Married", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(0).strip()) maccount = re.search(u"户口:(.+?)(\||$)", line) if maccount: basic_info["account_str"] = maccount.group().strip() basic_info["account"] = match_region.match_region( maccount.group(1).strip()) maddress = re.search(u"现居住地:(.+?)(\||$)", line) if maddress: basic_info["address_str"] = maddress.group().strip() basic_info["address"] = match_region.match_region( maddress.group(1).strip()) basic_info["address_province"] = "10" #提醒获取 id_card = re.search(u"[0-9]{17}[xX0-9]", line) if id_card: basic_info["card"] = id_card.group(0) overseas = re.search(u"有海外工作|Overseas Work", line) if overseas: basic_info["overseas"] = "Y" return basic_info