def clean_filetext(filetext): lines = [] for line in filetext.split("\n"): line = line.strip() line = StringUtils.removeShapesSymbols(line) line = StringUtils.removeGeneralPunctuation(line) if len(line) < 2: continue else: lines.append(line) return "\n".join(lines)
def extract_projectinfo(text): project = resume_struct.get_project_struct() project["ori_text"] = text lines = text.split("\n") isDesc, isResp = False, False for preline, line in izip([""] + lines, lines): m_proj = re.search(project_reg, line) if m_proj: timestamp = match_timestamp.match_timestamp_by_reg( project_reg, line) project["name"] = m_proj.group("project").strip() project["start_time"], project["end_time"], project[ "so_far"] = StringUtils.transform_timestamp(timestamp) m_desc = re.search(u"项目描述(:|:)", line) if m_desc: line = re.sub(u"项目描述(:|:)", "", line).strip() isDesc, isResp = True, False m_resp = re.search(u"责任描述(:|:)", line) if m_resp: line = re.sub(u"责任描述(:|:)", "", line).strip() isDesc, isResp = False, True pass if isDesc: project["describe"] += '\n' + line if project[ "describe"] and line else line if isResp: project["responsibilities"] += '\n' + line if project[ "responsibilities"] and line else line return project
def extract_traininfo(text): train = resume_struct.get_training_struct() for line in text.split('\n'): m_train = re.search(train_reg, line) if m_train: timestamp = match_timestamp.match_timestamp_by_reg(train_reg, line) train["start_time"], train["end_time"], train[ "so_far"] = StringUtils.transform_timestamp(timestamp) train["name"] = m_train.group("train").strip() if len(train["name"].split("\t")) == 2: train["authority"] = train["name"].split("\t")[0] train["name"] = train["name"].split("\t")[1] mauth = re.search(u"^培训机构:(.+)", line) if mauth: train["authority"] = mauth.group(1).strip() mcity = re.search(u"^培训地点:(.+)", line) if mcity: train["city"] = mcity.group(1).strip() mdesc = re.search(u"^培训描述:(.+)", line) if mdesc: train["description"] = mdesc.group(1).strip() return train
def extract_certinfo(text): cert = resume_struct.get_certificate_struct() for line in text.split('\n'): m_cert = re.search(certi_reg, line) if m_cert: timestamp = match_timestamp.match_timestamp_by_reg(certi_reg, line) cert["name"] = m_cert.group("name").strip() cert["start_time"], _, _ = StringUtils.transform_timestamp( timestamp) return cert
def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') if lines[0].startswith(u"自我评价"): basic_info["self_remark"] = "\n".join(lines[1:]) return basic_info for line_pre, line in izip([""] + lines, lines): ## name update m_update = re.search(u"更新时间:\s*(?P<up>\d{4}-\d{2}-\d{2})", line) if m_update: basic_info["updated_at"] = m_update.group("up") + " 00:00:00" if len(line.split("|")) >= 3: if len(StringUtils.get_words(line_pre.strip())) in [2, 3, 4]: basic_info["name"] = line_pre.strip() ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) if email: basic_info["contact_email"] = email if phone: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line) if mbirth and mage: basic_info["birth"] = mbirth.group().replace(" ", "") ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"婚姻状况:(.+)", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(1).strip()) maccount = re.search(u"(户口/国籍|户\s*口)(:|:)(?P<acc>.+)", line) if maccount: basic_info["account_str"] = maccount.group("acc").strip() basic_info["account"] = match_region.match_region( maccount.group("acc").strip()) maddress = re.search(u"(居住地|现居住)(:|:)\s*(?P<s>.+?)(\t|\||$)", line) if maddress: basic_info["address_str"] = maddress.group("s").strip() basic_info["address"] = match_region.match_region( maddress.group("s").strip()) return basic_info
def extract_eduinfo(expblock): edu = resume_struct.get_education_struct() edu["ori_text"] = expblock for line in expblock.split("\n"): m = re.search(edu_reg, line) if m: timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line) edu["school_name"] = m.group("school").strip() edu["start_time"], edu["end_time"], edu[ "so_far"] = StringUtils.transform_timestamp(timestamp) edu["degree"] = match_education.match_degree(m.group('degree'), 99) edu["discipline_name"] = m.group('discipline').strip() return edu
def extract_projectinfo(text): project = resume_struct.get_project_struct() project["ori_text"] = text time_found = False isResp, isDesc, isAchi = False, False, False for line in text.split('\n'): m_proj = re.search(project_reg, line) if m_proj: timestamp = match_timestamp.match_timestamp_by_reg( project_reg, line) project["start_time"], project["end_time"], project[ "so_far"] = StringUtils.transform_timestamp(timestamp) time_found = True continue if time_found: project["name"], time_found = line, False continue m_posi = re.search(u"项目职务(:|:)\s*(?P<posi>.+)", line) if m_posi: project["position_name"] = m_posi.group("posi") m_corp = re.search(u"所在公司(:|:)(?P<corp>.+)", line) if m_corp: project["corporation_name"] = m_corp.group("corp") m_desc = re.search(u"项目简介(:|:)", line) if m_desc: line = re.sub(u"项目简介(:|:)", "", line).strip() isResp, isDesc, isAchi = False, True, False m_resp = re.search(u"项目职责(:|:)", line) if m_resp: line = re.sub(u"项目职责(:|:)", "", line).strip() isResp, isDesc, isAchi = True, False, False m_achi = re.search(u"项目业绩(:|:)", line) if m_achi: line = re.sub(u"项目业绩(:|:)", "", line).strip() isResp, isDesc, isAchi = False, False, True pass if isDesc: project["describe"] += '\n' + line if project[ "describe"] and line else line if isResp: project["responsibilities"] += '\n' + line if project[ "responsibilities"] and line else line if isAchi: project["achivement"] += "\n" + line if project[ "achivement"] and line else line pass return project
def extract_eduinfo(expblock): edu = resume_struct.get_education_struct() edu["ori_text"] = expblock for line in expblock.split("\n"): m = re.search(edu_reg, line) if m: edu["school_name"] = m.group("school").strip() edu["discipline_name"] = m.group("disc").strip() edu["degree"] = match_education.match_degree( m.group("degree").strip(), 99) timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line) edu["start_time"], edu["end_time"], edu[ "so_far"] = StringUtils.transform_timestamp(timestamp) else: edu["discipline_desc"] += "\n" + line.strip( ) if edu["discipline_desc"] else line.strip() edu["discipline_desc"] = re.sub(u"^专业描述(:|:)", "", edu["discipline_desc"]).strip() return edu
def extract_workinfo(text): work = resume_struct.get_emplyment_struct() work["ori_text"] = text lastline = "not found company" for line in text.split('\n'): if re.search(u"工作描述(:|:)", line): lastline = "position" if lastline == "not found company": m_company = re.search(work_reg, line) if m_company: timestamp = match_timestamp.match_timestamp_by_reg( work_reg, line) work["corporation_name"] = clean_company_name( m_company.group("company").strip()) work["start_time"], work["end_time"], work[ "so_far"] = StringUtils.transform_timestamp(timestamp) lastline = "company name" pass elif lastline == "company name": items = line.split("|") if len(items) > 0: work["industry_name"] = items[0].strip() lastline = "industry" continue elif lastline == "industry": items = re.split("\s+", line) if len(items) > 1: work["architecture_name"] = items[0] work["position_name"] = items[1] lastline = "position" elif lastline == "position": work["responsibilities"] += '\n' + line if work[ "responsibilities"] else line pass work["responsibilities"] = re.sub(u"^工作描述(:|:)", "", work["responsibilities"]).strip() return work
def extract_workinfo(text): work = resume_struct.get_emplyment_struct() work["ori_text"] = text last_industry = False for line in text.split('\n'): m_company = re.search(work_reg, line) if m_company: timestamp = match_timestamp.match_timestamp_by_reg(work_reg, line) work["corporation_name"] = m_company.group("company").strip() work["corporation_name"] = clean_company_name( work["corporation_name"]) work["start_time"], work["end_time"], work[ "so_far"] = StringUtils.transform_timestamp(timestamp) m_position = re.search(u"职位名称(:|:)(?P<pos>.+)部门(:|:)(?P<arc>.+)", line) if m_position: work["position_name"] = m_position.group("pos").replace( u"(兼职)", "").strip() work["architecture_name"] = m_position.group("arc") m_industry = re.search(u"(行业|所属行业)(:|:)\s*(?P<ind>.+?)(\s|$)", line) if m_industry: work["industry_name"] = m_industry.group("ind").strip() last_industry = True continue if not work["position_name"] and last_industry: if len(line.split("\t")) == 2: work["architecture_name"] = line.split("\t")[0].strip() work["position_name"] = line.split("\t")[1].strip() if len(line.split("\t")) == 1: work["position_name"] = line.split("\t")[0].strip() continue if last_industry: work["responsibilities"] += "\n" + line if work[ "responsibilities"] else line pass work["responsibilities"] = re.sub(u"^工作描述(:|:)", "", work["responsibilities"]).strip() return work
def extract_workinfo(text): work = resume_struct.get_emplyment_struct() work["ori_text"] = text last_line, isResp = "", False for line in text.split('\n'): m_time = re.search(work_reg, line) if m_time: timestamp = match_timestamp.match_timestamp_by_reg(work_reg, line) work["start_time"], work["end_time"], work[ "so_far"] = StringUtils.transform_timestamp(timestamp) last_line = "time" continue if last_line == "time": work["corporation_name"] = line last_line = "corp_name" continue if last_line == "corp_name": work["position_name"] = line last_line = "" continue m_loc = re.search(u"所在地区(:|:)(?P<loc>.+)", line) if m_loc: work["city"] = m_loc.group("loc") m_resp = re.search(u"职责业绩(:|:)", line) if m_resp: line = re.sub(u".*职责业绩(:|:)", "", line) isResp = True if isResp: work["responsibilities"] += "\n" + line if work[ "responsibilities"] and line else line pass work["responsibilities"] = re.sub(u"^工作描述(:|:)", "", work["responsibilities"]).strip() return work
def extract_eduinfo(expblock): edu = resume_struct.get_education_struct() edu["ori_text"] = expblock lastline = "not found school" for line in expblock.split("\n"): if lastline == "not found school": m = re.search(edu_reg, line) if m: timestamp = match_timestamp.match_timestamp_by_reg( edu_reg, line) edu["school_name"] = re.sub(u"海外经历", "", m.group("school")).strip() edu["start_time"], edu["end_time"], edu[ "so_far"] = StringUtils.transform_timestamp(timestamp) lastline = "school" elif lastline == "school": items = line.split("|") if len(items) >= 2: edu["degree"] = match_education.match_degree(items[0], 99) edu["degree_ori"] = items[0].strip() edu["discipline_name"] = items[1].strip() else: if match_education.match_degree(items): edu["degree"] = match_education.match_degree(items[0]) edu["degree_ori"] = items[0].strip() else: edu["discipline_name"] = items[0].strip() lastline = "degree" elif lastline == "degree": edu["discipline_desc"] += '\n' + line.strip( ) if edu["discipline_desc"] else line.strip() pass pass edu["discipline_desc"] = re.sub(u"^专业描述(:|:)", "", edu["discipline_desc"]).strip() return edu
def extract_eduinfo(expblock): edu = resume_struct.get_education_struct() edu["ori_text"] = expblock time_found = False for line in expblock.split("\n"): m_time = re.search(edu_reg, line) if m_time: timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line) edu["start_time"], edu["end_time"], edu[ "so_far"] = StringUtils.transform_timestamp(timestamp) time_found = True continue if time_found: edu["school_name"] = line.strip() time_found = False m_dis = re.search(u"专业(:|:)(?P<dis>.+)", line) if m_dis: edu["discipline_name"] = m_dis.group("dis").strip() m_deg = re.search(u"学历(:|:)(?P<deg>.+)", line) if m_deg: edu["degree"] = match_education.match_degree( m_deg.group("deg").strip()) return edu
def extract_projectinfo(text): project = resume_struct.get_project_struct() project["ori_text"] = text lastline = "not found project" isResp, isDesc = False, False for line in text.split('\n'): if lastline == "not found project": m_proj = re.search(project_reg, line) if m_proj: timestamp = match_timestamp.match_timestamp_by_reg( project_reg, line) project["name"] = m_proj.group("project").strip() project["name"] = re.sub(u"已关联$", "", project["name"]).strip() project["start_time"], project["end_time"], project[ "so_far"] = StringUtils.transform_timestamp(timestamp) lastline = "project" pass elif lastline == "project": m_desc = re.search(u"项目描述(:|:)", line) if m_desc: line = re.sub(u"项目描述(:|:)", "", line).strip() isDesc, isResp = True, False m_resp = re.search(u"责任描述(:|:)", line) if m_resp: line = re.sub(u"责任描述(:|:)", "", line).strip() isDesc, isResp = False, True pass if isDesc: project[ "describe"] += '\n' + line if project["describe"] else line if isResp: project["responsibilities"] += '\n' + line if project[ "responsibilities"] else line pass return project
def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') for line_pre, line in izip([""] + lines, lines): ## name update if re.search(u"(.+)\s*流程状态.+标签.+", line) or re.search( u"(.+)\s*标签.+", line) or re.search(u"(.+)\s*ID:\d+", line): if u"流程状态" in line: basic_info["name"] = re.search(u"(.+)\s*流程状态", line).group(1).strip() elif u"标签" in line: basic_info["name"] = re.search(u"(.+)\s*标签", line).group(1).strip() else: basic_info["name"] = re.search(u"(.+)\s*ID", line).group(1).strip() if line_pre.strip().startswith(u"更新时间:"): m = re.search("\d{4}-\d{2}-\d{2}", line_pre) if m: basic_info["updated_at"] = m.group() + " 00:00:00" if "name" not in basic_info and re.search(u"^ID(:|:)\d{4,}", line): if len(StringUtils.get_words(line_pre)) in [2, 3, 4]: basic_info["name"] = line_pre if "name" not in basic_info and re.search(u"^\d{11}", line): if len(StringUtils.get_words(line_pre)) in [2, 3, 4]: basic_info["name"] = line_pre ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) if email: basic_info["contact_email"] = email if phone: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line) if mbirth and mage: basic_info["birth"] = mbirth.group().replace(" ", "") ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"婚姻状况:(.+)", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(1).strip()) maccount = re.search(u"户口/国籍:(.+)", line) if maccount: basic_info["account_str"] = maccount.group(1).strip() basic_info["account"] = match_region.match_region( maccount.group(1).strip()) maddress = re.search(u"现居住?(.+?)(\||$)", line) if maddress: basic_info["address_str"] = maddress.group(1).strip() basic_info["address"] = match_region.match_region( maddress.group(1).strip()) return basic_info
def extract_basicinfo(text): basic_info = {} basic_info["ori_text_" + str(random.randint(0, 1000))] = text lines = text.split('\n') if lines[0].startswith(u"自我评价"): basic_info["self_remark"] = "\n".join(lines[1:]) return basic_info for line_pre, line in izip([""] + lines, lines): ## update mupdate = re.search(u"简历更新时间(:|:)\s*(?P<t>\d{4}\.\d{2}\.\d{2})", line_pre + line) if mupdate: basic_info["updated_at"] = mupdate.group("t").replace( ".", "-") + " 00:00:00" ## name mname_id = re.search(u"^ID:", line_pre) mname_ph = re.search(u"^手机:", line) if mname_id and ("name" not in basic_info or not basic_info["name"]): basic_info["name"] = line.strip() if len( StringUtils.get_words(line.strip())) in [2, 3, 4] else "" if mname_ph and ("name" not in basic_info or not basic_info["name"]): basic_info["name"] = line_pre.strip() if len( StringUtils.get_words(line_pre.strip())) in [2, 3, 4] else "" ## email phone email = match_basic.match_email(line) phone = match_basic.match_phone(line) m_ph = re.search(u"^手机:(\d+)", re.sub(u"(\(|().+(\)|))", "", line).replace(" ", "")) if email: basic_info["contact_email"] = email if m_ph: basic_info["contact_phone"] = m_ph.group(1) elif phone and "contact_phone" not in basic_info: basic_info["contact_phone"] = phone ## gender age birth if "U" != match_basic.match_gender(line) and (u"岁" in line or u"经验" in line): basic_info["gender"] = match_basic.match_gender(line) mage = re.search(u"\d+岁", line) if mage: basic_info["age"] = int(mage.group()[:-1]) mbirth = re.search(u"(?P<y>\d{4})年\s*(?P<m>\d{1,2})月", line) if mbirth and mage: month = mbirth.group('m') if len(mbirth.group("m")) == 1: month = '0' + month basic_info["birth"] = mbirth.group('y') + u'年' + month + u'月' ## others mtel = re.search(u"家庭电话:(.+)", line) if mtel: basic_info["contact_tel"] = mtel.group(1).strip() mexp = re.search(u"(\d+)年工作经验", line) if mexp: basic_info["work_experience"] = int(mexp.group(1)) mmary = re.search(u"(已|未)婚|Single|Married", line) if mmary: basic_info["marital"] = match_basic.match_marital( mmary.group(0).strip()) maccount = re.search(u"户口:(.+?)(\||$)", line) if maccount: basic_info["account_str"] = maccount.group().strip() basic_info["account"] = match_region.match_region( maccount.group(1).strip()) maddress = re.search(u"现居住地:(.+?)(\||$)", line) if maddress: basic_info["address_str"] = maddress.group().strip() basic_info["address"] = match_region.match_region( maddress.group(1).strip()) basic_info["address_province"] = "10" #提醒获取 id_card = re.search(u"[0-9]{17}[xX0-9]", line) if id_card: basic_info["card"] = id_card.group(0) overseas = re.search(u"有海外工作|Overseas Work", line) if overseas: basic_info["overseas"] = "Y" return basic_info
def _clean_cand_headline(text): text = text.strip().upper() text = StringUtils.clean_all_un_chinese(text, ["/", "IT"]) return text
def _clean_text(text): text = StringUtils.removeShapesSymbols(text) text = StringUtils.removeGeneralPunctuation(text) text = StringUtils.removeChSpace(text) text = text.lower() return text