Exemple #1
0
def clean_filetext(filetext):
    lines = []
    for line in filetext.split("\n"):
        line = line.strip()
        line = StringUtils.removeShapesSymbols(line)
        line = StringUtils.removeGeneralPunctuation(line)
        if len(line) < 2:
            continue
        else:
            lines.append(line)
    return "\n".join(lines)
Exemple #2
0
def extract_projectinfo(text):
    project = resume_struct.get_project_struct()
    project["ori_text"] = text
    lines = text.split("\n")

    isDesc, isResp = False, False
    for preline, line in izip([""] + lines, lines):
        m_proj = re.search(project_reg, line)
        if m_proj:
            timestamp = match_timestamp.match_timestamp_by_reg(
                project_reg, line)
            project["name"] = m_proj.group("project").strip()
            project["start_time"], project["end_time"], project[
                "so_far"] = StringUtils.transform_timestamp(timestamp)

        m_desc = re.search(u"项目描述(:|:)", line)
        if m_desc:
            line = re.sub(u"项目描述(:|:)", "", line).strip()
            isDesc, isResp = True, False
        m_resp = re.search(u"责任描述(:|:)", line)
        if m_resp:
            line = re.sub(u"责任描述(:|:)", "", line).strip()
            isDesc, isResp = False, True
        pass
        if isDesc:
            project["describe"] += '\n' + line if project[
                "describe"] and line else line
        if isResp:
            project["responsibilities"] += '\n' + line if project[
                "responsibilities"] and line else line
    return project
Exemple #3
0
def extract_traininfo(text):
    train = resume_struct.get_training_struct()

    for line in text.split('\n'):
        m_train = re.search(train_reg, line)
        if m_train:
            timestamp = match_timestamp.match_timestamp_by_reg(train_reg, line)
            train["start_time"], train["end_time"], train[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
            train["name"] = m_train.group("train").strip()
            if len(train["name"].split("\t")) == 2:
                train["authority"] = train["name"].split("\t")[0]
                train["name"] = train["name"].split("\t")[1]

        mauth = re.search(u"^培训机构:(.+)", line)
        if mauth:
            train["authority"] = mauth.group(1).strip()
        mcity = re.search(u"^培训地点:(.+)", line)
        if mcity:
            train["city"] = mcity.group(1).strip()
        mdesc = re.search(u"^培训描述:(.+)", line)
        if mdesc:
            train["description"] = mdesc.group(1).strip()

    return train
Exemple #4
0
def extract_certinfo(text):
    cert = resume_struct.get_certificate_struct()

    for line in text.split('\n'):
        m_cert = re.search(certi_reg, line)
        if m_cert:
            timestamp = match_timestamp.match_timestamp_by_reg(certi_reg, line)
            cert["name"] = m_cert.group("name").strip()
            cert["start_time"], _, _ = StringUtils.transform_timestamp(
                timestamp)

    return cert
Exemple #5
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')
    if lines[0].startswith(u"自我评价"):
        basic_info["self_remark"] = "\n".join(lines[1:])
        return basic_info

    for line_pre, line in izip([""] + lines, lines):
        ## name update
        m_update = re.search(u"更新时间:\s*(?P<up>\d{4}-\d{2}-\d{2})", line)
        if m_update:
            basic_info["updated_at"] = m_update.group("up") + " 00:00:00"
        if len(line.split("|")) >= 3:
            if len(StringUtils.get_words(line_pre.strip())) in [2, 3, 4]:
                basic_info["name"] = line_pre.strip()

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        if email: basic_info["contact_email"] = email
        if phone: basic_info["contact_phone"] = phone
        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line)
        if mbirth and mage:
            basic_info["birth"] = mbirth.group().replace(" ", "")
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"婚姻状况:(.+)", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(1).strip())
        maccount = re.search(u"(户口/国籍|户\s*口)(:|:)(?P<acc>.+)", line)
        if maccount:
            basic_info["account_str"] = maccount.group("acc").strip()
            basic_info["account"] = match_region.match_region(
                maccount.group("acc").strip())
        maddress = re.search(u"(居住地|现居住)(:|:)\s*(?P<s>.+?)(\t|\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group("s").strip()
            basic_info["address"] = match_region.match_region(
                maddress.group("s").strip())

    return basic_info
Exemple #6
0
def extract_eduinfo(expblock):
    edu = resume_struct.get_education_struct()
    edu["ori_text"] = expblock

    for line in expblock.split("\n"):
        m = re.search(edu_reg, line)
        if m:
            timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line)
            edu["school_name"] = m.group("school").strip()
            edu["start_time"], edu["end_time"], edu[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
            edu["degree"] = match_education.match_degree(m.group('degree'), 99)
            edu["discipline_name"] = m.group('discipline').strip()
    return edu
Exemple #7
0
def extract_projectinfo(text):
    project = resume_struct.get_project_struct()
    project["ori_text"] = text

    time_found = False
    isResp, isDesc, isAchi = False, False, False
    for line in text.split('\n'):
        m_proj = re.search(project_reg, line)
        if m_proj:
            timestamp = match_timestamp.match_timestamp_by_reg(
                project_reg, line)
            project["start_time"], project["end_time"], project[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
            time_found = True
            continue
        if time_found:
            project["name"], time_found = line, False
            continue

        m_posi = re.search(u"项目职务(:|:)\s*(?P<posi>.+)", line)
        if m_posi:
            project["position_name"] = m_posi.group("posi")
        m_corp = re.search(u"所在公司(:|:)(?P<corp>.+)", line)
        if m_corp:
            project["corporation_name"] = m_corp.group("corp")
        m_desc = re.search(u"项目简介(:|:)", line)
        if m_desc:
            line = re.sub(u"项目简介(:|:)", "", line).strip()
            isResp, isDesc, isAchi = False, True, False
        m_resp = re.search(u"项目职责(:|:)", line)
        if m_resp:
            line = re.sub(u"项目职责(:|:)", "", line).strip()
            isResp, isDesc, isAchi = True, False, False
        m_achi = re.search(u"项目业绩(:|:)", line)
        if m_achi:
            line = re.sub(u"项目业绩(:|:)", "", line).strip()
            isResp, isDesc, isAchi = False, False, True
        pass
        if isDesc:
            project["describe"] += '\n' + line if project[
                "describe"] and line else line
        if isResp:
            project["responsibilities"] += '\n' + line if project[
                "responsibilities"] and line else line
        if isAchi:
            project["achivement"] += "\n" + line if project[
                "achivement"] and line else line
    pass
    return project
Exemple #8
0
def extract_eduinfo(expblock):
    edu = resume_struct.get_education_struct()
    edu["ori_text"] = expblock

    for line in expblock.split("\n"):
        m = re.search(edu_reg, line)
        if m:
            edu["school_name"] = m.group("school").strip()
            edu["discipline_name"] = m.group("disc").strip()
            edu["degree"] = match_education.match_degree(
                m.group("degree").strip(), 99)
            timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line)
            edu["start_time"], edu["end_time"], edu[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
        else:
            edu["discipline_desc"] += "\n" + line.strip(
            ) if edu["discipline_desc"] else line.strip()

    edu["discipline_desc"] = re.sub(u"^专业描述(:|:)", "",
                                    edu["discipline_desc"]).strip()
    return edu
Exemple #9
0
def extract_workinfo(text):
    work = resume_struct.get_emplyment_struct()
    work["ori_text"] = text

    lastline = "not found company"
    for line in text.split('\n'):
        if re.search(u"工作描述(:|:)", line):
            lastline = "position"
        if lastline == "not found company":
            m_company = re.search(work_reg, line)
            if m_company:
                timestamp = match_timestamp.match_timestamp_by_reg(
                    work_reg, line)
                work["corporation_name"] = clean_company_name(
                    m_company.group("company").strip())
                work["start_time"], work["end_time"], work[
                    "so_far"] = StringUtils.transform_timestamp(timestamp)
                lastline = "company name"
            pass
        elif lastline == "company name":
            items = line.split("|")
            if len(items) > 0:
                work["industry_name"] = items[0].strip()
                lastline = "industry"
                continue
        elif lastline == "industry":
            items = re.split("\s+", line)
            if len(items) > 1:
                work["architecture_name"] = items[0]
                work["position_name"] = items[1]
            lastline = "position"
        elif lastline == "position":
            work["responsibilities"] += '\n' + line if work[
                "responsibilities"] else line
    pass
    work["responsibilities"] = re.sub(u"^工作描述(:|:)", "",
                                      work["responsibilities"]).strip()
    return work
Exemple #10
0
def extract_workinfo(text):
    work = resume_struct.get_emplyment_struct()
    work["ori_text"] = text
    last_industry = False

    for line in text.split('\n'):
        m_company = re.search(work_reg, line)
        if m_company:
            timestamp = match_timestamp.match_timestamp_by_reg(work_reg, line)
            work["corporation_name"] = m_company.group("company").strip()
            work["corporation_name"] = clean_company_name(
                work["corporation_name"])
            work["start_time"], work["end_time"], work[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
        m_position = re.search(u"职位名称(:|:)(?P<pos>.+)部门(:|:)(?P<arc>.+)", line)
        if m_position:
            work["position_name"] = m_position.group("pos").replace(
                u"(兼职)", "").strip()
            work["architecture_name"] = m_position.group("arc")
        m_industry = re.search(u"(行业|所属行业)(:|:)\s*(?P<ind>.+?)(\s|$)", line)
        if m_industry:
            work["industry_name"] = m_industry.group("ind").strip()
            last_industry = True
            continue
        if not work["position_name"] and last_industry:
            if len(line.split("\t")) == 2:
                work["architecture_name"] = line.split("\t")[0].strip()
                work["position_name"] = line.split("\t")[1].strip()
            if len(line.split("\t")) == 1:
                work["position_name"] = line.split("\t")[0].strip()
            continue
        if last_industry:
            work["responsibilities"] += "\n" + line if work[
                "responsibilities"] else line
    pass
    work["responsibilities"] = re.sub(u"^工作描述(:|:)", "",
                                      work["responsibilities"]).strip()
    return work
Exemple #11
0
def extract_workinfo(text):
    work = resume_struct.get_emplyment_struct()
    work["ori_text"] = text

    last_line, isResp = "", False
    for line in text.split('\n'):
        m_time = re.search(work_reg, line)
        if m_time:
            timestamp = match_timestamp.match_timestamp_by_reg(work_reg, line)
            work["start_time"], work["end_time"], work[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
            last_line = "time"
            continue
        if last_line == "time":
            work["corporation_name"] = line
            last_line = "corp_name"
            continue
        if last_line == "corp_name":
            work["position_name"] = line
            last_line = ""
            continue

        m_loc = re.search(u"所在地区(:|:)(?P<loc>.+)", line)
        if m_loc:
            work["city"] = m_loc.group("loc")

        m_resp = re.search(u"职责业绩(:|:)", line)
        if m_resp:
            line = re.sub(u".*职责业绩(:|:)", "", line)
            isResp = True

        if isResp:
            work["responsibilities"] += "\n" + line if work[
                "responsibilities"] and line else line
    pass
    work["responsibilities"] = re.sub(u"^工作描述(:|:)", "",
                                      work["responsibilities"]).strip()
    return work
Exemple #12
0
def extract_eduinfo(expblock):
    edu = resume_struct.get_education_struct()
    edu["ori_text"] = expblock

    lastline = "not found school"
    for line in expblock.split("\n"):
        if lastline == "not found school":
            m = re.search(edu_reg, line)
            if m:
                timestamp = match_timestamp.match_timestamp_by_reg(
                    edu_reg, line)
                edu["school_name"] = re.sub(u"海外经历", "",
                                            m.group("school")).strip()
                edu["start_time"], edu["end_time"], edu[
                    "so_far"] = StringUtils.transform_timestamp(timestamp)
                lastline = "school"
        elif lastline == "school":
            items = line.split("|")
            if len(items) >= 2:
                edu["degree"] = match_education.match_degree(items[0], 99)
                edu["degree_ori"] = items[0].strip()
                edu["discipline_name"] = items[1].strip()
            else:
                if match_education.match_degree(items):
                    edu["degree"] = match_education.match_degree(items[0])
                    edu["degree_ori"] = items[0].strip()
                else:
                    edu["discipline_name"] = items[0].strip()
            lastline = "degree"
        elif lastline == "degree":
            edu["discipline_desc"] += '\n' + line.strip(
            ) if edu["discipline_desc"] else line.strip()
        pass
    pass
    edu["discipline_desc"] = re.sub(u"^专业描述(:|:)", "",
                                    edu["discipline_desc"]).strip()
    return edu
Exemple #13
0
def extract_eduinfo(expblock):
    edu = resume_struct.get_education_struct()
    edu["ori_text"] = expblock

    time_found = False
    for line in expblock.split("\n"):
        m_time = re.search(edu_reg, line)
        if m_time:
            timestamp = match_timestamp.match_timestamp_by_reg(edu_reg, line)
            edu["start_time"], edu["end_time"], edu[
                "so_far"] = StringUtils.transform_timestamp(timestamp)
            time_found = True
            continue
        if time_found:
            edu["school_name"] = line.strip()
            time_found = False
        m_dis = re.search(u"专业(:|:)(?P<dis>.+)", line)
        if m_dis:
            edu["discipline_name"] = m_dis.group("dis").strip()
        m_deg = re.search(u"学历(:|:)(?P<deg>.+)", line)
        if m_deg:
            edu["degree"] = match_education.match_degree(
                m_deg.group("deg").strip())
    return edu
Exemple #14
0
def extract_projectinfo(text):
    project = resume_struct.get_project_struct()
    project["ori_text"] = text

    lastline = "not found project"
    isResp, isDesc = False, False
    for line in text.split('\n'):
        if lastline == "not found project":
            m_proj = re.search(project_reg, line)
            if m_proj:
                timestamp = match_timestamp.match_timestamp_by_reg(
                    project_reg, line)
                project["name"] = m_proj.group("project").strip()
                project["name"] = re.sub(u"已关联$", "", project["name"]).strip()
                project["start_time"], project["end_time"], project[
                    "so_far"] = StringUtils.transform_timestamp(timestamp)
                lastline = "project"
            pass
        elif lastline == "project":
            m_desc = re.search(u"项目描述(:|:)", line)
            if m_desc:
                line = re.sub(u"项目描述(:|:)", "", line).strip()
                isDesc, isResp = True, False
            m_resp = re.search(u"责任描述(:|:)", line)
            if m_resp:
                line = re.sub(u"责任描述(:|:)", "", line).strip()
                isDesc, isResp = False, True
            pass
            if isDesc:
                project[
                    "describe"] += '\n' + line if project["describe"] else line
            if isResp:
                project["responsibilities"] += '\n' + line if project[
                    "responsibilities"] else line
    pass
    return project
Exemple #15
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')
    for line_pre, line in izip([""] + lines, lines):
        ## name update
        if re.search(u"(.+)\s*流程状态.+标签.+", line) or re.search(
                u"(.+)\s*标签.+", line) or re.search(u"(.+)\s*ID:\d+", line):
            if u"流程状态" in line:
                basic_info["name"] = re.search(u"(.+)\s*流程状态",
                                               line).group(1).strip()
            elif u"标签" in line:
                basic_info["name"] = re.search(u"(.+)\s*标签",
                                               line).group(1).strip()
            else:
                basic_info["name"] = re.search(u"(.+)\s*ID",
                                               line).group(1).strip()
            if line_pre.strip().startswith(u"更新时间:"):
                m = re.search("\d{4}-\d{2}-\d{2}", line_pre)
                if m: basic_info["updated_at"] = m.group() + " 00:00:00"
        if "name" not in basic_info and re.search(u"^ID(:|:)\d{4,}", line):
            if len(StringUtils.get_words(line_pre)) in [2, 3, 4]:
                basic_info["name"] = line_pre
        if "name" not in basic_info and re.search(u"^\d{11}", line):
            if len(StringUtils.get_words(line_pre)) in [2, 3, 4]:
                basic_info["name"] = line_pre

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        if email: basic_info["contact_email"] = email
        if phone: basic_info["contact_phone"] = phone
        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line)
        if mbirth and mage:
            basic_info["birth"] = mbirth.group().replace(" ", "")
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"婚姻状况:(.+)", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(1).strip())
        maccount = re.search(u"户口/国籍:(.+)", line)
        if maccount:
            basic_info["account_str"] = maccount.group(1).strip()
            basic_info["account"] = match_region.match_region(
                maccount.group(1).strip())
        maddress = re.search(u"现居住?(.+?)(\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group(1).strip()
            basic_info["address"] = match_region.match_region(
                maddress.group(1).strip())

    return basic_info
Exemple #16
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')

    if lines[0].startswith(u"自我评价"):
        basic_info["self_remark"] = "\n".join(lines[1:])
        return basic_info

    for line_pre, line in izip([""] + lines, lines):

        ## update
        mupdate = re.search(u"简历更新时间(:|:)\s*(?P<t>\d{4}\.\d{2}\.\d{2})",
                            line_pre + line)
        if mupdate:
            basic_info["updated_at"] = mupdate.group("t").replace(
                ".", "-") + " 00:00:00"

        ## name
        mname_id = re.search(u"^ID:", line_pre)
        mname_ph = re.search(u"^手机:", line)
        if mname_id and ("name" not in basic_info or not basic_info["name"]):
            basic_info["name"] = line.strip() if len(
                StringUtils.get_words(line.strip())) in [2, 3, 4] else ""
        if mname_ph and ("name" not in basic_info or not basic_info["name"]):
            basic_info["name"] = line_pre.strip() if len(
                StringUtils.get_words(line_pre.strip())) in [2, 3, 4] else ""

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        m_ph = re.search(u"^手机:(\d+)",
                         re.sub(u"(\(|().+(\)|))", "", line).replace(" ", ""))
        if email: basic_info["contact_email"] = email
        if m_ph: basic_info["contact_phone"] = m_ph.group(1)
        elif phone and "contact_phone" not in basic_info:
            basic_info["contact_phone"] = phone

        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"(?P<y>\d{4})年\s*(?P<m>\d{1,2})月", line)
        if mbirth and mage:
            month = mbirth.group('m')
            if len(mbirth.group("m")) == 1:
                month = '0' + month
            basic_info["birth"] = mbirth.group('y') + u'年' + month + u'月'
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"(已|未)婚|Single|Married", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(0).strip())
        maccount = re.search(u"户口:(.+?)(\||$)", line)
        if maccount:
            basic_info["account_str"] = maccount.group().strip()
            basic_info["account"] = match_region.match_region(
                maccount.group(1).strip())
        maddress = re.search(u"现居住地:(.+?)(\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group().strip()
            basic_info["address"] = match_region.match_region(
                maddress.group(1).strip())
            basic_info["address_province"] = "10"  #提醒获取
        id_card = re.search(u"[0-9]{17}[xX0-9]", line)
        if id_card:
            basic_info["card"] = id_card.group(0)
        overseas = re.search(u"有海外工作|Overseas Work", line)
        if overseas:
            basic_info["overseas"] = "Y"
    return basic_info
Exemple #17
0
def _clean_cand_headline(text):
    text = text.strip().upper()
    text = StringUtils.clean_all_un_chinese(text, ["/", "IT"])
    return text
Exemple #18
0
def _clean_text(text):
    text = StringUtils.removeShapesSymbols(text)
    text = StringUtils.removeGeneralPunctuation(text)
    text = StringUtils.removeChSpace(text)
    text = text.lower()
    return text