Ejemplo n.º 1
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')
    if lines[0].startswith(u"自我评价"):
        basic_info["self_remark"] = "\n".join(lines[1:])
        return basic_info

    for line_pre, line in izip([""] + lines, lines):
        ## name update
        m_update = re.search(u"更新时间:\s*(?P<up>\d{4}-\d{2}-\d{2})", line)
        if m_update:
            basic_info["updated_at"] = m_update.group("up") + " 00:00:00"
        if len(line.split("|")) >= 3:
            if len(StringUtils.get_words(line_pre.strip())) in [2, 3, 4]:
                basic_info["name"] = line_pre.strip()

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        if email: basic_info["contact_email"] = email
        if phone: basic_info["contact_phone"] = phone
        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line)
        if mbirth and mage:
            basic_info["birth"] = mbirth.group().replace(" ", "")
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"婚姻状况:(.+)", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(1).strip())
        maccount = re.search(u"(户口/国籍|户\s*口)(:|:)(?P<acc>.+)", line)
        if maccount:
            basic_info["account_str"] = maccount.group("acc").strip()
            basic_info["account"] = match_region.match_region(
                maccount.group("acc").strip())
        maddress = re.search(u"(居住地|现居住)(:|:)\s*(?P<s>.+?)(\t|\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group("s").strip()
            basic_info["address"] = match_region.match_region(
                maddress.group("s").strip())

    return basic_info
Ejemplo n.º 2
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')
    for line_pre, line in izip([""] + lines, lines):
        ## name update
        if re.search(u"(.+)\s*流程状态.+标签.+", line) or re.search(
                u"(.+)\s*标签.+", line) or re.search(u"(.+)\s*ID:\d+", line):
            if u"流程状态" in line:
                basic_info["name"] = re.search(u"(.+)\s*流程状态",
                                               line).group(1).strip()
            elif u"标签" in line:
                basic_info["name"] = re.search(u"(.+)\s*标签",
                                               line).group(1).strip()
            else:
                basic_info["name"] = re.search(u"(.+)\s*ID",
                                               line).group(1).strip()
            if line_pre.strip().startswith(u"更新时间:"):
                m = re.search("\d{4}-\d{2}-\d{2}", line_pre)
                if m: basic_info["updated_at"] = m.group() + " 00:00:00"
        if "name" not in basic_info and re.search(u"^ID(:|:)\d{4,}", line):
            if len(StringUtils.get_words(line_pre)) in [2, 3, 4]:
                basic_info["name"] = line_pre
        if "name" not in basic_info and re.search(u"^\d{11}", line):
            if len(StringUtils.get_words(line_pre)) in [2, 3, 4]:
                basic_info["name"] = line_pre

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        if email: basic_info["contact_email"] = email
        if phone: basic_info["contact_phone"] = phone
        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"\d{4}年\s?\d{1,2}月\s?\d{1,2}日", line)
        if mbirth and mage:
            basic_info["birth"] = mbirth.group().replace(" ", "")
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"婚姻状况:(.+)", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(1).strip())
        maccount = re.search(u"户口/国籍:(.+)", line)
        if maccount:
            basic_info["account_str"] = maccount.group(1).strip()
            basic_info["account"] = match_region.match_region(
                maccount.group(1).strip())
        maddress = re.search(u"现居住?(.+?)(\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group(1).strip()
            basic_info["address"] = match_region.match_region(
                maddress.group(1).strip())

    return basic_info
Ejemplo n.º 3
0
def extract_basicinfo(text):
    basic_info = {}
    basic_info["ori_text_" + str(random.randint(0, 1000))] = text

    lines = text.split('\n')

    if lines[0].startswith(u"自我评价"):
        basic_info["self_remark"] = "\n".join(lines[1:])
        return basic_info

    for line_pre, line in izip([""] + lines, lines):

        ## update
        mupdate = re.search(u"简历更新时间(:|:)\s*(?P<t>\d{4}\.\d{2}\.\d{2})",
                            line_pre + line)
        if mupdate:
            basic_info["updated_at"] = mupdate.group("t").replace(
                ".", "-") + " 00:00:00"

        ## name
        mname_id = re.search(u"^ID:", line_pre)
        mname_ph = re.search(u"^手机:", line)
        if mname_id and ("name" not in basic_info or not basic_info["name"]):
            basic_info["name"] = line.strip() if len(
                StringUtils.get_words(line.strip())) in [2, 3, 4] else ""
        if mname_ph and ("name" not in basic_info or not basic_info["name"]):
            basic_info["name"] = line_pre.strip() if len(
                StringUtils.get_words(line_pre.strip())) in [2, 3, 4] else ""

        ## email phone
        email = match_basic.match_email(line)
        phone = match_basic.match_phone(line)
        m_ph = re.search(u"^手机:(\d+)",
                         re.sub(u"(\(|().+(\)|))", "", line).replace(" ", ""))
        if email: basic_info["contact_email"] = email
        if m_ph: basic_info["contact_phone"] = m_ph.group(1)
        elif phone and "contact_phone" not in basic_info:
            basic_info["contact_phone"] = phone

        ## gender age birth
        if "U" != match_basic.match_gender(line) and (u"岁" in line
                                                      or u"经验" in line):
            basic_info["gender"] = match_basic.match_gender(line)
        mage = re.search(u"\d+岁", line)
        if mage: basic_info["age"] = int(mage.group()[:-1])
        mbirth = re.search(u"(?P<y>\d{4})年\s*(?P<m>\d{1,2})月", line)
        if mbirth and mage:
            month = mbirth.group('m')
            if len(mbirth.group("m")) == 1:
                month = '0' + month
            basic_info["birth"] = mbirth.group('y') + u'年' + month + u'月'
        ## others
        mtel = re.search(u"家庭电话:(.+)", line)
        if mtel: basic_info["contact_tel"] = mtel.group(1).strip()
        mexp = re.search(u"(\d+)年工作经验", line)
        if mexp: basic_info["work_experience"] = int(mexp.group(1))
        mmary = re.search(u"(已|未)婚|Single|Married", line)
        if mmary:
            basic_info["marital"] = match_basic.match_marital(
                mmary.group(0).strip())
        maccount = re.search(u"户口:(.+?)(\||$)", line)
        if maccount:
            basic_info["account_str"] = maccount.group().strip()
            basic_info["account"] = match_region.match_region(
                maccount.group(1).strip())
        maddress = re.search(u"现居住地:(.+?)(\||$)", line)
        if maddress:
            basic_info["address_str"] = maddress.group().strip()
            basic_info["address"] = match_region.match_region(
                maddress.group(1).strip())
            basic_info["address_province"] = "10"  #提醒获取
        id_card = re.search(u"[0-9]{17}[xX0-9]", line)
        if id_card:
            basic_info["card"] = id_card.group(0)
        overseas = re.search(u"有海外工作|Overseas Work", line)
        if overseas:
            basic_info["overseas"] = "Y"
    return basic_info