Example #1
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 4:
        print("len(tds) = %d" % (len(tds)))
        return None

    name_tag = tds[0]
    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None

    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']

    employee.title = tds[2].get_text().strip()
    employee.departments = tds[3].get_text().strip()

    return employee
Example #2
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 4:
        print("len(tds) = %d"%(len(tds)))
        return None

    
    name_tag = tds[0]
    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None

    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    
    employee.title = tds[2].get_text().strip()
    employee.departments = tds[3].get_text().strip()

    return employee
Example #3
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div",
                         attrs={"class": "page_right addpage_right"},
                         limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department = tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title = tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research = tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Example #4
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1)
    if not divs or len(divs) == 0:
        div= soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department =  tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title =  tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research =  tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()