def handler(tag): tds = tag.find_all(name='td') if not tds: print("len(tds) == 0") return None employee = Employee() if len(tds) < 4: print("len(tds) = %d" % (len(tds))) return None name_tag = tds[0] employee.name = name_tag.get_text() employee.name = employee.name.strip() if employee.name == u'姓名': return None ass = name_tag.find_all('a') if ass and len(ass) != 0: employee.url = ass[0]['href'] employee.title = tds[2].get_text().strip() employee.departments = tds[3].get_text().strip() return employee
def handler(tag): tds = tag.find_all(name='td') if not tds: print("len(tds) == 0") return None employee = Employee() if len(tds) < 4: print("len(tds) = %d"%(len(tds))) return None name_tag = tds[0] employee.name = name_tag.get_text() employee.name = employee.name.strip() if employee.name == u'姓名': return None ass = name_tag.find_all('a') if ass and len(ass) != 0: employee.url = ass[0]['href'] employee.title = tds[2].get_text().strip() employee.departments = tds[3].get_text().strip() return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div= soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()