def handler(tag): employee = Employee() ass = tag.find_all('a',class_="orangea") if ass and len(ass) != 0: employee.name = ass[0].get_text() employee.name = ''.join(employee.name.split()) employee.profile = ass[0]['href'] ass = tag.find_all('a',class_="black01") if ass and len(ass) != 0: lines = ass[0].stripped_strings parser = ProfileParser(lines=lines,employee=employee) employee = parser.parse() return employee
def handler(tag): employee = Employee() ass = tag.find_all('a', class_="orangea") if ass and len(ass) != 0: employee.name = ass[0].get_text() employee.name = ''.join(employee.name.split()) employee.profile = ass[0]['href'] ass = tag.find_all('a', class_="black01") if ass and len(ass) != 0: lines = ass[0].stripped_strings parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="table", attrs={ "width": "96%", "cellspacing": "0" }, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() divs = soup.find_all(name="table", attrs={ "width": "96%", "cellspacing": "1" }, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] ass = div.find_all('a', text="点击此处访问") if ass and len(ass) != 0: employee.profile = ass[0]['href'] print 'Got profile:' + employee.profile # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256) return parser.parse()
def handler(tag): dd_tables = { "Email":"email", "Phone":"tel", "Homepage":"profile", 'Math Fields':'research' } h3 = tag.find_all(name='h3') if not h3: return None employee = Employee() employee.name = h3[0].get_text() or '' employee.name = ''.join(employee.name.split()) title_spans = tag.find_all(name="span",class_="faculty-title") employee.title = title_spans[0].get_text() #employee.title = ''.join(employee.title.split()) tds = tag.find_all(name='dt',class_="faculty-info") if not tds or len(tds) < 5: return None employee.email = tds[0].get_text() employee.email = ''.join(employee.email.split()) employee.tel = tds[1].get_text() employee.tel = ''.join(employee.tel.split()) employee.profile = tds[3].get_text() employee.profile = ''.join(employee.profile.split()) employee.research = tds[4].get_text() #employee.research = ''.join(employee.research.split()) return employee
def handler(tag): dd_tables = { "Email": "email", "Phone": "tel", "Homepage": "profile", 'Math Fields': 'research' } h3 = tag.find_all(name='h3') if not h3: return None employee = Employee() employee.name = h3[0].get_text() or '' employee.name = ''.join(employee.name.split()) title_spans = tag.find_all(name="span", class_="faculty-title") employee.title = title_spans[0].get_text() #employee.title = ''.join(employee.title.split()) tds = tag.find_all(name='dt', class_="faculty-info") if not tds or len(tds) < 5: return None employee.email = tds[0].get_text() employee.email = ''.join(employee.email.split()) employee.tel = tds[1].get_text() employee.tel = ''.join(employee.tel.split()) employee.profile = tds[3].get_text() employee.profile = ''.join(employee.profile.split()) employee.research = tds[4].get_text() #employee.research = ''.join(employee.research.split()) return employee
def handler(tag): employee = Employee() lines = tag.stripped_strings ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"}) if not ass or len(ass) == 0: # first line is the name for count, line in enumerate(lines): employee.name = line break else: employee.name = ass[0].string employee.profile = ass[0]["href"] employee.url = employee.profile parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def handler(tag): employee = Employee() lines = tag.stripped_strings ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"}) if not ass or len(ass) == 0: # first line is the name for count, line in enumerate(lines): employee.name = line break else: employee.name = ass[0].string employee.profile = ass[0]['href'] employee.url = employee.profile parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def handler(tag): dd_tables = { "Email":"email", "Phone":"tel", "Homepage":"profile", 'Math Fields':'research' } lis = tag.find_all('li') if len(lis) < 4: return None employee = Employee() pre_len = len(u'职务:') employee.name = lis[0].get_text() employee.profile = lis[0].a['href'] employee.url = employee.url or employee.profile if not employee.name: employee.name = employee.name[pre_len:] employee.name = ''.join(employee.name.split()) employee.title = lis[1].get_text() if employee.title: employee.title = employee.title[pre_len:] employee.title = ''.join(employee.title.split()) employee.tel = lis[2].get_text() if employee.tel: employee.tel = employee.tel[pre_len:] employee.tel = ''.join(employee.tel.split()) employee.email = lis[3].get_text() if employee.email: employee.email = employee.email[pre_len:] employee.email = ''.join(employee.email.split()) print("name:"+employee.name+",email:"+employee.email) return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"0"}, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"1"}, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] ass = div.find_all('a',text="点击此处访问") if ass and len(ass) != 0: employee.profile = ass[0]['href'] print 'Got profile:' + employee.profile # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256) return parser.parse()
def profile_handler(doc,name,url,path): symbols = { u'个人主页:' :'profile', u'研究方向:' :'research', u'电话:':'tel', u'电话':'tel' } filename = path+name+".html" employee = Employee(name=name,url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(id="sub_main",limit=1) if not divs or len(divs) == 0: # xml members = soup.find_all(name="member",limit=1) if not members or len(members) == 0: print("id:main or sub_main not found") #print doc return employee member = members[0] # title names = member.find_all('name') if not names and len(names) != 0: name = name[0].string if name: idx = name.find(' ') if idx != -1: employee.title = name[idx:] if member.field: employee.research = member.field.string or '' if member.homepage: employee.profile = member.homepage.string or '' if member.contact: if member.contact.string: for i,c in enumerate(member.contact.string): if c.isdigit(): employee.tel += c with open(filename,'wb') as fp: content = member.prettify() fp.write(content) fp.close() return employee div = divs[0] with open(filename,'wb') as fp: content = div.prettify() fp.write(content) fp.close() h4s = div.find_all('h4') if not h4s and len(h4s) != 0: name = h4s[0].string idx = name.find(' ') if idx != -1: employee.tite = name[idx:] employee.tite = ''.join(employee.tite.split()) lis = div.find_all("li",limit=8) if not lis or len(lis) == 0: return employee res = lis[0] # 解析详细内容 for count,tag in enumerate(lis[0].children): text = tag.string if not text: continue if len(text) == 0: continue text = ''.join(text.split()) if '@' in text: employee.email = text continue for symbol,name in symbols.items(): idx = text.find(symbol) if idx != -1: idx += len(symbol) value = text[idx:] if hasattr(employee, name): setattr(employee, name, value) print (name + ":" + value) else: print ("no attr %s in employee" % name) break return employee