def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() infos_div = div.find_all('div', attrs={"id": "column-1"}) if infos_div and len(infos_div) != 0: div = infos_div[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "phy-main"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() #email email_div = soup.find_all(name='a', class_="phy-mail") if email_div and len(email_div) != 0: employee.email = email_div[0].get_text().strip() te_div = soup.find_all(name='a', class_="phy-phone") if te_div and len(te_div) != 0: employee.tel = te_div[0].get_text().strip() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="s2_right_con", limit=1) if not divs or len(divs) == 0: print("can't find div???") div = soup #return employee else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) lis = soup.find_all(name="li") if not lis and len(lis) != 5: div = soup else: ass = lis[4].find_all('a') if len(ass) != 0: li_url = ass[0]['href'] newUrl = urljoin(url,li_url) newDoc = get_doc_byUrllib2(newUrl) soup = BeautifulSoup(newDoc, Config.SOUP_PARSER) mainDiv = soup.find_all('div',attrs={"id":"main"}) if not mainDiv or len(mainDiv) == 0: print "not found main div" div = soup else: div = mainDiv[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="box_rt01 list", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() h3s = div.find_all('h3') if h3s and len(h3s) != 0: title = h3s[0].get_text() title = ''.join(title.split()) print title for t in PROFILE_TITLES: if t in title: employee.title = title print "got => " + title break else: print "not found h3" # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="line20 dataName", limit=1) if not divs or len(divs) == 0: divs = soup.find_all(name="div", class_="rightArea clearfix ", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=999) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td", class_="bd-content", limit=1) if not divs or len(divs) == 0: divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1) if not divs or len(divs) == 0: with open(filename, 'wb') as fp: content = doc fp.write(content) fp.close() return employee div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="right", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="lf0104", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td",attrs={"valign":"center"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() lines = [] tds = div.find_all('td') if len(tds) == 0: lines = div.stripped_strings print "TDS none!" else: for td in tds: string = td.get_text().strip() if len(string) < 128: string = ''.join(string.split()) print string lines.append(string) # 使用纯文本方式处理 #lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook,max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "main"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=999, force_email=True, force_tel=False) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", clasS_="content-wrapper", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() infos_div = div.find_all('div',attrs={"id":"column-1"}) if infos_div and len(infos_div) != 0: div = infos_div[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"newsContent"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() details = soup.find_all(name="span", attrs={"id":"ctl00_ContentPlaceHolder1_NewsView1_lbl_NewsContent"}, limit=1) if not details or len(details) == 0: return employee # 使用纯文本方式处理 lines = details[0].stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="box_rt01 list", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() h3s = div.find_all('h3') if h3s and len(h3s) != 0: title = h3s[0].get_text() title = ''.join(title.split()) print title for t in PROFILE_TITLES: if t in title: employee.title = title print "got => " + title break else: print "not found h3" # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id":"right_2"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() researches = [' ',' '] tds = div.find_all(name="td",attrs={"bgcolor":"#FFFFFF","class":"ft12","valign":"top"},limit=4) if len(tds) == 4: researches[0] = tds[2].get_text().strip() researches[1] = tds[3].get_text().strip() employee.research = researches[0] + ";" +researches[1] print "research:" + employee.research # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,ignore=set(['research'])) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "right-nr"}) if not divs or len(divs) == 0: print("div class=right-nr not found") return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) # ,set_attr_hook=set_attr_hook parser = ProfileParser(lines=lines, employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td", class_="bd-content", limit=1) if not divs or len(divs) == 0: divs = soup.find_all(name="td", attrs={"width": "79%"}, limit=1) if not divs or len(divs) == 0: with open(filename, "wb") as fp: content = doc fp.write(content) fp.close() return employee div = divs[0] with open(filename, "wb") as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "NewsArticles"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, "wb") as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser( lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=999, force_email=True, force_tel=False ) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id":"phy-main"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() #email #email_div = soup.find_all(name='a',class_="phy-mail") #if email_div and len(email_div) != 0: # employee.email = email_div[0].get_text().strip() # #te_div = soup.find_all(name='a',class_="phy-phone") #if te_div and len(te_div) != 0: # employee.tel = te_div[0].get_text().strip() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256,ignore=set(['title','research'])) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="right", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="lf0104", limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) #div_header = soup.find_all(name="div", attrs={"class":"neiye-shizi-title"}, limit=1) divs = soup.find_all(name="div", attrs={"class":"xinwen-txt_3"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,ignore=set(['fax'])) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td", attrs={"bgcolor": "#FFFFFF"}, limit=1) if not divs or len(divs) == 0: with open(filename, 'wb') as fp: content = doc fp.write(content) fp.close() return employee div = divs[0] with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=profile_set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"right-nr"}) if not divs or len(divs) == 0: print("div class=right-nr not found") return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) # ,set_attr_hook=set_attr_hook parser = ProfileParser(lines=lines,employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "darea"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() dnodes = div.find_all(name='div', class_=u"dnode") if not dnodes or len(dnodes) == 0: return employee lines = None target_node = None done = False for node in dnodes: lines = node.stripped_strings for count, line in enumerate(lines): if count >= 2: break if line == u'联系方式': print "binggo!" target_node = node done = True break if done: break if not target_node: return employee lines = [] trs = node.find_all('tr') if trs and len(trs) != 0: for tr in trs: text = tr.get_text() if text: text = ''.join(text.split()) lines.append(text) else: lines = node.stripped_strings #lines = target_node.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"darea"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() dnodes = div.find_all(name='div',class_=u"dnode") if not dnodes or len(dnodes) == 0: return employee lines = None target_node = None done = False for node in dnodes: lines = node.stripped_strings for count,line in enumerate(lines): if count >= 2: break; if line == u'联系方式': print "binggo!" target_node = node done = True break if done: break if not target_node: return employee lines = [] trs = node.find_all('tr') if trs and len(trs) != 0: for tr in trs: text = tr.get_text() if text: text = ''.join(text.split()) lines.append(text) else: lines = node.stripped_strings #lines = target_node.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def handler(tag): employee = Employee() name_divs = tag.find_all("div", class_="teacher-title") if name_divs and len(name_divs) != 0: employee.name = name_divs[0].get_text() employee.name = ''.join(employee.name.split()) # 使用纯文本方式处理 lines = tag.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee) return parser.parse()
def handler(tag): employee = Employee() name_divs = tag.find_all("div",class_="teacher-title") if name_divs and len(name_divs) != 0: employee.name = name_divs[0].get_text() employee.name = ''.join(employee.name.split()) # 使用纯文本方式处理 lines = tag.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div= soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def handler(tag): employee = Employee() ass = tag.find_all('a',class_="orangea") if ass and len(ass) != 0: employee.name = ass[0].get_text() employee.name = ''.join(employee.name.split()) employee.profile = ass[0]['href'] ass = tag.find_all('a',class_="black01") if ass and len(ass) != 0: lines = ass[0].stripped_strings parser = ProfileParser(lines=lines,employee=employee) employee = parser.parse() return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id":"maincontent"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() divs = div.find_all(class_="other") if not divs or len(divs) == 0: div = soup else: div = divs[0] lines = [] spans = div.find_all('span') for child in spans: line = child.get_text() if line: line = ''.join(line.split()) if not line: continue if len(line) != 0: lines.append(line) if len(lines) == 0: return emplo #email #email_div = soup.find_all(name='a',class_="phy-mail") #if email_div and len(email_div) != 0: # employee.email = email_div[0].get_text().strip() # #te_div = soup.find_all(name='a',class_="phy-phone") #if te_div and len(te_div) != 0: # employee.tel = te_div[0].get_text().strip() # 使用纯文本方式处理 #lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def handler(tag): employee = Employee() ass = tag.find_all('a', class_="orangea") if ass and len(ass) != 0: employee.name = ass[0].get_text() employee.name = ''.join(employee.name.split()) employee.profile = ass[0]['href'] ass = tag.find_all('a', class_="black01") if ass and len(ass) != 0: lines = ass[0].stripped_strings parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="table", attrs={ "width": "96%", "cellspacing": "0" }, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() divs = soup.find_all(name="table", attrs={ "width": "96%", "cellspacing": "1" }, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] ass = div.find_all('a', text="点击此处访问") if ass and len(ass) != 0: employee.profile = ass[0]['href'] print 'Got profile:' + employee.profile # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") email_image_filename = os.path.join(path, name + "_email.png") tel_image_filename = os.path.join(path, name + "_tel.png") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # email image item_divs = div.find_all(name="div", attrs={"class": "item_list"}) ignores = [] for div in item_divs: string = div.get_text() if string and len(string) != 0: if u'邮件' in string and len(employee.email) == 0: employee.email = image2text(imageSrc(div), email_image_filename, 'eng2') print(employee.email) ignores.append('email') elif u'电话' in string and len(employee.tel) == 0: employee.tel = image2text(imageSrc(div), tel_image_filename, 'eng') print(employee.tel) ignores.append('tel') # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores)) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) div = soup with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) div = soup with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") email_image_filename = os.path.join(path, name + "_email.png") tel_image_filename = os.path.join(path, name + "_tel.png") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, "wb") as fp: content = div.prettify() fp.write(content) fp.close() # email image item_divs = div.find_all(name="div", attrs={"class": "item_list"}) ignores = [] for div in item_divs: string = div.get_text() if string and len(string) != 0: if u"邮件" in string and len(employee.email) == 0: employee.email = image2text(imageSrc(div), email_image_filename, "eng2") print(employee.email) ignores.append("email") elif u"电话" in string and len(employee.tel) == 0: employee.tel = image2text(imageSrc(div), tel_image_filename, "eng") print(employee.tel) ignores.append("tel") # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser( lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores) ) return parser.parse()
def handler(tag): employee = Employee() lines = tag.stripped_strings ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"}) if not ass or len(ass) == 0: # first line is the name for count, line in enumerate(lines): employee.name = line break else: employee.name = ass[0].string employee.profile = ass[0]['href'] employee.url = employee.profile parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def handler(tag): employee = Employee() lines = tag.stripped_strings ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"}) if not ass or len(ass) == 0: # first line is the name for count, line in enumerate(lines): employee.name = line break else: employee.name = ass[0].string employee.profile = ass[0]["href"] employee.url = employee.profile parser = ProfileParser(lines=lines, employee=employee) employee = parser.parse() return employee
def handler(tag): name_spans = tag.find_all(class_="handle") if not name_spans or len(name_spans) == 0: return None # js <span class="handle" onclick="toCardDetailAction('10c07e70-3fb6-42af-aa26-bfab26b6ce0406');" style="color:#2084D2;font-size: 16px;">艾明晶</span> employee = Employee() employee.name = name_spans[0].get_text() employee.name = ''.join(employee.name.split()) card_id = name_spans[0]['onclick'][len('toCardDetailAction(\''):-3] employee.url = 'http://scse.buaa.edu.cn/buaa-css-web/toCardDetailAction.action?firstSelId=CARD_TMPL_OF_FIRST_NAVI_CN%20&%20secondSelId=CARD_TMPL_OF_ALL_TEACHER_CN%20&cardId='+card_id print ("card_id=[%s]"%card_id) lines = tag.stripped_strings parser = ProfileParser(lines=lines,employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="xq_teacher", limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 找到科研方向 details = div.find_all("div", class_="con01_t", limit=3) if details and len(details) >= 2: employee.research = details[1].get_text() employee.research = ''.join(employee.research.split()) # 过滤掉太短的串 if len(employee.research) <= (len(u'研究方向') + 1): employee.research = '' else: employee.research.replace(',', ',') # 解析其他各人信息 infos = div.find_all("div", class_="wz_teacher", limit=1) if infos and len(infos) != 0: # 使用纯文本方式处理 lines = infos[0].stripped_strings parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, force_email=True) return parser.parse() else: return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td",attrs={"bgcolor":"#FFFFFF"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=profile_set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id":"work"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="xq_teacher", limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 找到科研方向 details = div.find_all("div",class_="con01_t",limit=3) if details and len(details) >= 2: employee.research = details[1].get_text() employee.research = ''.join(employee.research.split()) # 过滤掉太短的串 if len(employee.research) <= (len(u'研究方向')+1): employee.research = '' else: employee.research.replace(',',',') # 解析其他各人信息 infos = div.find_all("div",class_="wz_teacher",limit=1) if infos and len(infos) != 0: # 使用纯文本方式处理 lines = infos[0].stripped_strings parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,force_email=True) return parser.parse() else: return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) lis = soup.find_all(name="li") if not lis and len(lis) != 5: div = soup else: ass = lis[4].find_all('a') if len(ass) != 0: li_url = ass[0]['href'] newUrl = urljoin(url, li_url) newDoc = get_doc_byUrllib2(newUrl) soup = BeautifulSoup(newDoc, Config.SOUP_PARSER) mainDiv = soup.find_all('div', attrs={"id": "main"}) if not mainDiv or len(mainDiv) == 0: print "not found main div" div = soup else: div = mainDiv[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # 使用纯文本方式处理 lines = div.stripped_strings parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, force_email=True) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "right_2"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() researches = [' ', ' '] tds = div.find_all(name="td", attrs={ "bgcolor": "#FFFFFF", "class": "ft12", "valign": "top" }, limit=4) if len(tds) == 4: researches[0] = tds[2].get_text().strip() researches[1] = tds[3].get_text().strip() employee.research = researches[0] + ";" + researches[1] print "research:" + employee.research # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, ignore=set(['research'])) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="td", attrs={"valign": "center"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() lines = [] tds = div.find_all('td') if len(tds) == 0: lines = div.stripped_strings print "TDS none!" else: for td in tds: string = td.get_text().strip() if len(string) < 128: string = ''.join(string.split()) print string lines.append(string) # 使用纯文本方式处理 #lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=profile_set_attr_hook, max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) tables = soup.find_all(name="table",limit=4) if len(tables) < 2: return employee tabel_content = tables[3] with open(filename, 'wb') as fp: content = tabel_content.prettify() fp.write(content) fp.close() td = tabel_content.find_all("td",attrs={"valign":"top","width":"577"}) if not td or len(td) == 0: return employee # 提取各人信息 lines = td[0].stripped_strings parser = ProfileParser(lines=lines,employee=employee) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"0"}, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() divs = soup.find_all(name="table", attrs={"width":"96%","cellspacing":"1"}, limit=1) if not divs or len(divs) == 0: print "not found main div" div = soup else: div = divs[0] ass = div.find_all('a',text="点击此处访问") if ass and len(ass) != 0: employee.profile = ass[0]['href'] print 'Got profile:' + employee.profile # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook,max_line=256) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) tables = soup.find_all(name="table", limit=4) if len(tables) < 2: return employee tabel_content = tables[3] with open(filename, 'wb') as fp: content = tabel_content.prettify() fp.write(content) fp.close() td = tabel_content.find_all("td", attrs={"valign": "top", "width": "577"}) if not td or len(td) == 0: return employee # 提取各人信息 lines = td[0].stripped_strings parser = ProfileParser(lines=lines, employee=employee) return parser.parse()