def handler(tag): tds = tag.find_all(name='td') if not tds: print("len(tds) == 0") return None employee = Employee() if len(tds) < 5: print("len(tds) = %d"%(len(tds))) return None name_tag = None name_tag_idx = 0 if len(tds) == 5: name_tag_idx = 0 name_tag = tds[name_tag_idx] elif len(tds) > 5: name_tag_idx = 1 name_tag = tds[name_tag_idx] employee.name = name_tag.get_text() employee.name = employee.name.strip() if employee.name == u'姓名': return None ass = name_tag.find_all('a') if ass and len(ass) != 0: employee.url = ass[0]['href'] employee.title = tds[name_tag_idx+1].get_text().strip() employee.email = tds[name_tag_idx+2].get_text().strip() employee.tel = tds[name_tag_idx+3].get_text().strip() return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id":"phy-main"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() #email email_div = soup.find_all(name='a',class_="phy-mail") if email_div and len(email_div) != 0: employee.email = email_div[0].get_text().strip() te_div = soup.find_all(name='a',class_="phy-phone") if te_div and len(te_div) != 0: employee.tel = te_div[0].get_text().strip() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "phy-main"}, limit=1) if not divs or len(divs) == 0: return employee div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() #email email_div = soup.find_all(name='a', class_="phy-mail") if email_div and len(email_div) != 0: employee.email = email_div[0].get_text().strip() te_div = soup.find_all(name='a', class_="phy-phone") if te_div and len(te_div) != 0: employee.tel = te_div[0].get_text().strip() # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def handler(tag): tds = tag.find_all(name='td') if not tds: print("len(tds) == 0") return None employee = Employee() if len(tds) < 5: print("len(tds) = %d" % (len(tds))) return None name_tag = None name_tag_idx = 0 if len(tds) == 5: name_tag_idx = 0 name_tag = tds[name_tag_idx] elif len(tds) > 5: name_tag_idx = 1 name_tag = tds[name_tag_idx] employee.name = name_tag.get_text() employee.name = employee.name.strip() if employee.name == u'姓名': return None ass = name_tag.find_all('a') if ass and len(ass) != 0: employee.url = ass[0]['href'] employee.title = tds[name_tag_idx + 1].get_text().strip() employee.email = tds[name_tag_idx + 2].get_text().strip() employee.tel = tds[name_tag_idx + 3].get_text().strip() return employee
def handler(tag): tds = tag.find_all("td") if not tds or len(tds) != 4: return None employee = Employee() ass = tag.find_all('a') if ass and len(ass) != 0: employee.url = ass[0]['href'] employee.name = tds[0].get_text().strip() employee.name = ''.join(employee.name.split()) title = tds[1].get_text() if title and len(title) != 0: employee.title = ''.join(title.split()) email = tds[3].get_text() if email and len(email) != 0: employee.email = ''.join(email.split()) tel = tds[2].get_text() if tel and len(tel) != 0: employee.tel = ''.join(tel.split()) return employee
def handler(tag): ass = tag.find_all(name='a') if not ass: return None employee = Employee() if len(ass) >= 2: employee.email = ass[1].string if employee.email: employee.email = ''.join(employee.email.split()) if len(ass) >= 1: employee.name = ass[0].string if not employee.name: return None employee.name = employee.name.strip() employee.url = ass[0]['href'] return employee
def profile_handler(doc, name, url, path): symbols = { u'电话:': 'tel', u'联系电话:': 'tel', u'传真:': 'fax', } employee = Employee(name=name, url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", class_="box_detail", limit=1) if not divs or len(divs) == 0: return employee div = divs[0] filename = path + name + ".html" with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() td_left = div.find_all("td", attrs={ "style": "line-height: 16px", "align": "left" }) if not td_left or len(td_left) == 0: return employee # 解析详细内容 links = div.find_all("a", limit=2) for link in links: if link.string: if '@' in link.string: employee.email = link.string for count, tag in enumerate(td_left[0].children): if not tag.string: continue if count > 15: break text = tag.string text = ''.join(text.split()) if len(text) == 0: continue for symbol, name in symbols.items(): idx = text.find(symbol) if idx != -1: idx += len(symbol) value = text[idx:] if hasattr(employee, name): setattr(employee, name, value) symbols # print (name + ":" + value) else: print("no attr %s in employee" % name) break return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class": "page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook) return parser.parse()
def profile_handler(doc,name,url,path): symbols = { u'电话:' :'tel', u'联系电话:' :'tel', u'传真:' :'fax', } employee = Employee(name=name,url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div",class_="box_detail",limit=1) if not divs or len(divs) == 0: return employee div = divs[0] filename = path+name+".html" with open(filename,'wb') as fp: content = div.prettify() fp.write(content) fp.close() td_left = div.find_all("td",attrs={ "style":"line-height: 16px","align":"left"}) if not td_left or len(td_left) == 0: return employee # 解析详细内容 links = div.find_all("a",limit=2) for link in links: if link.string: if '@' in link.string: employee.email = link.string for count,tag in enumerate(td_left[0].children): if not tag.string: continue if count > 15: break text = tag.string text = ''.join(text.split()) if len(text) == 0: continue for symbol,name in symbols.items(): idx = text.find(symbol) if idx != -1: idx += len(symbol) value = text[idx:] if hasattr(employee, name): setattr(employee, name, value) symbols # print (name + ":" + value) else: print ("no attr %s in employee" % name) break return employee
def handler(tag): tds = tag.find_all(name='td') if not tds or len(tds) != 3: return None employee = Employee() employee.name = tds[0].get_text() or '' employee.name = ''.join(employee.name.split()) # 过滤表头 if employee.name == u'姓名': return None employee.title = tds[1].get_text() employee.title = ''.join(employee.title.split()) employee.email = tds[2].get_text() employee.email = ''.join(employee.email.split()) employee.email = email_value_strip(employee.email) # print(tag) return employee
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1) if not divs or len(divs) == 0: div= soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() tds = div.find_all('td') if tds and len(tds) == 11: department = tds[2].get_text() if department: department = ''.join(department.split()) if department and len(department) != 0: employee.departments = department title = tds[4].get_text() if title: title = ''.join(title.split()) if title and len(title) != 0: employee.title = title email = tds[8].get_text() if email: email = ''.join(email.split()) if email and len(email) != 0: employee.email = email research = tds[10].get_text() if research: research = ''.join(research.split()) if research and len(research) != 0: employee.research = research divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1) if divs and len(divs) != 0: div = divs[0] # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook) return parser.parse()
def handler(tag): dd_tables = { "Email":"email", "Phone":"tel", "Homepage":"profile", 'Math Fields':'research' } lis = tag.find_all('li') if len(lis) < 4: return None employee = Employee() pre_len = len(u'职务:') employee.name = lis[0].get_text() employee.profile = lis[0].a['href'] employee.url = employee.url or employee.profile if not employee.name: employee.name = employee.name[pre_len:] employee.name = ''.join(employee.name.split()) employee.title = lis[1].get_text() if employee.title: employee.title = employee.title[pre_len:] employee.title = ''.join(employee.title.split()) employee.tel = lis[2].get_text() if employee.tel: employee.tel = employee.tel[pre_len:] employee.tel = ''.join(employee.tel.split()) employee.email = lis[3].get_text() if employee.email: employee.email = employee.email[pre_len:] employee.email = ''.join(employee.email.split()) print("name:"+employee.name+",email:"+employee.email) return employee
def handler(tag): dd_tables = { "Email": "email", "Phone": "tel", "Homepage": "profile", 'Math Fields': 'research' } h3 = tag.find_all(name='h3') if not h3: return None employee = Employee() employee.name = h3[0].get_text() or '' employee.name = ''.join(employee.name.split()) title_spans = tag.find_all(name="span", class_="faculty-title") employee.title = title_spans[0].get_text() #employee.title = ''.join(employee.title.split()) tds = tag.find_all(name='dt', class_="faculty-info") if not tds or len(tds) < 5: return None employee.email = tds[0].get_text() employee.email = ''.join(employee.email.split()) employee.tel = tds[1].get_text() employee.tel = ''.join(employee.tel.split()) employee.profile = tds[3].get_text() employee.profile = ''.join(employee.profile.split()) employee.research = tds[4].get_text() #employee.research = ''.join(employee.research.split()) return employee
def handler(tag): dd_tables = { "Email":"email", "Phone":"tel", "Homepage":"profile", 'Math Fields':'research' } h3 = tag.find_all(name='h3') if not h3: return None employee = Employee() employee.name = h3[0].get_text() or '' employee.name = ''.join(employee.name.split()) title_spans = tag.find_all(name="span",class_="faculty-title") employee.title = title_spans[0].get_text() #employee.title = ''.join(employee.title.split()) tds = tag.find_all(name='dt',class_="faculty-info") if not tds or len(tds) < 5: return None employee.email = tds[0].get_text() employee.email = ''.join(employee.email.split()) employee.tel = tds[1].get_text() employee.tel = ''.join(employee.tel.split()) employee.profile = tds[3].get_text() employee.profile = ''.join(employee.profile.split()) employee.research = tds[4].get_text() #employee.research = ''.join(employee.research.split()) return employee
def create_Employee(emp : createEmployee, db : Session = Depends(get_db)): _employee = Employee() _employee.id = emp.id _employee.name = emp.name _employee.email = emp.email _employee.position = emp.position _employee.works_on: emp.works_on _employee.reporting_manager : emp.reporting_manager db.add(_employee) db.commit() # Background_tasks.add_task(fetch_emp_data,employee.id) return{ "code":"Success", "messege": "Employee created the name"+emp.name }
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") email_image_filename = os.path.join(path, name + "_email.png") tel_image_filename = os.path.join(path, name + "_tel.png") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, 'wb') as fp: content = div.prettify() fp.write(content) fp.close() # email image item_divs = div.find_all(name="div", attrs={"class": "item_list"}) ignores = [] for div in item_divs: string = div.get_text() if string and len(string) != 0: if u'邮件' in string and len(employee.email) == 0: employee.email = image2text(imageSrc(div), email_image_filename, 'eng2') print(employee.email) ignores.append('email') elif u'电话' in string and len(employee.tel) == 0: employee.tel = image2text(imageSrc(div), tel_image_filename, 'eng') print(employee.tel) ignores.append('tel') # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser(lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores)) return parser.parse()
def profile_handler(doc, name, url, path): filename = os.path.join(path, name + ".html") email_image_filename = os.path.join(path, name + "_email.png") tel_image_filename = os.path.join(path, name + "_tel.png") employee = Employee(name=name, url=url) # 只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1) if not divs or len(divs) == 0: div = soup else: div = divs[0] if not os.path.exists(filename): with open(filename, "wb") as fp: content = div.prettify() fp.write(content) fp.close() # email image item_divs = div.find_all(name="div", attrs={"class": "item_list"}) ignores = [] for div in item_divs: string = div.get_text() if string and len(string) != 0: if u"邮件" in string and len(employee.email) == 0: employee.email = image2text(imageSrc(div), email_image_filename, "eng2") print(employee.email) ignores.append("email") elif u"电话" in string and len(employee.tel) == 0: employee.tel = image2text(imageSrc(div), tel_image_filename, "eng") print(employee.tel) ignores.append("tel") # 使用纯文本方式处理 lines = div.stripped_strings # text=div.get_text(strip=True) parser = ProfileParser( lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores) ) return parser.parse()
def create_employee_process(): employee = Employee() employee.name = request.form['name'] employee.phone = request.form['phone'] employee.email = request.form['email'] employee.address = { 'street': request.form['street'], 'city': request.form['city'], 'state': request.form['state'], 'zipcode': request.form['zipcode'], } if employee.create_employee(): msg = Message() employee.send_email(mail, msg, app) return 'Success' else: raise Exception
def handler(tag): name_symbol = u'姓名' tds = tag.find_all('td') if len(tds) != 7: return None if tds[0].get_text().strip() == name_symbol: return None employee = Employee() ass = tds[0].find_all('a') if len(ass) != 0: employee.url = ass[0]['href'] employee.name = tds[0].get_text().strip() employee.email = tds[2].get_text().strip() employee.title = tds[3].get_text().strip() employee.research = tds[6].get_text().strip() employee.research.replace('\n','.') print employee.name,employee.email,employee.title return employee
def handler(tag): name_symbol = u'姓名' tds = tag.find_all('td') if len(tds) != 7: return None if tds[0].get_text().strip() == name_symbol: return None employee = Employee() ass = tds[0].find_all('a') if len(ass) != 0: employee.url = ass[0]['href'] employee.name = tds[0].get_text().strip() employee.email = tds[2].get_text().strip() employee.title = tds[3].get_text().strip() employee.research = tds[6].get_text().strip() employee.research.replace('\n', '.') print employee.name, employee.email, employee.title return employee
def profile_handler(doc,name,url,path): symbols = { u'个人主页:' :'profile', u'研究方向:' :'research', u'电话:':'tel', u'电话':'tel' } filename = path+name+".html" employee = Employee(name=name,url=url) # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录 soup = BeautifulSoup(doc, Config.SOUP_PARSER) divs = soup.find_all(id="sub_main",limit=1) if not divs or len(divs) == 0: # xml members = soup.find_all(name="member",limit=1) if not members or len(members) == 0: print("id:main or sub_main not found") #print doc return employee member = members[0] # title names = member.find_all('name') if not names and len(names) != 0: name = name[0].string if name: idx = name.find(' ') if idx != -1: employee.title = name[idx:] if member.field: employee.research = member.field.string or '' if member.homepage: employee.profile = member.homepage.string or '' if member.contact: if member.contact.string: for i,c in enumerate(member.contact.string): if c.isdigit(): employee.tel += c with open(filename,'wb') as fp: content = member.prettify() fp.write(content) fp.close() return employee div = divs[0] with open(filename,'wb') as fp: content = div.prettify() fp.write(content) fp.close() h4s = div.find_all('h4') if not h4s and len(h4s) != 0: name = h4s[0].string idx = name.find(' ') if idx != -1: employee.tite = name[idx:] employee.tite = ''.join(employee.tite.split()) lis = div.find_all("li",limit=8) if not lis or len(lis) == 0: return employee res = lis[0] # 解析详细内容 for count,tag in enumerate(lis[0].children): text = tag.string if not text: continue if len(text) == 0: continue text = ''.join(text.split()) if '@' in text: employee.email = text continue for symbol,name in symbols.items(): idx = text.find(symbol) if idx != -1: idx += len(symbol) value = text[idx:] if hasattr(employee, name): setattr(employee, name, value) print (name + ":" + value) else: print ("no attr %s in employee" % name) break return employee