Ejemplo n.º 1
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 5:
        print("len(tds) = %d"%(len(tds)))
        return None

    name_tag = None
    name_tag_idx = 0
    if len(tds) == 5:
        name_tag_idx = 0
        name_tag = tds[name_tag_idx]
    elif len(tds) > 5:
        name_tag_idx = 1
        name_tag = tds[name_tag_idx]

    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None
    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    
    employee.title = tds[name_tag_idx+1].get_text().strip()
    employee.email = tds[name_tag_idx+2].get_text().strip()
    employee.tel   = tds[name_tag_idx+3].get_text().strip()

    return employee
Ejemplo n.º 2
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id":"phy-main"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()
    
    #email
    email_div = soup.find_all(name='a',class_="phy-mail")
    if email_div and len(email_div) != 0:
        employee.email = email_div[0].get_text().strip()

    te_div = soup.find_all(name='a',class_="phy-phone")
    if te_div and len(te_div) != 0:
        employee.tel = te_div[0].get_text().strip()
    
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Ejemplo n.º 3
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "phy-main"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    #email
    email_div = soup.find_all(name='a', class_="phy-mail")
    if email_div and len(email_div) != 0:
        employee.email = email_div[0].get_text().strip()

    te_div = soup.find_all(name='a', class_="phy-phone")
    if te_div and len(te_div) != 0:
        employee.tel = te_div[0].get_text().strip()

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Ejemplo n.º 4
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 5:
        print("len(tds) = %d" % (len(tds)))
        return None

    name_tag = None
    name_tag_idx = 0
    if len(tds) == 5:
        name_tag_idx = 0
        name_tag = tds[name_tag_idx]
    elif len(tds) > 5:
        name_tag_idx = 1
        name_tag = tds[name_tag_idx]

    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None
    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']

    employee.title = tds[name_tag_idx + 1].get_text().strip()
    employee.email = tds[name_tag_idx + 2].get_text().strip()
    employee.tel = tds[name_tag_idx + 3].get_text().strip()

    return employee
Ejemplo n.º 5
0
def handler(tag):
    tds = tag.find_all("td")
    if not tds or len(tds) != 4:
        return None
    employee = Employee()
    ass = tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.name = ''.join(employee.name.split())

    title = tds[1].get_text()
    if title and len(title) != 0:
        employee.title = ''.join(title.split())

    email = tds[3].get_text()
    if email and len(email) != 0:
        employee.email = ''.join(email.split())


    tel = tds[2].get_text()
    if tel and len(tel) != 0:
        employee.tel = ''.join(tel.split())

    return employee
Ejemplo n.º 6
0
def handler(tag):
    ass = tag.find_all(name='a')
    if not ass:
        return None
    employee = Employee()
    if len(ass) >= 2:
        employee.email = ass[1].string
        if employee.email:
            employee.email = ''.join(employee.email.split()) 
    if len(ass) >= 1:
        employee.name = ass[0].string
        if not employee.name:
            return None
        employee.name = employee.name.strip()
        employee.url = ass[0]['href']
    return employee
Ejemplo n.º 7
0
def handler(tag):
    ass = tag.find_all(name='a')
    if not ass:
        return None
    employee = Employee()
    if len(ass) >= 2:
        employee.email = ass[1].string
        if employee.email:
            employee.email = ''.join(employee.email.split())
    if len(ass) >= 1:
        employee.name = ass[0].string
        if not employee.name:
            return None
        employee.name = employee.name.strip()
        employee.url = ass[0]['href']
    return employee
Ejemplo n.º 8
0
def profile_handler(doc, name, url, path):
    symbols = {
        u'电话:': 'tel',
        u'联系电话:': 'tel',
        u'传真:': 'fax',
    }

    employee = Employee(name=name, url=url)
    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", class_="box_detail", limit=1)
    if not divs or len(divs) == 0:
        return employee
    div = divs[0]

    filename = path + name + ".html"
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    td_left = div.find_all("td",
                           attrs={
                               "style": "line-height: 16px",
                               "align": "left"
                           })
    if not td_left or len(td_left) == 0:
        return employee

    # 解析详细内容
    links = div.find_all("a", limit=2)
    for link in links:
        if link.string:
            if '@' in link.string:
                employee.email = link.string
    for count, tag in enumerate(td_left[0].children):
        if not tag.string:
            continue
        if count > 15:
            break

        text = tag.string
        text = ''.join(text.split())
        if len(text) == 0:
            continue
        for symbol, name in symbols.items():
            idx = text.find(symbol)
            if idx != -1:
                idx += len(symbol)
                value = text[idx:]
                if hasattr(employee, name):
                    setattr(employee, name, value)
                    symbols
                    # print (name + ":" + value)
                else:
                    print("no attr %s in employee" % name)
                break
    return employee
Ejemplo n.º 9
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div",
                         attrs={"class": "page_right addpage_right"},
                         limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department = tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title = tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research = tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class": "text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook)
    return parser.parse()
Ejemplo n.º 10
0
def profile_handler(doc,name,url,path):
    symbols = {
        u'电话:'      :'tel',  
        u'联系电话:'   :'tel',
        u'传真:'      :'fax',
    }
        
    employee = Employee(name=name,url=url)
    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div",class_="box_detail",limit=1)
    if not divs or len(divs) == 0:
        return employee
    div = divs[0]
    
    filename = path+name+".html"
    with open(filename,'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()
    
    td_left = div.find_all("td",attrs={ "style":"line-height: 16px","align":"left"})
    if not td_left or len(td_left) == 0:
        return employee
    
    # 解析详细内容
    links = div.find_all("a",limit=2)
    for link in links:
        if link.string:
            if '@' in link.string:
                employee.email = link.string
    for count,tag in  enumerate(td_left[0].children):
        if not tag.string:
            continue
        if count > 15:
            break
        
        text = tag.string
        text = ''.join(text.split())
        if len(text) == 0:
            continue
        for symbol,name in symbols.items():
                idx = text.find(symbol)
                if idx != -1:
                    idx += len(symbol)
                    value = text[idx:]
                    if hasattr(employee, name):
                        setattr(employee, name, value)
                        symbols
                        # print (name + ":" + value)
                    else:
                        print ("no attr %s in employee" % name)
                    break
    return employee
Ejemplo n.º 11
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 3:
        return None
    employee = Employee()

    employee.name = tds[0].get_text() or ''
    employee.name = ''.join(employee.name.split()) 
    
    # 过滤表头
    if employee.name == u'姓名':
        return None
    
    employee.title = tds[1].get_text()
    employee.title = ''.join(employee.title.split()) 
    
    employee.email = tds[2].get_text()
    employee.email = ''.join(employee.email.split()) 
    employee.email = email_value_strip(employee.email)
    # print(tag)
    return employee
Ejemplo n.º 12
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 3:
        return None
    employee = Employee()

    employee.name = tds[0].get_text() or ''
    employee.name = ''.join(employee.name.split())

    # 过滤表头
    if employee.name == u'姓名':
        return None

    employee.title = tds[1].get_text()
    employee.title = ''.join(employee.title.split())

    employee.email = tds[2].get_text()
    employee.email = ''.join(employee.email.split())
    employee.email = email_value_strip(employee.email)
    # print(tag)
    return employee
Ejemplo n.º 13
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    employee = Employee(name=name, url=url)

    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"class":"page_right addpage_right"}, limit=1)
    if not divs or len(divs) == 0:
        div= soup
    else:
        div = divs[0]
    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    tds = div.find_all('td')
    if tds and len(tds) == 11:
        department =  tds[2].get_text()
        if department:
            department = ''.join(department.split())
            if department and len(department) != 0:
                employee.departments = department

        title =  tds[4].get_text()
        if title:
            title = ''.join(title.split())
            if title and len(title) != 0:
                employee.title = title

        email = tds[8].get_text()
        if email:
            email = ''.join(email.split())
            if email and len(email) != 0:
                employee.email = email

        research =  tds[10].get_text()
        if research:
            research = ''.join(research.split())
            if research and len(research) != 0:
                employee.research = research

    divs = soup.find_all(name="div", attrs={"class":"text_more"}, limit=1)
    if divs and len(divs) != 0:
        div = divs[0]
    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee,set_attr_hook=set_attr_hook)
    return parser.parse()
Ejemplo n.º 14
0
def handler(tag):
    dd_tables = {
        "Email":"email",
        "Phone":"tel",
        "Homepage":"profile",
        'Math Fields':'research'
    }
    lis = tag.find_all('li')
    if len(lis) < 4:
        return None
    employee = Employee()
    
    pre_len = len(u'职务:')
    
    employee.name = lis[0].get_text()
    employee.profile = lis[0].a['href']
    employee.url = employee.url or employee.profile
    if not employee.name:
        employee.name = employee.name[pre_len:]
        employee.name = ''.join(employee.name.split()) 
  
    employee.title = lis[1].get_text()
    if employee.title:
        employee.title =  employee.title[pre_len:]
        employee.title = ''.join(employee.title.split()) 
        
    employee.tel = lis[2].get_text()
    if employee.tel:
        employee.tel =  employee.tel[pre_len:]
        employee.tel = ''.join(employee.tel.split()) 
    
    employee.email = lis[3].get_text()
    if employee.email:
        employee.email =  employee.email[pre_len:]
        employee.email = ''.join(employee.email.split()) 

    print("name:"+employee.name+",email:"+employee.email)
    return employee
Ejemplo n.º 15
0
def handler(tag):
    dd_tables = {
        "Email": "email",
        "Phone": "tel",
        "Homepage": "profile",
        'Math Fields': 'research'
    }

    h3 = tag.find_all(name='h3')
    if not h3:
        return None
    employee = Employee()
    employee.name = h3[0].get_text() or ''
    employee.name = ''.join(employee.name.split())

    title_spans = tag.find_all(name="span", class_="faculty-title")
    employee.title = title_spans[0].get_text()
    #employee.title = ''.join(employee.title.split())

    tds = tag.find_all(name='dt', class_="faculty-info")
    if not tds or len(tds) < 5:
        return None

    employee.email = tds[0].get_text()
    employee.email = ''.join(employee.email.split())

    employee.tel = tds[1].get_text()
    employee.tel = ''.join(employee.tel.split())

    employee.profile = tds[3].get_text()
    employee.profile = ''.join(employee.profile.split())

    employee.research = tds[4].get_text()
    #employee.research = ''.join(employee.research.split())

    return employee
Ejemplo n.º 16
0
def handler(tag):
    dd_tables = {
        "Email":"email",
        "Phone":"tel",
        "Homepage":"profile",
        'Math Fields':'research'
    }
    
    h3 = tag.find_all(name='h3')
    if not h3:
        return None
    employee = Employee()
    employee.name = h3[0].get_text() or ''
    employee.name = ''.join(employee.name.split()) 
    
    title_spans = tag.find_all(name="span",class_="faculty-title")
    employee.title = title_spans[0].get_text()
    #employee.title = ''.join(employee.title.split()) 

    tds = tag.find_all(name='dt',class_="faculty-info")
    if not tds or len(tds) < 5:
        return None
    
    employee.email = tds[0].get_text()
    employee.email = ''.join(employee.email.split()) 
    
    employee.tel = tds[1].get_text()
    employee.tel = ''.join(employee.tel.split()) 
        
    employee.profile = tds[3].get_text()
    employee.profile = ''.join(employee.profile.split()) 
    
    employee.research = tds[4].get_text()
    #employee.research = ''.join(employee.research.split()) 

    return employee
Ejemplo n.º 17
0
Archivo: main.py Proyecto: bips1996/POC
def create_Employee(emp : createEmployee, db : Session = Depends(get_db)): 
    _employee = Employee()
    _employee.id = emp.id
    _employee.name = emp.name
    _employee.email = emp.email
    _employee.position = emp.position
    _employee.works_on: emp.works_on
    _employee.reporting_manager : emp.reporting_manager
    db.add(_employee)
    db.commit()
    # Background_tasks.add_task(fetch_emp_data,employee.id)
    
    return{
        "code":"Success",
        "messege": "Employee created the name"+emp.name
    }
Ejemplo n.º 18
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    email_image_filename = os.path.join(path, name + "_email.png")
    tel_image_filename = os.path.join(path, name + "_tel.png")

    employee = Employee(name=name, url=url)
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, 'wb') as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # email image
    item_divs = div.find_all(name="div", attrs={"class": "item_list"})

    ignores = []
    for div in item_divs:
        string = div.get_text()
        if string and len(string) != 0:
            if u'邮件' in string and len(employee.email) == 0:
                employee.email = image2text(imageSrc(div),
                                            email_image_filename, 'eng2')
                print(employee.email)
                ignores.append('email')
            elif u'电话' in string and len(employee.tel) == 0:
                employee.tel = image2text(imageSrc(div), tel_image_filename,
                                          'eng')
                print(employee.tel)
                ignores.append('tel')

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,
                           employee=employee,
                           set_attr_hook=set_attr_hook,
                           max_line=256,
                           ignore=set(ignores))
    return parser.parse()
Ejemplo n.º 19
0
def profile_handler(doc, name, url, path):
    filename = os.path.join(path, name + ".html")
    email_image_filename = os.path.join(path, name + "_email.png")
    tel_image_filename = os.path.join(path, name + "_tel.png")

    employee = Employee(name=name, url=url)
    # 只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(name="div", attrs={"id": "view_pannel"}, limit=1)
    if not divs or len(divs) == 0:
        div = soup
    else:
        div = divs[0]

    if not os.path.exists(filename):
        with open(filename, "wb") as fp:
            content = div.prettify()
            fp.write(content)
            fp.close()

    # email image
    item_divs = div.find_all(name="div", attrs={"class": "item_list"})

    ignores = []
    for div in item_divs:
        string = div.get_text()
        if string and len(string) != 0:
            if u"邮件" in string and len(employee.email) == 0:
                employee.email = image2text(imageSrc(div), email_image_filename, "eng2")
                print(employee.email)
                ignores.append("email")
            elif u"电话" in string and len(employee.tel) == 0:
                employee.tel = image2text(imageSrc(div), tel_image_filename, "eng")
                print(employee.tel)
                ignores.append("tel")

    # 使用纯文本方式处理
    lines = div.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(
        lines=lines, employee=employee, set_attr_hook=set_attr_hook, max_line=256, ignore=set(ignores)
    )
    return parser.parse()
Ejemplo n.º 20
0
def create_employee_process():
    employee = Employee()

    employee.name = request.form['name']
    employee.phone = request.form['phone']
    employee.email = request.form['email']
    employee.address = {
        'street': request.form['street'],
        'city': request.form['city'],
        'state': request.form['state'],
        'zipcode': request.form['zipcode'],
    }

    if employee.create_employee():
        msg = Message()
        employee.send_email(mail, msg, app)
        return 'Success'
    else:
        raise Exception
Ejemplo n.º 21
0
def handler(tag):
    name_symbol = u'姓名'
    tds = tag.find_all('td')
    
    if len(tds) != 7:
        return None
    if tds[0].get_text().strip() == name_symbol:
        return None
    employee = Employee()
    
    ass = tds[0].find_all('a')
    if len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.email = tds[2].get_text().strip()
    employee.title = tds[3].get_text().strip()
    employee.research = tds[6].get_text().strip()
    employee.research.replace('\n','.')
    print employee.name,employee.email,employee.title
    return employee
Ejemplo n.º 22
0
def handler(tag):
    name_symbol = u'姓名'
    tds = tag.find_all('td')

    if len(tds) != 7:
        return None
    if tds[0].get_text().strip() == name_symbol:
        return None
    employee = Employee()

    ass = tds[0].find_all('a')
    if len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.email = tds[2].get_text().strip()
    employee.title = tds[3].get_text().strip()
    employee.research = tds[6].get_text().strip()
    employee.research.replace('\n', '.')
    print employee.name, employee.email, employee.title
    return employee
Ejemplo n.º 23
0
def profile_handler(doc,name,url,path):
    symbols = {
        u'个人主页:'   :'profile',
        u'研究方向:'   :'research',
        u'电话:':'tel',
        u'电话':'tel'
    }
    filename = path+name+".html"
    
    employee = Employee(name=name,url=url)
    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(id="sub_main",limit=1)
    if not divs or len(divs) == 0:
        # xml
        members = soup.find_all(name="member",limit=1)
        if not members or len(members) == 0:
            print("id:main or sub_main not found")
            #print doc
            return employee
        member = members[0]
        # title
        names = member.find_all('name')
        if not names and len(names) != 0:
            name = name[0].string
            if name:
                idx = name.find(' ')
                if idx != -1:
                    employee.title = name[idx:]
        if member.field:
            employee.research = member.field.string or ''
        if member.homepage:
            employee.profile = member.homepage.string or ''
        if member.contact:
            if member.contact.string:
                for i,c in enumerate(member.contact.string):
                    if c.isdigit():
                        employee.tel += c
        
        with open(filename,'wb') as fp:
            content = member.prettify()
            fp.write(content)
            fp.close()
        return employee
    
    div = divs[0]
    with open(filename,'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()
        
    h4s = div.find_all('h4')
    if not h4s and len(h4s) != 0:
        name = h4s[0].string
        idx = name.find(' ')
        if idx != -1:
            employee.tite = name[idx:]
            employee.tite = ''.join(employee.tite.split())
            
    lis = div.find_all("li",limit=8)
    if not lis or len(lis) == 0:
        return employee
    res = lis[0]
    # 解析详细内容
    for count,tag in  enumerate(lis[0].children):
        text = tag.string
        if not text:
            continue
        if len(text) == 0:
            continue
        text = ''.join(text.split())
        if '@' in text:
            employee.email = text
            continue
                
        for symbol,name in symbols.items():
            idx = text.find(symbol)
            if idx != -1:
                idx += len(symbol)
                value = text[idx:]
                if hasattr(employee, name):
                    setattr(employee, name, value)
                    print (name + ":" + value)
                else:
                    print ("no attr %s in employee" % name)
                break
    return employee