Esempio n. 1
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 4:
        print("len(tds) = %d" % (len(tds)))
        return None

    name_tag = tds[0]
    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None

    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']

    employee.title = tds[2].get_text().strip()
    employee.departments = tds[3].get_text().strip()

    return employee
Esempio n. 2
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 5:
        print("len(tds) = %d"%(len(tds)))
        return None

    name_tag = None
    name_tag_idx = 0
    if len(tds) == 5:
        name_tag_idx = 0
        name_tag = tds[name_tag_idx]
    elif len(tds) > 5:
        name_tag_idx = 1
        name_tag = tds[name_tag_idx]

    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None
    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    
    employee.title = tds[name_tag_idx+1].get_text().strip()
    employee.email = tds[name_tag_idx+2].get_text().strip()
    employee.tel   = tds[name_tag_idx+3].get_text().strip()

    return employee
Esempio n. 3
0
def handler(tag):
    tds = tag.find_all("td")
    if not tds or len(tds) != 4:
        return None
    employee = Employee()
    ass = tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.name = ''.join(employee.name.split())

    title = tds[1].get_text()
    if title and len(title) != 0:
        employee.title = ''.join(title.split())

    email = tds[3].get_text()
    if email and len(email) != 0:
        employee.email = ''.join(email.split())


    tel = tds[2].get_text()
    if tel and len(tel) != 0:
        employee.tel = ''.join(tel.split())

    return employee
Esempio n. 4
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 4:
        print("len(tds) = %d"%(len(tds)))
        return None

    
    name_tag = tds[0]
    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None

    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    
    employee.title = tds[2].get_text().strip()
    employee.departments = tds[3].get_text().strip()

    return employee
Esempio n. 5
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds:
        print("len(tds) == 0")
        return None

    employee = Employee()
    if len(tds) < 5:
        print("len(tds) = %d" % (len(tds)))
        return None

    name_tag = None
    name_tag_idx = 0
    if len(tds) == 5:
        name_tag_idx = 0
        name_tag = tds[name_tag_idx]
    elif len(tds) > 5:
        name_tag_idx = 1
        name_tag = tds[name_tag_idx]

    employee.name = name_tag.get_text()
    employee.name = employee.name.strip()
    if employee.name == u'姓名':
        return None
    ass = name_tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']

    employee.title = tds[name_tag_idx + 1].get_text().strip()
    employee.email = tds[name_tag_idx + 2].get_text().strip()
    employee.tel = tds[name_tag_idx + 3].get_text().strip()

    return employee
Esempio n. 6
0
def handler(tag):
    employee = Employee()
    name_divs = tag.find_all("div",class_="teacher-title")
    if name_divs and len(name_divs) != 0:
        employee.name = name_divs[0].get_text()
        employee.name = ''.join(employee.name.split())
    
    # 使用纯文本方式处理
    lines = tag.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Esempio n. 7
0
def handler(tag):
    employee = Employee()
    name_divs = tag.find_all("div", class_="teacher-title")
    if name_divs and len(name_divs) != 0:
        employee.name = name_divs[0].get_text()
        employee.name = ''.join(employee.name.split())

    # 使用纯文本方式处理
    lines = tag.stripped_strings
    # text=div.get_text(strip=True)
    parser = ProfileParser(lines=lines, employee=employee)
    return parser.parse()
Esempio n. 8
0
def handler(tag):
    employee = Employee()
    ass = tag.find_all('a',class_="orangea")
    if ass and len(ass) != 0:
        employee.name = ass[0].get_text()
        employee.name = ''.join(employee.name.split())
        employee.profile = ass[0]['href']
    
    ass = tag.find_all('a',class_="black01")
    if ass and len(ass) != 0:
        lines = ass[0].stripped_strings
        parser = ProfileParser(lines=lines,employee=employee)
        employee = parser.parse()
    return employee
Esempio n. 9
0
def handler(tag):
    employee = Employee()
    ass = tag.find_all('a', class_="orangea")
    if ass and len(ass) != 0:
        employee.name = ass[0].get_text()
        employee.name = ''.join(employee.name.split())
        employee.profile = ass[0]['href']

    ass = tag.find_all('a', class_="black01")
    if ass and len(ass) != 0:
        lines = ass[0].stripped_strings
        parser = ProfileParser(lines=lines, employee=employee)
        employee = parser.parse()
    return employee
Esempio n. 10
0
def handler(tag):
    ass = tag.find_all(name='a')
    if not ass:
        return None
    employee = Employee()
    if len(ass) >= 2:
        employee.email = ass[1].string
        if employee.email:
            employee.email = ''.join(employee.email.split())
    if len(ass) >= 1:
        employee.name = ass[0].string
        if not employee.name:
            return None
        employee.name = employee.name.strip()
        employee.url = ass[0]['href']
    return employee
Esempio n. 11
0
def handler(tag):
    employee = Employee(url=tag['href'])
    # 刘全勇:                                        硕士生导师 
    string = ''.join(tag.string.split())
    string_splits = string.split(u':')
    if len(string_splits) == 1:
        employee.name = string_splits[0]
    elif len(string_splits) == 2:
        employee.name = string_splits[0]
        employee.title = string_splits[1]
    else:
        return None
    
    print("name:"+employee.name )

    return employee
Esempio n. 12
0
def handler(tag):
    employee = Employee(url=tag['href'])
    # 刘全勇:                                        硕士生导师 
    string = ''.join(tag.get_text().split())
    string_splits = string.split(u':')
    if len(string_splits) == 1:
        employee.name = string_splits[0]
    elif len(string_splits) == 2:
        employee.name = string_splits[0]
        employee.title = string_splits[1]
    else:
        return None
    
    print("name:"+employee.name )

    return employee
Esempio n. 13
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 5:
        return None
    
    name = tds[0].get_text()
    if not name or len(name) == 0:
        return None

    employee = Employee()
    employee.name = ''.join(name.split())
    if employee.name == u'姓名':
        return None

    ass = tag.find_all('a')
    if ass:
        employee.url = ass[0]['href']

    title = tds[3].get_text()
    if title and len(title) != 0:
        title = ''.join(title.split())
        title = title.replace(',',',')
        employee.title = title
        print title

    research = tds[4].get_text()
    if research and len(research) != 0:
        employee.research = research.strip()
        employee.research = employee.research.replace(',',',')
    
    return employee
Esempio n. 14
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 5:
        return None

    name = tds[0].get_text()
    if not name or len(name) == 0:
        return None

    employee = Employee()
    employee.name = ''.join(name.split())
    if employee.name == u'姓名':
        return None

    ass = tag.find_all('a')
    if ass:
        employee.url = ass[0]['href']

    title = tds[3].get_text()
    if title and len(title) != 0:
        title = ''.join(title.split())
        title = title.replace(',', ',')
        employee.title = title
        print title

    research = tds[4].get_text()
    if research and len(research) != 0:
        employee.research = research.strip()
        employee.research = employee.research.replace(',', ',')

    return employee
Esempio n. 15
0
def handler(tag):
    ass = tag.find_all(name='a')
    if not ass:
        return None
    employee = Employee()
    if len(ass) >= 2:
        employee.email = ass[1].string
        if employee.email:
            employee.email = ''.join(employee.email.split()) 
    if len(ass) >= 1:
        employee.name = ass[0].string
        if not employee.name:
            return None
        employee.name = employee.name.strip()
        employee.url = ass[0]['href']
    return employee
Esempio n. 16
0
def profile_handler(doc,name,url,path):
    # employee可用属性(url, name, email, tel, title, profile, research, departments,fax,addr):
    symbols = {
        u'Email:'    :'email',
        u'邮箱:'      :'email',
        u'电子邮件:'   :'email',
        u'电子邮箱:'   :'email',
        u'电话:'      :'tel',  
        u'联系电话:'   :'tel',
        u'Tel:'       :'tel',
        u'办公电话:'   :'tel',
        u'传真:'      :'fax',  
        u'URL:'      :'url',
        u'职称:'      :'title'
    }
    employee = None
    
    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(id="s2_right_con",limit=1)
    filename = path+name+".html"
    if not divs or len(divs) == 0:
        return Employee(name=name,url=url)
    div =divs[0]
    with open(filename,'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()
     
    
    employee = Employee()
    # parse name
    name_h4 = div.h4
    if name_h4:
        employee.name = name_h4.string.strip(' \t\n\r')
    else:
        print name_h4
    
    for tag in div.children:
        if not tag.string:
            continue
        text = tag.string.strip(' \t\n\r')
        if len(text) == 0:
            continue
        for symbol,name in symbols.items():
            idx = text.find(symbol)
            if idx != -1:
                idx += len(symbol)
                value = text[idx:]
                if hasattr(employee, name):
                    setattr(employee, name, value)
                    symbols
                    # print (name + ":" + value)
                else:
                    print ("no attr %s in employee" % name)
                break
    return  employee
Esempio n. 17
0
def profile_handler(doc, name, url, path):
    # employee可用属性(url, name, email, tel, title, profile, research, departments,fax,addr):
    symbols = {
        u'Email:': 'email',
        u'邮箱:': 'email',
        u'电子邮件:': 'email',
        u'电子邮箱:': 'email',
        u'电话:': 'tel',
        u'联系电话:': 'tel',
        u'Tel:': 'tel',
        u'办公电话:': 'tel',
        u'传真:': 'fax',
        u'URL:': 'url',
        u'职称:': 'title'
    }
    employee = None

    # 太乱了,只保存名称和个人主页,个人简历文件另存当前目录
    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all(id="s2_right_con", limit=1)
    filename = path + name + ".html"
    if not divs or len(divs) == 0:
        return Employee(name=name, url=url)
    div = divs[0]
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    employee = Employee()
    # parse name
    name_h4 = div.h4
    if name_h4:
        employee.name = name_h4.string.strip(' \t\n\r')
    else:
        print name_h4

    for tag in div.children:
        if not tag.string:
            continue
        text = tag.string.strip(' \t\n\r')
        if len(text) == 0:
            continue
        for symbol, name in symbols.items():
            idx = text.find(symbol)
            if idx != -1:
                idx += len(symbol)
                value = text[idx:]
                if hasattr(employee, name):
                    setattr(employee, name, value)
                    symbols
                    # print (name + ":" + value)
                else:
                    print("no attr %s in employee" % name)
                break
    return employee
Esempio n. 18
0
def handler(tag):
    employee = Employee()

    lines = tag.stripped_strings

    ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"})
    if not ass or len(ass) == 0:
        # first line is the name
        for count, line in enumerate(lines):
            employee.name = line
            break
    else:
        employee.name = ass[0].string
        employee.profile = ass[0]["href"]
        employee.url = employee.profile

    parser = ProfileParser(lines=lines, employee=employee)
    employee = parser.parse()
    return employee
Esempio n. 19
0
def handler(tag):
    employee = Employee()

    lines = tag.stripped_strings

    ass = tag.find_all(name="a", attrs={"class": "dt_text_tit"})
    if not ass or len(ass) == 0:
        # first line is the name
        for count, line in enumerate(lines):
            employee.name = line
            break
    else:
        employee.name = ass[0].string
        employee.profile = ass[0]['href']
        employee.url = employee.profile

    parser = ProfileParser(lines=lines, employee=employee)
    employee = parser.parse()
    return employee
Esempio n. 20
0
def handler(tag):
    
    name_spans = tag.find_all(class_="handle")
    if not name_spans or len(name_spans) == 0:
        return None
    
    # js <span class="handle" onclick="toCardDetailAction('10c07e70-3fb6-42af-aa26-bfab26b6ce0406');" style="color:#2084D2;font-size: 16px;">艾明晶</span>
    
    employee = Employee()
    employee.name = name_spans[0].get_text()
    employee.name = ''.join(employee.name.split())
    
    card_id = name_spans[0]['onclick'][len('toCardDetailAction(\''):-3]
    employee.url = 'http://scse.buaa.edu.cn/buaa-css-web/toCardDetailAction.action?firstSelId=CARD_TMPL_OF_FIRST_NAVI_CN%20&%20secondSelId=CARD_TMPL_OF_ALL_TEACHER_CN%20&cardId='+card_id
    print ("card_id=[%s]"%card_id)

    
    lines = tag.stripped_strings
    parser = ProfileParser(lines=lines,employee=employee)
    return parser.parse()
Esempio n. 21
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 3:
        return None
    employee = Employee()

    employee.name = tds[0].get_text() or ''
    employee.name = ''.join(employee.name.split())

    # 过滤表头
    if employee.name == u'姓名':
        return None

    employee.title = tds[1].get_text()
    employee.title = ''.join(employee.title.split())

    employee.email = tds[2].get_text()
    employee.email = ''.join(employee.email.split())
    employee.email = email_value_strip(employee.email)
    # print(tag)
    return employee
Esempio n. 22
0
def handler(tag):
    tds = tag.find_all(name='td')
    if not tds or len(tds) != 3:
        return None
    employee = Employee()

    employee.name = tds[0].get_text() or ''
    employee.name = ''.join(employee.name.split()) 
    
    # 过滤表头
    if employee.name == u'姓名':
        return None
    
    employee.title = tds[1].get_text()
    employee.title = ''.join(employee.title.split()) 
    
    employee.email = tds[2].get_text()
    employee.email = ''.join(employee.email.split()) 
    employee.email = email_value_strip(employee.email)
    # print(tag)
    return employee
Esempio n. 23
0
def handler(tag):
    dd_tables = {
        "Email":"email",
        "Phone":"tel",
        "Homepage":"profile",
        'Math Fields':'research'
    }
    lis = tag.find_all('li')
    if len(lis) < 4:
        return None
    employee = Employee()
    
    pre_len = len(u'职务:')
    
    employee.name = lis[0].get_text()
    employee.profile = lis[0].a['href']
    employee.url = employee.url or employee.profile
    if not employee.name:
        employee.name = employee.name[pre_len:]
        employee.name = ''.join(employee.name.split()) 
  
    employee.title = lis[1].get_text()
    if employee.title:
        employee.title =  employee.title[pre_len:]
        employee.title = ''.join(employee.title.split()) 
        
    employee.tel = lis[2].get_text()
    if employee.tel:
        employee.tel =  employee.tel[pre_len:]
        employee.tel = ''.join(employee.tel.split()) 
    
    employee.email = lis[3].get_text()
    if employee.email:
        employee.email =  employee.email[pre_len:]
        employee.email = ''.join(employee.email.split()) 

    print("name:"+employee.name+",email:"+employee.email)
    return employee
Esempio n. 24
0
def handler(tag):
    employee = Employee()
    names = tag.get_text()
    names = ''.join(names.split())
    names = names.replace(')','')
    names = names.replace(')','')
    names = names.replace('(','(')
    names = names.split('(')
    employee.name = names[0]
    if len(names) >= 2:
        employee.title = names[1]
    employee.url = tag['href']
    print employee.name, employee.title
    return employee
Esempio n. 25
0
def handler(tag):
    dd_tables = {
        "Email":"email",
        "Phone":"tel",
        "Homepage":"profile",
        'Math Fields':'research'
    }
    
    h3 = tag.find_all(name='h3')
    if not h3:
        return None
    employee = Employee()
    employee.name = h3[0].get_text() or ''
    employee.name = ''.join(employee.name.split()) 
    
    title_spans = tag.find_all(name="span",class_="faculty-title")
    employee.title = title_spans[0].get_text()
    #employee.title = ''.join(employee.title.split()) 

    tds = tag.find_all(name='dt',class_="faculty-info")
    if not tds or len(tds) < 5:
        return None
    
    employee.email = tds[0].get_text()
    employee.email = ''.join(employee.email.split()) 
    
    employee.tel = tds[1].get_text()
    employee.tel = ''.join(employee.tel.split()) 
        
    employee.profile = tds[3].get_text()
    employee.profile = ''.join(employee.profile.split()) 
    
    employee.research = tds[4].get_text()
    #employee.research = ''.join(employee.research.split()) 

    return employee
Esempio n. 26
0
def handler(tag):
    dd_tables = {
        "Email": "email",
        "Phone": "tel",
        "Homepage": "profile",
        'Math Fields': 'research'
    }

    h3 = tag.find_all(name='h3')
    if not h3:
        return None
    employee = Employee()
    employee.name = h3[0].get_text() or ''
    employee.name = ''.join(employee.name.split())

    title_spans = tag.find_all(name="span", class_="faculty-title")
    employee.title = title_spans[0].get_text()
    #employee.title = ''.join(employee.title.split())

    tds = tag.find_all(name='dt', class_="faculty-info")
    if not tds or len(tds) < 5:
        return None

    employee.email = tds[0].get_text()
    employee.email = ''.join(employee.email.split())

    employee.tel = tds[1].get_text()
    employee.tel = ''.join(employee.tel.split())

    employee.profile = tds[3].get_text()
    employee.profile = ''.join(employee.profile.split())

    employee.research = tds[4].get_text()
    #employee.research = ''.join(employee.research.split())

    return employee
Esempio n. 27
0
def update_employee(self, employee_id):
    # -------------------------
    # Update data of employee
    # -------------------------
    if request.form.get('_method') != 'PUT':
        app.logger.Info(
            'Cannot perform this action. Please contact administrator')
        abort(405)

    employee = Employee(id=employee_id)

    try:
        employee = employee.list_one_or_none_employee()

        if employee is None:
            app.logger.info(
                f'No data with Employee ID = {employee_id} could be found!')
            abort(422)
    except BaseException:
        app.logger.info(
            f'An error occurred. No data with Employee ID\
                 = {employee_id} could be found!')
        abort(422)

    employee.id = employee_id
    employee.name = request.form.get('name', employee.name)

    temp = request.form.get('department_name')
    employee.department_id = temp.split(' - ', 2)[0]

    employee.title = request.form.get('title', employee.title)
    employee.emp_number = request.form.get('emp_number', employee.emp_number)
    employee.address = request.form.get('address', employee.address)
    employee.phone = request.form.get('phone', employee.phone)
    employee.wage = request.form.get('wage', employee.wage)
    employee.is_active = 'is_active' in request.form

    try:
        employee.update_employee_in_database()
        flash(
            f'Employee {employee_id} was successfully updated!',
            'success')
    except BaseException:
        app.log.info(f'An error occurred. Employee {employee_id} \
            could not be updated!')
        abort(422)

    return redirect(url_for('employees'))
Esempio n. 28
0
File: main.py Progetto: bips1996/POC
def create_Employee(emp : createEmployee, db : Session = Depends(get_db)): 
    _employee = Employee()
    _employee.id = emp.id
    _employee.name = emp.name
    _employee.email = emp.email
    _employee.position = emp.position
    _employee.works_on: emp.works_on
    _employee.reporting_manager : emp.reporting_manager
    db.add(_employee)
    db.commit()
    # Background_tasks.add_task(fetch_emp_data,employee.id)
    
    return{
        "code":"Success",
        "messege": "Employee created the name"+emp.name
    }
Esempio n. 29
0
def create_employee_process():
    employee = Employee()

    employee.name = request.form['name']
    employee.phone = request.form['phone']
    employee.email = request.form['email']
    employee.address = {
        'street': request.form['street'],
        'city': request.form['city'],
        'state': request.form['state'],
        'zipcode': request.form['zipcode'],
    }

    if employee.create_employee():
        msg = Message()
        employee.send_email(mail, msg, app)
        return 'Success'
    else:
        raise Exception
Esempio n. 30
0
def handler(tag):
    symbols = set([u'首页',u'第一页',u'下一页',u'最后页 ',u'上一页',])

    if not tag.string:
    	return None
    name = tag.string.strip()
    if name in symbols:
        return None

    employee = Employee(name=name, url=tag['href'])
    # 根据预定的关键词推测身分
    for keyword in PROFILE_TITLES:
    	idx = name.find(keyword)
    	if idx != -1:
    		employee.name = name[:idx]
    		employee.title = name[idx:]
    		break

    return employee
Esempio n. 31
0
def handler(tag):
    name_symbol = u'姓名'
    tds = tag.find_all('td')

    if len(tds) != 7:
        return None
    if tds[0].get_text().strip() == name_symbol:
        return None
    employee = Employee()

    ass = tds[0].find_all('a')
    if len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.email = tds[2].get_text().strip()
    employee.title = tds[3].get_text().strip()
    employee.research = tds[6].get_text().strip()
    employee.research.replace('\n', '.')
    print employee.name, employee.email, employee.title
    return employee
Esempio n. 32
0
def handler(tag):
    name_symbol = u'姓名'
    tds = tag.find_all('td')
    
    if len(tds) != 7:
        return None
    if tds[0].get_text().strip() == name_symbol:
        return None
    employee = Employee()
    
    ass = tds[0].find_all('a')
    if len(ass) != 0:
        employee.url = ass[0]['href']
    employee.name = tds[0].get_text().strip()
    employee.email = tds[2].get_text().strip()
    employee.title = tds[3].get_text().strip()
    employee.research = tds[6].get_text().strip()
    employee.research.replace('\n','.')
    print employee.name,employee.email,employee.title
    return employee
Esempio n. 33
0
def handler(tag):
    tds = tag.find_all("td")
    if not tds or len(tds) != 3:
        return None

    employee = Employee()
    name = tds[0].get_text()
    if not name:
        return None
    name = name.strip()
    if name == u'姓名':
        return None
    names = name.split('/')
    name = names[0]
    employee.name = name
    employee.title = tds[1].string.strip()

    ass = tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    return employee
Esempio n. 34
0
def handler(tag):
    symbols = ['首页' ,'尾页','师资队伍' ,'教师简介','教授', '上一页','下一页','[1]','[2]','1','2']
    name = tag.get_text() 
    if not name or len(name) == 0:
        return None
    
    employee = Employee(url = tag['href'])
    name = name.strip()

    # 特殊过滤去掉说明头
    for s in symbols:
        if name == s:
            return None
        
    names = name.split(' ')

    if len(names) >= 2:
        employee.title = names[1]
    employee.name = ''.join(names[:-1])

    return employee
Esempio n. 35
0
def handler(tag):
    tds = tag.find_all("td")
    if not tds or len(tds) != 3:
        return None
    
    employee = Employee()
    name = tds[0].get_text()
    if not name:
        return None
    name = name.strip()
    if name == u'姓名':
        return None
    names = name.split('/')
    name = names[0]
    employee.name = name
    employee.title = tds[1].string.strip()
    
    ass = tag.find_all('a')
    if ass and len(ass) != 0:
        employee.url = ass[0]['href']
    return employee
Esempio n. 36
0
def handler(tag):
    symbols = [
        '首页', '尾页', '师资队伍', '教师简介', '教授', '上一页', '下一页', '[1]', '[2]', '1', '2'
    ]
    name = tag.get_text()
    if not name or len(name) == 0:
        return None

    employee = Employee(url=tag['href'])
    name = name.strip()

    # 特殊过滤去掉说明头
    for s in symbols:
        if name == s:
            return None

    names = name.split(' ')

    if len(names) >= 2:
        employee.title = names[1]
    employee.name = ''.join(names[:-1])

    return employee
Esempio n. 37
0
def profile_handler(doc, name, url, path):
    # employee可用属性(url, name, email, tel, title, profile, research, departments,fax,addr):
    symbols = {
        'email': u'电子邮件:',
        'tel': u'办公电话:',
        'addr': u'办公地址:',
        'research': u'研究方向:'
    }
    employee = None

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all("td", attrs={"valign": "top"}, limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    employee = Employee()

    # save file
    filename = path + name + ".html"
    with open(filename, 'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()

    # parse name
    name_h3 = div.h3
    if name_h3:
        employee.name = name_h3.string.strip(' \t\n\r')
    else:
        print name_h3
    # parse title
    dls = soup.dl
    if dls and len(dls) >= 1:
        print dls
        if dls.dt:
            employee.title = dls.dt.string
        # parse everything
        for tag in dls.children:
            if not tag.string:
                continue
            text = tag.string.strip(' \t\n\r')
            if len(text) == 0:
                continue
            for name, symbol in symbols.items():
                idx = text.find(symbol)
                if idx != -1:
                    idx += len(symbol)
                    value = text[idx:]
                    if hasattr(employee, name):
                        setattr(employee, name, value)
                        # print (name + ":" + value)
                    else:
                        print("no attr %s in employee" % name)
                    break
    # parse profile
    teachcontent = soup.find_all("div", class_="teachcontent", limit=1)
    if len(teachcontent) != 0:
        content = teachcontent[0]
        link = content.a
        if link:
            employee.url = link['href']

    return employee
Esempio n. 38
0
def profile_handler(doc,name,url,path):
    # employee可用属性(url, name, email, tel, title, profile, research, departments,fax,addr):
    symbols = {
        'email': u'电子邮件:',
        'tel': u'办公电话:',
        'addr': u'办公地址:',
        'research':u'研究方向:'
    }
    employee = None

    soup = BeautifulSoup(doc, Config.SOUP_PARSER)
    divs = soup.find_all("td",attrs={"valign":"top"},limit=1)
    if not divs or len(divs) == 0:
        return employee

    div = divs[0]
    employee = Employee()
    
    # save file
    filename = path+name+".html"
    with open(filename,'wb') as fp:
        content = div.prettify()
        fp.write(content)
        fp.close()
        
    # parse name
    name_h3 = div.h3
    if name_h3:
        employee.name = name_h3.string.strip(' \t\n\r')
    else:
        print name_h3
    # parse title
    dls = soup.dl
    if dls and len(dls) >= 1:
        print dls
        if dls.dt:
            employee.title = dls.dt.string
        # parse everything
        for tag in dls.children:
            if not tag.string:
                continue
            text = tag.string.strip(' \t\n\r')
            if len(text) == 0:
                continue
            for name, symbol in symbols.items():
                idx = text.find(symbol)
                if idx != -1:
                    idx += len(symbol)
                    value = text[idx:]
                    if hasattr(employee, name):
                        setattr(employee, name, value)
                        # print (name + ":" + value)
                    else:
                        print ("no attr %s in employee" % name)
                    break
    # parse profile
    teachcontent = soup.find_all("div",class_="teachcontent",limit=1)
    if len(teachcontent) != 0:
        content = teachcontent[0]
        link= content.a
        if link:
            employee.url = link['href']

    return employee