Example #1
0
def get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y):
    sic_str = 'sic={sic}&'
    s_d_str = 'startday={s_d}&'
    s_m_str = 'startmonth={s_m}&'
    s_y_str = 'startyear={s_y}&'
    e_d_str = 'endday={e_d}&'
    e_m_str = 'endmonth={e_m}&'
    e_y_str = 'endyear={e_y}&'

    search_url = 'https://www.osha.gov/pls/imis/AccidentSearch.search?'
    search_url = search_url + 'p_logger=1&acc_description=fall&acc_Abstract=&acc_keyword=&'
    search_url = search_url + sic_str.format(sic=SICNo)
    search_url = search_url + 'naics=&Office=All&officetype=All&'
    search_url = search_url + e_m_str.format(e_m=e_m)
    search_url = search_url + e_d_str.format(e_d=e_d)
    search_url = search_url + e_y_str.format(e_y=e_y)
    search_url = search_url + s_m_str.format(s_m=s_m)
    search_url = search_url + s_d_str.format(s_d=s_d)
    search_url = search_url + s_y_str.format(s_y=s_y)
    search_url = search_url + 'InspNr='

    brw = BrowserShadow()
    res = brw.open_url(search_url)
    page_content = res.read()
    page_soup = BeautifulSoup(page_content, "html.parser")

    count_str_cell = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.row-fluid')[0].select('.span3')[0].select('.text-right')
    count_str = count_str_cell[0].get_text();
    record_num = int(count_str[count_str.find('of') + 2 : len(count_str)])
    
    #print(record_num)
    #print(search_url)

    return record_num
Example #2
0
def parse_page_obtian_event(page_soup):
    #event_content = page_soup.select('#maintain')[0].select('.row-fluid')[0].select('.table-responsive')[0].select('tbody')[0]
    event_content = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.table-responsive')[1].select('table')[0].find_all('tr')

    for index in range(1 , len(event_content)):
        item_list = event_content[index].find_all('td')
        SummaryNr = item_list[2].find('a').get_text()
        EventDate = item_list[3].get_text()
        ReportID = item_list[4].get_text()
        Fat = item_list[5].get_text()
        if Fat.find('X') > -1 :
            Fat = 1
        else:
            Fat = 0

        SIC = item_list[6].find('a').get_text()
        EventDesc = item_list[7].get_text()
        #print(SummaryNr + EventData + ReportID + str(Fat) + SIC)

        accident_url_str = item_list[2].find('a')['href']
        accident_url_str = 'https://www.osha.gov/pls/imis/' + accident_url_str

        #print(SummaryNr)
        brw_d = BrowserShadow()
        res_d = brw_d.open_url(accident_url_str)
        page_content_d = res_d.read()
        page_soup_d = BeautifulSoup(page_content_d, "html.parser")

        # parse the detail information about the event
        abstract_info = {'SummaryNr':SummaryNr , 'EventDate':EventDate, 'ReportID':ReportID, 'Fat':str(Fat), 'SIC':SIC, 'EventDesc':EventDesc}
        parse_accident_details(page_soup_d , abstract_info)
Example #3
0
def get_article_content(article_info_list):
    """ get the content of the articles"""
    for page_info in article_info_list:
        page_url = page_info['article_href']
        brw = BrowserShadow()
        res = brw.open_url(page_url)
        page_content = res.read()
        page_soup = BeautifulSoup(page_content)
        article_content = page_soup.select('#article_content')
        print(article_content)
        break;
Example #4
0
def get_articles_list():
    """ got the articles list """
    blog_name = input('Please input the name of the blog:')
    content_list_page_url = 'http://blog.csdn.net/%(blog_name)s/article/list/1'%{'blog_name':blog_name}
    
    # the type of the page_content is string
    brw = BrowserShadow()
    res = brw.open_url(content_list_page_url)
    page_content = res.read()
    page_soup = BeautifulSoup(page_content)
    article_info_list = []
    
    #get pages size
    articles_num = page_soup.select('#papelist > span')
    s_temp = str(articles_num[0])
    total_page_size = int(s_temp[s_temp.find('共')+1 : s_temp.find('页')])

    next_page = 1
    while 1:      
        # got the articles information of the current page      
        one_list = page_soup.find_all('div' , 'list_item article_item')

        for one_info in one_list:
            # got abstract information of the article
            s_temp = one_info.select('.article_title h1 a')
            article_title = s_temp[0].get_text().replace('\r\n','').replace('\n[置顶]','').replace('    ','')
            
            article_href = 'http://blog.csdn.net/' + s_temp[0]['href']
            article_time = one_info.select('.article_manage > .link_postdate')[0].get_text()
                      
            page_info = {'article_title':article_title, 'article_href':article_href, 'article_time':article_time}
            article_info_list.append(page_info)
            
        next_page = next_page + 1
        next_page_url = 'http://blog.csdn.net/%(blog_name)s/article/list/%(page_num)d'%{'blog_name':blog_name,'page_num':next_page}
        
        # obtain the soup of the next page
        if next_page > total_page_size:
            break
        else:
            res = brw.open_url(next_page_url)
            page_content = res.read()
            page_soup = BeautifulSoup(page_content)
            
##    for article in article_info_list:
##        print(article)
    return article_info_list
Example #5
0
def get_record_list(SICNo, s_d, s_m, s_y, e_d, e_m, e_y):

    record_num = get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y)

    if record_num <= 0:
        print("No Eligible Record has been retrieved!")
        return

    
    p_finish = 0
    p_show = 100
    if record_num < p_show:
        p_show = record_num

    checked_num = 0
    while 1:

        if (p_finish + p_show) > record_num:
            p_show = record_num - p_finish
        
        sic_str = 'sic={sic}&'
        p_finish_str = 'p_finish={p_finish}&'
        p_show_str = 'p_show={p_show}'

        search_url = 'https://www.osha.gov/pls/imis/accidentsearch.search?'
        search_url = search_url + sic_str.format(sic=SICNo)
        search_url = search_url + 'sicgroup=&naics=&acc_description=fall&acc_abstract=&acc_keyword=&inspnr=&fatal=&officetype=All&office=All&'
        search_url = search_url + 'startmonth=07&startday=24&startyear=2015&endmonth=07&endday=23&endyear=1984&keyword_list=&p_start=&'
        search_url = search_url + p_finish_str.format(p_finish=p_finish)
        search_url = search_url + 'p_sort=&p_desc=DESC&p_direction=Next&'
        search_url = search_url + p_show_str.format(p_show=p_show)
        #print(search_url)
        brw = BrowserShadow()
        res = brw.open_url(search_url)
        page_content = res.read()
        page_soup = BeautifulSoup(page_content, "html.parser")

        # prase the specified record
        """collect the records into mysql database"""
        parse_page_obtian_event(page_soup)

        checked_num = checked_num + p_show
        #print(checked_num)
        if checked_num == record_num:
            break
        else:
            p_finish = p_finish + p_show