def get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y): sic_str = 'sic={sic}&' s_d_str = 'startday={s_d}&' s_m_str = 'startmonth={s_m}&' s_y_str = 'startyear={s_y}&' e_d_str = 'endday={e_d}&' e_m_str = 'endmonth={e_m}&' e_y_str = 'endyear={e_y}&' search_url = 'https://www.osha.gov/pls/imis/AccidentSearch.search?' search_url = search_url + 'p_logger=1&acc_description=fall&acc_Abstract=&acc_keyword=&' search_url = search_url + sic_str.format(sic=SICNo) search_url = search_url + 'naics=&Office=All&officetype=All&' search_url = search_url + e_m_str.format(e_m=e_m) search_url = search_url + e_d_str.format(e_d=e_d) search_url = search_url + e_y_str.format(e_y=e_y) search_url = search_url + s_m_str.format(s_m=s_m) search_url = search_url + s_d_str.format(s_d=s_d) search_url = search_url + s_y_str.format(s_y=s_y) search_url = search_url + 'InspNr=' brw = BrowserShadow() res = brw.open_url(search_url) page_content = res.read() page_soup = BeautifulSoup(page_content, "html.parser") count_str_cell = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.row-fluid')[0].select('.span3')[0].select('.text-right') count_str = count_str_cell[0].get_text(); record_num = int(count_str[count_str.find('of') + 2 : len(count_str)]) #print(record_num) #print(search_url) return record_num
def parse_page_obtian_event(page_soup): #event_content = page_soup.select('#maintain')[0].select('.row-fluid')[0].select('.table-responsive')[0].select('tbody')[0] event_content = page_soup.select('#maincontain')[0].select('.row-fluid')[0].select('.table-responsive')[1].select('table')[0].find_all('tr') for index in range(1 , len(event_content)): item_list = event_content[index].find_all('td') SummaryNr = item_list[2].find('a').get_text() EventDate = item_list[3].get_text() ReportID = item_list[4].get_text() Fat = item_list[5].get_text() if Fat.find('X') > -1 : Fat = 1 else: Fat = 0 SIC = item_list[6].find('a').get_text() EventDesc = item_list[7].get_text() #print(SummaryNr + EventData + ReportID + str(Fat) + SIC) accident_url_str = item_list[2].find('a')['href'] accident_url_str = 'https://www.osha.gov/pls/imis/' + accident_url_str #print(SummaryNr) brw_d = BrowserShadow() res_d = brw_d.open_url(accident_url_str) page_content_d = res_d.read() page_soup_d = BeautifulSoup(page_content_d, "html.parser") # parse the detail information about the event abstract_info = {'SummaryNr':SummaryNr , 'EventDate':EventDate, 'ReportID':ReportID, 'Fat':str(Fat), 'SIC':SIC, 'EventDesc':EventDesc} parse_accident_details(page_soup_d , abstract_info)
def get_article_content(article_info_list): """ get the content of the articles""" for page_info in article_info_list: page_url = page_info['article_href'] brw = BrowserShadow() res = brw.open_url(page_url) page_content = res.read() page_soup = BeautifulSoup(page_content) article_content = page_soup.select('#article_content') print(article_content) break;
def get_articles_list(): """ got the articles list """ blog_name = input('Please input the name of the blog:') content_list_page_url = 'http://blog.csdn.net/%(blog_name)s/article/list/1'%{'blog_name':blog_name} # the type of the page_content is string brw = BrowserShadow() res = brw.open_url(content_list_page_url) page_content = res.read() page_soup = BeautifulSoup(page_content) article_info_list = [] #get pages size articles_num = page_soup.select('#papelist > span') s_temp = str(articles_num[0]) total_page_size = int(s_temp[s_temp.find('共')+1 : s_temp.find('页')]) next_page = 1 while 1: # got the articles information of the current page one_list = page_soup.find_all('div' , 'list_item article_item') for one_info in one_list: # got abstract information of the article s_temp = one_info.select('.article_title h1 a') article_title = s_temp[0].get_text().replace('\r\n','').replace('\n[置顶]','').replace(' ','') article_href = 'http://blog.csdn.net/' + s_temp[0]['href'] article_time = one_info.select('.article_manage > .link_postdate')[0].get_text() page_info = {'article_title':article_title, 'article_href':article_href, 'article_time':article_time} article_info_list.append(page_info) next_page = next_page + 1 next_page_url = 'http://blog.csdn.net/%(blog_name)s/article/list/%(page_num)d'%{'blog_name':blog_name,'page_num':next_page} # obtain the soup of the next page if next_page > total_page_size: break else: res = brw.open_url(next_page_url) page_content = res.read() page_soup = BeautifulSoup(page_content) ## for article in article_info_list: ## print(article) return article_info_list
def get_record_list(SICNo, s_d, s_m, s_y, e_d, e_m, e_y): record_num = get_record_num(SICNo, s_d, s_m, s_y, e_d, e_m, e_y) if record_num <= 0: print("No Eligible Record has been retrieved!") return p_finish = 0 p_show = 100 if record_num < p_show: p_show = record_num checked_num = 0 while 1: if (p_finish + p_show) > record_num: p_show = record_num - p_finish sic_str = 'sic={sic}&' p_finish_str = 'p_finish={p_finish}&' p_show_str = 'p_show={p_show}' search_url = 'https://www.osha.gov/pls/imis/accidentsearch.search?' search_url = search_url + sic_str.format(sic=SICNo) search_url = search_url + 'sicgroup=&naics=&acc_description=fall&acc_abstract=&acc_keyword=&inspnr=&fatal=&officetype=All&office=All&' search_url = search_url + 'startmonth=07&startday=24&startyear=2015&endmonth=07&endday=23&endyear=1984&keyword_list=&p_start=&' search_url = search_url + p_finish_str.format(p_finish=p_finish) search_url = search_url + 'p_sort=&p_desc=DESC&p_direction=Next&' search_url = search_url + p_show_str.format(p_show=p_show) #print(search_url) brw = BrowserShadow() res = brw.open_url(search_url) page_content = res.read() page_soup = BeautifulSoup(page_content, "html.parser") # prase the specified record """collect the records into mysql database""" parse_page_obtian_event(page_soup) checked_num = checked_num + p_show #print(checked_num) if checked_num == record_num: break else: p_finish = p_finish + p_show