def handle_third_page(urls, attrs): for url in urls: soup = get_html_str(get_phantomjs_page(url)) if soup is None: logger.info('3级页面无法获取:' + str(url)) return None else: link = soup.find('link', attrs={'rel': 'canonical'}) if link: link = link.get('href') else: logger.info('handle_third_page没有找到跳转链接link:' + str(url)) return None soup = get_html_str(get_phantomjs_page(link)) # 获取关于论文的描述信息:标题、作者、发表日期等等 data_dict = copy.deepcopy(attrs) # 深拷贝字典 data_dict['url'] = link # 保存论文的正真链接地址 h1 = soup.find('h1', class_='svTitle') if h1: data_dict['title'] = h1.get_text().strip() ul = soup.find('ul', class_='authorGroup noCollab svAuthor') if ul: a_list = ul.find_all_next('a', class_='authorName svAuthor') authors_dict = dict() for a in a_list: affiliation_dict = dict() affiliation_dict['affiliation'] = '' affiliation_dict['affiliation_name'] = '' affiliation_dict['affiliation_country'] = '' author_name = a.get_text().strip() author_name = re.sub(r'[\._$]', ' ', author_name) authors_dict[author_name] = affiliation_dict data_dict['author'] = authors_dict h2 = soup.find('h2', text=re.compile(r'Abstract')) if h2: p = h2.find_next_sibling('p') data_dict['abstract'] = p.get_text() h2 = soup.find('h2', text=re.compile(r'Keywords')) if h2: ul = h2.find_next_sibling('ul') keywords_list = ul.find_all_next('li', class_='svKeywords') keywords = list() for keyword in keywords_list: keywords.append(keyword.get_text().strip()) data_dict['keywords'] = keywords h2 = soup.find('h2', text=re.compile(r'References')) if h2: li_list = h2.find_all_next('li', class_='title') references = list() for li in li_list: references.append(li.get_text().strip()) data_dict['reference'] = references write_to_database(data_dict) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_second_page(urls): links = list() for url in urls: page_content = get_html_str(get_phantomjs_page(url)) if page_content is None: logger.info('2级页面无法获取:' + str(url)) return None ul = page_content.find('ul', class_='results') if ul is not None: divs = ul.find_all_next('div', class_='txt') for div in divs: temp = div.find_next('a', class_='art-abs-url') if temp is not None: links.append('http://ieeexplore.ieee.org' + temp.get('href')) # 找到分页代码,获得分页总数,并向分页链接请求页面内容 pagination = page_content.find('div', class_='pagination') if pagination is not None: a_list = pagination.select('a[aria-label^="Pagination Page"]') if a_list: pageNumber = a_list[-1].get_text().strip() if pageNumber is not None: pageNumber = int(pageNumber) url_list = list() for number in range(2, pageNumber + 1): url_list.append(url + '&pageNumber=' + str(number)) for tmp_url in url_list: page_content = get_html_str( get_phantomjs_page(tmp_url)) if page_content is None: logger.info('2级页面无法获取:' + str(url)) return None ul = page_content.find('ul', class_='results') if ul is not None: divs = ul.find_all_next('div', class_='txt') for div in divs: temp = div.find_next('a', class_='art-abs-url') if temp is not None: links.append('http://ieeexplore.ieee.org' + temp.get('href')) else: logger.info('处理2级页面时没有找到分页代码:' + str(url)) time.sleep(get_random_uniform(begin=5.0, end=10.0)) handle_third_page(links) # 进一步处理已采集到的当前页面上的所有3级页面的链接
def handle_third_page(url, attrs): soup = get_html_str(get_phantomjs_page(url)) if soup is None: logger.info('soup is None:' + str(url)) return None # 获取关于论文的描述信息:标题、作者、发表日期等等 data_dict = copy.deepcopy(attrs) # 深拷贝字典 tmp_list = soup.find_all('li', class_='affiliation') affiliation_list = list() for tmp in tmp_list: affiliation_dict = dict() department = tmp.find_next('span', class_='affiliation__department') if department is not None: affiliation_dict['affiliation_department'] = department.get_text() name = tmp.find_next('span', class_='affiliation__name') if name is not None: affiliation_dict['affiliation_name'] = name.get_text() city = tmp.find_next('span', class_='affiliation__city') if city is not None: affiliation_dict['affiliation_city'] = city.get_text() country = tmp.find_next('span', class_='affiliation__country') if country is not None: affiliation_dict['affiliation_country'] = country.get_text() affiliation_list.append(affiliation_dict) tmp_list = soup.find_all('span', class_='authors-affiliations__name') author_dict = dict() for tmp in tmp_list: id = tmp.find_next_sibling('ul').find_next('li').get_text() author = re.sub(r'\xa0', ' ', tmp.get_text()).strip() author = re.sub(r'[\._$]', ' ', author) author_dict[author] = affiliation_list[int(id) - 1] if author_dict: data_dict['author'] = author_dict h2 = soup.find('h2', text=re.compile(r'Abstract')) if h2: p = h2.find_next_sibling('p') if p: data_dict['abstract'] = p.get_text() h3 = soup.find('h3', text=re.compile(r'Keywords')) if h3: span_list = h3.find_next_siblings('span', class_='Keyword') keywords = list() for span in span_list: keywords.append(span.get_text()) data_dict['keywords'] = keywords ol = soup.find('ol', class_='BibliographyWrapper') if ol: div_list = ol.find_all_next('div', class_='CitationContent') references = list() for div in div_list: references.append(div.get_text().strip()) data_dict['references'] = references return data_dict
def handle_first_page(url): # 获得一级页面 page_content = get_html_str(get_phantomjs_page(url)) if page_content is None: logger.info('没有获得1级页面:' + str(url)) return None options = page_content.find('select', id='updatesDate') if options is not None: update_time = options.find_next('option')[ 'value'] #获得IEEE内容更新日期,例如:20170206 # 从数据库中读取上次更新的时间 db = get_database_connect() collection = db.update_conf setting_dict = collection.find_one({'name': 'setting'}) if setting_dict is not None: last_update_time = setting_dict['last_update_time'] else: last_update_time = '20170101' collection.insert( dict({ 'name': 'setting', 'last_update_time': last_update_time, 'this_update_time': last_update_time })) if update_time > last_update_time: # IEEE内容已更新,但是本地尚未采集此次数据 collection.update_one({'name': 'setting'}, { '$set': { 'this_update_time': update_time, 'status': 'unfinished' } }) ul = page_content.find('ul', class_='Browsing') if ul is not None: lis = ul.find_all_next('li', class_='noAbstract') urls = list( map( lambda li: 'http://ieeexplore.ieee.org/xpl/' + li. find_next('a').get('href'), lis)) handle_second_page(urls) # 本次采集数据完成,将本次更新日期保存到数据库 collection.update_one({'name': 'setting'}, { '$set': { 'last_update_time': update_time, 'status': 'finished' } })
def handle_third_page(url, attrs): soup = get_html_str(get_phantomjs_page(url)) if soup is None: logger.info('soup is None:' + str(url)) return None # 获取关于论文的描述信息:标题、作者、发表日期等等 data_dict = copy.deepcopy(attrs) # 深拷贝字典 tmp_list = soup.select( 'div[class="authors-info-container"] > span > span > a') authors_dict = dict() for tmp in tmp_list: affiliation_dict = dict() author_name = tmp.find_next('span') if author_name is not None: author_name = re.sub(r'[\._$]', ' ', author_name.get_text()) institute = tmp.get('qtip-text') if institute is not None: institute = re.sub(r'amp;', '', institute) data_list = re.split(r',', institute) affiliation_dict['affiliation'] = institute affiliation_dict['affiliation_name'] = data_list[0] affiliation_dict['affiliation_country'] = data_list[-1] authors_dict[author_name] = affiliation_dict else: authors_dict[author_name] = dict() data_dict['author'] = authors_dict # 采集论文摘要信息 abstract = soup.find('div', class_='abstract-text ng-binding') if abstract is not None: data_dict['abstract'] = abstract.get_text() # 采集论文关键词信息 ul = soup.find('ul', class_='doc-all-keywords-list') if ul is not None: spans = ul.find_all_next('span') keywords = list() for span in spans: temp = span.find_next('a', class_='ng-binding') if temp is not None: keywords.append(temp.get_text().strip()) data_dict['keywords'] = keywords # 获取论文参考信息 page_content = get_html_str( get_phantomjs_page(url + 'references?ctx=references')) if page_content is not None: h2 = page_content.find('h2', text='References') if h2 is not None: divs = h2.find_next_siblings('div', class_='reference-container ng-scope') references = list() for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: references.append(div_temp.get_text().strip()) data_dict['references'] = references # 获取论文被引用信息 page_content = get_html_str( get_phantomjs_page( url + 'citations?anchor=anchor-paper-citations-ieee&ctx=citations')) if page_content is not None: # Cited in Papers - IEEE h2 = page_content.find('h2', text=re.compile(r'Cited in Papers - IEEE')) citations = list() if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) # Cited in Papers - Other Publishers h2 = page_content.find( 'h2', text=re.compile(r'Cited in Papers - Other Publishers')) if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) data_dict['citations'] = citations return data_dict # 返回数据字典(类别、等级、作者信息)
def handle_third_page(urls): for url in urls: data_dict = dict() page_content = get_html_str(get_phantomjs_page(url)) if page_content is None: logger.info('论文内容页面无法获取!' + url) continue # 论文URL地址 data_dict['url'] = url # 论文类型 data_dict['category'] = 'conference' # IEEE更新论文日期 data_dict['update_time'] = time.strftime('%Y%m%d', time.localtime()) # 论文采集时间 data_dict['spider_time'] = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) # 采集论文名 if page_content.title is not None: data_dict['title'] = page_content.title.string[:-22].strip() # 采集论文摘要信息 abstract = page_content.find('div', class_='abstract-text ng-binding') if abstract is not None: data_dict['abstract'] = abstract.get_text() # 采集论文发表日期 date = page_content.find('strong', text='Date of Publication:') if date is not None: div = date.find_parent('div') if div is not None: date = re.split(r':', div.get_text())[-1].strip() data_dict['publication_date'] = date # 采集论文关键词信息 ul = page_content.find('ul', class_='doc-all-keywords-list') if ul is not None: spans = ul.find_all_next('span') keywords = list() for span in spans: temp = span.find_next('a', class_='ng-binding') if temp is not None: keywords.append(temp.get_text().strip()) data_dict['keywords'] = keywords # 采集论文作者信息 h2 = page_content.find('h2', text='Authors') if h2 is not None: div = h2.find_next_sibling('div', class_='ng-scope') if div is not None: temp = div.select( 'a[href^="/search/searchresult.jsp?searchWithin="]') if temp is not None: authors_dict = dict() # 保存多个作者信息到字典 for a in temp: affiliation_dict = dict() span = a.find_next('span', class_='ng-binding') if span is not None: author_name = span.get_text().strip() author_name = re.sub(r'[._$]', ' ', author_name) tmp = a.parent.find_next_sibling( 'div', class_='ng-binding') if tmp is not None: affiliation = tmp.get_text().strip() data_list = re.split(r',', affiliation) affiliation_dict['affiliation'] = affiliation if data_list is not None: affiliation_dict[ 'affiliation_country'] = data_list[-1] authors_dict[author_name] = affiliation_dict data_dict['author'] = authors_dict # 获取论文参考信息 page_content = get_html_str( get_phantomjs_page(url + 'references?ctx=references')) if page_content is not None: h2 = page_content.find('h2', text='References') if h2 is not None: divs = h2.find_next_siblings( 'div', class_='reference-container ng-scope') references = list() for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: references.append(div_temp.get_text().strip()) data_dict['references'] = references # 获取论文被引用信息 page_content = get_html_str( get_phantomjs_page( url + 'citations?anchor=anchor-paper-citations-ieee&ctx=citations')) if page_content is not None: # Cited in Papers - IEEE h2 = page_content.find('h2', text=re.compile(r'Cited in Papers - IEEE')) citations = list() if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) # Cited in Papers - Other Publishers h2 = page_content.find( 'h2', text=re.compile(r'Cited in Papers - Other Publishers')) if h2 is not None: divs = h2.find_next_siblings('div', class_='ng-scope') for div in divs: div_temp = div.find_next('div', class_='description ng-binding') if div_temp: citations.append(div_temp.get_text().strip()) data_dict['citations'] = citations write_to_database(data_dict) time.sleep(get_random_uniform(begin=1.0, end=20.0))