def handle_third_page(url, attrs): soup = get_html_text(url) if soup is None: logger.info('soup is None:' + str(url)) return None data_dict = copy.deepcopy(attrs) # 深拷贝字典 # 获取关于论文的描述信息:标题、作者、发表日期、关键词等等 paper_id = re.split(r'\.', url)[-1].strip() bib_url = 'http://dl.acm.org/exportformats.cfm?id=' + paper_id + '&expformat=bibtex' page = get_html_text(bib_url) if page: temp = page.find('pre') if temp: content = temp.get_text() filepath = root_dir + paper_id with open(filepath, 'w', encoding='utf-8') as f: f.write(content) f.flush() with open(filepath, 'r') as f: for line in f: if 'keywords' in line: temp = re.split(r'[{}]', line)[-2] data_dict['keywords'] = re.split(r',', temp) # 获取论文PDF的下载地址 pdf_url = soup.find('a', attrs={'name': 'FullTextPDF'}) if pdf_url is not None: pdf_url = 'http://dl.acm.org/' + pdf_url.get('href').strip() data_dict['pdf_url'] = pdf_url authors = soup.find_all('a', attrs={'title': 'Author Profile Page'}) if (authors is not None) and (authors != ''): authors_dict = {} for tmp in authors: temp = tmp.find_next('a', attrs={'title': 'Institutional Profile Page'}) if (temp is not None) and (temp != ''): institute = temp.find_next('small') if (institute is not None) and (institute != ''): affiliation_dict = dict() # mongodb中带有"."号,"_"号和"$"号前缀的Key被保留 author_name = re.sub(r'[\._$]', ' ', tmp.get_text().strip()) institute = institute.get_text().strip() data_list = re.split(r',', institute) affiliation_dict['affiliation'] = institute affiliation_dict['affiliation_name'] = data_list[0] if len(data_list) != 1: affiliation_dict['affiliation_country'] = data_list[-1] authors_dict[author_name] = affiliation_dict data_dict['author'] = authors_dict return data_dict #返回数据字典(类别、等级、作者信息) else: logger.info('三级页面没有找到论文描述信息' + str(url))
def handle_second_page(url, attrs): # 获得二级页面 soup = get_html_text(url) if soup is None: return None # 优先使用DOI链接 raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI')) if len(raw_links) == 0: # 没有找到DOI链接,就选择使用通过 @ 找到的链接 raw_links = soup.find_all(text=re.compile(r'electronic edition @')) links = map(lambda tmp: tmp.find_parent('a'), raw_links) if links is None: logger.info('处理二级页面,没有找到electronic edition链接' + str(url)) for raw_url in links: paper_dict = handle_third_page(raw_url.get('href'), attrs) tmp = raw_url.find_parent('li', class_='drop-down') if tmp is not None: temp = tmp.find_next_sibling('li', class_='drop-down') if temp is not None: raw_ris = temp.select_one( 'div[class="body"] > ul:nth-of-type(1) > li:nth-of-type(2) > a' ) if raw_ris is not None: download_paper_info(raw_ris.get('href'), root_dir, paper_dict) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def download_paper_info(url, root_dir, attrs): filename = re.split(r'/', url)[-1] page_content = get_html_text(url) if page_content is None: logger.error('download_paper_info出错!' + str(url)) return None data = page_content.get_text() if data is not None: # 将数据存入本地文件中,方便读取和写入数据库 filepath = root_dir + filename with open(filepath, 'w', encoding='utf-8') as f: f.write(data) f.flush() write_to_database(filepath, attrs)
def handle_second_page(url, attrs): # 获得二级页面 soup = get_html_text(url) if soup is None: return None # 优先使用DOI链接 raw_links = soup.find_all(text=re.compile(r'electronic edition via DOI')) if len(raw_links) == 0: # 没有找到DOI链接,就选择使用通过 @ 找到的链接 raw_links = soup.find_all(text=re.compile(r'electronic edition @')) links = map(lambda tmp: tmp.find_parent('a').get('href'), raw_links) if links is None: logger.info('处理二级页面,没有找到electronic edition链接' + str(url)) handle_third_page(list(links), attrs)
def handle_first_page(url, attrs): # 获得一级页面 page_content = get_html_text(url) if page_content is None: logger.info('1级页面无法获取:' + str(url)) return None raw_links = page_content.find_all('a', text='[contents]') if (raw_links is not None) and (len(raw_links) > 0): links = map(lambda raw_link: raw_link.get('href'), raw_links) # 会议论文 else: raw_links = page_content.find_all('a', text=re.compile(r'Volume')) # 期刊 links = map(lambda raw_link: raw_link.get('href'), raw_links) for url in links: handle_second_page(url, attrs) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_first_page(url, attrs): # 获得一级页面 page_content = get_html_text(url) if page_content is None: logger.info('1级页面无法获取:' + str(url)) return None raw_links = list() li_list = page_content.select( 'a[href^="http://dblp.uni-trier.de/db/journals/"]') for li in li_list: temp = li.get('href') if 'http://dblp.uni-trier.de/db/journals/' != temp: raw_links.append(temp) for url in raw_links: handle_second_page(url, attrs) time.sleep(get_random_uniform(begin=2.0, end=60.0))
def handle_third_page(url, attrs): soup = get_html_text(url) if soup is None: logger.info('soup is None:' + str(url)) return None # 获取关于论文的描述信息:标题、作者、发表日期等等 data_dict = copy.deepcopy(attrs) # 深拷贝字典 tmp = soup.find('div', class_='field-label') if tmp is not None: tmp = soup.find('div', class_='field-item odd').find_next('p') tmp_list = list() if tmp is not None: i = 0 for child in tmp.children: i += 1 if (i % 2) != 0: tmp_list.append(child) # 作者名 else: child = child.get_text().strip() tmp_list.append(child.strip(';')) # 机构名称 authors_dict = dict() for n in range(0, len(tmp_list), 2): affiliation_dict = dict() affiliation_dict['affiliation_name'] = tmp_list[n + 1] author_list = re.split(r'(?:and|,)\s*', tmp_list[n])[:-1] for author in author_list: if (author != '') and (author != ','): author = re.sub(r'[\.$_]', ' ', author.strip()) authors_dict[author] = affiliation_dict data_dict['author'] = authors_dict div = soup.find('div', text=re.compile(r'Abstract')) if div: div = div.find_next_sibling('div', class_='field-items') if div: data_dict['abstract'] = div.get_text() return data_dict # 返回数据字典(类别、等级、作者信息)
soup = BeautifulSoup(html, 'html.parser') imgs = soup('img', class_='card-img-top round-0') # print(imgs) image_links = [] for img in imgs: image_link = img.get('source') image_links.append(image_link) return image_links def save_image(image_link): content = requests.get(image_link, timeout=10).content root = 'g://img//timeroute//' if not os.path.exists(root): os.makedirs(root) path = root + image_link.split('/')[-1] if not os.path.exists(path): with open(path, 'wb') as f: f.write(content) f.close() print(path + ' saved sucessfully.') else: print(path + ' has already exist!') for i in range(0, 805): url = 'http://timeroute.cn/desktop/page/%d' % i print(url) html = common.get_html_text(url) image_links = get_image_links(html) for image_link in image_links: save_image(image_link)
with open(path, 'wb') as f: for content in contents: f.write(bytearray(content, 'utf8')) f.close() print(path + '.txt has saved sucessfully!') else: print(title+'.txt has already exist.') else: path = root + 'black.txt' print(path) if not os.path.exists(root): os.makedirs(root) if not os.path.exists(path): with open(path, 'wb') as f: f.write(bytearray('', 'utf8')) f.close() # pass html = common.get_html_text('http://cl.b8y.xyz/thread0806.php?fid=21') article_links = get_article_links(html) for article_link in article_links: print(article_link) html_sub = common.get_html_text(article_link) # (title, contents) = get_title_content(html_sub) # print(title) # for content in contents: # print(content) # save_article(content) save_article(html_sub)