def __get_article(line): href, title = __get_href_and_title(line) if 'htm_data' in href: article_url = http.DOMAIN + href html = http.fetch(article_url) __get_content(html, title)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ domain_crawler """ __author__ = 'shiyu.feng' import common.http_util as http from bs4 import BeautifulSoup import re district = open('../district/district.txt', 'r') for url_part in district.readlines(): url = http.DOMAIN + url_part print 'fetching %s' % url html = http.fetch(url) soup = BeautifulSoup(html, 'html.parser') children = soup.find(id='shangQuancontain').find_all('a') try: domain = open('domain.txt', 'a+') text = '' for child in children: text += child.get('href') + '\n' domain.write(text) except BaseException as e: print e
#!/usr/bin/env python # -*- coding: utf-8 -*- """ article list crawler """ __author__ = 'shiyu.feng' import common.http_util as http from bs4 import BeautifulSoup import re PORN_HOME_PAGE_URL = http.DOMAIN + 'thread0806.php' url = PORN_HOME_PAGE_URL + '?fid=20&search=&page=1' soup = BeautifulSoup(http.fetch(url), 'html.parser') page_button = soup.find( id='last').find_previous_sibling().find_previous_sibling() button_value = page_button.input['value'] max_page_number = int(re.split('/', button_value)[1]) articles = open('articles.txt', 'a+') for index in range(1, max_page_number + 1): print 'current page is %d' % index url = PORN_HOME_PAGE_URL + '?fid=20&search=&page=' + str(index) soup = BeautifulSoup(http.fetch(url), 'html.parser') items = soup.find_all('h3') text = '\n'.join(str(tag) for tag in items) articles.write(text) articles.close()
def __get_image(line): href, title = __get_href_and_title(line) if 'htm_data' in href: url = http.DOMAIN + href html = http.fetch(url) __get_content(html, title)