Beispiel #1
0
def __get_article(line):
    href, title = __get_href_and_title(line)
    if 'htm_data' in href:
        article_url = http.DOMAIN + href
        html = http.fetch(article_url)
        __get_content(html, title)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" domain_crawler """

__author__ = 'shiyu.feng'

import common.http_util as http
from bs4 import BeautifulSoup
import re

district = open('../district/district.txt', 'r')
for url_part in district.readlines():
    url = http.DOMAIN + url_part
    print 'fetching %s' % url
    html = http.fetch(url)
    soup = BeautifulSoup(html, 'html.parser')
    children = soup.find(id='shangQuancontain').find_all('a')
    try:
        domain = open('domain.txt', 'a+')
        text = ''
        for child in children:
            text += child.get('href') + '\n'
        domain.write(text)
    except BaseException as e:
        print e
Beispiel #3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" article list crawler """

__author__ = 'shiyu.feng'

import common.http_util as http
from bs4 import BeautifulSoup
import re

PORN_HOME_PAGE_URL = http.DOMAIN + 'thread0806.php'

url = PORN_HOME_PAGE_URL + '?fid=20&search=&page=1'
soup = BeautifulSoup(http.fetch(url), 'html.parser')
page_button = soup.find(
    id='last').find_previous_sibling().find_previous_sibling()
button_value = page_button.input['value']
max_page_number = int(re.split('/', button_value)[1])

articles = open('articles.txt', 'a+')
for index in range(1, max_page_number + 1):
    print 'current page is %d' % index
    url = PORN_HOME_PAGE_URL + '?fid=20&search=&page=' + str(index)
    soup = BeautifulSoup(http.fetch(url), 'html.parser')
    items = soup.find_all('h3')
    text = '\n'.join(str(tag) for tag in items)
    articles.write(text)

articles.close()
Beispiel #4
0
def __get_image(line):
    href, title = __get_href_and_title(line)
    if 'htm_data' in href:
        url = http.DOMAIN + href
        html = http.fetch(url)
        __get_content(html, title)