def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    rp = robotparser.RobotFileParser()
    while crawl_queue:
        url = crawl_queue.pop()
        rp.set_url(url + '/robots.txt')
        rp.read()
        user_agent = 'wswp'

        if rp.can_fetch(user_agent, url):
            throttle = Throttle.Throttle(5)
            throttle.wait(url)
            html = download(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                for link in get_links(html):
                    if re.match(link_regex, link):
                        link = urlparse.urljoin(seed_url, link)
                        if link not in seen:
                            seen[link] = depth + 1
                            # seen.add(link)
                            crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url
Exemple #2
0
def link_crawler(seed_url, link_regex):
    """
     crawlfrom the given seed URL following links matched by link_regex
     :param seed_url: 
     :param link_regex: 
     :return: 
     """
    #read the robots.txt
    rp = robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt')
    rp.read()
    #set the agent's name
    user_agent = "667's Python Spider"
    #set the delay for crawl speed    5 second

    th = Throttle.Throttle(5)

    #set the crawl queue for crawled url
    crawl_queue = [seed_url]
    visited = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            th.wait(url)
            html = download_network_page(url)
            print html
            # filter for links matching out regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)

                    if link not in visited:
                        visited.add(link)
                        crawl_queue.append(link)
Exemple #3
0
 def __init__(self,
              delay=5,
              user_agent='wswp',
              proxies=None,
              num_tries=1,
              catch=None):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_tries = num_tries
     self.catch = catch
Exemple #4
0
def linked_download(seed_url,
                    linked_rex=None,
                    user_agent='wswp',
                    proxy=None,
                    max_depth=2,
                    delay=3):
    # 按照正则匹配规则,下载关联的所有网页

    print("linked_download start")

    # 设置延迟访问休眠对象
    throttle = Throttle.Throttle(delay)

    # 访问过的url字典缓存
    searched_urls = {}
    # 需要遍历的url列表
    url_list = [seed_url]

    # 设置user-agent和代理
    opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy))
    opener.addheaders = [('User-agent', user_agent)]
    urllib.request.install_opener(opener)

    # 读取robot.txt
    rp = get_robots(seed_url)

    # 遍历所有的url
    while url_list:
        # 弹出当前第一个url
        url = url_list.pop()

        # robot.txt中当前代理是否允许爬取
        if rp.can_fetch(user_agent, url):
            # 获得当前url访问过的次数(默认为0)
            depth = searched_urls.get(url, 0)

            # 如果url最大访问次数未达到次数
            if depth != max_depth:
                # 判断当前访问是否需要延迟
                throttle.wait(url)

                # 访问url,获得html数据
                html = download(url, user_agent, proxy)

                # 从html中获得所有的a标签链接
                linked_urls = get_linked_url(html.decode('utf-8'))

                # 将符合规则的a标签加入url列表
                for url_item in linked_urls:
                    # 是否符合传入的url规则
                    if re.search(linked_rex, url_item):
                        # 是否还未被爬取过
                        if url_item not in searched_urls:
                            # 将已经爬取过的网页保存起来,并且设置爬取的次数加1
                            searched_urls[url_item] = depth + 1

                            # 将url拼接为绝对路径
                            url_item = urlparse.urljoin(seed_url, url_item)
                            # 加入当前url_list
                            url_list.append(url_item)
        else:
            # 被robot.txt 拒绝
            print('Blocked by robots.txt:' + url)
# In[1]:

get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
from data_pre import *
from Throttle import *

# In[2]:

data = load_data()

# In[3]:

m = Throttle(data)
m.update_vars(m.data)
m.filter_obs()

# In[4]:

m.update_vars(m.data2)
m.fit_by_batches()

# In[5]:

# Estimated parameters for the first 5 batches
m.mus[:5], m.Sigs[:5]

# In[6]:
Exemple #6
0
 def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=2, cache=None):
     self.throttle = Throttle.Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
# coding=utf-8
import urllib2
import re
import urlparse
import Throttle

# 设置下载限速(秒)
throttle = Throttle.Throttle(5)
data_list = []

def download(url, user_agent = 'wswp', proxy=None, re_times = 2):
    '''可以设置用户代理的下载方法'''
    print 'DownLoad....', url

    # 限制下载速度
    throttle.wait(url)

    # 设置请求头
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers = headers)

    opener = urllib2.build_opener()
    # 添加代理的支持
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
        # html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print "DownLoad Error: ", e.reason