Esempio n. 1
0
class Downloader:
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 num_tries=1,
                 catch=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_tries = num_tries
        self.catch = catch

    def __call__(self, url):
        result = None
        if self.catch:
            try:
                result = self.catch[url]
            except KeyError:
                pass
            else:
                if self.num_tries > 0 and 500 <= result['code'] < 600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'
            }
            result = self.download(url, headers, proxy, self.num_tries)
            if self.catch:
                self.catch[url] = result
        return result['html']

    def download(self, url, headers, proxy, num_tries, data=None):
        print 'downloading:', url
        request = urllib2.Request(url, headers=headers)
        opener = urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))

        html = None
        try:
            response = opener.open(request, timeout=30)
            html = response.read()
            code = response.code
        except urllib2.URLError as e:
            print 'Url error:', e
            html = None
            if hasattr(e, 'code'):
                code = e.code
                if num_tries > 0 and 500 <= code < 600:
                    return None
        except Exception as e:
            print 'error:', e
        return {'html': html, 'code': code}
Esempio n. 2
0
 def __init__(self,
              delay=5,
              user_agent='wswp',
              proxies=None,
              num_tries=1,
              catch=None):
     self.throttle = Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_tries = num_tries
     self.catch = catch
def link_crawler(seed_url, link_regex, max_depth=1, scrape_callback=None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    rp = robotparser.RobotFileParser()
    while crawl_queue:
        url = crawl_queue.pop()
        rp.set_url(url + '/robots.txt')
        rp.read()
        user_agent = 'wswp'

        if rp.can_fetch(user_agent, url):
            throttle = Throttle.Throttle(5)
            throttle.wait(url)
            html = download(url)
            links = []
            if scrape_callback:
                links.extend(scrape_callback(url, html) or [])
            depth = seen[url]
            if depth != max_depth:
                for link in get_links(html):
                    if re.match(link_regex, link):
                        link = urlparse.urljoin(seed_url, link)
                        if link not in seen:
                            seen[link] = depth + 1
                            # seen.add(link)
                            crawl_queue.append(link)
        else:
            print 'Blocked by robots.txt:', url
Esempio n. 4
0
 def __call__(self, url):
     '''
     先从缓存中取出该url对应的数据,如果缓存中有该数据则不必下载也不必限速
      如果缓存中没有该数据,则需要重新下载,并且下载前需要限速throttle
     '''
     result = None
     if self.cache:
         try:
             result = self.cache[url]
         except KeyError:
             # 如果这个url不在缓存中
             pass
         else:
             if self.num_retries > 0 and 500 <= result['code'] < 600:
                 # 如果有服务器错误,说明之前缓存的数据有误不可用
                 #并且num_retries>0,则重新下载
                 result = None
     if result is None:
         # 此时才是真正发生下载,不是从缓存中下载获取,故需要限速,防止被封
         self.throttle = Throttle.wait(self, url)
         proxy = random.choice(self.proxies) if self.proxies else None
         headers = {'User-agent': self.user_agent}
         self.num_retries = 1
         result = self.download(url,
                                headers,
                                proxy=proxy,
                                num_retries=self.num_retries)
         if self.cache:
             # 把下载得到的html网页存进缓存中
             self.cache[url] = result
     return result['html']
Esempio n. 5
0
def link_crawler(seed_url, link_regex):
    """
     crawlfrom the given seed URL following links matched by link_regex
     :param seed_url: 
     :param link_regex: 
     :return: 
     """
    #read the robots.txt
    rp = robotparser.RobotFileParser()
    rp.set_url('http://example.webscraping.com/robots.txt')
    rp.read()
    #set the agent's name
    user_agent = "667's Python Spider"
    #set the delay for crawl speed    5 second

    th = Throttle.Throttle(5)

    #set the crawl queue for crawled url
    crawl_queue = [seed_url]
    visited = set(crawl_queue)
    while crawl_queue:
        url = crawl_queue.pop()
        if rp.can_fetch(user_agent, url):
            th.wait(url)
            html = download_network_page(url)
            print html
            # filter for links matching out regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)

                    if link not in visited:
                        visited.add(link)
                        crawl_queue.append(link)
Esempio n. 6
0
def linked_download(seed_url,
                    linked_rex=None,
                    user_agent='wswp',
                    proxy=None,
                    max_depth=2,
                    delay=3):
    # 按照正则匹配规则,下载关联的所有网页

    print("linked_download start")

    # 设置延迟访问休眠对象
    throttle = Throttle.Throttle(delay)

    # 访问过的url字典缓存
    searched_urls = {}
    # 需要遍历的url列表
    url_list = [seed_url]

    # 设置user-agent和代理
    opener = urllib.request.build_opener(urllib.request.ProxyHandler(proxy))
    opener.addheaders = [('User-agent', user_agent)]
    urllib.request.install_opener(opener)

    # 读取robot.txt
    rp = get_robots(seed_url)

    # 遍历所有的url
    while url_list:
        # 弹出当前第一个url
        url = url_list.pop()

        # robot.txt中当前代理是否允许爬取
        if rp.can_fetch(user_agent, url):
            # 获得当前url访问过的次数(默认为0)
            depth = searched_urls.get(url, 0)

            # 如果url最大访问次数未达到次数
            if depth != max_depth:
                # 判断当前访问是否需要延迟
                throttle.wait(url)

                # 访问url,获得html数据
                html = download(url, user_agent, proxy)

                # 从html中获得所有的a标签链接
                linked_urls = get_linked_url(html.decode('utf-8'))

                # 将符合规则的a标签加入url列表
                for url_item in linked_urls:
                    # 是否符合传入的url规则
                    if re.search(linked_rex, url_item):
                        # 是否还未被爬取过
                        if url_item not in searched_urls:
                            # 将已经爬取过的网页保存起来,并且设置爬取的次数加1
                            searched_urls[url_item] = depth + 1

                            # 将url拼接为绝对路径
                            url_item = urlparse.urljoin(seed_url, url_item)
                            # 加入当前url_list
                            url_list.append(url_item)
        else:
            # 被robot.txt 拒绝
            print('Blocked by robots.txt:' + url)
# In[1]:

get_ipython().run_line_magic('matplotlib', 'inline')
import numpy as np
import pandas as pd
from data_pre import *
from Throttle import *

# In[2]:

data = load_data()

# In[3]:

m = Throttle(data)
m.update_vars(m.data)
m.filter_obs()

# In[4]:

m.update_vars(m.data2)
m.fit_by_batches()

# In[5]:

# Estimated parameters for the first 5 batches
m.mus[:5], m.Sigs[:5]

# In[6]:
Esempio n. 8
0
 def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=2, cache=None):
     self.throttle = Throttle.Throttle(delay)
     self.user_agent = user_agent
     self.proxies = proxies
     self.num_retries = num_retries
     self.cache = cache
# coding=utf-8
import urllib2
import re
import urlparse
import Throttle

# 设置下载限速(秒)
throttle = Throttle.Throttle(5)
data_list = []

def download(url, user_agent = 'wswp', proxy=None, re_times = 2):
    '''可以设置用户代理的下载方法'''
    print 'DownLoad....', url

    # 限制下载速度
    throttle.wait(url)

    # 设置请求头
    headers = {'User-agent': user_agent}
    request = urllib2.Request(url, headers = headers)

    opener = urllib2.build_opener()
    # 添加代理的支持
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
        # html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print "DownLoad Error: ", e.reason
Esempio n. 10
0
        sleep(0.2)
        self.setLights(kOff)
        sleep(0.2)

if __name__ == "__main__":
    gLog.open()

    #  Start the simulator and controller
    sak = StartAndKill()
    sak.start("simulator")
    sak.start("controller")

    # Create the communication resources for 1 user
    comRes = CommunicationResources(name = 'throttle-test', host = 'localhost', port = 1235, numberOfPackages = 1)
    
    myThrottle = Throttle(name = 'Bill', comPkg = comRes.getNextPackage())
    
    # Tell the throttle to read the layout file
    gLog.print("Main reading layout")
    msg = myThrottle.readLayout("../../runSoftware/Layout.xml")
    sleep(2)

    testing = 9  #<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< set test case
    print ("Testing option == {0}". format(testing))

    if testing == 1:

        # Initialize train 1111
        gLog.print("Main initializing train")
        msg = myThrottle.initTrain(1111, [5, 1])
        gLog.print("physAdd = {0}, physSlot = {1}, virtAdd = {2}, virtSlot = {3}".