Ejemplo n.º 1
0
class Downloader(object):
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 cache=None,
                 opener=None,
                 timeout=30):
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.cache = cache
        self.opener = opener

    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url,
                                   headers,
                                   proxy,
                                   num_retries=self.num_retries)
            if self.cache:
                self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxy=None, num_retries=2, data=None):
        print 'downloading:', url
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except urllib2.URLError as e:
            print 'Download Error:', e.reason
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if 500 <= e.code < 600 and num_retries > 0:
                    self.download(url, headers, num_retries - 1, data)
            else:
                code = None
        return {'html': html, 'code': code}
Ejemplo n.º 2
0
class LinkCrawlerOtoDom:
    """
    Class downloads links from url_start. 
    Parameter since - timeliness of the offer in days. -1 means all history.
    """
    def __init__(self, since = 1, th_sec = 5):
        if since not in [-1,1,3,7,14]: 
            raise Exception('SinceLevelError: since is out of range [-1,1,3,7,14].')
        self.since = since
        self.thr = Throttle(th_sec)
        self.url_start = 'https://www.otodom.pl/wynajem/mieszkanie/warszawa/?' if self.since == -1 else "https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bcreated_since%5D=" + str(self.since) + "&search%5Bregion_id%5D=7&search%5Bsubregion_id%5D=197&search%5Bcity_id%5D=26" 
        self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.137 Safari/537.36 OPR/67.0.3575.79'
        self.__get_max_page()
        self.links = Queue() 
        
    def __download_html(self, url):
        self.thr.wait(url) # Wait 
        response = requests.get(url, headers={'User-Agent' : self.user_agent})
        content = response.content
        return BeautifulSoup(content, "html.parser")
        
    def __get_max_page(self):
        soup = self.__download_html(self.url_start)
        try:
            self.max_page = int(soup.find("ul", class_="pager").find_all("li")[-2].text)
        except:
            raise Exception('ConvertError: cant find max page.')
        
    def __get_links_from_page(self, url):
        links = set()
        for article in self.__download_html(url).find("div", id="listContainer").find_all("article", {'data-featured-name' : "listing_no_promo"}):
            links.add(article.find("a", href = True)['href'])
        return links
    
    def __range_pages(self):
        for page in range(1, self.max_page + 1):
            yield self.url_start + "&page=" + str(page)
            
    def __get_links_from_pages(self):
        for url in self.__range_pages():
            links = self.__get_links_from_page(url)
            for link in links:
                self.links.put(link)
    
    def run(self):
        """
        Get links starting from self.url_start.
        Method crates Queue with urls.
        """
        print('Estimated crawling time: ', str(self.thr.mean_delay * self.max_page), 'seconds.')
        print('start...')
        self.__get_links_from_pages()
        print('...end')
        
    def get_link(self):
        while True:
            try:
                yield self.links.get_nowait()
            except:
                break
Ejemplo n.º 3
0
class Downloader:
    #错误重复尝试次数 numTry,延迟 delay 缓存 cache user_agent  proxies 代理
    def __init__(self,
                 user_angent='wsap',
                 proxies=None,
                 delay=5,
                 numTry=5,
                 cache=None,
                 timeout=30):
        self.user_agent = user_angent
        self.proxies = proxies
        self.delay = delay
        self.numTry = numTry
        self.cache = RedisCache()
        self.throt = Throttle(delay)
        self.timeOut = timeout

    #回调方法,可以让类和方法一样被使用
    def __call__(self, url):
        print("url is:" + url)
        try:
            html = self.cache.__getitem__(url)
        except KeyError:
            print("KeyError in __call__")
            html = None
        if html is None:
            print("html is None")
            self.throt.wait(url)
            header = {'user-agent': self.user_agent}
            #lamda表达式
            proixe = choice(self.proxies) if self.proxies else None
            html = self.download(url, header, proixe)
        self.cache.__setitem__(url, html)
        return html['html']

    #处理url下载问题
    def download(self, url, header, proxie):
        try:
            resp = requests.get(url,
                                headers=header,
                                proxies=proxie,
                                timeout=self.timeOut)
            html = resp.text
            print("status_code:" + str(resp.status_code))
            #小于400表示成功了
            if resp.status_code >= 400:
                html = None
                #500到600需要重试 400到500是可以直接退出的错误
                if 600 > resp.status_code > 500 and self.numTry:
                    self.numTry -= 1
                    #递归 实现错误重试
                    return self.download(url, header, proxie)
        except requests.exceptions.RequestException as e:

            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxies=None,
                 delay=0.0001,
                 max_depth=999999,
                 max_count=10000):
    """ 
    Recorre los link en profundidad 
    """
    i = 0
    crawl_queue = [start_url]
    result = []
    # Dict donde guardare las url visitadas para no volver a parsearlas
    seen = {}
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue and i <= 10000:
        url = crawl_queue.pop()

        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            if i > max_count:
                print('Skipping %s due to exceed limit count' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxies=proxies)
            if not html:
                continue
            i += 1
            print(i)
            #Devuelve un item parecido a scrapy donde guardo la url y el texto plano, ademas de
            ##guardarlo en un fichero
            yield WikiItem(html, url)

            # Filtramos los link a usar
            for link in get_links(html):
                if re.match('#(a-z)*', link):
                    continue
                if re.match(link_regex, link):
                    # Un pequeno parche que la wiki local al pedirle los link no me ponia esta A
                    # en una pagina online no tuve problema al quitarlo
                    #abs_link2 = urljoin(start_url, 'A/')
                    # abs_link = urljoin(abs_link2, link)
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen and len(abs_link) < 200:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
def main_link_crawler(start_url,
                      link_regex,
                      robots_url=None,
                      user_agent='bbbbbbb',
                      proxies=None,
                      delay=3,
                      max_depth=4,
                      num_retries=2,
                      cache={}):
    """ Crawl from the given start URL following links matched by link_regex. In the current
        implementation, we do not actually scrapy any information.

        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxy (str): proxy url, ex 'http://IP' (default: None)
            delay (int): seconds to throttle between requests to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
            scrape_callback (function): function to call after each download (default: None)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    data = []
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)

            html = download(url, user_agent=user_agent, proxy=proxies)
            if not html:
                continue
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
    return seen
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='wswp',
                 proxies=None,
                 delay=5,
                 max_depth=5):
    """ Crawl from the given start URL following links matched by link_regex.
    In the current implementation, we do not actually scrape any information.

        args:
            start_url (str): web site to start crawl
            link_regex (str): regex to match for links
        kwargs:
            robots_url (str): url of the site's robots.txt
                              (default: start_url + /robots.txt)
            user_agent (str): user agent (default: wswp)
            proxies (dict): proxy dict w/ keys 'http' and 'https', values
                            are strs (i.e. 'http(s)://IP') (default: None)
            delay (int): seconds to throttle between requests
                         to one domain (default: 3)
            max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
    """
    crawl_queue = [start_url]
    # keep track which URL's have seen before
    seen = {}
    if not robots_url:
        robots_url = '{}/robots.txt'.format(start_url)
    rp = get_robots_parser(robots_url)
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        # check url passes robots.txt restrictions
        if rp.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, user_agent=user_agent, proxies=proxies)
            if not html:
                continue
            # TODO: add actual data scraping here
            # filter for links matching our regular expression
            for link in get_links(html):
                if re.match(link_regex, link):
                    abs_link = urljoin(start_url, link)
                    if abs_link not in seen:
                        seen[abs_link] = depth + 1
                        crawl_queue.append(abs_link)
        else:
            print('Blocked by robots.txt:', url)
Ejemplo n.º 7
0
class Downloader:
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 cache={},
                 timeout=60):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            result = None
        if result is None:
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        print('Downloading:', url)
        try:
            resp = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=self.timeout)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}
Ejemplo n.º 8
0
class Downloader:
    def __init__(self,
                 delay=5,
                 user_agent=None,
                 num_retries=1,
                 proxies=None,
                 cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.cache = cache

    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                pass
            else:
                if self.num_retries > 0 and 500 < reuslt['code'] < 600:
                    #server error so ignore result from cache
                    #and re-download
                    result = None

        if result is None:
            #result is not get from cache
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url, headers, proxy, self.num_retries)

            if self.cache:
                #save result to cache
                self.cache[url] = result

        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):

        while num_retries > 0:
            try:
                r = requests.get(url, headers=headers, proxies=proxy)
                return {'html': r.text, 'code': r.status_code}
            except requests.exceptions.RequestException:
                num_retries -= 1
Ejemplo n.º 9
0
def link_crawler(seed_url, link_regex=None, delay=10, headers=None, max_depth=2, max_urls=1, user_agent='wswp',
                 proxy=None, num_retries=1):
    """
    :param seed_url: a list of master url
    :param link_regex: you wanna filter the url
    :return: a list of contain master_url and sub url
    """
    crawl_queue = Queue.deque([seed_url])  # 还需要被爬取的url, 类似一个列表
    seen = {seed_url: 0}  # 用于存储根url深度,默认为0,以及其他子url的深度
    num_urls = 0  # 跟踪已经下载的url的数量
    # robots file parse for get really url
    rp = get_robots(seed_url)
    # 生成延时器对象
    throttle = Throttle(delay)
    # 请求头字典
    headers = headers or {}
    if user_agent:
        headers['User-agent'] = user_agent  # 将自定义的用户请求头添加到字典中

    while crawl_queue:  # 只要crawl_queue没被pop从后向前取完,就执行循环
        url = crawl_queue.pop()  # 只取最新的url,由append而来的,取一次少一次。本次爬取的url
        if rp.can_fetch(user_agent, url):  # 判断是否可爬,如果是False是不能爬的
            throttle.wait(url)  # 进行限时,10秒
            html = download(url, headers, proxy=proxy, num_retries=num_retries)  # 下载网页

            links = []  # 用来存储匹配到的子url
            depth = seen[url]  # 取seen这个字典的url的值,其值为深度数字。url为本次爬取的url,取得是本次url的深度值
            if depth != max_depth:  # 控制深度来决定加入待爬队列中的链接深度,没达到深度就可以往crawl_queue里append
                if link_regex:
                    links.extend(link for link in get_link(html) if re.match(link_regex, link))  # 将子url扩展到列表

                for link in links:  # 遍历links列表中的url
                    link = normalize(seed_url, link)  # 将url碎片与根url拼接成完整的链接
                    if link not in seen:  # 有一个新link, 原url深度+1并赋值给link的深度。可以记录该条link是url的第几层,或者第几个次子链接
                        seen[link] = depth + 1  # 深度,url与link之间的相似度,如相似度一样,就表示重复,深度为+1。存下次要下载的深度值
                        if same_domain(seed_url, url):  # 判断domain是否一样,即是域名+端口
                            crawl_queue.append(link)  # 将链接加到待爬的队列中

            # 该link链接是根链接的一个子链接,将子链接加到待爬的队列中,然后num_urls通过控制总的下载次数,来确定爬多少个url,也就是深度
            num_urls += 1  # 控制下载次数
            if num_urls == max_urls:  # 控制循环的次数
                break
        else:
            print 'Blocked by robots.txt:', url
    print seen
class Downloader:
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = None
        self.cache = cache

    def __call__(self, url, num_retries=2):
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            result = None
        if result is None:
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            if self.cache:
                self.cache[url] = result
        return result['html']

    def download(self, url, headers={'User-Agent': 'wswp'}, proxies=None):
        print('Downloading:', url)
        # proxies = {'http': 'http://myproxy.net:1234', 'https': 'https://myproxy.net:1234'}
        try:
            resp = requests.get(url, headers=headers, proxies=proxies)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    self.num_retries -= 1
                    return self.download(url)
        except requests.exceptions.RequestException as e:
            print('Download error:', e.reason)
            html = None
        return {'html': html, 'code': resp.status_code}
Ejemplo n.º 11
0
class Downloader:
    #错误重复尝试次数 numTry,延迟 delay 缓存 cache user_agent  proxies 代理
    def __init__(self,user_angent='wsap',proxies=None,delay=5,numTry=5,cache=None,timeout =30):
        self.user_agent=user_angent
        self.proxies = proxies
        self.delay =delay
        self.numTry=numTry
        self.cache = RedisCache()
        self.throt = Throttle(delay)
        self.timeOut =timeout

    #回调方法,可以让类和方法一样被使用
    def __call__(self,url):
        try:
            html = self.cache.__getitem__(url)
        except KeyError:
            html = None
        if html is None:
            self.throt.wait(url)
            header = {'user-agent':self.user_agent}
            #lamda表达式
            proixe = choice(self.proxies) if self.proxies else None
            html = self.download(url,header,proixe)
        self.cache.__setitem__(url,html)
        return html['html']

    #处理url下载问题
    def download(self,url,header,proxie):
        try:
            resp = requests.get(url,headers=header,proxies=proxie,timeout=self.timeOut)
            html = resp.text
            #小于400表示成功了
            if resp.status_code >=400:
                html = None
                #500到600需要重试 400到500是可以直接退出的错误
                if 600> resp.status_code >500 and self.numTry:
                    self.numTry -= 1
                    #递归 实现错误重试
                    return self.download(url,header,proxie)
        except requests.exceptions.RequestException as e:

            return {'html':None,'code':500}
        return {'html':html,'code':resp.status_code}
Ejemplo n.º 12
0
def crawl_link(seed_url, link_regex, max_depth = 2, delay = 3, scrape_callback = None):
    crawl_queue = [seed_url]
    seen = {seed_url: 0}
    throttle = Throttle(delay)
    while crawl_queue:
        url = crawl_queue.pop()
        throttle.wait(url)
        html = download(url)
        if html is None:
            return
        
        links = []
        if scrape_callback:
            links.extend(scrape_callback(url, html) or []) #or []:代表追加的是个空列表
        # check is max depth
        depth = seen[url]
        if depth != max_depth:
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urlparse.urljoin(seed_url, link)
                    # check the has down
                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
Ejemplo n.º 13
0
class ThrottleTestCase(unittest.TestCase):
    def setUp(self):
        self.mock_time = MockTime()
        self.test_throttle = Throttle(10, time=self.mock_time)

    def test_first_time_should_run_immediately(self):
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, None)

    def test_should_sleep_minimum_interval(self):
        self.test_throttle.wait()
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, 10)

    def test_should_sleep_partial_time(self):
        self.mock_time._time = 5
        self.test_throttle.wait()
        self.mock_time._time = 10
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, 5)
class ThrottleTestCase(unittest.TestCase):
    def setUp(self):
        self.mock_time = MockTime()
        self.test_throttle = Throttle(10, time=self.mock_time)

    def test_first_time_should_run_immediately(self):
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, None)

    def test_should_sleep_minimum_interval(self):
        self.test_throttle.wait()
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, 10)

    def test_should_sleep_partial_time(self):
        self.mock_time._time = 5
        self.test_throttle.wait()
        self.mock_time._time = 10
        self.test_throttle.wait()
        self.assertEqual(self.mock_time.sleep_duration, 5)
Ejemplo n.º 15
0
from throttle import Throttle
from download import download, search_codes
from string import ascii_lowercase
import re
import csv
import itertools

throttle = Throttle(0)
"""
So, money.rediff.com has the simplest structure of scripcodes I found. All scrips are categorized by the first alphabet. In this code I am first finding out how many stocks are there starting with each alphabet. The count of the stocks is being recorded in the list 'index' below
"""
index = []
for x in ascii_lowercase:
    throttle.wait('https://money.rediff.com')
    html = str(download('https://money.rediff.com/companies/{}'.format(x)))
    len = re.search('>Showing 1 - (.*?) of (.*?) ', html)
    index.append(int(len.group(2)))
"""
Once I have all the stocks by their alphabet, I am iterating through every page on the structure to find the regex for scripcode, which is a 6 digit number. I know the variables look ugly, but the code is fucntional, and will only be run once in a blue moon. I'll improve on it later on, if I get time. Basically, this is an unintelligent iterative crawler.
"""

ctr = 0
b = []
prod = []
for x in ascii_lowercase:
    throttle.wait('https://money.rediff.com')
    for i in itertools.count(1, 200):
        limit = index[ctr]
        if (i > limit):
            break
        b = search_codes('https://money.rediff.com/companies/{}/{}-{}'.format(
class Downloader:
    def __init__(self, delay=1, user_agent='saint_data', proxy=None, cache={}):
        """ __init__ method initializes a Downloader object
            @parameters
                user_agent:     (str)   user agent for request header
                cache:          (dict)  stores all downloaded
        """

        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = None  # this variable will be set later by request (in __call__ method)
        self.proxy = proxy
        self.cache = cache

    # ---------------------------------------------------------------------------------------------------------------- #

    def __call__(self, url, num_retries=2):
        """ __call__ method downloads urls that are not found in cache or returns urls found in cache
            @parameters
               url:             (string)    web site's url
               num_retries:     (int)       number
            @returns
               result['html']   (string)    web page's source code
        """

        self.num_tries = num_retries
        try:
            result = self.cache[url]
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache
            # so still need to download
            self.throttle.wait(url)
            result = self.download(url, self.user_agent)
            if self.cache:
                # save result ot cache
                self.cache[url] = result
        return result['html']

    # ---------------------------------------------------------------------------------------------------------------- #

    def download(self, url, user_agent, num_tries=2, charset='utf-8'):
        """ This function downloads a website's source code.
            @parameters
                url:        (str)           website's url
                user_agent: (str)           specifies the user_agent string
                num_tries:  (int)           if a download fails due to a problem with the request (4xx) or the server
                                            (5xx) the function calls it self recursively #num_tries times
                charset:    (str)           helps specify the desired codec of the HTTP responses
            @returns
                html_code:  (str or None)   html code of web site or None if no code is returned
        """

        print("Downloading %s ... " % url)
        # construct a Request object
        request = urllib.request.Request(url)
        # set user-agent for this request
        request.add_header('User-Agent', user_agent)
        try:
            if self.proxy:
                proxy_support = urllib.request.ProxyHandler(
                    {'http': self.proxy})
                opener = urllib.request.build_opener(proxy_support)
                urllib.request.install_opener(opener)
            # make a request and get an HTTPResponse object back
            # response is a context manager (.info(), .getcode(), .geturl())
            response = urllib.request.urlopen(request)
            # reading response as string (bytes originally)
            # 'ignore' arg is crucial to avoid errors when decoding bytes with codec different than charset ('utf-8')
            html_code = response.read().decode(charset, 'ignore')
            response_code = response.getcode()
        except (URLError, HTTPError, ContentTooShortError) as e:
            print("Downloading Error:", e.reason)
            html_code = None
            if hasattr(e, 'code'):
                response_code = e.code
            else:
                response_code = None
            if num_tries > 0:
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    # recursively retry 5xx HTTP errors (server errors)
                    return self.download(url, user_agent, num_tries - 1,
                                         charset)

        # Our beloved html_code is UTF-8 STRING or NONE
        # TODO(4) delete statement
        # print("HTML: {0}".format(type(html_code)))

        return {'html': html_code, 'code': response_code}
Ejemplo n.º 17
0
class Downloader(object):
    '''
    classdocs
    '''
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 num_retries=1,
                 cache=None):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.cache = cache
        self.opener = None
        '''
        Constructor
        '''

    #当调用实例名时便会调用该方法
    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                print url + " is not available in cache!"
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    result = None
                else:
                    print url + " is available in cache!"
        if result is None:
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'User-agent': self.user_agent}
            result = self.download(url, headers, proxy, self.num_retries)
            if self.cache and result is not None:
                self.cache[url] = result
            else:
                return None
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        print 'Downloading ', url
        # python中的逻辑运算和普通的不一样
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        code = 200
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            #         htmlfile = urllib2.urlopen(request, timeout = 10)
            # timeout: s(秒)
            htmlfile = opener.open(request, timeout=15)
            html = htmlfile.read()
        except Exception as e:
            print 'Download error:', str(e)
            html = None
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= e.code <= 600:
                    # 当下载遇到5xx错误码时,尝试重新下载
                    return self.download(url, headers, proxy, num_retries - 1,
                                         data)
                else:
                    return None
            else:
                return None
        return {'html': html, 'code': code}
def rotate_log_files(options):
    with request_lock(options['lock_file']) as acquired:
        if not acquired:
            logger.warn('Not rotating, previous job still underway')
            return

        # Check we can send signals to all relevant processes
        pids_for_processes = running_processes_by_name(
            options['reopen_file_signals'].keys())
        unkillable_processes = set()
        for process_name in options['reopen_file_signals'].keys():
            pids = pids_for_processes[process_name]
            try:
                for pid in pids:
                    kill_if_running(pid, 0)
            except OSError:
                unkillable_processes.add(process_name)
        if unkillable_processes:
            logger.error('Cannot send signal to some processes, aborting: %s' %
                         ', '.join(unkillable_processes))
            return

        files_to_rotate = [
            file for file in os.listdir(options['log_directory'])
            if fnmatch.fnmatch(file, options['filename_filter'])
        ]

        rotation_suffix = datetime.datetime.now().strftime(
            options['timestamp_format'])

        filename_mapping = {
            file: file + rotation_suffix
            for file in files_to_rotate
        }

        # Move all files
        rotated_files = []
        for original_name, rotated_name in filename_mapping.items():
            original_path = os.path.join(options['log_directory'],
                                         original_name)
            rotated_path = os.path.join(options['log_directory'], rotated_name)
            if not os.path.exists(rotated_path):
                os.rename(original_path, rotated_path)
                rotated_files.append(rotated_name)
            else:
                logger.warning(
                    'Did not rotate file. File called %s already existed',
                    rotated_path)

        # Run kick commands
        pids_for_processes = running_processes_by_name(
            options['reopen_file_signals'].keys())
        for process_name, signal_name in options['reopen_file_signals'].items(
        ):
            signal_id = getattr(signal, 'SIG' + signal_name.upper())
            pids = pids_for_processes[process_name]
            for pid in pids:
                kill_if_running(pid, signal_id)

        throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL)
        checks_without_closed_files = 0
        s3_store = S3LogStore(options)

        # Get files which have no open handles and process them as soon as we can.
        # Files with open handles wait until next time through the loop. We throttle
        # to avoid checking too often.
        # TODO: Should we also pick up and retry copying any gz files which we could not
        #       copy to s3 last time around?
        open_files = rotated_files
        while open_files:
            throttle_file_checks.wait()
            closed_files, open_files = check_for_open_files(open_files)
            for ready_file in closed_files:
                try:
                    ready_path = os.path.join(options['log_directory'],
                                              ready_file)
                    compressed_path = compress_file(ready_path)
                    s3_store.store_file(compressed_path)
                    os.unlink(compressed_path)
                except:
                    logger.error('Unexpected error processing %s',
                                 ready_file,
                                 exc_info=True)
            if len(closed_files):
                checks_without_closed_files = 0
            else:
                checks_without_closed_files += 1
                if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED:
                    logger.error(
                        'Gave up waiting for files to close. Open files: %s' %
                        ', '.join(open_files))
                    return
Ejemplo n.º 19
0
ldr_kz_nb = Loader_KZ_NB()
ldr_kz_bai_alfa = Loader_KZ_bai_alfa()
kz_bai_halyk_cash_ldr = Loader_KZ_bai_halyk_cash()
kz_bai_halyk_cards_ldr = Loader_KZ_bai_halyk_cards()
kz_bai_kkb_cash_ldr = Loader_KZ_bai_kkb_cash()
kz_bai_kkb_cards_ldr = Loader_KZ_bai_kkb_cards()

# here is the place for adding an instance into the loaders list
loaders_list = [
    ldr_kz_nb, ldr_kz_bai_alfa, kz_bai_halyk_cash_ldr, kz_bai_halyk_cards_ldr,
    kz_bai_kkb_cash_ldr, kz_bai_kkb_cards_ldr
]

loadedData = ''
# loop in loaders list
for ldr in loaders_list:
    loadedData = ldr.loadDailyData(date_for_load)
    if loadedData:
        parsedData = ldr.parseDailyData(loadedData)
    else:
        logging.error("Empty loaded data")
        parsedData = None

    if parsedData:
        ldr.saveRatesData(parsedData)
    throttle.wait(ldr.get_domain())

# loc = localizator("en-us")

# logging.info(loc.get_translated_labels(["EUR","LBL000002", 12.4,"LBL000001", "LBL000005"]))
Ejemplo n.º 20
0
class Downloader:

    def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, 
                num_retries=DEFAULT_RETRIES, cache=DEFAULT_CACHE, 
                proxies=DEFAULT_PROXIES, opener=DEFAULT_OPENER, 
                timeout=DEFAULT_TIMEOUT):
        socket.setdefaulttimeout(timeout)
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.num_retries = num_retries
        self.cache = cache
        self.opener = opener

    def __call__(self, url):
        result = None
        if self.cache:
            try:
                result = self.cache[url]
            except KeyError:
                pass
            else:
                if self.num_retries > 0 and 500 <= result['code'] < 600:
                    result = None
        if result is None:
            self.throttle.wait(url)
            proxy = random.choice(self.proxies) if self.proxies else None
            headers = {'user-agent': self.user_agent}
            result = self.download(url, headers, proxy, self.num_retries)
            if self.cache:
                self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxy, num_retries, data=None):
        print 'Downloading:', url
        request = urllib2.Request(url, data, headers or {})
        opener = self.opener or urllib2.build_opener()
        if proxy:
            proxy_params = {urlparse.urlparse(url).scheme: proxy}
            opener.add_handler(urllib2.ProxyHandler(proxy_params))
        try:
            response = opener.open(request)
            html = response.read()
            code = response.code
        except urllib2.URLError as e:
            print 'Download error:', str(e)
            logging.error(str(e))
            html = ''
            if hasattr(e, 'code'):
                code = e.code
                if num_retries > 0 and 500 <= code < 600:
                    return self.download(url, headers, proxy, num_retries-1, data)
            else:
                code = None
        except (socket.error, httplib.BadStatusLine, httplib.IncompleteRead,
                socket.timeout, ssl.SSLError) as e:
            print 'Download error:', str(e)
            logging.error(str(e))
            html, code = '', None
            if num_retries > 0:
                return self.download(url, headers, proxy, num_retries-1, data)
        return {'html': html, 'code': code}
Ejemplo n.º 21
0
def rotate_log_files(options):
    with request_lock(options['lock_file']) as acquired:
        if not acquired:
            logger.warn('Not rotating, previous job still underway')
            return

        # Check we can send signals to all relevant processes
        pids_for_processes = running_processes_by_name(options['reopen_file_signals'].keys())
        unkillable_processes = set()
        for process_name in options['reopen_file_signals'].keys():
            pids = pids_for_processes[process_name]
            try:
                for pid in pids:
                    kill_if_running(pid, 0)
            except OSError:
                unkillable_processes.add(process_name)
        if unkillable_processes:
            logger.error('Cannot send signal to some processes, aborting: %s' % ', '.join(unkillable_processes))
            return

        files_to_rotate = [
            file for file in os.listdir(options['log_directory'])
            if fnmatch.fnmatch(file, options['filename_filter'])
        ]

        rotation_suffix = datetime.datetime.now().strftime(options['timestamp_format'])

        filename_mapping = {
            file: file + rotation_suffix
            for file in files_to_rotate
        }

        # Move all files
        rotated_files = []
        for original_name, rotated_name in filename_mapping.items():
            original_path = os.path.join(options['log_directory'], original_name)
            rotated_path = os.path.join(options['log_directory'], rotated_name)
            if not os.path.exists(rotated_path):
                os.rename(original_path, rotated_path)
                rotated_files.append(rotated_name)
            else:
                logger.warning('Did not rotate file. File called %s already existed', rotated_path)

        # Run kick commands
        pids_for_processes = running_processes_by_name(options['reopen_file_signals'].keys())
        for process_name, signal_name in options['reopen_file_signals'].items():
            signal_id = getattr(signal, 'SIG' + signal_name.upper())
            pids = pids_for_processes[process_name]
            for pid in pids:
                kill_if_running(pid, signal_id)

        throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL)
        checks_without_closed_files = 0
        s3_store = S3LogStore(options)

        # Get files which have no open handles and process them as soon as we can.
        # Files with open handles wait until next time through the loop. We throttle
        # to avoid checking too often.
        # TODO: Should we also pick up and retry copying any gz files which we could not
        #       copy to s3 last time around?
        open_files = rotated_files
        while open_files:
            throttle_file_checks.wait()
            closed_files, open_files = check_for_open_files(open_files)
            for ready_file in closed_files:
                try:
                    ready_path = os.path.join(options['log_directory'], ready_file)
                    compressed_path = compress_file(ready_path)
                    s3_store.store_file(compressed_path)
                    os.unlink(compressed_path)
                except:
                    logger.error('Unexpected error processing %s', ready_file, exc_info=True)
            if len(closed_files):
                checks_without_closed_files = 0
            else:
                checks_without_closed_files += 1
                if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED:
                    logger.error('Gave up waiting for files to close. Open files: %s' % ', '.join(open_files))
                    return
Ejemplo n.º 22
0
class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
                 timeout=60):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout
        
        
        LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php'
        LOGIN_EMAIL = 'caicai'
        LOGIN_PASSWORD = '******'
            
            
        postdata = urllib.parse.urlencode({'user_login': LOGIN_EMAIL, 'user_pass': LOGIN_PASSWORD,'action':'user_login'
                ,'remember_me':'1','redirect_url':'http://www.jobbole.com/'}).encode('utf-8')
        req = urllib.request.Request(LOGIN_URL,postdata)
        req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0')
        urllib.request.ProxyHandler(proxies=proxies)
        #create CookieJar
        cjar = http.cookiejar.CookieJar()
        #create opener
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar))
        #open 安装为全局
        urllib.request.install_opener(opener)
            
        file = opener.open(req)
        data=file.read()
        file=open('3.html','wb')
        file.write(data)
        file.close() 
            

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        """ Download a and return the page content
            args:
                url (str): URL
                headers (dict): dict of headers (like user_agent)
                proxies (dict): proxy dict w/ keys 'http'/'https', values
                    are strs (i.e. 'http(s)://IP') (default: None)
        """
        print('Downloading:', url)
        try:
            resp = requests.get(url, headers=headers, proxies=proxies,
                                timeout=self.timeout)
            html = resp.text
            
            
            html=urllib.request.urlopen(url).read().decode('utf-8')
            
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}
Ejemplo n.º 23
0
def link_crawler(start_url,
                 link_regex,
                 robots_url=None,
                 user_agent='statista',
                 max_depth=-1,
                 delay=3,
                 proxies=None,
                 num_retries=2,
                 cache=None,
                 scraper_callback=None):

    #: Initialze a crawl queue with a seed url to start the crawl from
    crawl_queue = [start_url]

    #: keep track of seen urls
    seen = {}

    robots = {}

    throttle = Throttle(delay)

    #: start the crawl
    while crawl_queue:
        url = crawl_queue.pop()

        #: robots.txt
        robots_file_present = False
        if 'http' not in url:
            continue

        #: Get the domain
        domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)

        #: Get the robot parser for this domain from the robots dictionary
        robot_parser = robots.get(domain)

        #: set a default robots url and a parser for it if there isn't one
        if not robot_parser and domain not in robots:
            robots_url = '{}/robots.txt'.format(domain)
            robot_parser = get_robots_parser(robots_url)
            if not robot_parser:
                #: continue to crawl even if there are problems finding robots.txt
                #: file
                robots_file_present = True
            # associate each domain with a corresponding parser, whether
            # present or not
            robots[domain] = robot_parser

        elif domain in robots:
            robots_file_present = True

        #: crawl only when url passes robots.txt restrictions
        if robots_file_present or robot_parser.can_fetch(user_agent, url):
            depth = seen.get(url, 0)
            if depth == max_depth:
                #: Skip link if you have crawled it more than max depth
                print('Skipping %s due to depth' % url)
                continue
            throttle.wait(url)
            html = download(url, num_retries=num_retries)
            if not html:
                continue
            if scraper_callback:
                scraper_callback(url, html)

            #: Get all links from page and filter only those matching given pattern
            for link in get_links(html):
                if re.search(link_regex, link):
                    if 'http' not in link:
                        # check if link is well formed and correct
                        if link.startswith('//'):
                            link = '{}:{}'.format(urlparse(url).scheme, link)
                        elif link.startswith('://'):
                            link = '{}{}'.format(urlparse(url).scheme, link)
                        else:
                            link = urljoin(domain, link)

                    if link not in seen:
                        seen[link] = depth + 1
                        crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', url)
class Downloader:

    def __init__(self, delay=1, user_agent='saint_data', proxy=None, cache={}):
        """ __init__ method initializes a Downloader object
            @parameters
                user_agent:     (str)   user agent for request header
                cache:          (dict)  stores all downloaded
        """

        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.num_retries = None             # this variable will be set later by request (in __call__ method)
        self.proxy = proxy
        self.cache = cache
    # ---------------------------------------------------------------------------------------------------------------- #

    def __call__(self, url, num_retries=2):
        """ __call__ method downloads urls that are not found in cache or returns urls found in cache
            @parameters
               url:             (string)    web site's url
               num_retries      (int)       number
            @returns
               result['html']   (string)    web page's source code
        """

        self.num_tries = num_retries
        try:
            result = self.cache[url]
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None

        if result is None:
            # result was not loaded from cache
            # so still need to download
            self.throttle.wait(url)
            result = self.download(url, self.user_agent, num_retries)
            if self.cache:
                # save result ot cache
                self.cache[url] = result
        return result["html_code"]

    # ---------------------------------------------------------------------------------------------------------------- #

    def download(self, url, user_agent, num_retries):
        """ This function downloads a website's source code.
            @parameters
                url             (str)       website's url
                user_agent      (str)       specifies the user_agent string
                num_retries     (int)       if a download fails due to a problem with the request (4xx) or the server
                                            (5xx) the function calls it self recursively #num_retries times
            @returns
                html_code   (str or None)   html code of web site or None if no code is returned
        """

        print("Downloading %s ... " % url)
        # set user-agent for this request
        headers = {'User-Agent': user_agent}
        try:
            resp = requests.get(url, headers=headers, proxies=self.proxy)
            # retrieve content
            html_code = resp.text
            # save the request's status code
            code = resp.status_code
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html_code = None
                if num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    print("retry")
                    self.throttle.wait(url)
                    return self.download(url, user_agent, num_retries - 1)
        except RequestException as e:
            print('Download Exception error:', e)
            html_code = None
            code = e.errno

        return {'html_code': html_code, 'code': code}
class Downloader:
    """ Downloader class to use cache and requests for downloading pages.
        For contructor, pass:
            delay (int): # of secs delay between requests (default: 5)
            user_agent (str): user agent string (default: 'wswp')
            proxies (list[dict]): list of possible proxies, each
                must be a dict with http / https keys and proxy values
            cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
            timeout (float/int): number of seconds to wait until timeout
    """
    def __init__(self,
                 delay=5,
                 user_agent='wswp',
                 proxies=None,
                 cache={},
                 timeout=60):
        self.throttle = Throttle(delay)
        self.user_agent = user_agent
        self.proxies = proxies
        self.cache = cache
        self.num_retries = None  # we will set this per request
        self.timeout = timeout

    def __call__(self, url, num_retries=2):
        """ Call the downloader class, which will return HTML from cache
            or download it
            args:
                url (str): url to download
            kwargs:
                num_retries (int): # times to retry if 5xx code (default: 2)
        """
        self.num_retries = num_retries
        try:
            result = self.cache[url]
            print('Loaded from cache:', url)
        except KeyError:
            result = None
        if result and self.num_retries and 500 <= result['code'] < 600:
            # server error so ignore result from cache
            # and re-download
            result = None
        if result is None:
            # result was not loaded from cache, need to download
            self.throttle.wait(url)
            proxies = choice(self.proxies) if self.proxies else None
            headers = {'User-Agent': self.user_agent}
            result = self.download(url, headers, proxies)
            self.cache[url] = result
        return result['html']

    def download(self, url, headers, proxies):
        """ Download a and return the page content
            args:
                url (str): URL
                headers (dict): dict of headers (like user_agent)
                proxies (dict): proxy dict w/ keys 'http'/'https', values
                    are strs (i.e. 'http(s)://IP') (default: None)
        """
        print('Downloading:', url)
        try:
            resp = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=self.timeout)
            html = resp.text
            if resp.status_code >= 400:
                print('Download error:', resp.text)
                html = None
                if self.num_retries and 500 <= resp.status_code < 600:
                    # recursively retry 5xx HTTP errors
                    self.num_retries -= 1
                    return self.download(url, headers, proxies)
        except requests.exceptions.RequestException as e:
            print('Download error:', e)
            return {'html': None, 'code': 500}
        return {'html': html, 'code': resp.status_code}