class Downloader(object): def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=1, cache=None, opener=None, timeout=30): socket.setdefaulttimeout(timeout) self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache self.opener = opener def __call__(self, url): result = None if self.cache: try: result = self.cache[url] except KeyError: pass else: if self.num_retries > 0 and 500 <= result['code'] < 600: result = None if result is None: self.throttle.wait(url) proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} result = self.download(url, headers, proxy, num_retries=self.num_retries) if self.cache: self.cache[url] = result return result['html'] def download(self, url, headers, proxy=None, num_retries=2, data=None): print 'downloading:', url request = urllib2.Request(url, data, headers or {}) opener = self.opener or urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download Error:', e.reason html = '' if hasattr(e, 'code'): code = e.code if 500 <= e.code < 600 and num_retries > 0: self.download(url, headers, num_retries - 1, data) else: code = None return {'html': html, 'code': code}
class LinkCrawlerOtoDom: """ Class downloads links from url_start. Parameter since - timeliness of the offer in days. -1 means all history. """ def __init__(self, since = 1, th_sec = 5): if since not in [-1,1,3,7,14]: raise Exception('SinceLevelError: since is out of range [-1,1,3,7,14].') self.since = since self.thr = Throttle(th_sec) self.url_start = 'https://www.otodom.pl/wynajem/mieszkanie/warszawa/?' if self.since == -1 else "https://www.otodom.pl/wynajem/mieszkanie/warszawa/?search%5Bdescription%5D=1&search%5Bcreated_since%5D=" + str(self.since) + "&search%5Bregion_id%5D=7&search%5Bsubregion_id%5D=197&search%5Bcity_id%5D=26" self.user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.137 Safari/537.36 OPR/67.0.3575.79' self.__get_max_page() self.links = Queue() def __download_html(self, url): self.thr.wait(url) # Wait response = requests.get(url, headers={'User-Agent' : self.user_agent}) content = response.content return BeautifulSoup(content, "html.parser") def __get_max_page(self): soup = self.__download_html(self.url_start) try: self.max_page = int(soup.find("ul", class_="pager").find_all("li")[-2].text) except: raise Exception('ConvertError: cant find max page.') def __get_links_from_page(self, url): links = set() for article in self.__download_html(url).find("div", id="listContainer").find_all("article", {'data-featured-name' : "listing_no_promo"}): links.add(article.find("a", href = True)['href']) return links def __range_pages(self): for page in range(1, self.max_page + 1): yield self.url_start + "&page=" + str(page) def __get_links_from_pages(self): for url in self.__range_pages(): links = self.__get_links_from_page(url) for link in links: self.links.put(link) def run(self): """ Get links starting from self.url_start. Method crates Queue with urls. """ print('Estimated crawling time: ', str(self.thr.mean_delay * self.max_page), 'seconds.') print('start...') self.__get_links_from_pages() print('...end') def get_link(self): while True: try: yield self.links.get_nowait() except: break
class Downloader: #错误重复尝试次数 numTry,延迟 delay 缓存 cache user_agent proxies 代理 def __init__(self, user_angent='wsap', proxies=None, delay=5, numTry=5, cache=None, timeout=30): self.user_agent = user_angent self.proxies = proxies self.delay = delay self.numTry = numTry self.cache = RedisCache() self.throt = Throttle(delay) self.timeOut = timeout #回调方法,可以让类和方法一样被使用 def __call__(self, url): print("url is:" + url) try: html = self.cache.__getitem__(url) except KeyError: print("KeyError in __call__") html = None if html is None: print("html is None") self.throt.wait(url) header = {'user-agent': self.user_agent} #lamda表达式 proixe = choice(self.proxies) if self.proxies else None html = self.download(url, header, proixe) self.cache.__setitem__(url, html) return html['html'] #处理url下载问题 def download(self, url, header, proxie): try: resp = requests.get(url, headers=header, proxies=proxie, timeout=self.timeOut) html = resp.text print("status_code:" + str(resp.status_code)) #小于400表示成功了 if resp.status_code >= 400: html = None #500到600需要重试 400到500是可以直接退出的错误 if 600 > resp.status_code > 500 and self.numTry: self.numTry -= 1 #递归 实现错误重试 return self.download(url, header, proxie) except requests.exceptions.RequestException as e: return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxies=None, delay=0.0001, max_depth=999999, max_count=10000): """ Recorre los link en profundidad """ i = 0 crawl_queue = [start_url] result = [] # Dict donde guardare las url visitadas para no volver a parsearlas seen = {} if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue and i <= 10000: url = crawl_queue.pop() if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue if i > max_count: print('Skipping %s due to exceed limit count' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxies=proxies) if not html: continue i += 1 print(i) #Devuelve un item parecido a scrapy donde guardo la url y el texto plano, ademas de ##guardarlo en un fichero yield WikiItem(html, url) # Filtramos los link a usar for link in get_links(html): if re.match('#(a-z)*', link): continue if re.match(link_regex, link): # Un pequeno parche que la wiki local al pedirle los link no me ponia esta A # en una pagina online no tuve problema al quitarlo #abs_link2 = urljoin(start_url, 'A/') # abs_link = urljoin(abs_link2, link) abs_link = urljoin(start_url, link) if abs_link not in seen and len(abs_link) < 200: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
def main_link_crawler(start_url, link_regex, robots_url=None, user_agent='bbbbbbb', proxies=None, delay=3, max_depth=4, num_retries=2, cache={}): """ Crawl from the given start URL following links matched by link_regex. In the current implementation, we do not actually scrapy any information. args: start_url (str): web site to start crawl link_regex (str): regex to match for links kwargs: robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) user_agent (str): user agent (default: wswp) proxy (str): proxy url, ex 'http://IP' (default: None) delay (int): seconds to throttle between requests to one domain (default: 3) max_depth (int): maximum crawl depth (to avoid traps) (default: 4) scrape_callback (function): function to call after each download (default: None) """ crawl_queue = [start_url] # keep track which URL's have seen before seen = {} data = [] if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxy=proxies) if not html: continue # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url) return seen
def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', proxies=None, delay=5, max_depth=5): """ Crawl from the given start URL following links matched by link_regex. In the current implementation, we do not actually scrape any information. args: start_url (str): web site to start crawl link_regex (str): regex to match for links kwargs: robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) user_agent (str): user agent (default: wswp) proxies (dict): proxy dict w/ keys 'http' and 'https', values are strs (i.e. 'http(s)://IP') (default: None) delay (int): seconds to throttle between requests to one domain (default: 3) max_depth (int): maximum crawl depth (to avoid traps) (default: 4) """ crawl_queue = [start_url] # keep track which URL's have seen before seen = {} if not robots_url: robots_url = '{}/robots.txt'.format(start_url) rp = get_robots_parser(robots_url) throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, user_agent=user_agent, proxies=proxies) if not html: continue # TODO: add actual data scraping here # filter for links matching our regular expression for link in get_links(html): if re.match(link_regex, link): abs_link = urljoin(start_url, link) if abs_link not in seen: seen[abs_link] = depth + 1 crawl_queue.append(abs_link) else: print('Blocked by robots.txt:', url)
class Downloader: def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout def __call__(self, url, num_retries=2): self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: result = None if result is None: self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) self.cache[url] = result return result['html'] def download(self, url, headers, proxies): print('Downloading:', url) try: resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) html = resp.text if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: self.num_retries -= 1 return self.download(url, headers, proxies) except requests.exceptions.RequestException as e: print('Download error:', e) return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}
class Downloader: def __init__(self, delay=5, user_agent=None, num_retries=1, proxies=None, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache def __call__(self, url): result = None if self.cache: try: result = self.cache[url] except KeyError: pass else: if self.num_retries > 0 and 500 < reuslt['code'] < 600: #server error so ignore result from cache #and re-download result = None if result is None: #result is not get from cache self.throttle.wait(url) proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} result = self.download(url, headers, proxy, self.num_retries) if self.cache: #save result to cache self.cache[url] = result return result['html'] def download(self, url, headers, proxy, num_retries, data=None): while num_retries > 0: try: r = requests.get(url, headers=headers, proxies=proxy) return {'html': r.text, 'code': r.status_code} except requests.exceptions.RequestException: num_retries -= 1
def link_crawler(seed_url, link_regex=None, delay=10, headers=None, max_depth=2, max_urls=1, user_agent='wswp', proxy=None, num_retries=1): """ :param seed_url: a list of master url :param link_regex: you wanna filter the url :return: a list of contain master_url and sub url """ crawl_queue = Queue.deque([seed_url]) # 还需要被爬取的url, 类似一个列表 seen = {seed_url: 0} # 用于存储根url深度,默认为0,以及其他子url的深度 num_urls = 0 # 跟踪已经下载的url的数量 # robots file parse for get really url rp = get_robots(seed_url) # 生成延时器对象 throttle = Throttle(delay) # 请求头字典 headers = headers or {} if user_agent: headers['User-agent'] = user_agent # 将自定义的用户请求头添加到字典中 while crawl_queue: # 只要crawl_queue没被pop从后向前取完,就执行循环 url = crawl_queue.pop() # 只取最新的url,由append而来的,取一次少一次。本次爬取的url if rp.can_fetch(user_agent, url): # 判断是否可爬,如果是False是不能爬的 throttle.wait(url) # 进行限时,10秒 html = download(url, headers, proxy=proxy, num_retries=num_retries) # 下载网页 links = [] # 用来存储匹配到的子url depth = seen[url] # 取seen这个字典的url的值,其值为深度数字。url为本次爬取的url,取得是本次url的深度值 if depth != max_depth: # 控制深度来决定加入待爬队列中的链接深度,没达到深度就可以往crawl_queue里append if link_regex: links.extend(link for link in get_link(html) if re.match(link_regex, link)) # 将子url扩展到列表 for link in links: # 遍历links列表中的url link = normalize(seed_url, link) # 将url碎片与根url拼接成完整的链接 if link not in seen: # 有一个新link, 原url深度+1并赋值给link的深度。可以记录该条link是url的第几层,或者第几个次子链接 seen[link] = depth + 1 # 深度,url与link之间的相似度,如相似度一样,就表示重复,深度为+1。存下次要下载的深度值 if same_domain(seed_url, url): # 判断domain是否一样,即是域名+端口 crawl_queue.append(link) # 将链接加到待爬的队列中 # 该link链接是根链接的一个子链接,将子链接加到待爬的队列中,然后num_urls通过控制总的下载次数,来确定爬多少个url,也就是深度 num_urls += 1 # 控制下载次数 if num_urls == max_urls: # 控制循环的次数 break else: print 'Blocked by robots.txt:', url print seen
class Downloader: def __init__(self, delay=5, user_agent='wswp', proxies=None, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = None self.cache = cache def __call__(self, url, num_retries=2): self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: result = None if result is None: self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) if self.cache: self.cache[url] = result return result['html'] def download(self, url, headers={'User-Agent': 'wswp'}, proxies=None): print('Downloading:', url) # proxies = {'http': 'http://myproxy.net:1234', 'https': 'https://myproxy.net:1234'} try: resp = requests.get(url, headers=headers, proxies=proxies) html = resp.text if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: # recursively retry 5xx HTTP errors self.num_retries -= 1 return self.download(url) except requests.exceptions.RequestException as e: print('Download error:', e.reason) html = None return {'html': html, 'code': resp.status_code}
class Downloader: #错误重复尝试次数 numTry,延迟 delay 缓存 cache user_agent proxies 代理 def __init__(self,user_angent='wsap',proxies=None,delay=5,numTry=5,cache=None,timeout =30): self.user_agent=user_angent self.proxies = proxies self.delay =delay self.numTry=numTry self.cache = RedisCache() self.throt = Throttle(delay) self.timeOut =timeout #回调方法,可以让类和方法一样被使用 def __call__(self,url): try: html = self.cache.__getitem__(url) except KeyError: html = None if html is None: self.throt.wait(url) header = {'user-agent':self.user_agent} #lamda表达式 proixe = choice(self.proxies) if self.proxies else None html = self.download(url,header,proixe) self.cache.__setitem__(url,html) return html['html'] #处理url下载问题 def download(self,url,header,proxie): try: resp = requests.get(url,headers=header,proxies=proxie,timeout=self.timeOut) html = resp.text #小于400表示成功了 if resp.status_code >=400: html = None #500到600需要重试 400到500是可以直接退出的错误 if 600> resp.status_code >500 and self.numTry: self.numTry -= 1 #递归 实现错误重试 return self.download(url,header,proxie) except requests.exceptions.RequestException as e: return {'html':None,'code':500} return {'html':html,'code':resp.status_code}
def crawl_link(seed_url, link_regex, max_depth = 2, delay = 3, scrape_callback = None): crawl_queue = [seed_url] seen = {seed_url: 0} throttle = Throttle(delay) while crawl_queue: url = crawl_queue.pop() throttle.wait(url) html = download(url) if html is None: return links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) #or []:代表追加的是个空列表 # check is max depth depth = seen[url] if depth != max_depth: for link in get_links(html): if re.match(link_regex, link): link = urlparse.urljoin(seed_url, link) # check the has down if link not in seen: seen[link] = depth + 1 crawl_queue.append(link)
class ThrottleTestCase(unittest.TestCase): def setUp(self): self.mock_time = MockTime() self.test_throttle = Throttle(10, time=self.mock_time) def test_first_time_should_run_immediately(self): self.test_throttle.wait() self.assertEqual(self.mock_time.sleep_duration, None) def test_should_sleep_minimum_interval(self): self.test_throttle.wait() self.test_throttle.wait() self.assertEqual(self.mock_time.sleep_duration, 10) def test_should_sleep_partial_time(self): self.mock_time._time = 5 self.test_throttle.wait() self.mock_time._time = 10 self.test_throttle.wait() self.assertEqual(self.mock_time.sleep_duration, 5)
from throttle import Throttle from download import download, search_codes from string import ascii_lowercase import re import csv import itertools throttle = Throttle(0) """ So, money.rediff.com has the simplest structure of scripcodes I found. All scrips are categorized by the first alphabet. In this code I am first finding out how many stocks are there starting with each alphabet. The count of the stocks is being recorded in the list 'index' below """ index = [] for x in ascii_lowercase: throttle.wait('https://money.rediff.com') html = str(download('https://money.rediff.com/companies/{}'.format(x))) len = re.search('>Showing 1 - (.*?) of (.*?) ', html) index.append(int(len.group(2))) """ Once I have all the stocks by their alphabet, I am iterating through every page on the structure to find the regex for scripcode, which is a 6 digit number. I know the variables look ugly, but the code is fucntional, and will only be run once in a blue moon. I'll improve on it later on, if I get time. Basically, this is an unintelligent iterative crawler. """ ctr = 0 b = [] prod = [] for x in ascii_lowercase: throttle.wait('https://money.rediff.com') for i in itertools.count(1, 200): limit = index[ctr] if (i > limit): break b = search_codes('https://money.rediff.com/companies/{}/{}-{}'.format(
class Downloader: def __init__(self, delay=1, user_agent='saint_data', proxy=None, cache={}): """ __init__ method initializes a Downloader object @parameters user_agent: (str) user agent for request header cache: (dict) stores all downloaded """ self.throttle = Throttle(delay) self.user_agent = user_agent self.num_retries = None # this variable will be set later by request (in __call__ method) self.proxy = proxy self.cache = cache # ---------------------------------------------------------------------------------------------------------------- # def __call__(self, url, num_retries=2): """ __call__ method downloads urls that are not found in cache or returns urls found in cache @parameters url: (string) web site's url num_retries: (int) number @returns result['html'] (string) web page's source code """ self.num_tries = num_retries try: result = self.cache[url] except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # server error so ignore result from cache # and re-download result = None if result is None: # result was not loaded from cache # so still need to download self.throttle.wait(url) result = self.download(url, self.user_agent) if self.cache: # save result ot cache self.cache[url] = result return result['html'] # ---------------------------------------------------------------------------------------------------------------- # def download(self, url, user_agent, num_tries=2, charset='utf-8'): """ This function downloads a website's source code. @parameters url: (str) website's url user_agent: (str) specifies the user_agent string num_tries: (int) if a download fails due to a problem with the request (4xx) or the server (5xx) the function calls it self recursively #num_tries times charset: (str) helps specify the desired codec of the HTTP responses @returns html_code: (str or None) html code of web site or None if no code is returned """ print("Downloading %s ... " % url) # construct a Request object request = urllib.request.Request(url) # set user-agent for this request request.add_header('User-Agent', user_agent) try: if self.proxy: proxy_support = urllib.request.ProxyHandler( {'http': self.proxy}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) # make a request and get an HTTPResponse object back # response is a context manager (.info(), .getcode(), .geturl()) response = urllib.request.urlopen(request) # reading response as string (bytes originally) # 'ignore' arg is crucial to avoid errors when decoding bytes with codec different than charset ('utf-8') html_code = response.read().decode(charset, 'ignore') response_code = response.getcode() except (URLError, HTTPError, ContentTooShortError) as e: print("Downloading Error:", e.reason) html_code = None if hasattr(e, 'code'): response_code = e.code else: response_code = None if num_tries > 0: if hasattr(e, 'code') and 500 <= e.code < 600: # recursively retry 5xx HTTP errors (server errors) return self.download(url, user_agent, num_tries - 1, charset) # Our beloved html_code is UTF-8 STRING or NONE # TODO(4) delete statement # print("HTML: {0}".format(type(html_code))) return {'html': html_code, 'code': response_code}
class Downloader(object): ''' classdocs ''' def __init__(self, delay=5, user_agent='wswp', proxies=None, num_retries=1, cache=None): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache self.opener = None ''' Constructor ''' #当调用实例名时便会调用该方法 def __call__(self, url): result = None if self.cache: try: result = self.cache[url] except KeyError: print url + " is not available in cache!" pass else: if self.num_retries > 0 and 500 <= result['code'] < 600: result = None else: print url + " is available in cache!" if result is None: self.throttle.wait(url) proxy = random.choice(self.proxies) if self.proxies else None headers = {'User-agent': self.user_agent} result = self.download(url, headers, proxy, self.num_retries) if self.cache and result is not None: self.cache[url] = result else: return None return result['html'] def download(self, url, headers, proxy, num_retries, data=None): print 'Downloading ', url # python中的逻辑运算和普通的不一样 request = urllib2.Request(url, data, headers or {}) opener = self.opener or urllib2.build_opener() code = 200 if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: # htmlfile = urllib2.urlopen(request, timeout = 10) # timeout: s(秒) htmlfile = opener.open(request, timeout=15) html = htmlfile.read() except Exception as e: print 'Download error:', str(e) html = None if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= e.code <= 600: # 当下载遇到5xx错误码时,尝试重新下载 return self.download(url, headers, proxy, num_retries - 1, data) else: return None else: return None return {'html': html, 'code': code}
def rotate_log_files(options): with request_lock(options['lock_file']) as acquired: if not acquired: logger.warn('Not rotating, previous job still underway') return # Check we can send signals to all relevant processes pids_for_processes = running_processes_by_name( options['reopen_file_signals'].keys()) unkillable_processes = set() for process_name in options['reopen_file_signals'].keys(): pids = pids_for_processes[process_name] try: for pid in pids: kill_if_running(pid, 0) except OSError: unkillable_processes.add(process_name) if unkillable_processes: logger.error('Cannot send signal to some processes, aborting: %s' % ', '.join(unkillable_processes)) return files_to_rotate = [ file for file in os.listdir(options['log_directory']) if fnmatch.fnmatch(file, options['filename_filter']) ] rotation_suffix = datetime.datetime.now().strftime( options['timestamp_format']) filename_mapping = { file: file + rotation_suffix for file in files_to_rotate } # Move all files rotated_files = [] for original_name, rotated_name in filename_mapping.items(): original_path = os.path.join(options['log_directory'], original_name) rotated_path = os.path.join(options['log_directory'], rotated_name) if not os.path.exists(rotated_path): os.rename(original_path, rotated_path) rotated_files.append(rotated_name) else: logger.warning( 'Did not rotate file. File called %s already existed', rotated_path) # Run kick commands pids_for_processes = running_processes_by_name( options['reopen_file_signals'].keys()) for process_name, signal_name in options['reopen_file_signals'].items( ): signal_id = getattr(signal, 'SIG' + signal_name.upper()) pids = pids_for_processes[process_name] for pid in pids: kill_if_running(pid, signal_id) throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL) checks_without_closed_files = 0 s3_store = S3LogStore(options) # Get files which have no open handles and process them as soon as we can. # Files with open handles wait until next time through the loop. We throttle # to avoid checking too often. # TODO: Should we also pick up and retry copying any gz files which we could not # copy to s3 last time around? open_files = rotated_files while open_files: throttle_file_checks.wait() closed_files, open_files = check_for_open_files(open_files) for ready_file in closed_files: try: ready_path = os.path.join(options['log_directory'], ready_file) compressed_path = compress_file(ready_path) s3_store.store_file(compressed_path) os.unlink(compressed_path) except: logger.error('Unexpected error processing %s', ready_file, exc_info=True) if len(closed_files): checks_without_closed_files = 0 else: checks_without_closed_files += 1 if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED: logger.error( 'Gave up waiting for files to close. Open files: %s' % ', '.join(open_files)) return
ldr_kz_nb = Loader_KZ_NB() ldr_kz_bai_alfa = Loader_KZ_bai_alfa() kz_bai_halyk_cash_ldr = Loader_KZ_bai_halyk_cash() kz_bai_halyk_cards_ldr = Loader_KZ_bai_halyk_cards() kz_bai_kkb_cash_ldr = Loader_KZ_bai_kkb_cash() kz_bai_kkb_cards_ldr = Loader_KZ_bai_kkb_cards() # here is the place for adding an instance into the loaders list loaders_list = [ ldr_kz_nb, ldr_kz_bai_alfa, kz_bai_halyk_cash_ldr, kz_bai_halyk_cards_ldr, kz_bai_kkb_cash_ldr, kz_bai_kkb_cards_ldr ] loadedData = '' # loop in loaders list for ldr in loaders_list: loadedData = ldr.loadDailyData(date_for_load) if loadedData: parsedData = ldr.parseDailyData(loadedData) else: logging.error("Empty loaded data") parsedData = None if parsedData: ldr.saveRatesData(parsedData) throttle.wait(ldr.get_domain()) # loc = localizator("en-us") # logging.info(loc.get_translated_labels(["EUR","LBL000002", 12.4,"LBL000001", "LBL000005"]))
class Downloader: def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, num_retries=DEFAULT_RETRIES, cache=DEFAULT_CACHE, proxies=DEFAULT_PROXIES, opener=DEFAULT_OPENER, timeout=DEFAULT_TIMEOUT): socket.setdefaulttimeout(timeout) self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.num_retries = num_retries self.cache = cache self.opener = opener def __call__(self, url): result = None if self.cache: try: result = self.cache[url] except KeyError: pass else: if self.num_retries > 0 and 500 <= result['code'] < 600: result = None if result is None: self.throttle.wait(url) proxy = random.choice(self.proxies) if self.proxies else None headers = {'user-agent': self.user_agent} result = self.download(url, headers, proxy, self.num_retries) if self.cache: self.cache[url] = result return result['html'] def download(self, url, headers, proxy, num_retries, data=None): print 'Downloading:', url request = urllib2.Request(url, data, headers or {}) opener = self.opener or urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download error:', str(e) logging.error(str(e)) html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: return self.download(url, headers, proxy, num_retries-1, data) else: code = None except (socket.error, httplib.BadStatusLine, httplib.IncompleteRead, socket.timeout, ssl.SSLError) as e: print 'Download error:', str(e) logging.error(str(e)) html, code = '', None if num_retries > 0: return self.download(url, headers, proxy, num_retries-1, data) return {'html': html, 'code': code}
def rotate_log_files(options): with request_lock(options['lock_file']) as acquired: if not acquired: logger.warn('Not rotating, previous job still underway') return # Check we can send signals to all relevant processes pids_for_processes = running_processes_by_name(options['reopen_file_signals'].keys()) unkillable_processes = set() for process_name in options['reopen_file_signals'].keys(): pids = pids_for_processes[process_name] try: for pid in pids: kill_if_running(pid, 0) except OSError: unkillable_processes.add(process_name) if unkillable_processes: logger.error('Cannot send signal to some processes, aborting: %s' % ', '.join(unkillable_processes)) return files_to_rotate = [ file for file in os.listdir(options['log_directory']) if fnmatch.fnmatch(file, options['filename_filter']) ] rotation_suffix = datetime.datetime.now().strftime(options['timestamp_format']) filename_mapping = { file: file + rotation_suffix for file in files_to_rotate } # Move all files rotated_files = [] for original_name, rotated_name in filename_mapping.items(): original_path = os.path.join(options['log_directory'], original_name) rotated_path = os.path.join(options['log_directory'], rotated_name) if not os.path.exists(rotated_path): os.rename(original_path, rotated_path) rotated_files.append(rotated_name) else: logger.warning('Did not rotate file. File called %s already existed', rotated_path) # Run kick commands pids_for_processes = running_processes_by_name(options['reopen_file_signals'].keys()) for process_name, signal_name in options['reopen_file_signals'].items(): signal_id = getattr(signal, 'SIG' + signal_name.upper()) pids = pids_for_processes[process_name] for pid in pids: kill_if_running(pid, signal_id) throttle_file_checks = Throttle(FILE_OPEN_CHECK_INTERVAL) checks_without_closed_files = 0 s3_store = S3LogStore(options) # Get files which have no open handles and process them as soon as we can. # Files with open handles wait until next time through the loop. We throttle # to avoid checking too often. # TODO: Should we also pick up and retry copying any gz files which we could not # copy to s3 last time around? open_files = rotated_files while open_files: throttle_file_checks.wait() closed_files, open_files = check_for_open_files(open_files) for ready_file in closed_files: try: ready_path = os.path.join(options['log_directory'], ready_file) compressed_path = compress_file(ready_path) s3_store.store_file(compressed_path) os.unlink(compressed_path) except: logger.error('Unexpected error processing %s', ready_file, exc_info=True) if len(closed_files): checks_without_closed_files = 0 else: checks_without_closed_files += 1 if checks_without_closed_files > MAX_CHECKS_WITHOUT_FILE_CLOSED: logger.error('Gave up waiting for files to close. Open files: %s' % ', '.join(open_files)) return
class Downloader: """ Downloader class to use cache and requests for downloading pages. For contructor, pass: delay (int): # of secs delay between requests (default: 5) user_agent (str): user agent string (default: 'wswp') proxies (list[dict]): list of possible proxies, each must be a dict with http / https keys and proxy values cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code) timeout (float/int): number of seconds to wait until timeout """ def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout LOGIN_URL = 'http://www.jobbole.com/wp-admin/admin-ajax.php' LOGIN_EMAIL = 'caicai' LOGIN_PASSWORD = '******' postdata = urllib.parse.urlencode({'user_login': LOGIN_EMAIL, 'user_pass': LOGIN_PASSWORD,'action':'user_login' ,'remember_me':'1','redirect_url':'http://www.jobbole.com/'}).encode('utf-8') req = urllib.request.Request(LOGIN_URL,postdata) req.add_header('User-Agent','Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:55.0) Gecko/20100101 Firefox/55.0') urllib.request.ProxyHandler(proxies=proxies) #create CookieJar cjar = http.cookiejar.CookieJar() #create opener opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cjar)) #open 安装为全局 urllib.request.install_opener(opener) file = opener.open(req) data=file.read() file=open('3.html','wb') file.write(data) file.close() def __call__(self, url, num_retries=2): """ Call the downloader class, which will return HTML from cache or download it args: url (str): url to download kwargs: num_retries (int): # times to retry if 5xx code (default: 2) """ self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # server error so ignore result from cache # and re-download result = None if result is None: # result was not loaded from cache, need to download self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) self.cache[url] = result return result['html'] def download(self, url, headers, proxies): """ Download a and return the page content args: url (str): URL headers (dict): dict of headers (like user_agent) proxies (dict): proxy dict w/ keys 'http'/'https', values are strs (i.e. 'http(s)://IP') (default: None) """ print('Downloading:', url) try: resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) html = resp.text html=urllib.request.urlopen(url).read().decode('utf-8') if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: # recursively retry 5xx HTTP errors self.num_retries -= 1 return self.download(url, headers, proxies) except requests.exceptions.RequestException as e: print('Download error:', e) return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}
def link_crawler(start_url, link_regex, robots_url=None, user_agent='statista', max_depth=-1, delay=3, proxies=None, num_retries=2, cache=None, scraper_callback=None): #: Initialze a crawl queue with a seed url to start the crawl from crawl_queue = [start_url] #: keep track of seen urls seen = {} robots = {} throttle = Throttle(delay) #: start the crawl while crawl_queue: url = crawl_queue.pop() #: robots.txt robots_file_present = False if 'http' not in url: continue #: Get the domain domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) #: Get the robot parser for this domain from the robots dictionary robot_parser = robots.get(domain) #: set a default robots url and a parser for it if there isn't one if not robot_parser and domain not in robots: robots_url = '{}/robots.txt'.format(domain) robot_parser = get_robots_parser(robots_url) if not robot_parser: #: continue to crawl even if there are problems finding robots.txt #: file robots_file_present = True # associate each domain with a corresponding parser, whether # present or not robots[domain] = robot_parser elif domain in robots: robots_file_present = True #: crawl only when url passes robots.txt restrictions if robots_file_present or robot_parser.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: #: Skip link if you have crawled it more than max depth print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, num_retries=num_retries) if not html: continue if scraper_callback: scraper_callback(url, html) #: Get all links from page and filter only those matching given pattern for link in get_links(html): if re.search(link_regex, link): if 'http' not in link: # check if link is well formed and correct if link.startswith('//'): link = '{}:{}'.format(urlparse(url).scheme, link) elif link.startswith('://'): link = '{}{}'.format(urlparse(url).scheme, link) else: link = urljoin(domain, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print('Blocked by robots.txt:', url)
class Downloader: def __init__(self, delay=1, user_agent='saint_data', proxy=None, cache={}): """ __init__ method initializes a Downloader object @parameters user_agent: (str) user agent for request header cache: (dict) stores all downloaded """ self.throttle = Throttle(delay) self.user_agent = user_agent self.num_retries = None # this variable will be set later by request (in __call__ method) self.proxy = proxy self.cache = cache # ---------------------------------------------------------------------------------------------------------------- # def __call__(self, url, num_retries=2): """ __call__ method downloads urls that are not found in cache or returns urls found in cache @parameters url: (string) web site's url num_retries (int) number @returns result['html'] (string) web page's source code """ self.num_tries = num_retries try: result = self.cache[url] except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # server error so ignore result from cache # and re-download result = None if result is None: # result was not loaded from cache # so still need to download self.throttle.wait(url) result = self.download(url, self.user_agent, num_retries) if self.cache: # save result ot cache self.cache[url] = result return result["html_code"] # ---------------------------------------------------------------------------------------------------------------- # def download(self, url, user_agent, num_retries): """ This function downloads a website's source code. @parameters url (str) website's url user_agent (str) specifies the user_agent string num_retries (int) if a download fails due to a problem with the request (4xx) or the server (5xx) the function calls it self recursively #num_retries times @returns html_code (str or None) html code of web site or None if no code is returned """ print("Downloading %s ... " % url) # set user-agent for this request headers = {'User-Agent': user_agent} try: resp = requests.get(url, headers=headers, proxies=self.proxy) # retrieve content html_code = resp.text # save the request's status code code = resp.status_code if resp.status_code >= 400: print('Download error:', resp.text) html_code = None if num_retries and 500 <= resp.status_code < 600: # recursively retry 5xx HTTP errors print("retry") self.throttle.wait(url) return self.download(url, user_agent, num_retries - 1) except RequestException as e: print('Download Exception error:', e) html_code = None code = e.errno return {'html_code': html_code, 'code': code}
class Downloader: """ Downloader class to use cache and requests for downloading pages. For contructor, pass: delay (int): # of secs delay between requests (default: 5) user_agent (str): user agent string (default: 'wswp') proxies (list[dict]): list of possible proxies, each must be a dict with http / https keys and proxy values cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code) timeout (float/int): number of seconds to wait until timeout """ def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, timeout=60): self.throttle = Throttle(delay) self.user_agent = user_agent self.proxies = proxies self.cache = cache self.num_retries = None # we will set this per request self.timeout = timeout def __call__(self, url, num_retries=2): """ Call the downloader class, which will return HTML from cache or download it args: url (str): url to download kwargs: num_retries (int): # times to retry if 5xx code (default: 2) """ self.num_retries = num_retries try: result = self.cache[url] print('Loaded from cache:', url) except KeyError: result = None if result and self.num_retries and 500 <= result['code'] < 600: # server error so ignore result from cache # and re-download result = None if result is None: # result was not loaded from cache, need to download self.throttle.wait(url) proxies = choice(self.proxies) if self.proxies else None headers = {'User-Agent': self.user_agent} result = self.download(url, headers, proxies) self.cache[url] = result return result['html'] def download(self, url, headers, proxies): """ Download a and return the page content args: url (str): URL headers (dict): dict of headers (like user_agent) proxies (dict): proxy dict w/ keys 'http'/'https', values are strs (i.e. 'http(s)://IP') (default: None) """ print('Downloading:', url) try: resp = requests.get(url, headers=headers, proxies=proxies, timeout=self.timeout) html = resp.text if resp.status_code >= 400: print('Download error:', resp.text) html = None if self.num_retries and 500 <= resp.status_code < 600: # recursively retry 5xx HTTP errors self.num_retries -= 1 return self.download(url, headers, proxies) except requests.exceptions.RequestException as e: print('Download error:', e) return {'html': None, 'code': 500} return {'html': html, 'code': resp.status_code}