def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(max_threads): from mongo_cache import MongoCache from alexa_cb import AlexaCallback scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10) # process_crawler
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\ user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() webpage_cache = MongoCache() # crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) if (500 <= webpage_cache[url]['code'] < 600) | (webpage_cache[url]['code'] == -999): crawl_queue.reset(url) else: crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def __init__(self, mongo_host, mongo_port): last_lock = SimpleMongoServiceLock(mongo_host, mongo_port, 'music_tour', 'last_lock', 1, 30) self.last_fm = LastFmService( MongoCache(mongo_host, mongo_port, 'music_tour', 'last_cache', timedelta(weeks=24)), last_lock) spotify_lock = SimpleMongoServiceLock(mongo_host, mongo_port, 'music_tour', 'spotify_lock', 1, 30) self.spotify = SpotifyMetaService( MongoCache(mongo_host, mongo_port, 'music_tour', 'spotify_cache', timedelta(weeks=24)), spotify_lock)
def main(): scrape_callback = AlexaCallback() cache = MongoCache(expires=timedelta()) #cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache)
def main(max_threads=5): scrape_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() urls = [] temple = scrape_callback.seed_url[0:-2] for i in range(1, 1189, 1): urls.append(temple + str(i) + '/') while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler( urls, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=30, host=urlparse.urlparse(scrape_callback.seed_url).netloc, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ) else: print 'pass:' + str(now) pass time.sleep(3600)
def __init__(self, cache=MongoCache()): self.max_page = 0 self.base_url = configs.MAIN_PAGE_URL self.page_url = configs.EACH_PAGE_URL self.headers = self._load_headers() self.video_headers = self._load_headers('headers/video_headers') self.cache = cache
def main(): starttime = datetime.datetime.now() scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback) endtime = datetime.datetime.now() print((endtime - starttime).seconds)
def test_cache_expired(self): cache = MongoCache(expires=timedelta()) # every 60 seconds the cache is purged # http://docs.mongodb.org/manual/core/index-ttl/ cache[self.url] = self.result sleep(61) with self.assertRaises(KeyError): cache[self.url]
def main(): scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
def __call__(self, url, html): urls = [] cache = MongoCache() for _, website in csv.reader(open(self.seed_url)): if website not in cache: urls.append(website) if len(urls) == self.max_urls: break return urls
def main(max_threads): cache = MongoCache() # cache.clear() threaded_crawler( seed_url='http://example.webscraping.com', scrape_callback=link_crawler('http://example.webscraping.com'), cache=cache, max_threads=max_threads, timeout=0)
def test(): start_url = 'http://www.alexa.com/topsites/global;0' cache = MongoCache() scrape_callback = AlaxeCallback(allow_domains=[start_url]) process_crawler(start_url, link_regex='/topsites/global;', cache=cache, scrape_callback=scrape_callback, max_threads=8, timeout=5)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() crawler(scrape_callback.seed_url, proxies=[ '127.0.0.1:8118', ], scrape_callback=scrape_callback, cache=cache)
def threaded_crawler(seed_url, delay=5, cache=MongoCache(), scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=10): """Crawl using multiple threads""" # the queue of url's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for; {}:{}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(link) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # 线程睡眠1s
def __call__(self, url, html): if url == self.seed_url: urls = [] cache = MongoCache() with ZipFile(StringIO(html)) as zf: csv_filename = zf.namelist()[0] for _, website in csv.reader(zf.open(csv_filename)): if 'http://' + website not in cache: urls.append('http://' + website) if len(urls) == self.max_urls: break return urls
def test(max_threads): start_url = 'http://www.alexa.com/topsites/global;0' scrape_callback = AlaxeCallback(allow_domains=[start_url]) cache = MongoCache() # start_url = 'http://www.eastday.com' # start_url = 'http://www.qq.com' threaded_crawler(start_url, link_regex='/topsites/global;', cache=cache, scrape_callback=scrape_callback, max_threads=max_threads, timeout=5)
def __call__(self, url, html): if url == self.seed_url: urls = [] cache = MongoCache() with ZipFile(BytesIO(html.content)) as zf: csv_filename = zf.namelist()[0] data = StringIO(zf.open(csv_filename).read().decode('utf-8')) for _, website in csv.reader(data): if 'http://' + website not in cache: urls.append('http://' + website) if len(urls) == self.max_urls: break return urls
def com_alexa(): """ 从该网址下载一些热门网址 """ start_url = 'http://www.alexa.com/topsites/global;0' scrape_callback = AlaxeCallback(allow_domains=start_url) link_crawler(start_url, link_regex='/topsites/global;', delay=3, only_same_host=False, save_cache=False, max_urls=100, cache=MongoCache(), scrape_callback=scrape_callback, timeout=3) del scrape_callback
def __init__( self, output_dir, start_date, end_date, chosen_program=None, use_cache=False): self.output_dir = output_dir self.start_date = start_date self.end_date = end_date self.chosen_program = chosen_program self.base_url = 'https://www.byte.fm' self.header = ["program", "date", "title", "artist", "album", "label"] self.parser = HTMLParser() if use_cache: from mongo_cache import MongoCache cache = MongoCache() else: cache = None self.Downloader = Downloader(cache=cache)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)
def setUp(self): self.cache = MongoCache(default_timeout=0)
else: stop = 1 # 该链表页下的所有详情页为空, 不再增加链表页 if 'top250' in url and stop == 0: page_size += 25 next_link = form_url.format(page_size) if next_link not in seen: seen.add(next_link) crawl_queue.append(next_link) # 等待所有的下载线程结束 threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): # 移除已经停止的进程 threads.remove(thread) while len(threads) < max_threads and crawl_queue: # 开始更多的线程 thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(np.random.randint(6, 12)) if __name__ == '__main__': Scrape_Back = GetDetailInfo Cache = MongoCache() Cache.clear() threaded_crawler(scrape_callback=Scrape_Back, cache=Cache)
class TestCache(unittest.TestCase): def setUp(self): self.cache = MongoCache(default_timeout=0) def tearDown(self): self.cache.collection.delete_many({}) def test_get(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_delete_existing(self): x = MockData(1) self.cache.set('key-1', x) self.assertTrue(self.cache.delete('key-1')) def test_delete_not_existing(self): self.assertFalse(self.cache.delete('key-1')) def test_set(self): x = MockData(1) self.cache.set('key-1', x) xc = self.cache.get('key-1') self.assertEqual(x, xc) def test_add_not_existing(self): x = MockData(1) added = self.cache.add('key-1', x) self.assertTrue(added) def test_add_existing(self): x = MockData(1) self.cache.set('key-1', x) y = MockData(2) added = self.cache.add('key-1', y) self.assertFalse(added) def test_clear(self): x = MockData(1) self.cache.set('key-1', x) cleared = self.cache.clear() xc = self.cache.get('key-1') self.assertTrue(cleared) self.assertIsNone(xc) def test_set_overwrite(self): x1 = MockData(1) key = 'key-set-overwrite' self.cache.set(key, x1) x2 = MockData(2) self.cache.set(key, x2) _filter = {'_id': key} count_keys = self.cache.collection.count(_filter) self.assertEqual(1, count_keys) def test_inc_with_exist_key(self): value = 10 key = 'key-inc-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) result = delta + value self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_inc_witho_exist_key(self): key = 'key-inc-without-exist-key' delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(delta, value_cache) self.assertEqual(delta, new_value) def test_inc_with_error(self): value = MockData(1) key = 'key-inc-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.inc(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_has_with_add_key(self): key = 'key-has-with-add-key' value = MockData(1) self.cache.add(key, value) has_key = self.cache.has(key) self.assertTrue(has_key) def test_has_without_add_key(self): key = 'key-has-without-add-key' has_key = self.cache.has(key) self.assertFalse(has_key) def test_dec_with_exist_key(self): value = 10 key = 'key-dec-with-exist-key' self.cache.set(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) result = value - delta self.assertEqual(result, value_cache) self.assertEqual(result, new_value) def test_dec_witho_exist_key(self): key = 'key-dec-without-exist-key' delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(-delta, value_cache) self.assertEqual(-delta, new_value) def test_dec_with_error(self): value = MockData(1) key = 'key-dec-with-error' self.cache.add(key, value) delta = 9 new_value = self.cache.dec(key, delta) value_cache = self.cache.get(key) self.assertEqual(value, value_cache) self.assertEqual(None, new_value) def test_get_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 11)} for key, value in key_x_value.items(): self.cache.add(key, value) values = self.cache.get_many(*key_x_value.keys()) self.assertEqual(10, len(values)) for _return, _value in zip(values, key_x_value.values()): self.assertEqual(_value, _return) def test_get_dict(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) results = self.cache.get_dict(*key_x_value.keys()) self.assertIsInstance(results, dict) for key, value in key_x_value.items(): self.assertIn(key, results) self.assertEqual(key_x_value[key], results[key]) def test_delete_many(self): key_x_value = {'key-%s' % i: MockData(i) for i in range(1, 6)} for key, value in key_x_value.items(): self.cache.add(key, value) self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}})) result = self.cache.delete_many(*key_x_value.keys()) self.assertTrue(result) self.assertEqual(0, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}})) def test_set_many(self): key_x_value = {'key-set-many-%s' % i: MockData(i) for i in range(1, 6)} result = self.cache.set_many(key_x_value) self.assertTrue(result) self.assertEqual(5, self.cache.collection.count({'_id': {'$in': key_x_value.keys()}}))
import pandas as pd import re import numpy as np from process_crawler import process_crawler from mongo_queue import MongoQueue from mongo_cache import MongoCache from mongo_info import MongoInfo from downloader import Downloader from lxml import etree crawl_queue = MongoQueue() webpage_cache = MongoCache() DEFAULT_AGENT = {} DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 100 DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv' DEFAULT_COOKIE = {} D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def usere(regex, getcontent): #定义使用正则表达式的函数 pattern = re.compile(regex) content = re.findall(pattern, getcontent) return content #Obtain target urls startdate = '20180414' enddate = '20180415'
"""Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') link_crawler('http://example.webscraping.com', '/places/default/view', delay=0, num_retries=1, max_depth=10, user_agent='GoodCrawler', cache=MongoCache(expires=datetime.timedelta()))
def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': from mongo_cache import MongoCache class CallBack: def __init__(self, filename='log.txt'): self.file = open(filename, 'w+') def __call__(self, url, html): self.file.write("{}\n".format(url)) cache = MongoCache() link_crawler('http://example.webscraping.com/places/default', '/places/default/(index|view)', delay=1, num_retries=1, max_depth=3, user_agent='GoodCrawler', cache=cache, scrape_callback=CallBack())
def main(): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
# -*- coding: utf-8 -*- from datetime import timedelta from pymongo import MongoClient from mongo_cache import MongoCache cache = MongoCache() cache.clear() url = 'http://example.webscraping.comasdf' result = {'html': '...'} cache[url] = result print(cache[url]['html'] == result['html']) cache = MongoCache(expires=timedelta()) cache[url] = result import time time.sleep(60) print(cache[url])
link = normalize(seed_url,link) if link not in seen: seen[link] = depth+1 if same_domain(link,seed_url): crawl_queue.append(link) #print('crawl_queue=',crawl_queue) num +=1 print('num=',num) num_urls +=1 if num_urls == max_urls: break def get_links(html): webpage_regex=re.compile('<a href="position.php\?(.*?)"',re.IGNORECASE) #print('webpage_regex.findall(html)=',webpage_regex.findall(html)) return webpage_regex.findall(html) def normalize(seed_url,link): link,_=urllib.parse.urldefrag(link) return urllib.parse.urljoin(seed_url,link) def same_domain(url_1,url_2): return urllib.parse.urlparse(url_1).netloc == urllib.parse.urlparse(url_2).netloc link_crawler('https://hr.tencent.com/position.php?keywords=python', 'keywords=python&start=', cache=MongoCache())
from fangjia_thread_crawler import thread_crawler from fangjia_cb import FangjiaCallback from mongo_cache import MongoCache from downloader import Downloader from fangjia2 import get_search from fangjia2 import get_info_list from fangjia2 import download import pandas as pd import cPickle import os if __name__ == '__main__': # get the seed_urls starttime = datetime.datetime.now() seed_urls = [] cache = MongoCache() # cache all pages if os.path.exists('seed_urls.pkl'): with open('seed_urls.pkl', 'rb') as fp: seed_urls = cPickle.load(fp) else: base_url = r'http://cd.fangjia.com/ershoufang/' search_list = [] # 房源信息url列表 tmp_list = [] # 房源信息url缓存列表 layer = -1 # 一级筛选 #D = Downloader(cache=cache) page = download(base_url) search_dict = get_search(page, 'r-') # 二级筛选 for k in search_dict: print u'****************一级抓取:正在抓取【%s】***************' % k
class TestTimeout(unittest.TestCase): def setUp(self): self.cache = MongoCache() def tearDown(self): self.cache.collection.delete_many({}) def test_set(self, mock_time): key = 'key-set' mock_time.return_value = 100 self.cache.set(key, MockData(1), timeout=300) doc = self.cache.collection.find_one({'_id': key}) self.assertIn('expires', doc) self.assertEqual(400, doc['expires']) self.assertIn('value', doc) def test_get_not_expired(self, mock_time): key = 'key-not-expired' mock_time.return_value = 100 self.cache.set(key, MockData(1), timeout=100) mock_time.return_value = 150 result = self.cache.get(key) self.assertIsNotNone(result) self.assertEqual(result, MockData(1)) def test_get_timeout_0(self, mock_time): key = 'key-not-expired' mock_time.return_value = 100 self.cache.set(key, MockData(1), timeout=0) result = self.cache.get(key) self.assertIsNotNone(result) self.assertEqual(result, MockData(1)) def test_get_expired(self, mock_time): key = 'key-get-expired' mock_time.return_value = 100 self.cache.set(key, MockData(1), timeout=100) mock_time.return_value = 201 result = self.cache.get(key) self.assertIsNone(result) def test_get_many_expired(self, mock_time): key_timeout_1 = 'key-timeout-1' key_timeout_100 = 'key-timeout-100' mock_time.return_value = 100 self.cache.set(key_timeout_1, MockData(1), timeout=1) self.cache.set(key_timeout_100, MockData(1), timeout=100) mock_time.return_value = 150 results = self.cache.get_many(*[key_timeout_1, key_timeout_100]) self.assertIsNone(results[0]) self.assertIsNotNone(results[1])
def setUp(self): self.cache = MongoCache()
def link_crawler(seed_url,link_regex_large,link_regex_small,max_depth=2,max_threads=5): 'Crawl from the given seed URL following links matchedly by link_regex' print 'seed_ur',seed_url #crawl_queue=[seed_url] crawl_queue=Mongo_Queue() #seen={seed_url:0}#no need this seen for Mongo_Queue will take care of duplicate url #crawl_queue=Mongo_Queue.push(seed_url) crawl_queue.push(seed_url) depth=(crawl_queue.get_item(seed_url))['depth'] print 'seedurldepth:',depth cache=MongoCache() D=Download(cache=cache) #result_links=set() csvFile=open('D:/Work/Projects/realestate/app/static/163_money.csv','wb') writer=csv.writer(csvFile) def process_queue(): 'extract the page_download part as a function, so that every tread can call it to download page' while True: try: url=crawl_queue.pop() except KeyError: #no url in crawl_queue break else: depth=(crawl_queue.get_item(url))['depth'] 'depth=128,129' #print depth if depth<=max_depth: html=D(url) links=re.findall(link_regex_large,html) for link in links: if re.match(link_regex_small,link): writer.writerow((link,'')) #writer.writerow((link,'')) print link else: crawl_queue.push(link,depth+1) #encoding=chardet.detect(link) #link=link.decode(encoding).encode('utf-8') #crawl_queue.push(link,depth+1) #seen[link]=depth+1 crawl_queue.complete(url) threads=[] while crawl_queue or threads: while len(threads)<max_threads and crawl_queue: #can start some more threads thread=threading.Thread(target=process_queue) #daemon's value must be set before start(), or RuntimeError will rarise. set deamon=Ture ,so that main thread can exit when receieve ctrl-c thread.setDaemon(True) thread.start() threads.append(thread) for thread in threads: if not thread.is_alive(): #remove the stopped threads threads.remove(thread) csvFile.close()