def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\ user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl using multiple threads """ # the queue of URL's that still need to be crawled crawl_queue = MongoQueue() webpage_cache = MongoCache() # crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print('Error in callback for: {}: {}'.format(url, e)) else: for link in links: # add this new link to queue crawl_queue.push(normalize(seed_url, link)) if (500 <= webpage_cache[url]['code'] < 600) | (webpage_cache[url]['code'] == -999): crawl_queue.reset(url) else: crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon( True ) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def __init__(self, mongo_host, mongo_port): last_lock = SimpleMongoServiceLock(mongo_host, mongo_port, 'music_tour', 'last_lock', 1, 30) self.last_fm = LastFmService( MongoCache(mongo_host, mongo_port, 'music_tour', 'last_cache', timedelta(weeks=24)), last_lock) spotify_lock = SimpleMongoServiceLock(mongo_host, mongo_port, 'music_tour', 'spotify_lock', 1, 30) self.spotify = SpotifyMetaService( MongoCache(mongo_host, mongo_port, 'music_tour', 'spotify_cache', timedelta(weeks=24)), spotify_lock)
def main(): scrape_callback = AlexaCallback() cache = MongoCache(expires=timedelta()) #cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache)
def main(max_threads=5): scrape_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() urls = [] temple = scrape_callback.seed_url[0:-2] for i in range(1, 1189, 1): urls.append(temple + str(i) + '/') while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler( urls, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=30, host=urlparse.urlparse(scrape_callback.seed_url).netloc, user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36' ) else: print 'pass:' + str(now) pass time.sleep(3600)
def __init__(self, cache=MongoCache()): self.max_page = 0 self.base_url = configs.MAIN_PAGE_URL self.page_url = configs.EACH_PAGE_URL self.headers = self._load_headers() self.video_headers = self._load_headers('headers/video_headers') self.cache = cache
def test_cache_expired(self): cache = MongoCache(expires=timedelta()) # every 60 seconds the cache is purged # http://docs.mongodb.org/manual/core/index-ttl/ cache[self.url] = self.result sleep(61) with self.assertRaises(KeyError): cache[self.url]
def main(): starttime = datetime.datetime.now() scrape_callback = AlexaCallback() cache = MongoCache() #cache.clear() threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback) endtime = datetime.datetime.now() print((endtime - starttime).seconds)
def main(max_threads): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() process_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, timeout=10, ignore_robots=True)
def __call__(self, url, html): urls = [] cache = MongoCache() for _, website in csv.reader(open(self.seed_url)): if website not in cache: urls.append(website) if len(urls) == self.max_urls: break return urls
def main(max_threads): cache = MongoCache() # cache.clear() threaded_crawler( seed_url='http://example.webscraping.com', scrape_callback=link_crawler('http://example.webscraping.com'), cache=cache, max_threads=max_threads, timeout=0)
def test(): start_url = 'http://www.alexa.com/topsites/global;0' cache = MongoCache() scrape_callback = AlaxeCallback(allow_domains=[start_url]) process_crawler(start_url, link_regex='/topsites/global;', cache=cache, scrape_callback=scrape_callback, max_threads=8, timeout=5)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() crawler(scrape_callback.seed_url, proxies=[ '127.0.0.1:8118', ], scrape_callback=scrape_callback, cache=cache)
def threaded_crawler(seed_url, delay=5, cache=MongoCache(), scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=10): """Crawl using multiple threads""" # the queue of url's that still need to be crawled crawl_queue = MongoQueue() crawl_queue.clear() crawl_queue.push(seed_url) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: # keep track that are processing url try: url = crawl_queue.pop() except KeyError: # currently no urls to process break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for; {}:{}'.format(url, e) else: for link in links: # add this new link to queue crawl_queue.push(link) crawl_queue.complete(url) # wait for all download threads to finish threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue.peek(): thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME) # 线程睡眠1s
def main(max_threads): from mongo_cache import MongoCache from alexa_cb import AlexaCallback scrape_callback = AlexaCallback() cache = MongoCache() # cache.clear() ############# threaded_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, max_threads=max_threads, timeout=10)
def __call__(self, url, html): if url == self.seed_url: urls = [] cache = MongoCache() with ZipFile(StringIO(html)) as zf: csv_filename = zf.namelist()[0] for _, website in csv.reader(zf.open(csv_filename)): if 'http://' + website not in cache: urls.append('http://' + website) if len(urls) == self.max_urls: break return urls
def __call__(self, url, html): if url == self.seed_url: urls = [] cache = MongoCache() with ZipFile(BytesIO(html.content)) as zf: csv_filename = zf.namelist()[0] data = StringIO(zf.open(csv_filename).read().decode('utf-8')) for _, website in csv.reader(data): if 'http://' + website not in cache: urls.append('http://' + website) if len(urls) == self.max_urls: break return urls
def test(max_threads): start_url = 'http://www.alexa.com/topsites/global;0' scrape_callback = AlaxeCallback(allow_domains=[start_url]) cache = MongoCache() # start_url = 'http://www.eastday.com' # start_url = 'http://www.qq.com' threaded_crawler(start_url, link_regex='/topsites/global;', cache=cache, scrape_callback=scrape_callback, max_threads=max_threads, timeout=5)
def com_alexa(): """ 从该网址下载一些热门网址 """ start_url = 'http://www.alexa.com/topsites/global;0' scrape_callback = AlaxeCallback(allow_domains=start_url) link_crawler(start_url, link_regex='/topsites/global;', delay=3, only_same_host=False, save_cache=False, max_urls=100, cache=MongoCache(), scrape_callback=scrape_callback, timeout=3) del scrape_callback
def __init__( self, output_dir, start_date, end_date, chosen_program=None, use_cache=False): self.output_dir = output_dir self.start_date = start_date self.end_date = end_date self.chosen_program = chosen_program self.base_url = 'https://www.byte.fm' self.header = ["program", "date", "title", "artist", "album", "label"] self.parser = HTMLParser() if use_cache: from mongo_cache import MongoCache cache = MongoCache() else: cache = None self.Downloader = Downloader(cache=cache)
def main(max_threads = 5): catlog_callback = AlexaCallback() cache = MongoCache() queue = MongoQueue() client = MongoClient('localhost', 27017, connect=False) #create collection to store cached webpages, # which is the equivalent of a table in a relational database db = client.cache cursor = db.books.find() urls = [] while cursor.alive: temp = cursor.next() temp = temp['link'] if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la': temp = '/novel' + temp[5:-4] + '/' temp = normalize(catlog_callback.seed_url, temp) elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com': temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/' print temp urls.append(temp) print urls[0] while True: now = datetime.now() if now.hour < 3 or now.hour > 12: queue.repairFast() process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36') # every time finished, clear the job queue queue.clear() else: print 'pass:' + str(now) pass time.sleep(3600)
def main(): scrape_callback = AlexaCallback() cache = MongoCache() cache.clear() link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)
def link_crawler(seed_url,link_regex_large,link_regex_small,max_depth=2,max_threads=5): 'Crawl from the given seed URL following links matchedly by link_regex' print 'seed_ur',seed_url #crawl_queue=[seed_url] crawl_queue=Mongo_Queue() #seen={seed_url:0}#no need this seen for Mongo_Queue will take care of duplicate url #crawl_queue=Mongo_Queue.push(seed_url) crawl_queue.push(seed_url) depth=(crawl_queue.get_item(seed_url))['depth'] print 'seedurldepth:',depth cache=MongoCache() D=Download(cache=cache) #result_links=set() csvFile=open('D:/Work/Projects/realestate/app/static/163_money.csv','wb') writer=csv.writer(csvFile) def process_queue(): 'extract the page_download part as a function, so that every tread can call it to download page' while True: try: url=crawl_queue.pop() except KeyError: #no url in crawl_queue break else: depth=(crawl_queue.get_item(url))['depth'] 'depth=128,129' #print depth if depth<=max_depth: html=D(url) links=re.findall(link_regex_large,html) for link in links: if re.match(link_regex_small,link): writer.writerow((link,'')) #writer.writerow((link,'')) print link else: crawl_queue.push(link,depth+1) #encoding=chardet.detect(link) #link=link.decode(encoding).encode('utf-8') #crawl_queue.push(link,depth+1) #seen[link]=depth+1 crawl_queue.complete(url) threads=[] while crawl_queue or threads: while len(threads)<max_threads and crawl_queue: #can start some more threads thread=threading.Thread(target=process_queue) #daemon's value must be set before start(), or RuntimeError will rarise. set deamon=Ture ,so that main thread can exit when receieve ctrl-c thread.setDaemon(True) thread.start() threads.append(thread) for thread in threads: if not thread.is_alive(): #remove the stopped threads threads.remove(thread) csvFile.close()
import pandas as pd import re import numpy as np from process_crawler import process_crawler from mongo_queue import MongoQueue from mongo_cache import MongoCache from mongo_info import MongoInfo from downloader import Downloader from lxml import etree crawl_queue = MongoQueue() webpage_cache = MongoCache() DEFAULT_AGENT = {} DEFAULT_DELAY = 5 DEFAULT_RETRIES = 1 DEFAULT_TIMEOUT = 100 DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv' DEFAULT_COOKIE = {} D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \ cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \ opener=None, cache=MongoCache()) def usere(regex, getcontent): #定义使用正则表达式的函数 pattern = re.compile(regex) content = re.findall(pattern, getcontent) return content #Obtain target urls startdate = '20180414' enddate = '20180415'
def test_cache_not_yet_expired(self): cache = MongoCache() cache[self.url] = self.result self.assertIsInstance(cache[self.url], dict)
def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': from mongo_cache import MongoCache class CallBack: def __init__(self, filename='log.txt'): self.file = open(filename, 'w+') def __call__(self, url, html): self.file.write("{}\n".format(url)) cache = MongoCache() link_crawler('http://example.webscraping.com/places/default', '/places/default/(index|view)', delay=1, num_retries=1, max_depth=3, user_agent='GoodCrawler', cache=cache, scrape_callback=CallBack())
link = normalize(seed_url,link) if link not in seen: seen[link] = depth+1 if same_domain(link,seed_url): crawl_queue.append(link) #print('crawl_queue=',crawl_queue) num +=1 print('num=',num) num_urls +=1 if num_urls == max_urls: break def get_links(html): webpage_regex=re.compile('<a href="position.php\?(.*?)"',re.IGNORECASE) #print('webpage_regex.findall(html)=',webpage_regex.findall(html)) return webpage_regex.findall(html) def normalize(seed_url,link): link,_=urllib.parse.urldefrag(link) return urllib.parse.urljoin(seed_url,link) def same_domain(url_1,url_2): return urllib.parse.urlparse(url_1).netloc == urllib.parse.urlparse(url_2).netloc link_crawler('https://hr.tencent.com/position.php?keywords=python', 'keywords=python&start=', cache=MongoCache())
# -*- coding: utf-8 -*- from datetime import timedelta from pymongo import MongoClient from mongo_cache import MongoCache cache = MongoCache() cache.clear() url = 'http://example.webscraping.comasdf' result = {'html': '...'} cache[url] = result print(cache[url]['html'] == result['html']) cache = MongoCache(expires=timedelta()) cache[url] = result import time time.sleep(60) print(cache[url])
from fangjia_thread_crawler import thread_crawler from fangjia_cb import FangjiaCallback from mongo_cache import MongoCache from downloader import Downloader from fangjia2 import get_search from fangjia2 import get_info_list from fangjia2 import download import pandas as pd import cPickle import os if __name__ == '__main__': # get the seed_urls starttime = datetime.datetime.now() seed_urls = [] cache = MongoCache() # cache all pages if os.path.exists('seed_urls.pkl'): with open('seed_urls.pkl', 'rb') as fp: seed_urls = cPickle.load(fp) else: base_url = r'http://cd.fangjia.com/ershoufang/' search_list = [] # 房源信息url列表 tmp_list = [] # 房源信息url缓存列表 layer = -1 # 一级筛选 #D = Downloader(cache=cache) page = download(base_url) search_dict = get_search(page, 'r-') # 二级筛选 for k in search_dict: print u'****************一级抓取:正在抓取【%s】***************' % k
"""Initialize robots parser for this domain """ rp = robotparser.RobotFileParser() rp.set_url(urlparse.urljoin(url, '/robots.txt')) rp.read() return rp def get_links(html): """Return a list of links from html """ # a regular expression to extract all links from the webpage webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE) # list of all links from the webpage return webpage_regex.findall(html) if __name__ == '__main__': link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler') link_crawler('http://example.webscraping.com', '/places/default/view', delay=0, num_retries=1, max_depth=10, user_agent='GoodCrawler', cache=MongoCache(expires=datetime.timedelta()))