Ejemplos de MongoCache.MongoCache en Python, ejemplos de mongo_cache.MongoCache.MongoCache en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: process_crawler.py Proyecto: evanleungc/spider_test

def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None,\
 user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60):
    """Crawl using multiple threads
    """
    # the queue of URL's that still need to be crawled
    crawl_queue = MongoQueue()
    webpage_cache = MongoCache()
    # crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
            cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
            opener=None, cache=MongoCache())

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print('Error in callback for: {}: {}'.format(url, e))
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(normalize(seed_url, link))
            if (500 <= webpage_cache[url]['code'] <
                    600) | (webpage_cache[url]['code'] == -999):
                crawl_queue.reset(url)
            else:
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            # can start some more threads
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(
                True
            )  # set daemon so main thread can exit when receives ctrl-c
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)

Ejemplo n.º 2

0

Mostrar archivo

    def __init__(self, mongo_host, mongo_port):

        last_lock = SimpleMongoServiceLock(mongo_host, mongo_port,
                                           'music_tour', 'last_lock', 1, 30)
        self.last_fm = LastFmService(
            MongoCache(mongo_host, mongo_port, 'music_tour', 'last_cache',
                       timedelta(weeks=24)), last_lock)
        spotify_lock = SimpleMongoServiceLock(mongo_host, mongo_port,
                                              'music_tour', 'spotify_lock', 1,
                                              30)
        self.spotify = SpotifyMetaService(
            MongoCache(mongo_host, mongo_port, 'music_tour', 'spotify_cache',
                       timedelta(weeks=24)), spotify_lock)

Ejemplo n.º 3

0

Mostrar archivo

def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache(expires=timedelta())
    #cache.clear()
    link_crawler(scrape_callback.seed_url,
                 scrape_callback=scrape_callback,
                 cache=cache)

Ejemplo n.º 4

0

Mostrar archivo

def main(max_threads=5):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()

    urls = []
    temple = scrape_callback.seed_url[0:-2]
    for i in range(1, 1189, 1):
        urls.append(temple + str(i) + '/')

    while True:
        now = datetime.now()
        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(
                urls,
                scrape_callback=scrape_callback,
                cache=cache,
                max_threads=max_threads,
                timeout=30,
                host=urlparse.urlparse(scrape_callback.seed_url).netloc,
                user_agent=
                'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36'
            )
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)

Ejemplo n.º 5

0

Mostrar archivo

 def __init__(self, cache=MongoCache()):
     self.max_page = 0
     self.base_url = configs.MAIN_PAGE_URL
     self.page_url = configs.EACH_PAGE_URL
     self.headers = self._load_headers()
     self.video_headers = self._load_headers('headers/video_headers')
     self.cache = cache

Ejemplo n.º 6

0

Mostrar archivo

Archivo: test_scrapers.py Proyecto: jackdbd/scrapers

 def test_cache_expired(self):
     cache = MongoCache(expires=timedelta())
     # every 60 seconds the cache is purged
     # http://docs.mongodb.org/manual/core/index-ttl/
     cache[self.url] = self.result
     sleep(61)
     with self.assertRaises(KeyError):
         cache[self.url]

Ejemplo n.º 7

0

Mostrar archivo

def main():
    starttime = datetime.datetime.now()
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    #cache.clear()
    threaded_crawler('http://example.webscraping.com',scrape_callback.seed_url, scrape_callback=scrape_callback)
    endtime = datetime.datetime.now()
    print((endtime - starttime).seconds)

Ejemplo n.º 8

0

Mostrar archivo

Archivo: process_test.py Proyecto: fu477521/amazom_crawler

def main(max_threads):
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    process_crawler(scrape_callback.seed_url,
                    scrape_callback=scrape_callback,
                    cache=cache,
                    max_threads=max_threads,
                    timeout=10)

Ejemplo n.º 9

0

Mostrar archivo

def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()
    link_crawler(scrape_callback.seed_url,
                 scrape_callback=scrape_callback,
                 cache=cache,
                 timeout=10,
                 ignore_robots=True)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: alexa_cb.py Proyecto: CodeMinge/syntax_algorithm_of_python

 def __call__(self, url, html):
     urls = []
     cache = MongoCache()
     for _, website in csv.reader(open(self.seed_url)):
         if website not in cache:
             urls.append(website)
             if len(urls) == self.max_urls:
                 break
     return urls

Ejemplo n.º 11

0

Mostrar archivo

def main(max_threads):

    cache = MongoCache()
    # cache.clear()
    threaded_crawler(
        seed_url='http://example.webscraping.com',
        scrape_callback=link_crawler('http://example.webscraping.com'),
        cache=cache,
        max_threads=max_threads,
        timeout=0)

Ejemplo n.º 12

0

Mostrar archivo

def test():
    start_url = 'http://www.alexa.com/topsites/global;0'
    cache = MongoCache()
    scrape_callback = AlaxeCallback(allow_domains=[start_url])
    process_crawler(start_url,
                    link_regex='/topsites/global;',
                    cache=cache,
                    scrape_callback=scrape_callback,
                    max_threads=8,
                    timeout=5)

Ejemplo n.º 13

0

Mostrar archivo

def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()
    crawler(scrape_callback.seed_url,
            proxies=[
                '127.0.0.1:8118',
            ],
            scrape_callback=scrape_callback,
            cache=cache)

Ejemplo n.º 14

0

Mostrar archivo

def threaded_crawler(seed_url,
                     delay=5,
                     cache=MongoCache(),
                     scrape_callback=None,
                     user_agent='wswp',
                     proxies=None,
                     num_retries=1,
                     max_threads=10,
                     timeout=10):
    """Crawl using multiple threads"""
    # the queue of url's that still need to be crawled
    crawl_queue = MongoQueue()
    crawl_queue.clear()
    crawl_queue.push(seed_url)
    D = Downloader(cache=cache,
                   delay=delay,
                   user_agent=user_agent,
                   proxies=proxies,
                   num_retries=num_retries,
                   timeout=timeout)

    def process_queue():
        while True:
            # keep track that are processing url
            try:
                url = crawl_queue.pop()
            except KeyError:
                # currently no urls to process
                break
            else:
                html = D(url)
                if scrape_callback:
                    try:
                        links = scrape_callback(url, html) or []
                    except Exception as e:
                        print 'Error in callback for; {}:{}'.format(url, e)
                    else:
                        for link in links:
                            # add this new link to queue
                            crawl_queue.push(link)
                crawl_queue.complete(url)

    # wait for all download threads to finish
    threads = []
    while threads or crawl_queue:
        for thread in threads:
            if not thread.is_alive():
                threads.remove(thread)
        while len(threads) < max_threads and crawl_queue.peek():
            thread = threading.Thread(target=process_queue)
            thread.setDaemon(True)
            thread.start()
            threads.append(thread)
        time.sleep(SLEEP_TIME)  # 线程睡眠1s

Ejemplo n.º 15

0

Mostrar archivo

Archivo: threaded_crawler.py Proyecto: houjq18/CsLearning

def main(max_threads):
    from mongo_cache import MongoCache
    from alexa_cb import AlexaCallback
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    # cache.clear()             #############
    threaded_crawler(scrape_callback.seed_url,
                     scrape_callback=scrape_callback,
                     cache=cache,
                     max_threads=max_threads,
                     timeout=10)

Ejemplo n.º 16

0

Mostrar archivo

 def __call__(self, url, html):
     if url == self.seed_url:
         urls = []
         cache = MongoCache()
         with ZipFile(StringIO(html)) as zf:
             csv_filename = zf.namelist()[0]
             for _, website in csv.reader(zf.open(csv_filename)):
                 if 'http://' + website not in cache:
                     urls.append('http://' + website)
                     if len(urls) == self.max_urls:
                         break
         return urls

Ejemplo n.º 17

0

Mostrar archivo

Archivo: alexaCallback.py Proyecto: fjh1997/Learn-Python

 def __call__(self, url, html):
     if url == self.seed_url:
         urls = []
         cache = MongoCache()
         with ZipFile(BytesIO(html.content)) as zf:
             csv_filename = zf.namelist()[0]
             data = StringIO(zf.open(csv_filename).read().decode('utf-8'))
             for _, website in csv.reader(data):
                 if 'http://' + website not in cache:
                     urls.append('http://' + website)
                     if len(urls) == self.max_urls:
                         break
         return urls

Ejemplo n.º 18

0

Mostrar archivo

Archivo: threaded_test.py Proyecto: basicworld/pycrawler

def test(max_threads):
    start_url = 'http://www.alexa.com/topsites/global;0'
    scrape_callback = AlaxeCallback(allow_domains=[start_url])
    cache = MongoCache()
    # start_url = 'http://www.eastday.com'
    # start_url = 'http://www.qq.com'

    threaded_crawler(start_url,
                     link_regex='/topsites/global;',
                     cache=cache,
                     scrape_callback=scrape_callback,
                     max_threads=max_threads,
                     timeout=5)

Ejemplo n.º 19

0

Mostrar archivo

def com_alexa():
    """
    从该网址下载一些热门网址
    """
    start_url = 'http://www.alexa.com/topsites/global;0'
    scrape_callback = AlaxeCallback(allow_domains=start_url)
    link_crawler(start_url,
                 link_regex='/topsites/global;',
                 delay=3,
                 only_same_host=False,
                 save_cache=False,
                 max_urls=100,
                 cache=MongoCache(),
                 scrape_callback=scrape_callback,
                 timeout=3)
    del scrape_callback

Ejemplo n.º 20

0

Mostrar archivo

    def __init__(
            self, output_dir, start_date, end_date, chosen_program=None,
            use_cache=False):
        self.output_dir = output_dir
        self.start_date = start_date
        self.end_date = end_date
        self.chosen_program = chosen_program

        self.base_url = 'https://www.byte.fm'
        self.header = ["program", "date", "title", "artist", "album", "label"]
        self.parser = HTMLParser()
        if use_cache:
            from mongo_cache import MongoCache
            cache = MongoCache()
        else:
            cache = None
        self.Downloader = Downloader(cache=cache)

Ejemplo n.º 21

0

Mostrar archivo

Archivo: process_test.py Proyecto: freedomofme/CrawlerNovels

def main(max_threads = 5):
    catlog_callback = AlexaCallback()
    cache = MongoCache()
    queue = MongoQueue()


    client = MongoClient('localhost', 27017, connect=False)
        #create collection to store cached webpages,
        # which is the equivalent of a table in a relational database
    db = client.cache
    cursor = db.books.find()

    urls = []
    while cursor.alive:
        temp = cursor.next()
        temp = temp['link']

        if urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.junzige.la':
            temp = '/novel' + temp[5:-4] + '/'
            temp = normalize(catlog_callback.seed_url, temp)
        elif urlparse.urlparse(catlog_callback.seed_url).netloc == 'www.boluoxs.com':
            temp = 'http://www.boluoxs.com/biquge/0/' + temp[temp.rfind('/') + 1 :temp.rfind('.')] + '/'

        print temp
        urls.append(temp)

    print urls[0]

    while True:
        now = datetime.now()

        if now.hour < 3 or now.hour > 12:
            queue.repairFast()
            process_crawler(urls, scrape_callback=catlog_callback, cache=cache, max_threads=max_threads, timeout=30, host = urlparse.urlparse(catlog_callback.seed_url).netloc, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36')
            # every time finished, clear the job queue
            queue.clear()
        else:
            print 'pass:' + str(now)
            pass
        time.sleep(3600)

Ejemplo n.º 22

0

Mostrar archivo

Archivo: sequential_test.py Proyecto: Alchemy2011/spider

def main():
    scrape_callback = AlexaCallback()
    cache = MongoCache()
    cache.clear()
    link_crawler(scrape_callback.seed_url, scrape_callback=scrape_callback, cache=cache, user_agent='GoodCrawler', ignore_robots=True)

Ejemplo n.º 23

0

Mostrar archivo

def link_crawler(seed_url,link_regex_large,link_regex_small,max_depth=2,max_threads=5):
	'Crawl from the given seed URL following links matchedly by link_regex'
	print 'seed_ur',seed_url
	#crawl_queue=[seed_url]
	crawl_queue=Mongo_Queue()
	#seen={seed_url:0}#no need this seen for Mongo_Queue will take care of duplicate url
	#crawl_queue=Mongo_Queue.push(seed_url)
	crawl_queue.push(seed_url)
	depth=(crawl_queue.get_item(seed_url))['depth']
	print 'seedurldepth:',depth
	cache=MongoCache()
	D=Download(cache=cache)
	#result_links=set()
	csvFile=open('D:/Work/Projects/realestate/app/static/163_money.csv','wb')
	writer=csv.writer(csvFile)

	def process_queue():
		'extract the page_download part as a function, so that every tread can call it to download page'
		while True:
			try:
				url=crawl_queue.pop()
			except KeyError:
				#no url in crawl_queue
				break
			else:
				depth=(crawl_queue.get_item(url))['depth']
				'depth=128,129'
				#print depth
				if depth<=max_depth:
					html=D(url)
					links=re.findall(link_regex_large,html)

					for link in links:
						if re.match(link_regex_small,link):
							writer.writerow((link,''))
							#writer.writerow((link,''))
							print link
						else:
							crawl_queue.push(link,depth+1)
							#encoding=chardet.detect(link)
							#link=link.decode(encoding).encode('utf-8')
							#crawl_queue.push(link,depth+1)
							#seen[link]=depth+1
				crawl_queue.complete(url)
								
	threads=[]
	while crawl_queue or threads:
		while len(threads)<max_threads and crawl_queue:
			#can start some more threads
			thread=threading.Thread(target=process_queue)
			#daemon's value must be set before start(), or RuntimeError will rarise. set deamon=Ture ,so that main thread can exit when receieve ctrl-c
			thread.setDaemon(True)
			thread.start()
			threads.append(thread)

		for thread in threads:
			if not thread.is_alive():
				#remove the stopped threads
				threads.remove(thread)

	csvFile.close()

Ejemplo n.º 24

0

Mostrar archivo

Archivo: innproject.py Proyecto: evanleungc/spider_test

import pandas as pd
import re
import numpy as np
from process_crawler import process_crawler
from mongo_queue import MongoQueue
from mongo_cache import MongoCache
from mongo_info import MongoInfo
from downloader import Downloader
from lxml import etree

crawl_queue = MongoQueue()
webpage_cache = MongoCache()
DEFAULT_AGENT = {}
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 100
DEFAULT_PROXY_LIST = '/Users/apple/Desktop/connect/proxylist/proxies.csv'
DEFAULT_COOKIE = {}

D = Downloader(delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=DEFAULT_PROXY_LIST, \
        cookies = DEFAULT_COOKIE, num_retries=DEFAULT_RETRIES, timeout=DEFAULT_TIMEOUT, \
        opener=None, cache=MongoCache())

def usere(regex, getcontent): #定义使用正则表达式的函数
    pattern = re.compile(regex)
    content = re.findall(pattern, getcontent)
    return content

#Obtain target urls
startdate = '20180414'
enddate = '20180415'

Ejemplo n.º 25

0

Mostrar archivo

Archivo: test_scrapers.py Proyecto: jackdbd/scrapers

 def test_cache_not_yet_expired(self):
     cache = MongoCache()
     cache[self.url] = self.result
     self.assertIsInstance(cache[self.url], dict)

Ejemplo n.º 26

0

Mostrar archivo


def get_links(html):
    """Return a list of links from html 
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


if __name__ == '__main__':
    from mongo_cache import MongoCache

    class CallBack:
        def __init__(self, filename='log.txt'):
            self.file = open(filename, 'w+')

        def __call__(self, url, html):
            self.file.write("{}\n".format(url))

    cache = MongoCache()
    link_crawler('http://example.webscraping.com/places/default',
                 '/places/default/(index|view)',
                 delay=1,
                 num_retries=1,
                 max_depth=3,
                 user_agent='GoodCrawler',
                 cache=cache,
                 scrape_callback=CallBack())

Ejemplo n.º 27

0

Mostrar archivo

Archivo: link_crawler.py Proyecto: nightqiuhua/tenxun_crawler

				link = normalize(seed_url,link)
				if link not in seen:
					seen[link] = depth+1
					if same_domain(link,seed_url):
						crawl_queue.append(link)
						#print('crawl_queue=',crawl_queue)
		num +=1
		print('num=',num)
		num_urls +=1
		if num_urls == max_urls:
				break


def get_links(html):
	webpage_regex=re.compile('<a href="position.php\?(.*?)"',re.IGNORECASE)
	#print('webpage_regex.findall(html)=',webpage_regex.findall(html))
	return webpage_regex.findall(html)

def normalize(seed_url,link):
	link,_=urllib.parse.urldefrag(link)
	return urllib.parse.urljoin(seed_url,link)

def same_domain(url_1,url_2):
	return urllib.parse.urlparse(url_1).netloc == urllib.parse.urlparse(url_2).netloc

link_crawler('https://hr.tencent.com/position.php?keywords=python', 'keywords=python&start=', cache=MongoCache())

Ejemplo n.º 28

0

Mostrar archivo

# -*- coding: utf-8 -*-
from datetime import timedelta

from pymongo import MongoClient

from mongo_cache import MongoCache

cache = MongoCache()
cache.clear()
url = 'http://example.webscraping.comasdf'
result = {'html': '...'}
cache[url] = result
print(cache[url]['html'] == result['html'])
cache = MongoCache(expires=timedelta())
cache[url] = result
import time
time.sleep(60)
print(cache[url])

Ejemplo n.º 29

0

Mostrar archivo

Archivo: fangjia_main.py Proyecto: qthinker/web-spider

from fangjia_thread_crawler import thread_crawler
from fangjia_cb import FangjiaCallback
from mongo_cache import MongoCache
from downloader import Downloader
from fangjia2 import get_search
from fangjia2 import get_info_list
from fangjia2 import download
import pandas as pd
import cPickle
import os

if __name__ == '__main__':
    # get the seed_urls
    starttime = datetime.datetime.now()
    seed_urls = []
    cache = MongoCache()  # cache all pages
    if os.path.exists('seed_urls.pkl'):
        with open('seed_urls.pkl', 'rb') as fp:
            seed_urls = cPickle.load(fp)
    else:
        base_url = r'http://cd.fangjia.com/ershoufang/'
        search_list = []  # 房源信息url列表
        tmp_list = []  # 房源信息url缓存列表
        layer = -1
        # 一级筛选
        #D = Downloader(cache=cache)
        page = download(base_url)
        search_dict = get_search(page, 'r-')
        # 二级筛选
        for k in search_dict:
            print u'****************一级抓取：正在抓取【%s】***************' % k

Ejemplo n.º 30

0

Mostrar archivo

Archivo: link_crawler.py Proyecto: czerwonykalafior/scrape_with_penman

    """Initialize robots parser for this domain
    """
    rp = robotparser.RobotFileParser()
    rp.set_url(urlparse.urljoin(url, '/robots.txt'))
    rp.read()
    return rp


def get_links(html):
    """Return a list of links from html
    """
    # a regular expression to extract all links from the webpage
    webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
    # list of all links from the webpage
    return webpage_regex.findall(html)


if __name__ == '__main__':
    link_crawler('http://example.webscraping.com',
                 '/(index|view)',
                 delay=0,
                 num_retries=1,
                 user_agent='BadCrawler')
    link_crawler('http://example.webscraping.com',
                 '/places/default/view',
                 delay=0,
                 num_retries=1,
                 max_depth=10,
                 user_agent='GoodCrawler',
                 cache=MongoCache(expires=datetime.timedelta()))