def test_兩句例句(self): rr = scrapy.http.TextResponse( 'http://e-dictionary.apc.gov.tw/ais/Term.htm', body=self.無source有兩句.encode(), encoding='utf-8', request=self.要求, ) 結果 = Spider().掠詞條(rr) 答案 = [{ "pronounce": None, "sentence": "O 'opo no panay kora ma'araway no mita i potal na Mayaw.", "zh_Hant": "在Mayaw家的庭院我們看到的是堆積如山的稻穀。", "description": "解釋1:堆集晒過的穀子" }, { "pronounce": None, "sentence": "'Opo han no mita ko iraay a sito.", "zh_Hant": "我們把學生集合起來。", "description": "解釋2:集中;集合" }, { "pronounce": None, "sentence": None, "zh_Hant": None, "description": "解釋3:會議" }] self.assertEqual(結果['examples'], 答案)
def test_有例句詞條(self): rr = scrapy.http.TextResponse( 'http://e-dictionary.apc.gov.tw/ais/Term.htm', body=self.例句回應.encode(), encoding='utf-8', request=self.要求, ) 結果 = Spider().掠詞條(rr) 答案 = { "source": "ahowid", "pronounce": "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ami/aahowiden_{1}.mp3", "frequency": "詞頻:★(1)", "examples": [{ "pronounce": "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ami/aahowiden_{1}_@_1.1.mp3", "sentence": "O aahowiden no mita ko pasifana'ay a singsi.", "zh_Hant": "教導我們的老師是值得我們感謝的。", "description": "解釋1:值得去感謝者" }], "name": "aahowiden" } self.assertEqual(結果['examples'], 答案['examples']) self.assertEqual(結果, 答案)
def test_系統維護中(self): rr = scrapy.http.TextResponse( 'http://e-dictionary.apc.gov.tw/ais/Term.htm', body=self.系統維護中回應.encode(), encoding='utf-8', request=self.要求, ) self.assertIsNone(Spider().掠詞條(rr))
def test_無發音回應(self): rr = scrapy.http.TextResponse( 'http://e-dictionary.apc.gov.tw/ais/Term.htm', body=self.無發音回應.encode(), encoding='utf-8', request=self.要求, ) 結果 = Spider().掠詞條(rr) self.assertEqual(結果['pronounce'], None)
class SpiderTest(TestCase): def setUp(self): self.sut = Spider(Url('https://www.technovium.nl', 1), 'testSpider', UrlDAOMock(), MockRedis()) Properties.SPIDER_MAX_PAGES = 1 def tearDown(self): Properties.SPIDER_MAX_PAGES = 100 Properties.SPIDER_MAX_DEPTH = 5 def test_add_links_to_queue(self): Properties.SPIDER_MAX_DEPTH = 0 self.sut.run() self.assertGreater(1, len(self.sut.deque)) def test_http_error(self): Properties.SPIDER_MAX_DEPTH = 1 self.sut = Spider(Url('http://example.com/', 1), 'testSpider', UrlDAOMock(), MockRedis()) self.sut.run() self.assertGreater(len(self.sut.crawled), 0)
def crawl_wikipedia(self): # util.verbose_print("\t\t - [%s] Crawling Wikipedia" % self.name) self.spider = Spider.Spider(self.name, self.wikipedia_url) episode_data = self.spider.run() episode_data = self.remove_empty_episodes(episode_data) if len(episode_data) != 0: print("%35s Seasons: %-3d | Episodes: %-3d" % (("[%s]" % self.name), episode_data[-1]['season'], episode_data[-1]['episode_id'])) else: print("%35s ERROR, Broken Episode Data..." % ("[%s]" % self.name)) return episode_data
def test_正常詞條(self): rr = scrapy.http.TextResponse( 'http://e-dictionary.apc.gov.tw/ais/Term.htm', body=self.正常回應.encode(), encoding='utf-8', request=self.要求, ) 結果 = Spider().掠詞條(rr) self.assertIn('examples', 結果) self.assertEqual(結果['name'], "a:su'") self.assertEqual(結果['source'], "asu'") self.assertEqual( 結果['pronounce'], "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ais/a:su'_{1}.mp3" ) self.assertEqual(結果['frequency'], '詞頻:★(1)')
def crawl_imdb(self): util.verbose_print("\t\t - [%s] Crawling IMDB" % self.name) self.spider = Spider.Spider(self.name, self.imdb_url) self.spider.run()
def work(): while True: url = threadQueue.get() # print url + "hello" Spider.crawlPage(threading.current_thread().name, url) threadQueue.task_done()
from general import * import threading from Queue import Queue from crawler import Spider import sys from config import * reload(sys) sys.setdefaultencoding('utf8') threadQueue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) def createWorkers(): for _ in range(NUMBER_OF_THREADS): t = threading.Thread(target=work) t.daemon = True t.start() def work(): while True: url = threadQueue.get() # print url + "hello" Spider.crawlPage(threading.current_thread().name, url) threadQueue.task_done() def createJobs(): for link in fileToSet(QUEUE_FILE):
from pymongo import * from flask import Flask, json, request from crawler import Spider from bson import json_util import sys from datetime import datetime from threading import Timer app = Flask(__name__) client = MongoClient('localhost', 27017) db = client.crawler #collection = db.storage collection = db.storage_test spider = Spider() spider.set_datebase(collection) ''' Intital ''' @app.before_first_request def start_crawler(): spider.get_from_category_entry() #daily_crawler() def daily_crawler(): spider.get_from_category_entry() x = datetime.today()
def test_http_error(self): Properties.SPIDER_MAX_DEPTH = 1 self.sut = Spider(Url('http://example.com/', 1), 'testSpider', UrlDAOMock(), MockRedis()) self.sut.run() self.assertGreater(len(self.sut.crawled), 0)
def setUp(self): self.sut = Spider(Url('https://www.technovium.nl', 1), 'testSpider', UrlDAOMock(), MockRedis()) Properties.SPIDER_MAX_PAGES = 1