Esempio n. 1
0
 def test_兩句例句(self):
     rr = scrapy.http.TextResponse(
         'http://e-dictionary.apc.gov.tw/ais/Term.htm',
         body=self.無source有兩句.encode(),
         encoding='utf-8',
         request=self.要求,
     )
     結果 = Spider().掠詞條(rr)
     答案 = [{
         "pronounce": None,
         "sentence":
         "O 'opo no panay kora ma'araway no mita i potal na Mayaw.",
         "zh_Hant": "在Mayaw家的庭院我們看到的是堆積如山的稻穀。",
         "description": "解釋1:堆集晒過的穀子"
     }, {
         "pronounce": None,
         "sentence": "'Opo han no mita ko iraay a sito.",
         "zh_Hant": "我們把學生集合起來。",
         "description": "解釋2:集中;集合"
     }, {
         "pronounce": None,
         "sentence": None,
         "zh_Hant": None,
         "description": "解釋3:會議"
     }]
     self.assertEqual(結果['examples'], 答案)
Esempio n. 2
0
 def test_有例句詞條(self):
     rr = scrapy.http.TextResponse(
         'http://e-dictionary.apc.gov.tw/ais/Term.htm',
         body=self.例句回應.encode(),
         encoding='utf-8',
         request=self.要求,
     )
     結果 = Spider().掠詞條(rr)
     答案 = {
         "source":
         "ahowid",
         "pronounce":
         "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ami/aahowiden_{1}.mp3",
         "frequency":
         "詞頻:★(1)",
         "examples": [{
             "pronounce":
             "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ami/aahowiden_{1}_@_1.1.mp3",
             "sentence": "O aahowiden no mita ko pasifana'ay a singsi.",
             "zh_Hant": "教導我們的老師是值得我們感謝的。",
             "description": "解釋1:值得去感謝者"
         }],
         "name":
         "aahowiden"
     }
     self.assertEqual(結果['examples'], 答案['examples'])
     self.assertEqual(結果, 答案)
Esempio n. 3
0
 def test_系統維護中(self):
     rr = scrapy.http.TextResponse(
         'http://e-dictionary.apc.gov.tw/ais/Term.htm',
         body=self.系統維護中回應.encode(),
         encoding='utf-8',
         request=self.要求,
     )
     self.assertIsNone(Spider().掠詞條(rr))
Esempio n. 4
0
 def test_無發音回應(self):
     rr = scrapy.http.TextResponse(
         'http://e-dictionary.apc.gov.tw/ais/Term.htm',
         body=self.無發音回應.encode(),
         encoding='utf-8',
         request=self.要求,
     )
     結果 = Spider().掠詞條(rr)
     self.assertEqual(結果['pronounce'], None)
Esempio n. 5
0
class SpiderTest(TestCase):
    def setUp(self):
        self.sut = Spider(Url('https://www.technovium.nl', 1), 'testSpider', UrlDAOMock(), MockRedis())
        Properties.SPIDER_MAX_PAGES = 1

    def tearDown(self):
        Properties.SPIDER_MAX_PAGES = 100
        Properties.SPIDER_MAX_DEPTH = 5

    def test_add_links_to_queue(self):
        Properties.SPIDER_MAX_DEPTH = 0
        self.sut.run()
        self.assertGreater(1, len(self.sut.deque))

    def test_http_error(self):
        Properties.SPIDER_MAX_DEPTH = 1
        self.sut = Spider(Url('http://example.com/', 1), 'testSpider', UrlDAOMock(), MockRedis())
        self.sut.run()
        self.assertGreater(len(self.sut.crawled), 0)
Esempio n. 6
0
 def crawl_wikipedia(self):
     # util.verbose_print("\t\t - [%s] Crawling Wikipedia" % self.name)
     self.spider = Spider.Spider(self.name, self.wikipedia_url)
     episode_data = self.spider.run()
     episode_data = self.remove_empty_episodes(episode_data)
     if len(episode_data) != 0:
         print("%35s Seasons: %-3d | Episodes: %-3d" %
               (("[%s]" % self.name), episode_data[-1]['season'],
                episode_data[-1]['episode_id']))
     else:
         print("%35s ERROR, Broken Episode Data..." % ("[%s]" % self.name))
     return episode_data
Esempio n. 7
0
 def test_正常詞條(self):
     rr = scrapy.http.TextResponse(
         'http://e-dictionary.apc.gov.tw/ais/Term.htm',
         body=self.正常回應.encode(),
         encoding='utf-8',
         request=self.要求,
     )
     結果 = Spider().掠詞條(rr)
     self.assertIn('examples', 結果)
     self.assertEqual(結果['name'], "a:su'")
     self.assertEqual(結果['source'], "asu'")
     self.assertEqual(
         結果['pronounce'],
         "http://e-dictionary.apc.gov.tw/MultiMedia/Audio/ais/a:su'_{1}.mp3"
     )
     self.assertEqual(結果['frequency'], '詞頻:★(1)')
Esempio n. 8
0
 def crawl_imdb(self):
     util.verbose_print("\t\t - [%s] Crawling IMDB" % self.name)
     self.spider = Spider.Spider(self.name, self.imdb_url)
     self.spider.run()
Esempio n. 9
0
def work():
    while True:
        url = threadQueue.get()
        # print url + "hello"
        Spider.crawlPage(threading.current_thread().name, url)
        threadQueue.task_done()
Esempio n. 10
0
from general import *
import threading
from Queue import Queue
from crawler import Spider
import sys
from config import *

reload(sys)
sys.setdefaultencoding('utf8')

threadQueue = Queue()
Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME)


def createWorkers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


def work():
    while True:
        url = threadQueue.get()
        # print url + "hello"
        Spider.crawlPage(threading.current_thread().name, url)
        threadQueue.task_done()


def createJobs():
    for link in fileToSet(QUEUE_FILE):
Esempio n. 11
0
from pymongo import *
from flask import Flask, json, request
from crawler import Spider
from bson import json_util
import sys
from datetime import datetime
from threading import Timer

app = Flask(__name__)

client = MongoClient('localhost', 27017)
db = client.crawler
#collection = db.storage
collection = db.storage_test

spider = Spider()
spider.set_datebase(collection)
'''
Intital
'''


@app.before_first_request
def start_crawler():
    spider.get_from_category_entry()
    #daily_crawler()


def daily_crawler():
    spider.get_from_category_entry()
    x = datetime.today()
Esempio n. 12
0
 def test_http_error(self):
     Properties.SPIDER_MAX_DEPTH = 1
     self.sut = Spider(Url('http://example.com/', 1), 'testSpider', UrlDAOMock(), MockRedis())
     self.sut.run()
     self.assertGreater(len(self.sut.crawled), 0)
Esempio n. 13
0
 def setUp(self):
     self.sut = Spider(Url('https://www.technovium.nl', 1), 'testSpider', UrlDAOMock(), MockRedis())
     Properties.SPIDER_MAX_PAGES = 1