def main(): task_queue = TaskQueue() df = KalmanFilter() dd = GetDistance() dw = DrawSingle(ylim_min=0, ylim_max=10, key='distance') task_queue.append(df) task_queue.append(dd) task_queue.append(dw) socketRun(task_queue.update, port=8070)
def test_page_crawler_init(self): ''' test generic page crawler initialization ''' url = 'http://www.nyu.edu/engineering' page = Page(url, depth=1, score=9) queue = TaskQueue() keywords = ['nyu', 'poly'] cr = GenericPageCrawler(page, queue, None, None, keywords, fake=True) url = u'http://www.nyu.edu/engineering' cr = GenericPageCrawler(page, queue, None, None, keywords, fake=True) url = u'http://www.google.com/search?q=♥' cr = GenericPageCrawler(page, queue, None, None, keywords, fake=True)
def main(): ''' main routine function ''' # argument passing and config file reading st = Settings() # start queue service qs = TaskQueue() # start de-duplicate hash cc = DeDupeCache() # kick off dispatcher dp = Dispatcher(qs, cc, st) dp.run()
def test_simple_enqueue_dequeue(self): q = TaskQueue() task = Page('http://www.google.com', 1, 80) q.en_queue(task) self.assertTrue(q.total_task_cnt == 1) self.assertTrue(q.prio_task_cnt[0] == 1) self.assertTrue(q.prio_task_list[0] == [task]) outtask = q.de_queue() self.assertTrue(outtask.depth == 1) self.assertTrue(outtask.score == 80) self.assertTrue(outtask.url == 'http://www.google.com') self.check_empty_queue(q)
def test_normalize_url(self): ''' test normalize url function ''' url = 'http://www.poly.edu/admission/page.html#tuition' page = Page(url, depth=1, score=9) queue = TaskQueue() keywords = ['nyu', 'poly'] self.assertTrue( vc.normalize_link(url) == 'http://www.poly.edu/admission/page.html') url2 = 'http://www.poly.edu/admission/page.html#tuition#abc' self.assertTrue( vc.normalize_link(url2) == 'http://www.poly.edu/admission/page.html')
def test_simplify_url(self): url = "http://www.poly.edu/admission/../page.html" page = Page(url, depth=1, score=9) queue = TaskQueue() keywords = ['nyu', 'poly'] self.assertTrue( vc.simplify_link(url) == 'http://www.poly.edu/page.html') url2 = 'http://www.poly.edu/./page.html' self.assertTrue( vc.simplify_link(url2) == 'http://www.poly.edu/page.html') url3 = 'http://www.poly.edu/../../../../page.html' self.assertTrue( vc.simplify_link(url3) == 'http://www.poly.edu/page.html') url4 = 'http://www.poly.edu/aa/bb/cc/../page.html' self.assertTrue( vc.simplify_link(url4) == 'http://www.poly.edu/aa/bb/page.html') url5 = 'http://www.poly.edu/aa/bb/cc/../../../page.html' self.assertTrue( vc.simplify_link(url5) == 'http://www.poly.edu/page.html') url6 = 'http://www.poly.edu/aa/bb/cc/../../../../page.html' self.assertTrue( vc.simplify_link(url6) == 'http://www.poly.edu/page.html') url7 = 'http://www.poly.edu/./././aa/././././bb/./cc/.././././page.html' self.assertTrue( vc.simplify_link(url7) == 'http://www.poly.edu/aa/bb/page.html') url8 = [ 'http://www.poly.edu/index.html', 'http://www.poly.edu/index.htm', 'http://www.poly.edu/index.jsp', 'http://www.poly.edu/index.asp', 'http://www.poly.edu/index.aspx', 'http://www.poly.edu/index.php', ] for url in url8: self.assertTrue(vc.simplify_link(url) == 'http://www.poly.edu') url9 = 'http://www.poly.edu/a/../../b/index.html' self.assertTrue(vc.simplify_link(url9) == 'http://www.poly.edu/b')
def test_bulk_enqueue_dequeue(self): q = TaskQueue() for cnt in range(10000): task = Page('http://www.nyu.edu/engineering', 2, 60) q.en_queue(task) self.assertTrue(q.total_task_cnt == 10000) self.assertTrue(q.prio_task_cnt[0] == 10000) self.assertTrue(len(q.prio_task_list[0]) == 10000) while 1: outtask = q.de_queue() if not outtask: break self.assertTrue(outtask.url == 'http://www.nyu.edu/engineering') self.assertTrue(outtask.depth == 2) self.assertTrue(outtask.score == 60) self.check_empty_queue(q)
def test_init(self): q = TaskQueue() self.check_empty_queue(q)