コード例 #1
0
def main():
    ''' main routine function '''

    # argument passing and config file reading
    st = Settings()

    # start queue service
    qs = TaskQueue()

    # start de-duplicate hash
    cc = DeDupeCache()

    # kick off dispatcher
    dp = Dispatcher(qs, cc, st)
    dp.run()
コード例 #2
0
  def test_check_dup(self):
    ''' test de-dupe simple case with 3 URLs '''
    cc = DeDupeCache()
    self.assertFalse(cc.is_url_dup('http://www.google.com'))
    self.assertTrue(cc.url_count == 1)
    self.assertTrue(cc.is_url_dup('http://www.google.com'))
    self.assertTrue(cc.url_count == 1)

    self.assertFalse(cc.is_url_dup('http://www.nyu.edu'))
    self.assertTrue(cc.url_count == 2)
    self.assertTrue(cc.is_url_dup('http://www.nyu.edu'))
    self.assertTrue(cc.url_count == 2)

    self.assertFalse(cc.is_url_dup('http://www.nyu.edu/engineering'))
    self.assertTrue(cc.url_count == 3)
    self.assertTrue(cc.is_url_dup('http://www.nyu.edu/engineering'))
    self.assertTrue(cc.url_count == 3)
コード例 #3
0
  def test_check_bulk(self):
    ''' test de-dupe with 100k URLs '''
    cc = DeDupeCache()
    
    for idx in range(100000):
      url = 'http://www.nyu.edu/engineering/access.aspx?magicnum=%d' % idx
      self.assertFalse(cc.is_url_dup(url))
      self.assertTrue(cc.url_count == idx + 1)
      self.assertTrue(cc.is_url_dup(url))
      self.assertTrue(cc.url_count == idx + 1)
      self.assertTrue(cc.is_url_dup(url))
      self.assertTrue(cc.url_count == idx + 1)

    for idx in range(100000):
      url = 'http://www.nyu.edu/engineering/access.aspx?magicnum=%d' % idx
      cc.del_url(url)
      self.assertTrue(cc.url_count == 100000 - idx - 1)

    self.check_empty_cache(cc)
コード例 #4
0
 def test_init_cache(self):
   ''' test DeDupeCache() class initialization '''
   cc = DeDupeCache()
   self.check_empty_cache(cc)