def test_site_craigslist_ads(self):
        all_urls = test_util.get_urls("http://newyork.craigslist.org/search/edu", None)
        print 'urls', len(all_urls), all_urls
        c = cluster_urls(all_urls, 2)
        clusters = {'url_clusters': [] }
        for key, urls in c['clusters'].items():                
            cluster = {
                'regex': key[0],
                'human': key[1],
                'urls': urls 
            }
            clusters['url_clusters'].append(cluster)        
        print clusters
        print '---------------------------------'        
 
        all_urls = test_util.get_urls("http://newyork.craigslist.org/search/eng", None)
        print 'urls', len(all_urls), all_urls
        c = cluster_urls(all_urls, 2)
        clusters = {'url_clusters': [] }
        for key, urls in c['clusters'].items():                
            cluster = {
                'regex': key[0],
                'human': key[1],
                'urls': urls 
            }
            clusters['url_clusters'].append(cluster)        
        print clusters
        print '---------------------------------'        
 def test_single_cluster(self):
     urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     c = cluster_urls(urls, 10)
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://s.com/blah/(\\d+)', 'http://s.com/blah/[NUMBER]')
                 ]))
     self.assertEqual(sorted(c['clusters'].values()), sorted([urls]))
 def test_mixed(self):
     c_urls = ['http://s.com/blah/?id=%d' % x for x in range(1, 20)]
     u_urls = ['http://s.com/asdf', 'http://s.com/a/a/b']
     c = cluster_urls(c_urls + u_urls, 10)
     self.assertEqual(sorted(c['unclustered']), sorted(u_urls))
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://s.com/blah/?\\?id=(\\d+)',
                  'http://s.com/blah?id=[NUMBER]')]))
     self.assertEqual(sorted(c['clusters'].values()), sorted([c_urls]))
 def test_single_cluster(self):
     urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     c = cluster_urls(urls, 10)
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://s.com/blah/(\\d+)',
                  'http://s.com/blah/[NUMBER]')]))
     self.assertEqual(
         sorted(c['clusters'].values()), sorted([urls]))
 def test_mixed(self):
     c_urls = ['http://s.com/blah/?id=%d' % x for x in range(1, 20)]
     u_urls = ['http://s.com/asdf', 'http://s.com/a/a/b']
     c = cluster_urls(c_urls + u_urls, 10)
     self.assertEqual(sorted(c['unclustered']), sorted(u_urls))
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://s.com/blah/?\\?id=(\\d+)',
                  'http://s.com/blah?id=[NUMBER]')]))
     self.assertEqual(
         sorted(c['clusters'].values()), sorted([c_urls]))
 def test_other(self):
     x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)]
     z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)]
     c = cluster_urls(x_urls + y_urls + z_urls, 10)
     improve_patterns(c['clusters'])
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([
             ('http://b.com/ab/aa([^/]+)bb', 'http://b.com/ab/aa[...]bb'),
             ('http://s.com/blah/(\\d+)', 'http://s.com/blah/[NUMBER]'),
             ('http://s.com/a/b/aa([^/]+)bb', 'http://s.com/a/b/aa[...]bb')
         ]))
 def test_site_backpage(self):
    all_urls = test_util.get_urls("http://www.backpage.com", None)
    print 'urls', len(all_urls), all_urls
    c = cluster_urls(all_urls, 2)
    clusters = {'url_clusters': [] }
    for key, urls in c['clusters'].items():                
        cluster = {
            'regex': key[0],
            'human': key[1],
            'urls': urls 
        }
        clusters['url_clusters'].append(cluster)        
    print clusters
    print '---------------------------------'
 def test_other(self):
     x_urls = ['http://s.com/blah/%d' % x for x in range(1, 20)]
     y_urls = ['http://s.com/a/b/aa%dbb' % x for x in range(1, 20)]
     z_urls = ['http://b.com/ab/aa%dbb' % x for x in range(1, 50)]
     c = cluster_urls(x_urls + y_urls + z_urls, 10)
     improve_patterns(c['clusters'])
     self.assertEqual(c['unclustered'], [])
     self.assertEqual(
         sorted(c['clusters'].keys()),
         sorted([('http://b.com/ab/aa([^/]+)bb',
                  'http://b.com/ab/aa[...]bb'),
                 ('http://s.com/blah/(\\d+)',
                  'http://s.com/blah/[NUMBER]'),
                 ('http://s.com/a/b/aa([^/]+)bb',
                  'http://s.com/a/b/aa[...]bb')]))