def downloadArchivesList(aList, container, extension='.txt.gz', numThreads=5):
    '''Set up downloader'''
    queue = initDownloader(numThreads)

    import csv
    f = open(aList, 'rb')
    reader = csv.reader(f)
    for row in reader:
        startURL = row[0]
        
        mlName = startURL.split('/')[-2]
        
        spider = Spider(startURL)
        spider.process_page(startURL)
            
        '''Only the links to archive files are interesting:
        mailing list archive file names end with '.txt.gz' '''
        urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
        if len(urlList):
            print '%s: %d archives' % (mlName, len(urlList))
            store = os.path.join(container, mlName)
            if not (os.path.isdir(store)):
                    os.system("mkdir %s" % store)
                
            '''Download each archive'''
            addToQ(queue, urlList, store)
                        
    '''If here, download finished. Stop threads'''
    stopDownloader(queue, numThreads)
def downloadArchives(startURL, container, lookInsideSubfolders=False, extension='.txt.gz', numThreads=5):
    '''Crawl <startURL> and find all mailing list archives (given the filename <extension>).
    Store the files in the folder with the path <container>.
    If <lookInsideSubfolders>, then go one level deeper (crawl all first-order links as well).
    '''

    '''Set up downloader'''
    queue = initDownloader(numThreads)
    
    print 'Downloading archives from', startURL
        
    if not lookInsideSubfolders:
        spider = Spider(startURL)
        spider.process_page(startURL)
    
        '''Only the links to archive files are interesting:
        mailing list archive file names end with '.txt.gz' '''
        urlList = [x for x in sorted(spider.URLs) if x.endswith(extension)]
        print '%d archives' % (len(urlList))
            
        addToQ(queue, urlList, container)
            
    else:        
        spider = Spider(startURL)
        spider.process_page(startURL)
        
        for link in sorted(spider.URLs):
            subspider = Spider(link)
            subspider.process_page(link)
            
            mlName = link.split('/')[-2]
    
            '''Only the links to archive files are interesting:
            mailing list archive file names end with '.txt.gz' '''
            urlList = [x for x in sorted(subspider.URLs) if x.endswith(extension)]
            if len(urlList):
                print '%s: %d archives' % (mlName, len(urlList))
                '''Create a folder for the mailing list'''
                store = os.path.join(container, mlName)
                if not (os.path.isdir(store)):
                    os.system("mkdir %s" % store)
                
                addToQ(queue, urlList, store)
                    
    '''If here, download finished. Stop threads'''
    stopDownloader(queue, numThreads)