Ejemplo n.º 1
0
            logger.info("Found file " + file_url)
            filename = file_url.split('/')[-2] # splits and gets the word before /download, which is the filename
            try:
                extension = get_extension(filename)
                if extension in archive_types:
                    proj_ext_counter[extension] += 1
            except ValueError: # file without extension
                pass
        else: # is a directory
            file_queue.append("http://sourceforge.net" + file_url)

pool = ThreadPool(16)
for project_name in project_names:
    proj_ext_counter = Counter()
    file_queue = deque(["http://sourceforge.net/projects/%s/files/" % project_name])
    while file_queue:
        url = file_queue.popleft()
        pool.add_task(visit_project_file, url)
        if not file_queue:
            pool.wait_completion()
    try:
        extension, count = proj_ext_counter.most_common(1)[0]
        ext_counter[extension] += 1
    except IndexError: # no known archive files in project
        pass

print ext_counter
logger.info("Result: %s" % ext_counter)
error_log.close()
http_log.close()