def downloadFromList(alist, directory=".", timeout=10): """Get files from a list of urls. return : list, contained the failure fetch""" failure = [] for url in alist: print alist.index(url), stream = getStream(url, timeout=timeout) file_name = getFilenameFromURL(url) if not stream or not writeBinFile(stream, file_name, directory): failure.append(url) return failure
def downloadFromQueue(queue, failure, directory='.', timeout=10): """Get files from a list of urls. return : list, contained the failure fetch""" while not queue.empty(): url = queue.get() stream = getStream(url, timeout=timeout) file_name = getFilenameFromURL(url) if stream and writeBinFile(stream, file_name, directory): queue.task_done() print "Fetching", url, 'done.' continue failure.append(url) queue.task_done() return failure
def main(): # 开始准备 prepare() while_n = 0 # 循环计数器 imglist = [] makedir(Config.directory) print 'Generate search url' URL = baseURL() # 下载 ############# # 获取搜索结果数量并与_count比较取其较小值 count = min(searchResult(URL), Config.count) # 没有搜索结果时退出 if not count: print "No search result at current condition." sys.exit(1) # 获得指定数量的url, 存放于list print 'Fetching page', while len(imglist) < count: print while_n, while_n += 1 tmplist = getImageUrlList(URL) imglist = imglist + tmplist URL = nextPage(URL, len(tmplist)) print '' # 换行 count = len(imglist) print "There're %d files to download" % count # 将已有文件从imglist中去除 imglist = [url for url in imglist if not getFilenameFromURL(url) in os.listdir(Config.directory)] print "There's %d files already downloaded." % (count - len(imglist)) # 下载该list print 'Fetching list of %d files' % len(imglist) queue = Queue() for url in imglist: queue.put(url) failure = [] for i in range(Config.thread_count): start_new_thread(downloadFromQueue, ( queue, failure, Config.directory, Config.timeout)) queue.join() print "%d failed to fetch." % len(failure)
count = min(searchResult(searchURL), count) # 没有搜索结果时退出 if not count: print "No search result at current condition." sys.exit(1) # 获得指定数量的url, 存放于list ,one page by one page print 'Fetching page', while len(imglist) < count: print while_n, #mark the times of while while_n += 1 tmplist = getImageUrlList(searchURL) imglist = imglist + tmplist searchURL = nextPage(searchURL, len(tmplist)) print '' # 换行 count = len(imglist) print "There're %d files to download" % count # 将已有文件从imglist中去除 imglist = [url for url in imglist if not getFilenameFromURL(url) in os.listdir(directory)] print "There's %d files already downloaded." % (count - len(imglist)) # 下载该list 使用超时20 10好像小了点 print 'Fetching list of %d files' % len(imglist) failure = threadDownloadFromList(imglist, directory=directory, timeout=20) print "%d failed to fetch." % len(failure) # 清理 # 1.添加后缀 print 'Adding extension ...', for fname in os.listdir(directory): addExtension(directory + os.sep + fname, '.jpg') print 'done.'
def run(self): stream = getStream(self.url, timeout=self.timeout) file_name = getFilenameFromURL(self.url) if not stream or not writeBinFile(stream, file_name, self.directory): self.failure.append(self.url) self.finished = True