filePattern = re.compile( '<a href="/dict_detail\?dict_id=(\d+)">(.*?)</a>') fileList = re.findall(filePattern, data) for id, name in fileList: # print id, name.decode('gbk') fileURL = 'http://dict.qq.pinyin.cn/download?dict_id=' + id filePath = cateDir.decode('utf8') + '/' + name.decode( 'gbk') + '.qpyd' if fileURL in downloaded: continue else: downloaded.add(fileURL) print fileURL, name.decode( 'gbk') + '.qpyd is downloading..........' downloadSingleFile.downloadSingleFile(fileURL, filePath, logFile) print filePath + ' is downloaded!!' for i in visited: print i if __name__ == '__main__': baseDir = '/data/tgz/ThesaurusSpider/QQTheaurusSpider/QQ/' # 路径最后不能添加/ logFile = baseDir + '/' + 'download.log' category = getQQCategory.getCategory() for bigCate in category: for smallCate in category[bigCate]: downloadSingleType(bigCate.encode('utf8'), smallCate.encode('utf8'), baseDir, logFile)
queue.put(pageURL) # 下载当前页面存在的文件 filePattern = re.compile('<a href="/dict_detail\?dict_id=(\d+)">(.*?)</a>') fileList = re.findall(filePattern,data) for id, name in fileList: # print id, name.decode('gbk') fileURL = 'http://dict.qq.pinyin.cn/download?dict_id='+id filePath = cateDir.decode('utf8')+'/'+name.decode('gbk')+'.qpyd' if fileURL in downloaded: continue else: downloaded.add(fileURL) print fileURL,name.decode('gbk')+'.qpyd is downloading..........' downloadSingleFile.downloadSingleFile(fileURL, filePath, logFile) print filePath+' is downloaded!!' for i in visited: print i if __name__ == '__main__': baseDir = '/Users/f/downloads/wx/dicts/qqdata' # 路径最后不能添加/ logFile = baseDir+'/'+'download.log' category = getQQCategory.getCategory() for bigCate in category: for smallCate in category[bigCate]: downloadSingleType(bigCate.encode('utf8'), smallCate.encode('utf8'), baseDir, logFile)