filePattern = re.compile(
            '<a href="/dict_detail\?dict_id=(\d+)">(.*?)</a>')
        fileList = re.findall(filePattern, data)
        for id, name in fileList:
            # print id, name.decode('gbk')
            fileURL = 'http://dict.qq.pinyin.cn/download?dict_id=' + id
            filePath = cateDir.decode('utf8') + '/' + name.decode(
                'gbk') + '.qpyd'

            if fileURL in downloaded:
                continue
            else:
                downloaded.add(fileURL)
                print fileURL, name.decode(
                    'gbk') + '.qpyd is downloading..........'
                downloadSingleFile.downloadSingleFile(fileURL, filePath,
                                                      logFile)
                print filePath + ' is downloaded!!'

    for i in visited:
        print i


if __name__ == '__main__':
    baseDir = '/data/tgz/ThesaurusSpider/QQTheaurusSpider/QQ/'  # 路径最后不能添加/
    logFile = baseDir + '/' + 'download.log'
    category = getQQCategory.getCategory()
    for bigCate in category:
        for smallCate in category[bigCate]:
            downloadSingleType(bigCate.encode('utf8'),
                               smallCate.encode('utf8'), baseDir, logFile)
Exemple #2
0
            queue.put(pageURL)

        # 下载当前页面存在的文件
        filePattern = re.compile('<a href="/dict_detail\?dict_id=(\d+)">(.*?)</a>')
        fileList = re.findall(filePattern,data)
        for id, name in fileList:
             # print id, name.decode('gbk')
             fileURL = 'http://dict.qq.pinyin.cn/download?dict_id='+id
             filePath = cateDir.decode('utf8')+'/'+name.decode('gbk')+'.qpyd'

             if fileURL in downloaded:
                 continue
             else:
                 downloaded.add(fileURL)
                 print fileURL,name.decode('gbk')+'.qpyd is downloading..........'
                 downloadSingleFile.downloadSingleFile(fileURL, filePath, logFile)
                 print filePath+' is downloaded!!'

    for i in visited:
        print i


if __name__ == '__main__':
    baseDir = '/Users/f/downloads/wx/dicts/qqdata'  # 路径最后不能添加/
    logFile = baseDir+'/'+'download.log'
    category = getQQCategory.getCategory()
    for bigCate in category:
        for smallCate in category[bigCate]:
            downloadSingleType(bigCate.encode('utf8'), smallCate.encode('utf8'), baseDir, logFile)