Exemple #1
0
def main():
    # 创建pdf_download文件夹
    if not os.path.exists('pdf_download'):
        os.mkdir('pdf_download')
    # os.chdir(os.path.join(os.getcwd(), 'pdf_download'))

    download_dict = getDownloadUrlAndFilename()
    #
    # download_dict = getUrlAndFilenameFromTxt('./download_fail.txt')

    # print(download_dict)
    pdf_file_list = getAllFilename('./pdf_download')
    print(len(pdf_file_list))

    download_url_queue = queue.Queue()

    for i in pdf_file_list:
        del download_dict[i]

    for filename, url in download_dict.items():
        filename_url = [filename, url]
        download_url_queue.put(filename_url)

    thread_list = []
    for i in range(100):
        thread = CrawlThread(download_url_queue)
        thread.start()
        thread_list.append(thread)

    download_url_queue.join()

    for t in thread_list:
        t.join()

    print('MainThread End')
Exemple #2
0
def main():
    # f_zhuku_id = open('zhiku_download_list.txt', 'r')
    # id_list = f_zhuku_id.readlines()
    # for i in range(len(id_list)):
    # 	id_list[i] = id_list[i].replace('\n', '')
    # print(id_list)
    pdf_file_list = getAllFilename('./pdf_download')
    id_list = []
    for pdf in pdf_file_list:
        # print(pdf)
        id_list.append(pdf.split('.')[0])
    print(len(id_list))
    getDataByFileurlToExcel(id_list)
def rename():
    db, cursor = connectDatabase()
    sql = 'SELECT id, uuid FROM zhiku_data'
    id_uuid_dict = {}
    try:
        cursor.execute(sql)
        results = cursor.fetchall()
        for r in results:
            id_uuid_dict[str(r[0])] = r[1]
    except Exception as e:
        print(e)
    file_list = getAllFilename('./pdf_download')

    # print(file_list)

    for f in file_list:
        key = f.split('.')[0]
        new_filename = id_uuid_dict[key] + '.pdf'
        print(new_filename)
        os.rename('./pdf_download/' + f, './pdf_download/' + new_filename)
    #
    # new_file_list = []
    cursor.close()
    db.close()