def test_download_midi(): argv = '2013,31394,1,1' driver = utils.start_driver('phantomjs', verbose=True) try: session.login(driver, credential) composer_id, work_id, page_id, track_id = map(int, argv.split(',')) out_dir = 'midi/{}'.format(composer_id) job = download.Download(driver, composer_id, work_id, page_id, track_id) job.order() utils.wait(3) success = False fname = job.pickup(out_dir) if fname: success = job.cleanup() if success: utils.print_message('successfully downloaded {}'.format( job.track.title)) utils.print_message('output file: {}'.format(fname)) utils.wait(3) session.logout(driver) finally: utils.close_driver(driver, verbose=True)
def test_login(): driver = utils.start_driver('phantomjs', verbose=True) try: session.login(driver, credential) utils.wait(5) session.is_login(driver, verbose=True) utils.wait(5) session.logout(driver) finally: utils.close_driver(driver, verbose=True)
def test_get_composer_data(): composer_id = '2062' out_json = '2062.json' driver = utils.start_driver('phantomjs', verbose=True) try: c = composer.Composer(driver, composer_id) c.get_all_works() c.format_json(fname_out=out_json) finally: utils.close_driver(driver, verbose=True)
def get_all_composers(out_json_fname, out_list_fname): driver = utils.start_driver('phantomjs', verbose=True) try: composer_json = composers.get_all_composers(driver) utils.save_json(composer_json, out_json_fname) # write composer id list composer_id_list = [] for composer in composer_json: composer_id = composer['url'].split('/')[-1].replace('.html', '') composer_id_list.append(int(composer_id)) fout = open(out_list_fname, 'w') utils.print_message('wrinting composer ID list ' + out_list_fname) for composer_id in sorted(composer_id_list): print(composer_id, file=fout) fout.close() finally: utils.close_driver(driver, verbose=True)
def get_composer_works(composers_list_fname, skip_exist=True): # start virtual display display = Display(visible=0, size=(1024, 768)) display.start() driver = utils.start_driver('chrome', verbose=True) try: for composer_id in open(composers_list_fname): composer_id = composer_id.rstrip() out_json = '{}/data/composer/{}.json'.format(root_dir, composer_id) if skip_exist and os.path.isfile(out_json): continue nretry = 0 while nretry < 10: try: utils.print_message( 'extract works of composer {}'.format(composer_id)) c = composer.Composer(driver, composer_id) c.get_all_works() c.format_json(fname_out=out_json) utils.print_message('\n') break except: utils.print_message('*ERROR* failed to extract works of ' 'composer {} (#retry={})'.format( composer_id, nretry)) utils.wait(3) nretry += 1 finally: utils.close_driver(driver, verbose=True) display.stop()
def check_composer_ntrack(composers_list_fname, online_mode): host = 'https://www.classicalarchives.com' composer_dir = '{}/data/composer'.format(root_dir) mismatch_list = [] if online_mode: driver = utils.start_driver('phantomjs', verbose=True) try: for composer_id in open(composers_list_fname): composer_id = composer_id.rstrip() composer_json = '{}/{}.json'.format(composer_dir, composer_id) composer = utils.load_json(composer_json, verbose=False) ntrack = 0 for work in composer['work_list']: for page in work['page_list']: for track in page['track_list']: ntrack += 1 if not online_mode: if composer['ntrack'] != ntrack: mismatch_list.append(composer_id) continue # online mode ntrack_online = 0 nretry = 0 while nretry < 10: try: utils.open_url( driver, host + '/midi/composer/{}.html'.format(composer_id), reopen=True) ntrack_online = int( driver.find_element_by_xpath( '//div[@id="wMidi"]//li[@class="counts"]').text. split()[1].replace(',', '')) break except: utils.print_message('*ERROR* failed to extract #track for ' 'composer {} (#retry={})'.format( composer_id, nretry)) utils.wait(3) nretry += 1 print('composer={}, #track online={} local={}'.format( composer_id, ntrack_online, ntrack)) if ntrack_online != ntrack: mismatch_list.append(composer_id) finally: if online_mode: utils.close_driver(driver, verbose=True) print('Found {} composers with inconsistent #tracks ({} mode)\n'.format( len(mismatch_list), 'online' if online_mode else 'local')) for composer_id in mismatch_list: print(composer_id)
def get_composer_midis(midi_list_fname): utils.print_message('---------- Start Time: {} ----------'.format(time.ctime())) # start virtual display display = Display(visible=0, size=(1024,768)) display.start() credential = '{}/data/cma.credential.json'.format(root_dir) driver = utils.start_driver('chrome', verbose=True) try: session.login(driver, credential) utils.wait(3) utils.print_message(' ') ntrack = 0 ntrack_max = 103 # 100 midis/day for line in open(midi_list_fname): composer_id, work_id, page_id, track_id = map( int, line.rstrip().split(',')) out_dir = '{}/data/midi/{}'.format(root_dir, composer_id) os.makedirs(out_dir, exist_ok=True) fname_prefix = 'composer_{}.work_{}.page_{}.track_{}'.format( composer_id, work_id, page_id, track_id) # check if output midi already exists file_exist = False for local_fname in os.listdir(out_dir): if local_fname.startswith(fname_prefix + '.'): file_exist = True if file_exist: continue # create a download job job = download.Download( driver, composer_id, work_id, page_id, track_id) job.order() utils.wait(5) # check if a download is successfully created success = False fname = job.pickup(out_dir) if fname: success = job.cleanup() if not success: break utils.print_message('successfully downloaded {} ({}/100)' .format(job.track.title, ntrack+1)) utils.print_message('output file: {}'.format(fname)) utils.print_message(' ') utils.wait(3) ntrack += 1 # if daily limit is reached if ntrack == ntrack_max: break finally: utils.print_message(' ') session.logout(driver) utils.wait(3) utils.close_driver(driver, verbose=True) display.stop() utils.print_message('---------- Finish Time: {} ----------'.format(time.ctime()))