def test_get_filename_from_url(self): filename = ut.get_filename_from_url("http://google.com", 0) self.assertEqual("0-google.com", filename) filename = ut.get_filename_from_url("https://yahoo.com", 99) self.assertEqual("99-yahoo.com", filename) filename = ut.get_filename_from_url("https://123abc.com/somepath", 999) self.assertEqual("999-123abc.com-somepath", filename) filename = ut.get_filename_from_url( "https://123abc.com/somepath/", 123) self.assertEqual("123-123abc.com-somepath-", filename) filename = ut.get_filename_from_url( "https://123abc.com/somepath/q=query&q2=q2", 234) self.assertEqual("234-123abc.com-somepath-q-query-q2-q2", filename)
def download_art(*, urls, target_dir): start = time.clock() target_dir = os.path.normpath(target_dir) try: os.makedirs(target_dir) print(f"Created directory : '{target_dir}'") except OSError: pass success = 0 fail = 0 for url in urls: url = url.strip() print(f"{url:<64}", end=" ", flush=True) r = requests.get(url) if r.status_code == 200: path = os.path.join(target_dir, utils.get_filename_from_url(url)) with open(path, "wb") as f: f.write(r.content) print(f"{r.status_code} {r.reason} > {path}") success += 1 else: print(f"{r.status_code} {r.reason}") fail += 1 exec_time = time.clock() - start print( f"\nDownloaded {success}/{success + fail} artworks to '{target_dir}' in {exec_time:0.02f} seconds.\n" )
def setup_crawl_dirs(self, test_url=TEST_URL): crawl_name = ut.append_timestamp("crawl") self.crawl_dir = ut.create_dir(join(cm.TEST_FILES_DIR, crawl_name)) batch_dir = ut.create_dir(join(self.crawl_dir, str(self.batch_num))) self.site_dir = ut.create_dir(join(batch_dir, ut.get_filename_from_url(test_url, self.site_num)))
def get_work_metadata(work_url, file_format): parser = utils.get_html_parser(work_url) title = utils.parse_work_title(parser) authors = utils.parse_work_authors(parser) part = utils.parse_work_part(parser) download_url = utils.parse_work_download_url(parser, file_format) filename = utils.get_filename_from_url(download_url) return Work(title, authors, part, download_url, filename)
def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0): wl_log.info('Crawl configuration: batches %s, instances: %s, tbb_version %s, no of URLs: %s, crawl dir: %s, XVFB: %s, screenshot: %s' % (num_batches, num_instances, self.tbb_version, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen)) # for each batch for batch_num in range(num_batches): wl_log.info('********** Starting batch %s **********' % batch_num) site_num = start_line bg_site = None batch_dir = ut.create_dir( os.path.join(self.crawl_dir, str(batch_num))) # init/reset tor process to have a different circuit. # make sure that we're not using the same guard node again wl_log.info('********** Restarting Tor Before Batch **********') self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 # for each site for page_url in self.urls: sites_crawled_with_same_proc += 1 if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS: wl_log.info('********** Restarting Tor Process **********') self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 wl_log.info('********** Crawling %s **********' % page_url) page_url = page_url[:cm.MAX_FNAME_LENGTH] site_dir = ut.create_dir(os.path.join( batch_dir, ut.get_filename_from_url(page_url, site_num))) for instance_num in range(num_instances): wl_log.info('********** Visit #%s to %s **********' % (instance_num, page_url)) self.visit = None try: self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tbb_version, self.tor_controller, bg_site, self.xvfb, self.capture_screen) self.visit.get() except KeyboardInterrupt: # CTRL + C raise KeyboardInterrupt except (ut.TimeExceededError, TimeoutException) as exc: wl_log.critical('Visit to %s timed out! %s %s' % ( page_url, exc, type(exc))) if self.visit: self.visit.cleanup_visit() except Exception: wl_log.critical('Exception crawling %s' % page_url, exc_info=True) if self.visit: self.visit.cleanup_visit() # END - for each visit site_num += 1 time.sleep(cm.PAUSE_BETWEEN_SITES)
def process_urls(): urls = [] if not os.path.exists(ZIP_FILES_PATH): os.makedirs(ZIP_FILES_PATH) for url in URLS: file_name = get_filename_from_url(url) if os.path.exists(os.path.join(ZIP_FILES_PATH, file_name)): continue urls.append(url) if urls: pool.map(download_zip_file, urls)
def setup_data(): pretty_print("SETUP DATA") extract_directory = f"{INPUT_DIR}extracts/" """ See if needed files currently exist in input directory. If not, see if extract file already exists correctly If not, retrieve and extract accordingly Move extracted files to correct directory and simplified filename Remove extra directory and files """ if (needed_files_exists()): pretty_print("Needed Files Already Exist", True) else: if (extracted_files_exists(extract_directory)): pretty_print("Extract Already Exist", True) else: create_directory(extract_directory) pretty_print("Fetching CRDC Data From Public Website (34MB)", True) zip_file_name = get_filename_from_url(CRDC_DATA_URL) zip_file_name = fetch_file(CRDC_DATA_URL, extract_directory, zip_file_name) pretty_print("Extracting Zip At ", True, extract_directory + zip_file_name) unzip(extract_directory + zip_file_name, extract_directory) pretty_print("Moving Files In Place", True) formatted_files_list = list( map( lambda x: { "src_path": x["extracted_path"], "dest_path": x["needed_file_name"] }, CRDC_FILES)) rename_files(formatted_files_list, extract_directory, INPUT_DIR) pretty_print("Cleaning Up", True) remove_directory(extract_directory) # create_directory(OUTPUT_DIR, True) create_directory(MIGRATION_DIR, True)
def setup_data(input_dir="./input/"): print("--- STEP 1: SETUP DATA") extract_directory = f"{input_dir}extracts/" """ See if needed files currently exist in input directory. If not, see if extract file already exists correctly If not, retrieve and extract accordingly Move extracted files to correct directory and simplified filename Remove extra directory and files """ if (needed_files_exists(input_dir)): print(" * Needed Files Already Exist") else: if (extracted_files_exists(extract_directory)): print(" * Extract Already Exist") else: create_directory(extract_directory) print(" * Fetching CRDC Data From Public Website (34MB)") zip_file_name = get_filename_from_url(CRDC_DATA_URL) zip_file_name = fetch_file(CRDC_DATA_URL, extract_directory, zip_file_name) print(" * Extracting Zip At ", extract_directory + zip_file_name) unzip(extract_directory + zip_file_name, extract_directory) print(" * Moving Files In Place") formatted_files_list = list( map( lambda x: { "src_path": x["extracted_path"], "dest_path": x["needed_file_name"] }, CRDC_FILES)) rename_files(formatted_files_list, extract_directory, input_dir) print(" * Cleaning Up") remove_directory(extract_directory) print(" * Setup Complete")
def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0): wl_log.info("Crawl configuration: batches: %s, instances: %s," " tbb_version: %s, experiment: %s, no of URLs: %s, " "crawl dir: %s, XVFB: %s, screenshot: %s" % (num_batches, num_instances, self.tbb_version, self.experiment, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen)) # for each batch for batch_num in xrange(num_batches): wl_log.info("********** Starting batch %s **********" % batch_num) site_num = start_line bg_site = None batch_dir = ut.create_dir(os.path.join(self.crawl_dir, str(batch_num))) # init/reset tor process to have a different circuit. # make sure that we're not using the same guard node again wl_log.info("********** Restarting Tor Before Batch **********") self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 # for each site for page_url in self.urls: sites_crawled_with_same_proc += 1 if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS: wl_log.info("********** Restarting Tor Process **********") self.tor_controller.restart_tor() sites_crawled_with_same_proc = 0 wl_log.info("********** Crawling %s **********" % page_url) page_url = page_url[:cm.MAX_FNAME_LENGTH] site_dir = ut.create_dir(os.path.join( batch_dir, ut.get_filename_from_url(page_url, site_num))) if self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA: bg_site = choice(self.urls) # for each visit for instance_num in range(num_instances): wl_log.info("********** Visit #%s to %s **********" % (instance_num, page_url)) self.visit = None try: self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tor_controller, bg_site, self.experiment, self.xvfb, self.capture_screen) self.visit.get() except KeyboardInterrupt: # CTRL + C raise KeyboardInterrupt except (ut.TimeExceededError, TimeoutException) as exc: wl_log.critical("Visit to %s timed out! %s %s" % (page_url, exc, type(exc))) if self.visit: self.visit.cleanup_visit() except Exception: wl_log.critical("Exception crawling %s" % page_url, exc_info=True) if self.visit: self.visit.cleanup_visit() # END - for each visit site_num += 1 time.sleep(cm.PAUSE_BETWEEN_SITES)
def upload(): # Get priority priority = int(request.form.get('priority', PRIORITY.medium)) if priority not in PRIORITY.get_values(): priority = PRIORITY.medium # Get output formats output_formats = request.form.get('output-formats', '') output_formats = list(set( filter( lambda format: format in app.config['ALLOWED_EXTENSIONS'], output_formats.split(';') ) )) if not output_formats: return jsonify({'Error': 'Must provide valid output formats'}), 400 # Get file (either directly or via URL) file = request.files.get('file') allowed_extensions = app.config['ALLOWED_EXTENSIONS'] if file: if allowed_filename(file.filename, allowed_extensions): filename = secure_filename(file.filename).strip()[-FILE_NAME_LIMIT] local_path = os.path.join(app.config['UPLOAD_FOLDER'], timestamp_filename(filename)) file.save(local_path) else: return jsonify({'Error': 'File format not allowed'}), 400 else: fileURL = request.form.get('fileURL') if fileURL: filename = get_filename_from_url(fileURL) try: local_path = download_url( fileURL, app.config['UPLOAD_FOLDER'], timestamp=True) except FileAccessDenied as fad: return jsonify({ 'status': 'error', 'code': fad.status_code, 'message': fad.message }), 500 else: return jsonify({'status': 'error', 'message': 'Unable to decode uploaded file'}), 500 # Upload to remote and remove file from local remote_destination = os.path.join(app.config['REMOTE_INPUT_FOLDER'], get_uuid(), filename) upload_to_remote(remote_destination, local_path) os.remove(local_path) # Register the file for conversions and return docIds docIds = Conversion.register_file(filename, remote_destination, g.user, output_formats, priority) # Call request fetcher request_fetcher.delay() return jsonify({'status': STATUS.introduced, 'doc_ids': docIds})
def setup_crawl_dirs(self, test_url=TEST_URL): crawl_name = ut.append_timestamp("crawl") self.crawl_dir = ut.create_dir(join(cm.TEST_FILES_DIR, crawl_name)) batch_dir = ut.create_dir(join(self.crawl_dir, str(self.batch_num))) self.site_dir = ut.create_dir( join(batch_dir, ut.get_filename_from_url(test_url, self.site_num)))