Example #1
0
 def test_get_filename_from_url(self):
     filename = ut.get_filename_from_url("http://google.com", 0)
     self.assertEqual("0-google.com", filename)
     filename = ut.get_filename_from_url("https://yahoo.com", 99)
     self.assertEqual("99-yahoo.com", filename)
     filename = ut.get_filename_from_url("https://123abc.com/somepath", 999)
     self.assertEqual("999-123abc.com-somepath", filename)
     filename = ut.get_filename_from_url(
         "https://123abc.com/somepath/", 123)
     self.assertEqual("123-123abc.com-somepath-", filename)
     filename = ut.get_filename_from_url(
         "https://123abc.com/somepath/q=query&q2=q2", 234)
     self.assertEqual("234-123abc.com-somepath-q-query-q2-q2", filename)
Example #2
0
def download_art(*, urls, target_dir):
    start = time.clock()
    target_dir = os.path.normpath(target_dir)
    try:
        os.makedirs(target_dir)
        print(f"Created directory : '{target_dir}'")
    except OSError:
        pass
    success = 0
    fail = 0
    for url in urls:
        url = url.strip()
        print(f"{url:<64}", end=" ", flush=True)
        r = requests.get(url)
        if r.status_code == 200:
            path = os.path.join(target_dir, utils.get_filename_from_url(url))
            with open(path, "wb") as f:
                f.write(r.content)
            print(f"{r.status_code} {r.reason}  > {path}")
            success += 1
        else:
            print(f"{r.status_code} {r.reason}")
            fail += 1
    exec_time = time.clock() - start
    print(
        f"\nDownloaded {success}/{success + fail} artworks to '{target_dir}' in {exec_time:0.02f} seconds.\n"
    )
 def setup_crawl_dirs(self, test_url=TEST_URL):
     crawl_name = ut.append_timestamp("crawl")
     self.crawl_dir = ut.create_dir(join(cm.TEST_FILES_DIR, crawl_name))
     batch_dir = ut.create_dir(join(self.crawl_dir, str(self.batch_num)))
     self.site_dir = ut.create_dir(join(batch_dir,
                                        ut.get_filename_from_url(test_url,
                                                                 self.site_num)))
Example #4
0
def get_work_metadata(work_url, file_format):
    parser = utils.get_html_parser(work_url)
    title = utils.parse_work_title(parser)
    authors = utils.parse_work_authors(parser)
    part = utils.parse_work_part(parser)
    download_url = utils.parse_work_download_url(parser, file_format)
    filename = utils.get_filename_from_url(download_url)
    return Work(title, authors, part, download_url, filename)
Example #5
0
    def crawl(self, num_batches=cm.NUM_BATCHES, num_instances=cm.NUM_INSTANCES, start_line=0):
        wl_log.info('Crawl configuration: batches %s, instances: %s, tbb_version %s, no of URLs: %s, crawl dir: %s, XVFB: %s, screenshot: %s'
                    % (num_batches, num_instances, self.tbb_version, len(self.urls), self.crawl_dir, self.xvfb, self.capture_screen))

        # for each batch
        for batch_num in range(num_batches):
            wl_log.info('********** Starting batch %s **********' % batch_num)
            site_num = start_line
            bg_site = None
            batch_dir = ut.create_dir(
                os.path.join(self.crawl_dir, str(batch_num)))

            # init/reset tor process to have a different circuit.
            # make sure that we're not using the same guard node again
            wl_log.info('********** Restarting Tor Before Batch **********')
            self.tor_controller.restart_tor()
            sites_crawled_with_same_proc = 0

            # for each site
            for page_url in self.urls:
                sites_crawled_with_same_proc += 1
                if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS:
                    wl_log.info('********** Restarting Tor Process **********')
                    self.tor_controller.restart_tor()
                    sites_crawled_with_same_proc = 0

                wl_log.info('********** Crawling %s **********' % page_url)
                page_url = page_url[:cm.MAX_FNAME_LENGTH]
                site_dir = ut.create_dir(os.path.join(
                    batch_dir, ut.get_filename_from_url(page_url, site_num)))

                for instance_num in range(num_instances):
                    wl_log.info('********** Visit #%s to %s **********' %
                                (instance_num, page_url))
                    self.visit = None
                    try:
                        self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir,
                                           self.tbb_version, self.tor_controller, bg_site, self.xvfb, self.capture_screen)
                        self.visit.get()
                    except KeyboardInterrupt:  # CTRL + C
                        raise KeyboardInterrupt
                    except (ut.TimeExceededError, TimeoutException) as exc:
                        wl_log.critical('Visit to %s timed out! %s %s' % (
                            page_url, exc, type(exc)))
                        if self.visit:
                            self.visit.cleanup_visit()
                    except Exception:
                        wl_log.critical('Exception crawling %s' %
                                        page_url, exc_info=True)
                        if self.visit:
                            self.visit.cleanup_visit()

                # END - for each visit
                site_num += 1
                time.sleep(cm.PAUSE_BETWEEN_SITES)
Example #6
0
def process_urls():
    urls = []
    if not os.path.exists(ZIP_FILES_PATH):
        os.makedirs(ZIP_FILES_PATH)
    for url in URLS:
        file_name = get_filename_from_url(url)
        if os.path.exists(os.path.join(ZIP_FILES_PATH, file_name)):
            continue
        urls.append(url)
    if urls:
        pool.map(download_zip_file, urls)
Example #7
0
def setup_data():
    pretty_print("SETUP DATA")
    extract_directory = f"{INPUT_DIR}extracts/"
    """
    See if needed files currently exist in input directory.
    If not, see if extract file already exists correctly
    If not, retrieve and extract accordingly
    Move extracted files to correct directory and simplified filename
    Remove extra directory and files
    """
    if (needed_files_exists()):
        pretty_print("Needed Files Already Exist", True)
    else:
        if (extracted_files_exists(extract_directory)):
            pretty_print("Extract Already Exist", True)
        else:
            create_directory(extract_directory)
            pretty_print("Fetching CRDC Data From Public Website (34MB)", True)
            zip_file_name = get_filename_from_url(CRDC_DATA_URL)
            zip_file_name = fetch_file(CRDC_DATA_URL, extract_directory,
                                       zip_file_name)

            pretty_print("Extracting Zip At ", True,
                         extract_directory + zip_file_name)
            unzip(extract_directory + zip_file_name, extract_directory)

        pretty_print("Moving Files In Place", True)
        formatted_files_list = list(
            map(
                lambda x: {
                    "src_path": x["extracted_path"],
                    "dest_path": x["needed_file_name"]
                }, CRDC_FILES))
        rename_files(formatted_files_list, extract_directory, INPUT_DIR)

        pretty_print("Cleaning Up", True)
        remove_directory(extract_directory)

    # create_directory(OUTPUT_DIR, True)

    create_directory(MIGRATION_DIR, True)
Example #8
0
def setup_data(input_dir="./input/"):
    print("--- STEP 1: SETUP DATA")
    extract_directory = f"{input_dir}extracts/"
    """
    See if needed files currently exist in input directory.
    If not, see if extract file already exists correctly
    If not, retrieve and extract accordingly
    Move extracted files to correct directory and simplified filename
    Remove extra directory and files
    """
    if (needed_files_exists(input_dir)):
        print("    * Needed Files Already Exist")
    else:
        if (extracted_files_exists(extract_directory)):
            print("    * Extract Already Exist")
        else:
            create_directory(extract_directory)
            print("    * Fetching CRDC Data From Public Website (34MB)")
            zip_file_name = get_filename_from_url(CRDC_DATA_URL)
            zip_file_name = fetch_file(CRDC_DATA_URL, extract_directory,
                                       zip_file_name)

            print("    * Extracting Zip At ",
                  extract_directory + zip_file_name)
            unzip(extract_directory + zip_file_name, extract_directory)

        print("    * Moving Files In Place")
        formatted_files_list = list(
            map(
                lambda x: {
                    "src_path": x["extracted_path"],
                    "dest_path": x["needed_file_name"]
                }, CRDC_FILES))
        rename_files(formatted_files_list, extract_directory, input_dir)

        print("    * Cleaning Up")
        remove_directory(extract_directory)

    print("    * Setup Complete")
Example #9
0
    def crawl(self, num_batches=cm.NUM_BATCHES,
              num_instances=cm.NUM_INSTANCES, start_line=0):
        wl_log.info("Crawl configuration: batches: %s, instances: %s,"
                    " tbb_version: %s, experiment: %s, no of URLs: %s, "
                    "crawl dir: %s, XVFB: %s, screenshot: %s"
                    % (num_batches, num_instances, self.tbb_version,
                       self.experiment, len(self.urls), self.crawl_dir,
                       self.xvfb, self.capture_screen))
        # for each batch
        for batch_num in xrange(num_batches):
            wl_log.info("********** Starting batch %s **********" % batch_num)
            site_num = start_line
            bg_site = None
            batch_dir = ut.create_dir(os.path.join(self.crawl_dir,
                                                   str(batch_num)))
            # init/reset tor process to have a different circuit.
            # make sure that we're not using the same guard node again
            wl_log.info("********** Restarting Tor Before Batch **********")
            self.tor_controller.restart_tor()
            sites_crawled_with_same_proc = 0

            # for each site
            for page_url in self.urls:
                sites_crawled_with_same_proc += 1
                if sites_crawled_with_same_proc > cm.MAX_SITES_PER_TOR_PROCESS:
                    wl_log.info("********** Restarting Tor Process **********")
                    self.tor_controller.restart_tor()
                    sites_crawled_with_same_proc = 0

                wl_log.info("********** Crawling %s **********" % page_url)
                page_url = page_url[:cm.MAX_FNAME_LENGTH]
                site_dir = ut.create_dir(os.path.join(
                    batch_dir, ut.get_filename_from_url(page_url, site_num)))

                if self.experiment == cm.EXP_TYPE_MULTITAB_ALEXA:
                    bg_site = choice(self.urls)
                # for each visit
                for instance_num in range(num_instances):
                    wl_log.info("********** Visit #%s to %s **********" %
                                (instance_num, page_url))
                    self.visit = None
                    try:
                        self.visit = Visit(batch_num, site_num, instance_num, page_url, site_dir, self.tor_controller,
                                           bg_site, self.experiment, self.xvfb, self.capture_screen)

                        self.visit.get()
                    except KeyboardInterrupt:  # CTRL + C
                        raise KeyboardInterrupt
                    except (ut.TimeExceededError, TimeoutException) as exc:
                        wl_log.critical("Visit to %s timed out! %s %s" %
                                        (page_url, exc, type(exc)))
                        if self.visit:
                            self.visit.cleanup_visit()
                    except Exception:
                        wl_log.critical("Exception crawling %s" % page_url,
                                        exc_info=True)
                        if self.visit:
                            self.visit.cleanup_visit()
                # END - for each visit
                site_num += 1
                time.sleep(cm.PAUSE_BETWEEN_SITES)
Example #10
0
def upload():
    # Get priority
    priority = int(request.form.get('priority', PRIORITY.medium))
    if priority not in PRIORITY.get_values():
        priority = PRIORITY.medium

    # Get output formats
    output_formats = request.form.get('output-formats', '')
    output_formats = list(set(
        filter(
            lambda format: format in app.config['ALLOWED_EXTENSIONS'],
            output_formats.split(';')
        )
    ))
    if not output_formats:
        return jsonify({'Error': 'Must provide valid output formats'}), 400

    # Get file (either directly or via URL)
    file = request.files.get('file')
    allowed_extensions = app.config['ALLOWED_EXTENSIONS']

    if file:
        if allowed_filename(file.filename, allowed_extensions):
            filename = secure_filename(file.filename).strip()[-FILE_NAME_LIMIT]
            local_path = os.path.join(app.config['UPLOAD_FOLDER'],
                                      timestamp_filename(filename))
            file.save(local_path)
        else:
            return jsonify({'Error': 'File format not allowed'}), 400
    else:
        fileURL = request.form.get('fileURL')
        if fileURL:
            filename = get_filename_from_url(fileURL)

            try:
                local_path = download_url(
                    fileURL, app.config['UPLOAD_FOLDER'], timestamp=True)
            except FileAccessDenied as fad:
                return jsonify({
                    'status': 'error',
                    'code': fad.status_code,
                    'message': fad.message
                }), 500

        else:
            return jsonify({'status': 'error',
                            'message': 'Unable to decode uploaded file'}), 500

    # Upload to remote and remove file from local
    remote_destination = os.path.join(app.config['REMOTE_INPUT_FOLDER'],
                                      get_uuid(), filename)
    upload_to_remote(remote_destination, local_path)
    os.remove(local_path)

    # Register the file for conversions and return docIds
    docIds = Conversion.register_file(filename, remote_destination,
                                      g.user, output_formats, priority)

    # Call request fetcher
    request_fetcher.delay()

    return jsonify({'status': STATUS.introduced, 'doc_ids': docIds})
 def setup_crawl_dirs(self, test_url=TEST_URL):
     crawl_name = ut.append_timestamp("crawl")
     self.crawl_dir = ut.create_dir(join(cm.TEST_FILES_DIR, crawl_name))
     batch_dir = ut.create_dir(join(self.crawl_dir, str(self.batch_num)))
     self.site_dir = ut.create_dir(
         join(batch_dir, ut.get_filename_from_url(test_url, self.site_num)))