Ejemplos de safe_string en Python, ejemplos de sfmutils.utils.safe_string en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: harvester.py Proyecto: gwu-libraries/sfm-utils

    def _create_warc_temp_dir(self):
        """
        Create temporary directory for WARC files.

        :return: the directory path
        """
        path = os.path.join(self.working_path, "tmp", safe_string(self.message["id"]))
        if not os.path.exists(path):
            os.makedirs(path)
        return path

Ejemplo n.º 2

0

Mostrar archivo

Archivo: supervisor.py Proyecto: gwu-libraries/sfm-utils

    def _create_conf_file(self, harvest_id, debug, debug_warcprox, tries):
        # Note that giving a long time to shutdown.
        # Stream harvester may need to finish processing.
        contents = """[program:{process_group}]
command={python_executable} {script} --debug={debug} --debug-warcprox={debug_warcprox} seed {seed_filepath} {working_path} --streaming --host {mq_host} --username {mq_username} --password {mq_password} --tries {tries}
user={user}
autostart=true
autorestart=unexpected
exitcodes=0,1
stopwaitsecs=900
stderr_logfile={log_path}/{safe_harvest_id}.err.log
stdout_logfile={log_path}/{safe_harvest_id}.out.log
""".format(process_group=self._get_process_group(harvest_id),
           safe_harvest_id=safe_string(harvest_id),
           python_executable=self.python_executable,
           script=self.script,
           seed_filepath=self._get_seed_filepath(harvest_id),
           working_path=self.working_path,
           mq_host=self.mq_host,
           mq_username=self.mq_username,
           mq_password=self.mq_password,
           user=self.process_owner,
           log_path=self.log_path,
           debug=debug,
           debug_warcprox=debug_warcprox,
           tries=tries)

        # Write the file
        conf_filepath = self._get_conf_filepath(harvest_id)
        log.debug("Writing conf to %s: %s", conf_filepath, contents)
        with open(conf_filepath, "wb") as f:
            f.write(contents)
        filestatus = os.stat(conf_filepath)
        # do a chmod +x, and add group write permissions
        os.chmod(conf_filepath, filestatus.st_mode | stat.S_IXUSR |
                 stat.S_IXGRP | stat.S_IXOTH | stat.S_IWGRP)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: supervisor.py Proyecto: gwu-libraries/sfm-utils

 def _get_process_group(harvest_id):
     return safe_string(harvest_id)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: supervisor.py Proyecto: gwu-libraries/sfm-utils

 def _get_seed_filepath(self, harvest_id):
     return "{}/{}.json".format(self.conf_path, safe_string(harvest_id))

Ejemplo n.º 5

0

Mostrar archivo

Archivo: harvester.py Proyecto: gwu-libraries/sfm-utils

    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(self.working_path, "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED, "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.debug("Shutdown triggered")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]), self.warc_temp_dir, debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug("Too many retries, so giving up on harvesting seeds.")
                        done = True
                        self.result.success = False
                        self.result.errors.append(Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])

Ejemplo n.º 6

0

Mostrar archivo

    def facebook_user_ads(self, username, nsid, iso2c, access_token):
        assert username or nsid

        limit_per_page = 500

        if username and not nsid:
            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid and access_token and iso2c:
            # start scraping
            request_url = "https://graph.facebook.com/v5.0/ads_archive"
            request_params = {
                "access_token":
                access_token,
                "limit":
                limit_per_page,
                "search_page_ids":
                str(nsid),
                "ad_active_status":
                "ALL",
                "ad_reached_countries":
                iso2c,  # todo
                "fields":
                "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency"
            }

            api_result = requests.get(request_url, params=request_params)

            print(api_result.text)

            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            # write to warc
            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(api_result.json(),
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
            time.sleep(1.2)  # sleep to avoid getting blocked by api

        else:
            log.debug(
                "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s",
                str(access_token), str(iso2c))

Ejemplo n.º 7

0

Mostrar archivo

    def facebook_user_bio(self, username):
        """Scrapes Facebook bio and returns info
        on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal)
        @param username: Facebook username
        @return: a dictionary of account attributes """

        user_email_fb = self.message['credentials']['user_email_fb']
        user_password_fb = self.message['credentials']['user_password_fb']

        # ensure username is clean and can be accessed
        if username.startswith(
                "https://www.facebook.com/") or username.startswith(
                    "http://www.facebook.com/"):

            username = re.sub(r'^.+facebook\.com\/', '', username)
            # possibly also remove trailing /
            username = re.sub(r'\/$', '', username)

        # created at field
        fb_general = base_fb_url + username
        # bio info
        fb_about = base_fb_url + username + "/about/?ref=page_internal"
        # site transparency (e.g. admins)
        m_fb_general = "http://m.facebook.com/" + username

        # request the html
        r = requests.get(fb_general)
        # ensure no 404's
        if not r:
            log.debug("Couldn't access profile site: %s", fb_general)
            return

        soup = BeautifulSoup(r.content, "html.parser")

        # scrape creation date
        created_at = soup.find('div', {"class": "_3qn7"})
        created_at = created_at.select_one("span").text

        created_at = re.sub(r"(Seite erstellt)", "", created_at)

        created_at = created_at[3:]

        # scrape n of likes
        # find span with like number
        spans = soup.find('span', {"class": "_52id _50f5 _50f7"})
        # isolate likes via regex
        likes = re.search(r'^[\d]+.[^\s]+', spans.text).group()

        bio_dict = {
            "username": fb_general,
            "n_likes": likes,
            "created_at": created_at
        }

        # request about html
        r_about = requests.get(fb_about)

        # ensure no 404's
        if not r_about:
            log.debug("Couldn't access username/about site: %s", fb_about)
            return

        about_soup = BeautifulSoup(r_about.content, "html.parser")
        mission_text = about_soup.find_all('div', {'class': "_4bl9"})

        for divs in mission_text:
            describing_div = divs.find('div', {'class': '_50f4'})
            content_div = divs.find('div', {'class': '_3-8w'})

            if describing_div and content_div:
                bio_dict[describing_div.text] = content_div.text

        # photos
        # Retrieves profile and cover photo of public facebook page
        # bio going to the 'about' page, parsing html and getting
        # the links to photos from script tag, these can then be passed
        # harvest_media
        # this is not affected by the harvest_media options but will always happen
        all_scripts = about_soup.find_all('script')

        for js in all_scripts:
            for content in js.contents:
                if 'cover_photo' in content:
                    # isolate relevant links
                    links = re.findall(r'https\:\\/\\/scontent[^"]*', content)

                    # remove escaped front slashes
                    for val, link in enumerate(links):
                        links[val] = re.sub(r'\\', "", link)
                        self._harvest_media_url(links[val])

        if m_fb_general:

            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
            site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt"
            site_transparency_detail_id = "u_0_d"

            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('headless')
            chrome_options.add_argument('start-maximised')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1200x800')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument(f"user-agent={user_agent}")

            # this will connect to the selenium container starting scraping
            driver = webdriver.Remote("host.docker.internal:4444/wd/hub",
                                      {'browserName': 'chrome'})
            driver.get("http://m.facebook.com")
            driver.maximize_window()
            # accept cookies
            cookies = driver.find_element_by_id('accept-cookie-banner-label')
            # more or less random wait to replicate user behavior, ensure politeness
            time.sleep(random.uniform(3, 9))
            cookies.click()
            # Search & Enter the Email or Phone field & Enter Password
            username_fb = driver.find_element_by_id("m_login_email")
            password_fb = driver.find_element_by_id("m_login_password")
            submit = driver.find_element_by_css_selector("._56b_")
            # send keys and make sure not prepolutaed
            # 2fa has to be deactivated
            username_fb.clear()
            password_fb.clear()
            username_fb.send_keys(user_email_fb)
            password_fb.send_keys(user_password_fb)
            time.sleep(random.uniform(3, 9))
            # Step 4) Click Login
            submit.click()
            time.sleep(random.uniform(3, 9))
            # navigate to site
            driver.get(m_fb_general)
            time.sleep(random.uniform(3, 9))
            driver.execute_script("window.scrollTo(0, 800)")
            # site info only loads on scroll
            # use class name and div content (todo)
            time.sleep(random.uniform(20, 25))
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.CSS_SELECTOR, site_transparency_class_selector)))
            site_transparency = driver.find_elements_by_css_selector(
                site_transparency_class_selector)
            #site transparency should always be below about
            site_transparency[1].click()
            time.sleep(random.uniform(20, 15))
            # simply get the whole text of the transparency box of site
            # the exact info can be extracted ex-post
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.ID, site_transparency_detail_id)))
            time.sleep(random.uniform(3, 9))
            site_transparency_text = driver.find_element_by_id(
                site_transparency_detail_id).text
            time.sleep(random.uniform(3, 9))
            driver.close()
            log.info("Finished scraping transparency box")
            bio_dict['transparency_text'] = site_transparency_text

        # ensure that only warc will be written if sites were found
        # else nothing will happen
        if r_about or r:
            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(bio_dict,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)

Ejemplo n.º 8

0

Mostrar archivo

    def facebook_user_timeline(self, seed_id, username, nsid):
        """This function will scrape the user timeline"""
        log.debug("Harvesting user %s with seed_id %s.", username, seed_id)
        # make sure either username or nsid is present to start scraping
        assert username or nsid

        # Possibly look up username
        if username and not nsid:

            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid:
            # report back whether user id was found
            log.info("FB userid %s", nsid)
            # todo - need to add timeout and what to do if blocked
            # todo - post ids will sometimes be empty, account for that for incremental

            incremental = self.message.get("options",
                                           {}).get("incremental", False)
            harvest_media = self.message.get("options",
                                             {}).get("harvest_media", False)

            if incremental:
                # search for since_id of post
                since_id = self.state_store.get_state(
                    __name__, u"timeline.{}.since_id".format(nsid))

            scrape_result = []

            for post in facebook_scraper.get_posts(nsid,
                                                   pages=self.pages,
                                                   extra_info=True,
                                                   timeout=20):
                scrape_result.append(post)
                self.result.harvest_counter["posts"] += 1
                self.result.increment_stats("posts")

                if harvest_media and post[
                        'images']:  #last condition avoids parsing empty lists (i.e. no media)
                    log.info("Harvesting media from post")
                    # get media content from links - should automatically be caught within warc stream
                    # all photos on fb are jpgs, so the list comprehension checks whether this is the case
                    # for the stream, if not (e.g. video) it will not harvest
                    [
                        self._harvest_media_url(media_url)
                        for media_url in post['images'] if 'jpg' in media_url
                    ]

                if incremental and post["post_id"] == since_id:
                    log.info(
                        "Stopping, found last post that was previously harvested with id: %s",
                        post["post_id"])
                    break

            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(scrape_result,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)

            # write to state store
            incremental = self.message.get("options",
                                           {}).get("incremental", False)

            key = "timeline.{}.since_id".format(nsid)
            max_post_time = scrape_result[0].get("time")
            max_post_id = scrape_result[0].get("post_id")

            assert max_post_time and max_post_id

            if incremental:

                self.state_store.set_state(
                    __name__, key, max_post_id) if incremental else None

                log.info("Wrote first scraped post to state_store")

        else:
            msg = "NSID not found for user {}".format(username)
            log.exception(msg)
            self.result.warnings.append(
                Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))

Ejemplo n.º 9

0

Mostrar archivo

    def on_message(self):
        assert self.message

        log.info("Harvesting by message with id %s", self.message["id"])

        self.result_filepath = os.path.join(
            self.working_path,
            "{}_result.json".format(safe_string(self.message["id"])))

        # Create a temp directory for WARCs
        self.warc_temp_dir = self._create_warc_temp_dir()
        self._create_state_store()

        # Possibly resume a harvest
        self.result = HarvestResult()
        self.result.started = datetime_now()

        if os.path.exists(self.result_filepath) or len(
                self._list_warcs(self.warc_temp_dir)) > 0:
            self._load_result()
            self.result.warnings.append(
                Msg(CODE_HARVEST_RESUMED,
                    "Harvest resumed on {}".format(datetime_now())))
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)
            self._queue_warc_files()
        else:
            # Send a status message. This will give immediate indication that harvesting is occurring.
            self._send_status_message(STATUS_RUNNING)

        # stop_harvest_loop_event tells the harvester to stop looping.
        # Only streaming harvesters loop.
        # For other harvesters, this is tripped after the first entrance into loop.
        self.stop_harvest_loop_event = threading.Event()

        # Supervisor sends a signal, indicating that the harvester should stop.
        # This is a graceful shutdown. Harvesting seeds is stopped and processing
        # is finished. This may take some time.
        def shutdown(signal_number, stack_frame):
            log.info("Shutdown triggered")
            # This is for the consumer.
            self.should_stop = True
            if self.is_pause:
                log.info("This will be a pause of the harvest.")
            self.stop_harvest_loop_event.set()
            # stop_event tells the harvester to stop harvest_seeds.
            # This will allow warcprox to exit.
            self.stop_harvest_seeds_event.set()
            if self.restart_stream_timer:
                self.restart_stream_timer.cancel()
            if self.queue_warc_files_timer:
                self.queue_warc_files_timer.cancel()

        signal.signal(signal.SIGTERM, shutdown)
        signal.signal(signal.SIGINT, shutdown)

        def pause(signal_number, stack_frame):
            self.is_pause = True

        signal.signal(signal.SIGUSR1, pause)

        log.debug("Message is %s" % json.dumps(self.message, indent=4))

        # Setup the restart timer for streams
        # The restart timer stops and restarts the stream periodically.
        # This makes makes sure that each HTTP response is limited in size.
        if self.is_streaming:
            self.restart_stream_timer = threading.Timer(
                self.stream_restart_interval_secs, self._restart_stream)
            self.restart_stream_timer.start()

        # Start a queue warc files timer
        self.queue_warc_files_timer = threading.Timer(
            self.queue_warc_files_interval_secs, self._queue_warc_files)
        self.queue_warc_files_timer.start()

        while not self.stop_harvest_loop_event.is_set():
            # Reset the stop_harvest_seeds_event
            self.stop_harvest_seeds_event = threading.Event()

            # If this isn't streaming then set stop_harvest_seeds_event so that looping doesn't occur.
            if not self.is_streaming:
                self.stop_harvest_loop_event.set()

            # Here is where the harvesting happens.
            try_count = 0
            done = False
            while not done:
                try_count += 1
                log.debug("Try {} of {}".format(try_count, self.tries))
                try:
                    if self.use_warcprox:
                        with warced(safe_string(self.message["id"]),
                                    self.warc_temp_dir,
                                    debug=self.debug_warcprox,
                                    interrupt=self.is_streaming,
                                    rollover_time=self.warc_rollover_secs
                                    if not self.is_streaming else None):
                            self.harvest_seeds()
                    else:
                        self.harvest_seeds()
                    done = True
                    log.debug("Done harvesting seeds.")
                except Exception as e:
                    log.exception("Unknown error raised during harvest: %s", e)
                    if try_count == self.tries:
                        # Give up trying
                        log.debug(
                            "Too many retries, so giving up on harvesting seeds."
                        )
                        done = True
                        self.result.success = False
                        self.result.errors.append(
                            Msg(CODE_UNKNOWN_ERROR, str(e)))
                        self.stop_harvest_loop_event.set()
                    else:
                        # Retry
                        # Queue any WARC files
                        self._queue_warc_files()
                        # Wait for any WARC files to be processed
                        log.debug("Waiting for processing to complete.")
                        self.warc_processing_queue.join()
                        log.debug("Processing complete.")

            # Queue any WARC files
            self._queue_warc_files()

        # Turn off the restart_stream_timer.
        if self.restart_stream_timer:
            self.restart_stream_timer.cancel()

        # Turn off the queue WARC files timer
        if self.queue_warc_files_timer:
            self.queue_warc_files_timer.cancel()

        # Finish processing
        self._finish_processing()

        # Delete temp dir
        if os.path.exists(self.warc_temp_dir):
            shutil.rmtree(self.warc_temp_dir)

        log.info("Done harvesting by message with id %s", self.message["id"])

Ejemplo n.º 10

0

Mostrar archivo

Archivo: test_utils.py Proyecto: gwu-libraries/sfm-utils

 def test_safe_string(self):
     self.assertEqual("fooBAR12", safe_string("fooBAR12"))
     self.assertEqual("foo-bar-12", safe_string("foo.bar 12", replace_char="-"))