Esempio n. 1
0
    def cache_data(
        self,
        extracted_metadata: dict,
        cache_manager: CacheManager,
        allow_list: dict,
    ):
        for feature, meta_data in extracted_metadata.items():
            if (allow_list[feature]
                    and Explanation.Cached not in meta_data[EXPLANATION]):
                values = []
                if feature == ACCESSIBILITY:
                    values = meta_data[VALUES]

                data_to_be_cached = {
                    VALUES: values,
                    STAR_CASE: meta_data[STAR_CASE],
                    TIMESTAMP: get_utc_now(),
                    EXPLANATION: meta_data[EXPLANATION],
                }

                create_cache_entry(
                    cache_manager.get_domain(),
                    feature,
                    data_to_be_cached,
                    self._logger,
                )
Esempio n. 2
0
def print_exceptions(maximum_age_in_seconds: int):
    database: Session = ProfilerSession()

    query = database.query(
        db_models.Record.exception,
        db_models.Record.timestamp,
        db_models.Record.url,
    )

    meta_rows = database.execute(query)

    failure_urls = []

    print_data = {}
    for meta_row in meta_rows:
        timestamp = meta_row[1]
        if timestamp < (get_utc_now() - maximum_age_in_seconds):
            continue
        exception = meta_row[0]

        if exception == "":
            continue

        if exception not in print_data.keys():
            print_data.update({exception: 0})

        if "Empty html. Potentially, splash failed." in exception:
            print(f"Splash failed for url: '{meta_row[2]}'")

        failure_urls.append(meta_row[2])

        print_data[exception] += 1

    print(
        f"----------------- Found exceptions of the last {round(maximum_age_in_seconds / SECONDS_PER_DAY, 2)} days."
    )
    print(f"All urls which caused exceptions: {get_unique_list(failure_urls)}")
    print(f"Total number of found exceptions: {len(print_data.items())}")
    for exception, value in print_data.items():
        if exception != "":
            print(exception, value)
            print("-------")
    def _processing_values(self, values: list[str], website_data: WebsiteData,
                           before: float) -> dict:
        website_data.values = values

        star_case, explanation = self._decide(website_data=website_data)

        data = {
            self.key: {
                TIME_REQUIRED: get_utc_now() - before,
                VALUES: values,
                STAR_CASE: star_case,
                EXPLANATION: explanation,
            }
        }
        if self.tag_list_last_modified != "":
            data[self.key].update({
                "tag_list_last_modified": self.tag_list_last_modified,
                "tag_list_expires": self.tag_list_expires,
            })
        return data
Esempio n. 4
0
def extract_meta(input_data: Input):
    starting_extraction = get_utc_now()

    allowance = _convert_allow_list_to_dict(input_data.allow_list)

    database_exception = ""
    try:
        create_request_record(
            starting_extraction, input_data=input_data, allowance=allowance
        )
    except OperationalError as err:
        database_exception += (
            "\nDatabase exception: "
            + str(err.args)
            + "".join(traceback.format_exception(None, err, err.__traceback__))
        )

    uuid = app.communicator.send_message(
        {
            MESSAGE_URL: input_data.url,
            MESSAGE_HTML: input_data.html,
            MESSAGE_HEADERS: input_data.headers,
            MESSAGE_HAR: input_data.har,
            MESSAGE_ALLOW_LIST: allowance,
            MESSAGE_SHARED_MEMORY_NAME: shared_status.shm.name,
            MESSAGE_BYPASS_CACHE: input_data.bypass_cache,
        }
    )

    meta_data: dict = app.communicator.get_message(uuid)
    if meta_data:
        extractor_tags = _convert_dict_to_output_model(
            meta_data, input_data.debug
        )

        if MESSAGE_EXCEPTION in meta_data.keys():
            exception = meta_data[MESSAGE_EXCEPTION]
        else:
            exception = ""

    else:
        extractor_tags = None
        exception = f"No response from {METADATA_EXTRACTOR}."

    end_time = get_utc_now()
    out = Output(
        url=input_data.url,
        meta=extractor_tags,
        exception=exception + database_exception,
        time_until_complete=end_time - starting_extraction,
    )
    try:
        create_response_record(
            starting_extraction,
            end_time,
            input_data=input_data,
            allowance=allowance,
            output=out,
        )
    except OperationalError as err:
        database_exception += (
            "\nDatabase exception: "
            + str(err.args)
            + "".join(traceback.format_exception(None, err, err.__traceback__))
        )
        out.exception += database_exception

    if exception != "":
        raise HTTPException(
            status_code=400,
            detail={
                MESSAGE_URL: input_data.url,
                "meta": meta_data,
                MESSAGE_EXCEPTION: exception,
                "time_until_complete": end_time - starting_extraction,
            },
        )

    return out
 def is_cached_value_recent(timestamp: float) -> bool:
     return timestamp >= (get_utc_now() -
                          CACHE_RETENTION_TIME_DAYS * SECONDS_PER_DAY)
Esempio n. 6
0
def rester():
    allow_list = {
        "advertisement": True,
        "easy_privacy": True,
        "malicious_extensions": True,
        "extracted_links": True,
        "extract_from_files": True,
        "fanboy_annoyance": True,
        "fanboy_notification": True,
        "fanboy_social_media": True,
        "anti_adblock": True,
        "easylist_germany": True,
        "easylist_adult": True,
        "paywall": True,
        "security": True,
        "iframe_embeddable": True,
        "pop_up": True,
        "reg_wall": True,
        "log_in_out": True,
        "accessibility": True,
        "cookies": True,
        "g_d_p_r": True,
        "javascript": True,
    }

    extractor_url = "http://0.0.0.0:5057/extract_meta"

    result = {}

    try:
        os.remove(RESULT_FILE_PATH)
    except FileNotFoundError:
        pass

    logs, file_path = load_file_list()
    for counter, raw in enumerate(load_scraped_data(logs, file_path)):
        before = time.perf_counter()
        print(f"Working file {counter + 1} of {len(logs)}".center(80, "-"))
        print(raw["url"])

        starting_extraction = get_utc_now()

        headers = {"Content-Type": "application/json"}

        payload = {
            MESSAGE_HTML: raw["html"],
            MESSAGE_HEADERS: raw["headers"],
            MESSAGE_URL: raw["url"],
            MESSAGE_ALLOW_LIST: allow_list,
            MESSAGE_HAR: raw["har"],
            "debug": True,
        }
        response = requests.request("POST",
                                    extractor_url,
                                    headers=headers,
                                    data=json.dumps(payload))

        try:
            output = json.loads(response.content)
        except JSONDecodeError as e:
            print(response.content)
            print(f"Exception: {e}, {e.args}")
            output = {}
        output.update(
            {"time_for_extraction": get_utc_now() - starting_extraction})

        result.update({raw["url"]: output})

        with open(RESULT_FILE_PATH, "w") as fp:
            json.dump(result, fp)

        print(output)
        after = time.perf_counter()
        print(f"Total time needed in series: {after - before}")
 def _prepare_start(self) -> tuple[float, WebsiteData]:
     self._logger.info(f"Starting {self.__class__.__name__}.")
     before = get_utc_now()
     website_data = self._prepare_website_data()
     return before, website_data
Esempio n. 8
0
    def start(self, message: dict) -> dict:

        self._logger.debug(
            f"Start metadata_manager at {time.perf_counter() - global_start} since start"
        )

        shared_status = shared_memory.ShareableList(
            name=message[MESSAGE_SHARED_MEMORY_NAME])
        url = message[MESSAGE_URL]
        if len(url) > 1024:
            url = url[0:1024]
        shared_status[1] = url

        website_manager = WebsiteManager.get_instance()
        self._logger.debug(
            f"WebsiteManager initialized at {time.perf_counter() - global_start} since start"
        )
        website_manager.load_website_data(message=message)

        self._logger.debug(
            f"WebsiteManager loaded at {time.perf_counter() - global_start} since start"
        )
        cache_manager = CacheManager.get_instance()
        cache_manager.update_to_current_domain(
            website_manager.website_data.domain,
            bypass=message[MESSAGE_BYPASS_CACHE],
        )

        now = time.perf_counter()
        self._logger.debug(
            f"starting_extraction at {now - global_start} since start")
        starting_extraction = get_utc_now()
        if website_manager.website_data.html == "":
            exception = "Empty html. Potentially, splash failed."
            extracted_meta_data = {MESSAGE_EXCEPTION: exception}
        else:
            try:
                extracted_meta_data = asyncio.run(
                    self._extract_meta_data(
                        allow_list=message[MESSAGE_ALLOW_LIST],
                        cache_manager=cache_manager,
                        shared_memory_name=message[MESSAGE_SHARED_MEMORY_NAME],
                    ))
                self.cache_data(
                    extracted_meta_data,
                    cache_manager,
                    allow_list=message[MESSAGE_ALLOW_LIST],
                )
            except ConnectionError as e:
                exception = f"Connection error extracting metadata: '{e.args}'"
                self._logger.exception(
                    exception,
                    exc_info=True,
                )
                extracted_meta_data = {MESSAGE_EXCEPTION: exception}
            except Exception as e:
                exception = (
                    f"Unknown exception from extracting metadata: '{e.args}'. "
                    f"{''.join(traceback.format_exception(None, e, e.__traceback__))}"
                )
                self._logger.exception(
                    exception,
                    exc_info=True,
                )
                extracted_meta_data = {MESSAGE_EXCEPTION: exception}

        self._logger.debug(
            f"extracted_meta_data at {time.perf_counter() - global_start} since start"
        )
        extracted_meta_data.update({
            "time_for_extraction": get_utc_now() - starting_extraction,
            **website_manager.get_website_data_to_log(),
        })

        website_manager.reset()
        cache_manager.reset()
        shared_status[1] = ""

        self._logger.debug(
            f"website_manager.reset() at {time.perf_counter() - global_start} since start"
        )
        return extracted_meta_data