Exemple #1
0
def test_word_count_with_lots_of_punctuation():
    """Ensure word count works properly with lots of punctuation."""
    string = (
        'Even if "everyone" knows this should still work with a lot '
        "-- a LOT -- of punctuation (or spécial characters), it's probably "
        "best not to count 100% on it; that's just foolish/risky.")
    assert word_count(string) == 31
Exemple #2
0
    def get_metadata_from_result(result: ScraperResult) -> Dict[str, Any]:
        """Get the metadata that we're interested in out of a scrape result."""
        if result.scraper_type != ScraperType.EMBEDLY:
            raise ValueError(
                "Can't process a result from a different scraper.")

        metadata = {}

        if result.data.get("title"):
            metadata["title"] = result.data["title"]

        if result.data.get("description"):
            metadata["description"] = result.data["description"]

        content = result.data.get("content")
        if content:
            metadata["word_count"] = word_count(
                extract_text_from_html(content))

        if result.data.get("published"):
            # the field's value is in milliseconds, store it in seconds instead
            metadata["published"] = result.data["published"] // 1000

        authors = result.data.get("authors")
        if authors:
            try:
                metadata["authors"] = [author["name"] for author in authors]
            except KeyError:
                pass

        return metadata
    def _generate_text_metadata(topic: Topic) -> Dict[str, Any]:
        """Generate metadata for a text topic (word count and excerpt)."""
        extracted_text = extract_text_from_html(topic.rendered_html)

        # create a short excerpt by truncating the extracted string
        excerpt = truncate_string(extracted_text, length=200, truncate_at_chars=" ")

        return {"word_count": word_count(extracted_text), "excerpt": excerpt}
Exemple #4
0
    def _generate_text_metadata(topic: Topic) -> None:
        """Generate metadata for a text topic (word count and excerpt)."""
        html_tree = HTMLParser().parseFragment(topic.rendered_html)

        # extract the text from all of the HTML elements
        extracted_text = ''.join(
            [element_text for element_text in html_tree.itertext()])

        # sanitize unicode, remove leading/trailing whitespace, etc.
        extracted_text = simplify_string(extracted_text)

        # create a short excerpt by truncating the simplified string
        excerpt = truncate_string(
            extracted_text,
            length=200,
            truncate_at_chars=' ',
        )

        topic.content_metadata = {
            'word_count': word_count(extracted_text),
            'excerpt': excerpt,
        }
Exemple #5
0
def test_word_count_with_apostrophes():
    """Ensure apostrophes don't mess up the word count."""
    string = "It's not always false that apostrophes aren't counted properly."
    assert word_count(string) == 9
Exemple #6
0
def test_simple_word_count():
    """Ensure word-counting a simple string works as expected."""
    string = 'Here is a simple string of words, nothing fancy.'
    assert word_count(string) == 9