# Import custom text_analyzer package import text_analyzer # Create a SocialMedia instance with datacamp_tweets dc_tweets = text_analyzer.SocialMedia(text=datacamp_tweets) # Print the top five most most mentioned users print(dc_tweets.mention_counts.most_common(5)) # Plot the most used hashtags text_analyzer.plot_counter(dc_tweets.hashtag_counts)
class Document: """A class for text analysis :param text: string of text to be analyzed :ivar text: string of text to be analyzed; set by `text` parameter """ # Method to create a new instance of MyClass def __init__(self, text): # Store text parameter to the text attribute self.text = text # Tokenize the document with non-public tokenize method self.tokens = self._tokenize() # Perform word count with non-public count_words method self.word_counts = self._count_words # Assumes tokenize and Counter from other packages are imported def _tokenize(self): return tokenize(self.text) # Non-public method to tally document's word counts with Counter def _count_words(self): return Counter(self.tokens) # Can be imported with from .document import Document # Using the functionality # Import custom text_analyzer package import text_analyzer # Create an instance of Document with datacamp_tweet my_document = text_analyzer.Document(text=datacamp_tweet) # Print the text attribute of the Document instance print(my_document.text) # Print the first 5 tokens from datacamp_doc print(datacamp_doc.tokens[:5]) # Print the top 5 most used words in datacamp_doc print(datacamp_doc.word_counts.most_common(5)) # Class inheritance ---------------------------------------------------------------------------------------------------- # Child inherits all attributes from the parant, plus can have some extensions # Define a SocialMedia class that is a child of the `Document class` class SocialMedia(Document): def __init__(self, text): Document.__init__(self, text) self.hashtag_counts = self._count_hashtags() self.mention_counts = self._count_mentions() def _count_hashtags(self): # Filter attribute so only words starting with '#' remain return filter_word_counts(self.word_counts, first_char='#') def _count_mentions(self): # Filter attribute so only words starting with '@' remain return filter_word_counts(self.word_counts, first_char='@') # Using child class # Import custom text_analyzer package import text_analyzer # Create a SocialMedia instance with datacamp_tweets dc_tweets = text_analyzer.SocialMedia(text=datacamp_tweets) # Print the top five most most mentioned users print(dc_tweets.mention_counts.most_common(5)) # Plot the most used hashtags text_analyzer.plot_counter(dc_tweets.hashtag_counts) # Multilevel inheritance ----------------------------------------------------------------------------------------------- # One child can inherit from multiple parents # Define a Tweet class that inherits from SocialMedia class Tweets(SocialMedia): def __init__(self, text): # Call parent's __init__ with super() super().__init__(self, text) # Define retweets attribute with non-public method self.retweets = self._process_retweets() def _process_retweets(self): # Filter tweet text to only include retweets retweet_text = filter_lines(self.text, first_chars='RT') # Return retweet_text as a SocialMedia object return SocialMedia(retweet_text) # Using the grandchild # Import needed package import text_analyzer # Create instance of Tweets my_tweets = text_analyzer.Tweets(datacamp_tweets) # Plot the most used hashtags in the tweets my_tweets.plot_counts('hashtag_counts') # Plot the most used hashtags in the retweets my_tweets.retweets.plot_counts('hashtag_counts') # Docstrings ---------------------------------------------------------------------------------------------------------- def tokenize(text, regex=r'[a-zA-z]+'): """Split text into tokens using a regular expression :param text: text to be tokenized :param regex: regular expression used to match tokens using re.findall :return: a list of resulting tokens >>> tokenize('the rain in spain') ['the', 'rain', 'in', 'spain'] """ return re.findall(regex, text, flags=re.IGNORECASE) # Print the docstring help(tokenize) # Google docstring style """Google style. The Google style tends to result in wider docstrings with fewer lines of code. Section 1: Item 1: Item descriptions don't need line breaks. """ # Numpy docstring style """Numpy style. The Numpy style tends to results in narrower docstrings with more lines of code. Section 1 --------- Item 1 Item descriptions are indented on a new line. """ # Building docstring from multiple strings in parenthesis def get_matches(word_list: List[str], query:str) -> List[str]: ("Find lines containing the query string.\nExamples:\n\t" # Complete the docstring example below ">>> get_matches(['a', 'list', 'of', 'words'], 's')\n\t" # Fill in the expected result of the function call "['list', 'words']") return [line for line in word_list if query in line] # The Zen of Python --------------------------------------------------------------------------------------------------- import this # Testing with doctest ------------------------------------------------------------------------------------------------ def sum_counters(counters): """Aggregate collections.Counter objects by summing counts :param counters: list/tuple of counters to sum :return: aggregated counters with counts summed >>> d1 = text_analyzer.Document('1 2 fizz 4 buzz fizz 7 8') >>> d2 = text_analyzer.Document('fizz buzz 11 fizz 13 14') >>> sum_counters([d1.word_counts, d2.word_counts]) Counter({'buzz': 2, 'fizz': 4}) """ return sum(counters, Counter()) doctest.testmod() # Testing with pytest ------------------------------------------------------------------------------------------------- # working_dir # ├── text_analyzer # │ ├── __init__.py # │ ├── counter_utils.py # │ ├── document.py # ├── setup.py # ├── requirements.py # └── tests # └── test_unit.py from collections import Counter from text_analyzer import SocialMedia # Create an instance of SocialMedia for testing test_post = 'learning #python & #rstats is awesome! thanks @datacamp!' sm_post = SocialMedia(test_post) # Test hashtag counts are created properly def test_social_media_hashtags(): expected_hashtag_counts = Counter({'#python': 1, '#rstats': 1}) assert sm_post.hashtag_counts == expected_hashtag_counts # Run from command line $ pytest # Parametrizing test @pytest.mark.parametrize("inputs", ["intro.md", "plot.py", "discussion.md"]) def test_nbuild(inputs): assert nbuild([inputs]).cells[0].source == Path(inputs).read_text() # Check whether error is raised @pytest.mark.parametrize("not_exporters", ["htm", "ipython", "markup"]) def test_nbconv(not_exporters): with pytest.raises(ValueError): nbconv(nb_name="mynotebook.ipynb", exporter=not_exporters) # Classmethods -------------------------------------------------------------------------------------------------------- # Due to the classmethod decorator, the first argument in the function it decorates is not the class' instance (self), # but the class itself. This allows to instatiate multiple instances at ones. class TextFile: instances = [] def__init__(self, file): self.text = Path(file).read_text() self.__class__.instances.append(file) @classmethod def instantiate(cls, filenames): return map(cls, filenames)