Exemple #1
0
# Import local package
import text_analyzer

# Sum word_counts using sum_counters from text_analyzer
word_count_totals = text_analyzer.sum_counters(word_counts)

# Plot word_count_totals using plot_counter from text_analyzer
text_analyzer.plot_counter(word_count_totals)

# Import custom text_analyzer package
import text_analyzer

# Create a SocialMedia instance with datacamp_tweets
dc_tweets = text_analyzer.SocialMedia(text=datacamp_tweets)

# Print the top five most most mentioned users
print(dc_tweets.mention_counts.most_common(5))

# Plot the most used hashtags
text_analyzer.plot_counter(dc_tweets.hashtag_counts)
Exemple #3
0
        self.hashtag_counts = self._count_hashtags()
        self.mention_counts = self._count_mentions()

    def _count_hashtags(self):
        # Filter attribute so only words starting with '#' remain
        return filter_word_counts(self.word_counts, first_char='#')      

    def _count_mentions(self):
        # Filter attribute so only words starting with '@' remain
        return filter_word_counts(self.word_counts, first_char='@')
Instructions
100 XP
import your text_analyzer custom package.
Define dc_tweets as an instance of SocialMedia with the preloaded datacamp_tweets object as the text.
print the 5 most_common mentioned users in the data using the appropriate dc_tweets attribute.
Use text_analyzer's plot_counter() method to plot the most used hashtags in the data using the appropriate dc_tweets attribute.
'''
SOLUTION

# Import custom text_analyzer package
import text_analyzer

# Create a SocialMedia instance with datacamp_tweets
dc_tweets = text_analyzer.SocialMedia(text=datacamp_tweets)

# Print the top five most most mentioned users
print(dc_tweets.mention_counts.most_common(5))

# Plot the most used hashtags
text_analyzer.plot_counter(dc_tweets.word_counts)
class Document:
    """A class for text analysis
    
    :param text: string of text to be analyzed
    :ivar text: string of text to be analyzed; set by `text` parameter
    """
    # Method to create a new instance of MyClass
    def __init__(self, text):
        # Store text parameter to the text attribute
        self.text = text
        # Tokenize the document with non-public tokenize method
    	self.tokens = self._tokenize()
    	# Perform word count with non-public count_words method
    	self.word_counts = self._count_words

    # Assumes tokenize and Counter from other packages are imported
    def _tokenize(self):
    	return tokenize(self.text)
	
  	# Non-public method to tally document's word counts with Counter
  	def _count_words(self):
    	return Counter(self.tokens)

# Can be imported with
from .document import Document

# Using the  functionality
# Import custom text_analyzer package
import text_analyzer
# Create an instance of Document with datacamp_tweet
my_document = text_analyzer.Document(text=datacamp_tweet)
# Print the text attribute of the Document instance
print(my_document.text)
# Print the first 5 tokens from datacamp_doc
print(datacamp_doc.tokens[:5])
# Print the top 5 most used words in datacamp_doc
print(datacamp_doc.word_counts.most_common(5))


# Class inheritance ----------------------------------------------------------------------------------------------------
# Child inherits all attributes from the parant, plus can have some extensions

# Define a SocialMedia class that is a child of the `Document class`
class SocialMedia(Document):
    def __init__(self, text):
        Document.__init__(self, text)
        self.hashtag_counts = self._count_hashtags()
        self.mention_counts = self._count_mentions()
        
    def _count_hashtags(self):
        # Filter attribute so only words starting with '#' remain
        return filter_word_counts(self.word_counts, first_char='#')      
    
    def _count_mentions(self):
        # Filter attribute so only words starting with '@' remain
        return filter_word_counts(self.word_counts, first_char='@')

# Using child class
# Import custom text_analyzer package
import text_analyzer
# Create a SocialMedia instance with datacamp_tweets
dc_tweets = text_analyzer.SocialMedia(text=datacamp_tweets)
# Print the top five most most mentioned users
print(dc_tweets.mention_counts.most_common(5))
# Plot the most used hashtags
text_analyzer.plot_counter(dc_tweets.hashtag_counts)


# Multilevel inheritance -----------------------------------------------------------------------------------------------
# One child can inherit from multiple parents

# Define a Tweet class that inherits from SocialMedia
class Tweets(SocialMedia):
    def __init__(self, text):
        # Call parent's __init__ with super()
        super().__init__(self, text)
        # Define retweets attribute with non-public method
        self.retweets = self._process_retweets()

    def _process_retweets(self):
        # Filter tweet text to only include retweets
        retweet_text = filter_lines(self.text, first_chars='RT')
        # Return retweet_text as a SocialMedia object
        return SocialMedia(retweet_text)

# Using the grandchild
# Import needed package
import text_analyzer
# Create instance of Tweets
my_tweets = text_analyzer.Tweets(datacamp_tweets)
# Plot the most used hashtags in the tweets
my_tweets.plot_counts('hashtag_counts')
# Plot the most used hashtags in the retweets
my_tweets.retweets.plot_counts('hashtag_counts')


# Docstrings ----------------------------------------------------------------------------------------------------------
def tokenize(text, regex=r'[a-zA-z]+'):
  """Split text into tokens using a regular expression

  :param text: text to be tokenized
  :param regex: regular expression used to match tokens using re.findall 
  :return: a list of resulting tokens

  >>> tokenize('the rain in spain')
  ['the', 'rain', 'in', 'spain']
  """
  return re.findall(regex, text, flags=re.IGNORECASE)

# Print the docstring
help(tokenize)

# Google docstring style
"""Google style.

The Google style tends to result in
wider docstrings with fewer lines of code.

Section 1:    
	Item 1: Item descriptions don't need line breaks.
"""

# Numpy docstring style
"""Numpy style.

The Numpy style tends to results in
narrower docstrings with more lines of code.

Section 1
---------
Item 1    
	Item descriptions are indented on a new line.
"""

# Building docstring from multiple strings in parenthesis
def get_matches(word_list: List[str], query:str) -> List[str]:
    ("Find lines containing the query string.\nExamples:\n\t"
     # Complete the docstring example below
     ">>> get_matches(['a', 'list', 'of', 'words'], 's')\n\t"
     # Fill in the expected result of the function call
     "['list', 'words']")
    return [line for line in word_list if query in line]
	

# The Zen of Python ---------------------------------------------------------------------------------------------------
import this


# Testing with doctest ------------------------------------------------------------------------------------------------
def sum_counters(counters):
    """Aggregate collections.Counter objects by summing counts

    :param counters: list/tuple of counters to sum
    :return: aggregated counters with counts summed

    >>> d1 = text_analyzer.Document('1 2 fizz 4 buzz fizz 7 8')
    >>> d2 = text_analyzer.Document('fizz buzz 11 fizz 13 14')
    >>> sum_counters([d1.word_counts, d2.word_counts])
    Counter({'buzz': 2, 'fizz': 4})
    """
    return sum(counters, Counter())

doctest.testmod()


# Testing with pytest -------------------------------------------------------------------------------------------------
# working_dir
# ├── text_analyzer
# │    ├── __init__.py
# │    ├── counter_utils.py
# │    ├── document.py
# ├──  setup.py
# ├──  requirements.py
# └── tests
#      └── test_unit.py
from collections import Counter
from text_analyzer import SocialMedia

# Create an instance of SocialMedia for testing
test_post = 'learning #python & #rstats is awesome! thanks @datacamp!'
sm_post = SocialMedia(test_post)

# Test hashtag counts are created properly
def test_social_media_hashtags():
    expected_hashtag_counts = Counter({'#python': 1, '#rstats': 1})
    assert sm_post.hashtag_counts == expected_hashtag_counts

# Run from command line
$ pytest

# Parametrizing test
@pytest.mark.parametrize("inputs", ["intro.md", "plot.py", "discussion.md"])
def test_nbuild(inputs):
    assert nbuild([inputs]).cells[0].source == Path(inputs).read_text()

# Check whether error is raised
@pytest.mark.parametrize("not_exporters", ["htm", "ipython", "markup"])
def test_nbconv(not_exporters):
    with pytest.raises(ValueError):
        nbconv(nb_name="mynotebook.ipynb", exporter=not_exporters)
	

# Classmethods --------------------------------------------------------------------------------------------------------
# Due to the classmethod decorator, the first argument in the function it decorates is not the class' instance (self),
# but the class itself. This allows to instatiate multiple instances at ones.

class TextFile:
	
	instances = []
	
	def__init__(self, file):        
		self.text = Path(file).read_text()        
		self.__class__.instances.append(file)    
		
	@classmethod
	def instantiate(cls, filenames):
		return map(cls, filenames)
Exemple #5
0
        'manipulate': 1,
        'p': 1,
        'pandas': 1,
        'series': 1,
        'time': 1,
        'to': 1,
        'with': 1
    })
]

# Sum word_counts using sum_counters from text_analyzer
word_count_totals = text_analyzer.sum_counters(word_counts)
print(word_count_totals)

# Plot word_count_totals using plot_counter from text_analyzer
top_5_items = text_analyzer.plot_counter(word_count_totals)
print(top_5_items)

# Create an instance of Document with datacamp_tweet
datacamp_tweet = 'Basic linear regression example. #DataCamp #DataScience #Python #sklearn'
my_document = text_analyzer.Document(text=datacamp_tweet)

# Print the text attribute of the Document instance
print(my_document.text)

# create a new document instance from datacamp_tweets
datacamp_tweets = '[DataCamp] Introduction to H2O AutoML --> In this tutorial, you will learn about H2O and have a glimpse of its auto…\n[DataCamp] Stocks, Significance Testing & p-Hacking --> Learn how to manipulate time series data with pandas and co…\nRT @cbismuth: Linear regression example with most significant features detection. #DataCamp #DataScience #Python #sklearn …\nLinear regression example with most significant features detection. #DataCamp #DataScience #Python #sklearn\nBasic linear regression example. #DataCamp #DataScience #Python #sklearn\nRT @David_Makinde_: I just completed Introduction to Python for Data Science \n#Datacamp\n#DataScience \n#Python\n[DataCamp] Enter the #DataFramedChallenge for a chance to be on an upcoming podcast segment. --> DataCamp has a pod…\n[DataCamp] Introduction to Python Metaclasses --> In this tutorial, you\'ll learn about metaclasses in Python. by De…\nI just completed Introduction to Python for Data Science \n#Datacamp\n#DataScience \n#Python\nRT @cbismuth: My pretty first classifier! #DataCamp #Python #sklearn\nMy pretty first classifier! #DataCamp #Python #sklearn\nRT @ascentt: The different #DataScience roles on the job market.'
datacamp_doc = text_analyzer.Document(datacamp_tweets)

# print the first 5 tokens from datacamp_doc
print(datacamp_doc.text)