def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer( ) self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger( tokenizer=self.tokenizer) self.np_extractor = np_extractor if np_extractor is not None \ else PatternParserNPExtractor(tokenizer=self.tokenizer) self.analyzer = analyzer if analyzer is not None \ else PatternAnalyzer(tokenizer=self.tokenizer) self.parser = parser if parser is not None \ else PatternParser(tokenizer=self.tokenizer) self.classifier = classifier if classifier is not None else None if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError( "clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models(self, self.tokenizer, self.pos_tagger, self.np_extractor, self.analyzer, self.parser, self.classifier)
def word_counts(self): """Dictionary of word frequencies in this text.""" counts = defaultdict(int) stripped_words = [lowerstrip(word) for word in self.words] for word in stripped_words: counts[word] += 1 return counts
def sentiment_overview(df): st.subheader('Overview of comments') st.write( 'Sort the comments by sentiment based on polarity and then it displays them in the list below.' ) option = st.selectbox('Select the sentiment', ('Positive', 'Negative', 'Neutral')) if option == 'All': if len(df['comments'].tolist()) > 0: st.table(df['comments']) else: st.info('There is no comments on this video.') else: if len(df[df['sentiment'] == lowerstrip(option)] ['comments'].tolist()) > 0: st.table(df[df['sentiment'] == lowerstrip(option)]['comments']) else: st.info( f'There is no {lowerstrip(option)} comments on this video.')
def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError("clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier)
def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): self.tokenizer = tokenizer if tokenizer is not None else NLTKPunktTokenizer() self.pos_tagger = pos_tagger if pos_tagger is not None else PatternTagger( tokenizer=self.tokenizer) self.np_extractor = np_extractor if np_extractor is not None \ else PatternParserNPExtractor(tokenizer=self.tokenizer) self.analyzer = analyzer if analyzer is not None \ else PatternAnalyzer(tokenizer=self.tokenizer) self.parser = parser if parser is not None \ else PatternParser(tokenizer=self.tokenizer) self.classifier = classifier if classifier is not None else None if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError( "clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models( self, self.tokenizer, self.pos_tagger, self.np_extractor, self.analyzer, self.parser, self.classifier)
def test_lowerstrip(self): assert_equal(lowerstrip(self.text), 'this. has. punctuation')
import keys from operator import itemgetter from tweetlistener import TweetListener import preprocessor as p from textblob.utils import lowerstrip #authenticate with Twitter and create an api auth = tweepy.OAuthHandler(keys.consumer_key, keys.consumer_secret) auth.set_access_token(keys.access_token, keys.access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) search = api.search(q='liverpool', count=100) # initialize the preprocessor p.set_options(p.OPT.URL, p.OPT.RESERVED, p.OPT.NUMBER, p.OPT.SMILEY, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG) clean = [] for tweet in search: try: text = tweet.extended_tweet.text except: text = tweet.text clean.append(p.clean(text)) stripped_text = [lowerstrip(t) for t in clean] for text in stripped_text: print(text)