Exemple #1
0
class TestPatternParser(unittest.TestCase):
    def setUp(self):
        self.parser = PatternParser()
        self.text = "And now for something completely different."

    def test_parse(self):
        assert_equal(self.parser.parse(self.text), pattern_parse(self.text))
Exemple #2
0
class TestPatternParser(unittest.TestCase):

    def setUp(self):
        self.parser = PatternParser()
        self.text = "And now for something completely different."

    def test_parse(self):
        assert_equal(self.parser.parse(self.text), pattern_parse(self.text))
Exemple #3
0
class Blobber(object):
    """A factory for TextBlobs that all share the same tagger,
    tokenizer, parser, classifier, and np_extractor.

    Usage:

        >>> from textblob import Blobber
        >>> from textblob.taggers import NLTKTagger
        >>> from textblob.tokenizers import SentenceTokenizer
        >>> tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer())
        >>> blob1 = tb("This is one blob.")
        >>> blob2 = tb("This blob has the same tagger and tokenizer.")
        >>> blob1.pos_tagger is blob2.pos_tagger
        True

    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionadded:: 0.4.0
    """

    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None):
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    def __call__(self, text):
        """Return a new TextBlob object with this Blobber's ``np_extractor``,
        ``pos_tagger``, ``tokenizer``, ``analyzer``, and ``classifier``.

        :returns: A new :class:`TextBlob <TextBlob>`.
        """
        return TextBlob(text,
                        tokenizer=self.tokenizer,
                        pos_tagger=self.pos_tagger,
                        np_extractor=self.np_extractor,
                        analyzer=self.analyzer,
                        parser=self.parser,
                        classifier=self.classifier)

    def __repr__(self):
        classifier_name = self.classifier.__class__.__name__ + "()" if self.classifier else "None"
        return ("Blobber(tokenizer={0}(), pos_tagger={1}(), "
                    "np_extractor={2}(), analyzer={3}(), parser={4}(), classifier={5})")\
                    .format(self.tokenizer.__class__.__name__,
                            self.pos_tagger.__class__.__name__,
                            self.np_extractor.__class__.__name__,
                            self.analyzer.__class__.__name__,
                            self.parser.__class__.__name__,
                            classifier_name)

    __str__ = __repr__
Exemple #4
0
class BaseBlob(StringlikeMixin, BlobComparableMixin):
    """An abstract base class that all textblob classes will inherit from.
    Includes words, POS tag, NP, and word count properties. Also includes
    basic dunder and string methods for making objects like Python strings.

    :param text: A string.
    :param tokenizer: (optional) A tokenizer instance. If ``None``,
        defaults to :class:`WordTokenizer() <textblob.tokenizers.WordTokenizer>`.
    :param np_extractor: (optional) An NPExtractor instance. If ``None``,
        defaults to :class:`FastNPExtractor() <textblob.en.np_extractors.FastNPExtractor>`.
    :param pos_tagger: (optional) A Tagger instance. If ``None``,
        defaults to :class:`NLTKTagger <textblob.en.taggers.NLTKTagger>`.
    :param analyzer: (optional) A sentiment analyzer. If ``None``,
        defaults to :class:`PatternAnalyzer <textblob.en.sentiments.PatternAnalyzer>`.
    :param parser: A parser. If ``None``, defaults to
        :class:`PatternParser <textblob.en.parsers.PatternParser>`.
    :param classifier: A classifier.

    .. versionchanged:: 0.6.0
        ``clean_html`` parameter deprecated, as it was in NLTK.
    """
    np_extractor = FastNPExtractor()
    pos_tagger = NLTKTagger()
    tokenizer = WordTokenizer()
    translator = Translator()
    analyzer = PatternAnalyzer()
    parser = PatternParser()

    def __init__(self,
                 text,
                 tokenizer=None,
                 pos_tagger=None,
                 np_extractor=None,
                 analyzer=None,
                 parser=None,
                 classifier=None,
                 clean_html=False):
        if not isinstance(text, basestring):
            raise TypeError('The `text` argument passed to `__init__(text)` '
                            'must be a string, not {0}'.format(type(text)))
        if clean_html:
            raise NotImplementedError(
                "clean_html has been deprecated. "
                "To remove HTML markup, use BeautifulSoup's "
                "get_text() function")
        self.raw = self.string = text
        self.stripped = lowerstrip(self.raw, all=True)
        _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer,
                           parser, classifier)

    @cached_property
    def words(self):
        """Return a list of word tokens. This excludes punctuation characters.
        If you want to include punctuation characters, access the ``tokens``
        property.

        :returns: A :class:`WordList <WordList>` of word tokens.
        """
        return WordList(word_tokenize(self.raw, include_punc=False))

    @cached_property
    def tokens(self):
        """Return a list of tokens, using this blob's tokenizer object
        (defaults to :class:`WordTokenizer <textblob.tokenizers.WordTokenizer>`).
        """
        return WordList(self.tokenizer.tokenize(self.raw))

    def tokenize(self, tokenizer=None):
        """Return a list of tokens, using ``tokenizer``.

        :param tokenizer: (optional) A tokenizer object. If None, defaults to
            this blob's default tokenizer.
        """
        t = tokenizer if tokenizer is not None else self.tokenizer
        return WordList(t.tokenize(self.raw))

    def parse(self, parser=None):
        """Parse the text.

        :param parser: (optional) A parser instance. If ``None``, defaults to
            this blob's default parser.

        .. versionadded:: 0.6.0
        """
        p = parser if parser is not None else self.parser
        return p.parse(self.raw)

    def classify(self):
        """Classify the blob using the blob's ``classifier``."""
        if self.classifier is None:
            raise NameError("This blob has no classifier. Train one first!")
        return self.classifier.classify(self.raw)

    @cached_property
    def sentiment(self):
        """Return a tuple of form (polarity, subjectivity ) where polarity
        is a float within the range [-1.0, 1.0] and subjectivity is a float
        within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is
        very subjective.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity)``
        """
        return self.analyzer.analyze(self.raw)

    @cached_property
    def sentiment_assessments(self):
        """Return a tuple of form (polarity, subjectivity, assessments ) where
        polarity is a float within the range [-1.0, 1.0], subjectivity is a
        float within the range [0.0, 1.0] where 0.0 is very objective and 1.0
        is very subjective, and assessments is a list of polarity and
        subjectivity scores for the assessed tokens.

        :rtype: namedtuple of the form ``Sentiment(polarity, subjectivity,
        assessments)``
        """
        return self.analyzer.analyze(self.raw, keep_assessments=True)

    @cached_property
    def polarity(self):
        """Return the polarity score as a float within the range [-1.0, 1.0]

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[0]

    @cached_property
    def subjectivity(self):
        """Return the subjectivity score as a float within the range [0.0, 1.0]
        where 0.0 is very objective and 1.0 is very subjective.

        :rtype: float
        """
        return PatternAnalyzer().analyze(self.raw)[1]

    @cached_property
    def noun_phrases(self):
        """Returns a list of noun phrases for this blob."""
        return WordList([
            phrase.strip().lower()
            for phrase in self.np_extractor.extract(self.raw)
            if len(phrase) > 1
        ])

    @cached_property
    def pos_tags(self):
        """Returns an list of tuples of the form (word, POS tag).

        Example:
        ::

            [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'JJ'), ('on', 'IN'),
                    ('Thursday', 'NNP'), ('morning', 'NN')]

        :rtype: list of tuples
        """
        if isinstance(self, TextBlob):
            return [
                val for sublist in [s.pos_tags for s in self.sentences]
                for val in sublist
            ]
        else:
            return [(Word(word, pos_tag=t), unicode(t))
                    for word, t in self.pos_tagger.tag(self)
                    if not PUNCTUATION_REGEX.match(unicode(t))]

    tags = pos_tags

    @cached_property
    def word_counts(self):
        """Dictionary of word frequencies in this text.
        """
        counts = defaultdict(int)
        stripped_words = [lowerstrip(word) for word in self.words]
        for word in stripped_words:
            counts[word] += 1
        return counts

    @cached_property
    def np_counts(self):
        """Dictionary of noun phrase frequencies in this text.
        """
        counts = defaultdict(int)
        for phrase in self.noun_phrases:
            counts[phrase] += 1
        return counts

    def ngrams(self, n=3):
        """Return a list of n-grams (tuples of n successive words) for this
        blob.

        :rtype: List of :class:`WordLists <WordList>`
        """
        if n <= 0:
            return []
        grams = [
            WordList(self.words[i:i + n])
            for i in range(len(self.words) - n + 1)
        ]
        return grams

    def translate(self, from_lang="auto", to="en"):
        """Translate the blob to another language.
        Uses the Google Translate API. Returns a new TextBlob.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("Simple is better than complex")
            >>> b.translate(to="es")
            TextBlob('Lo simple es mejor que complejo')

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0.

        :param str from_lang: Language to translate from. If ``None``, will attempt
            to detect the language.
        :param str to: Language to translate to.
        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        return self.__class__(
            self.translator.translate(self.raw,
                                      from_lang=from_lang,
                                      to_lang=to))

    def detect_language(self):
        """Detect the blob's language using the Google Translate API.

        Requires an internet connection.

        Usage:
        ::

            >>> b = TextBlob("bonjour")
            >>> b.detect_language()
            u'fr'

        Language code reference:
            https://developers.google.com/translate/v2/using_rest#language-params

        .. versionadded:: 0.5.0

        :rtype: str
        """
        return self.translator.detect(self.raw)

    def correct(self):
        """Attempt to correct the spelling of a blob.

        .. versionadded:: 0.6.0

        :rtype: :class:`BaseBlob <BaseBlob>`
        """
        # regex matches: word or punctuation or whitespace
        tokens = nltk.tokenize.regexp_tokenize(self.raw, "\w+|[^\w\s]|\s")
        corrected = (Word(w).correct() for w in tokens)
        ret = ''.join(corrected)
        return self.__class__(ret)

    def _cmpkey(self):
        """Key used by ComparableMixin to implement all rich comparison
        operators.
        """
        return self.raw

    def _strkey(self):
        """Key used by StringlikeMixin to implement string methods."""
        return self.raw

    def __hash__(self):
        return hash(self._cmpkey())

    def __add__(self, other):
        '''Concatenates two text objects the same way Python strings are
        concatenated.

        Arguments:
        - `other`: a string or a text object
        '''
        if isinstance(other, basestring):
            return self.__class__(self.raw + other)
        elif isinstance(other, BaseBlob):
            return self.__class__(self.raw + other.raw)
        else:
            raise TypeError(
                'Operands must be either strings or {0} objects'.format(
                    self.__class__.__name__))

    def split(self, sep=None, maxsplit=sys.maxsize):
        """Behaves like the built-in str.split() except returns a
        WordList.

        :rtype: :class:`WordList <WordList>`
        """
        return WordList(self._strkey().split(sep, maxsplit))
Exemple #5
0
blob.tokens

#This is an alternative way
tokenizer = BlanklineTokenizer()
blob = TextBlob("A token\n\nof appreciation")
blob.tokenize(tokenizer)

# Noun phrase chunkers
from textblob.np_extractors import ConllExtractor
extractor = ConllExtractor()
blob = TextBlob("Python is a high-level programming language.", np_extractor=extractor)
blob.noun_phrases

# POS taggers
from textblob.taggers import NLTKTagger
nltk_tagger = NLTKTagger()
blob = TextBlob("Tag! You're It!", pos_tagger=nltk_tagger)
blob.pos_tags

# Parser
from textblob.parsers import PatternParser
blob = TextBlob("Parsing is fun.", parser=PatternParser())
blob.parse()

# TextBlob that share same model
rom textblob.taggers import NLTKTagger
tb = Blobber(pos_tagger=NLTKTagger())
blob1 = tb("This is a blob.")
blob2 = tb("This is another blob.")
blob1.pos_tagger is blob2.pos_tagger
Exemple #6
0
	def __init__(self):
		self.tag_stack = []
		self.ignore_data = False
		self.parsed_text = ''
		self.blobber = Blobber(parser=PatternParser(), pos_tagger=PatternTagger())
		HTMLParser.__init__(self)
Exemple #7
0
 def test_parse(self):
     blob = tb.TextBlob("And now for something completely different.")
     assert_equal(blob.parse(), PatternParser().parse(blob.string))
Exemple #8
0
 def setUp(self):
     self.parser = PatternParser()
     self.text = "And now for something completely different."
Exemple #9
0
**Regular Expression Parsing**
"""

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
data = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP) and Chairman of the Central Military Commission (CMC) since 2012, and President of the People's Republic of China (PRC) since 2013. He has been the paramount leader of China, the most prominent political leader in the country, since 2012. The son of Chinese Communist veteran Xi Zhongxun, he was exiled to rural Yanchuan County as a teenager following his father's purge during the Cultural Revolution and lived in a cave in the village of Liangjiahe, where he joined the CCP and worked as the party secretary."
new_token = nltk.pos_tag (word_tokenize(data))
new_token

np = r "NP: {<DT>?<JJ>*<NN>}" #This is a definition for a rule to group of words into a noun phrase.  It will group one determinant, then zero or more adjectives followed by zero or more nouns. 
chunk_parser = nltk.RegexpParser(np) #RegexpParser - Uses a set of regular expression patterns to specify the behavior of the parser. 
result = chunk_parser.parse(new_token)
result

"""**Pattern Parsing**"""

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from textblob.parsers import PatternParser
data = "Xi Jinping is a Chinese politician who has served as General Secretary of the Chinese Communist Party (CCP)."
new_token = nltk.pos_tag (word_tokenize(data))
chunk_parser = PatternParser()
result = chunk_parser.parse(new_token)
result
Exemple #10
0
 def setUp(self):
     self.parser = PatternParser()
     self.text = "And now for something completely different."
Exemple #11
0
def check_sarc(tweet):
    blob = TextBlob(tweet, parser=PatternParser())
    tokens = blob.parse().split(' ')
    dic = defaultdict(list)  # stores all phrases by category
    temp = ''
    phrases = []  # list of all phrases
    for t in tokens:
        if t.split('/')[2] == 'O':
            if temp:
                phrases.append((ctag, temp))
            dic[t.split('/')[2]].append(temp)
            temp = t.split('/')[0] + ' '
            ctag = t.split('/')[2]
        elif 'B-' in t.split('/')[2]:
            if temp:
                phrases.append((ctag, temp))
            temp = t.split('/')[0] + ' '
            dic[t.split('/')[2].split('-')[1]].append(temp)
            ctag = t.split('/')[2].split('-')[1]
        elif 'I-' in t.split('/')[2]:
            dic[t.split('/')[2].split('-')[1]][-1] += t.split('/')[0] + ' '
            temp += t.split('/')[0] + ' '
            ctag = t.split('/')[2].split('-')[1]
        else:
            pass
    if temp:
        phrases.append((ctag, temp))
    SF = []
    sf = []
    for i in phrases:
        if i[0] in ['NP', 'ADjP']:
            SF.append(i[1])
        elif i[0] == 'VP':
            sf.append(i[1])
    for i in range(len(phrases) - 1):
        if phrases[i][0] == 'NP' and phrases[i + 1][0] == 'VP':
            SF.append(phrases[i][1] + ' ' + phrases[i + 1][1])
        elif phrases[i][0] == 'ADVP' and phrases[i + 1][0] == 'VP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1])
        elif phrases[i][0] == 'VP' and phrases[i + 1][0] == 'ADVP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1])
        elif phrases[i][0] == 'ADJP' and phrases[i + 1][0] == 'VP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1])
        elif phrases[i][0] == 'VP' and phrases[i + 1][0] == 'NP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1])
    for i in range(len(phrases) - 2):
        if phrases[i][0] == 'VP' and phrases[i + 1][0] == 'ADVP' and phrases[
                i + 2][0] == 'ADJP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1] + ' ' +
                      phrases[i + 1][1])
        elif phrases[i][0] == 'VP' and phrases[i + 1][0] == 'ADJP' and phrases[
                i + 2][0] == 'NP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1] + ' ' +
                      phrases[i + 2][1])
        elif phrases[i][0] == 'ADVP' and phrases[
                i + 1][0] == 'ADJP' and phrases[i + 2][0] == 'NP':
            sf.append(phrases[i][1] + ' ' + phrases[i + 1][1] + ' ' +
                      phrases[i + 2][1])
    print SF
    print sf
    PSF = []
    NSF = []
    psf = []
    nsf = []
    for i in SF:
        blob = TextBlob(i)
        if blob.polarity > 0:
            PSF.append(i)
        elif blob.polarity < 0:
            NSF.append(i)
        elif blob.polarity == 0:
            pass
    for i in sf:
        blob = TextBlob(i)
        if blob.polarity > 0:
            psf.append(i)
        elif blob.polarity < 0:
            psf.append(i)
        elif blob.polarity == 0:
            pass
    print PSF
    print NSF
    print psf
    print nsf
    if (PSF and nsf) or (psf and NSF):
        return 1
    else:
        return 0