Beispiel #1
0
 def tokenize(self, text):
     '''Casual speech tokenizer wrapper function, closely based on nltk's version.
     Returns a list of words.
     ::param text:: tweet text
     ::type text:: str
     '''
     text = _replace_html_entities(text)
     if not self.preserve_handles:
         text = re.sub(TWITTER_USER_RE, ' ', text)
     if not self.preserve_hashes:
         text = re.sub(HASH_RE, '', text)
     if not self.preserve_url:
         text = re.sub(URL_RE, ' ', text)
     if not self.preserve_len:
         text = reduce_lengthening(text)
     if self.regularize:
         text = self.R.regularize(text)
     if not self.preserve_emoji:
         text = self.strip_emoji(text)
     words = self.WORD_RE.findall(text)
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()),
                 words))
     return words
Beispiel #2
0
    def tokenize(self, text):
        '''Casual speech tokenizer wrapper function for Reddit, closely based on nltk's version.
        Returns a list of words.
        ::param text:: reddit text
        ::type text:: str
        '''
        text = _replace_html_entities(text)
        if not self.preserve_handles:
            text = re.sub(REDDIT_USER_RE, ' ', text)
        if not self.preserve_hashes:
            text = re.sub(HASH_RE, '', text)
        if not self.preserve_url:
            text = re.sub(URL_RE, ' ', text)
        if not self.preserve_ellipsis:
            text = re.sub(ELLIPSIS_RE, ' ', text)
        if not self.preserve_numbers:
            text = re.sub(NUMBERS_RE, ' ', text)
        if not self.preserve_aposS:
            text = re.sub(r"""'[sS]\b""", '', text)

        if not self.preserve_len:
            text = reduce_lengthening(text)
        if self.regularize:
            text = self.R.regularize(text)
        if not self.preserve_emoji:
            text = self.strip_emoji(text)

        words = self.WORD_RE.findall(text)
        if not self.preserve_case:
            words = list(
                map((lambda x: x if EMOTICON_RE.search(x) else x.lower()),
                    words))
        return words
def preprocess_tweet(text):    
    text = casual.reduce_lengthening(text)
    text = cleanString(setupRegexes('twitterProAna'),text)   
    text = ' '.join([span for notentity,span in tweetPreprocessor(text, ("urls", "users", "lists")) if notentity])
    text = text.replace('\t','')
    text = text.replace('< ','<').replace(' >','>')
    text = text.replace('):', '<sadface>').replace('(:', '<smile>')
    text = text.replace(" 't", "t").replace('#','')
    return text
Beispiel #4
0
    def tokenize(self, text):
        """
        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings;

        Normalizes URLs, usernames and word lengthening depending of the
        attributes of the instance.

        """
        # Fix HTML character entities:
        text = _replace_html_entities(text)

        # Remove or replace username handles
        if self.strip_handles:
            text = remove_handles(text)
        elif self.normalize_usernames:
            text = normalize_mentions(text)

        if self.normalize_urls:
            # Shorten problematic sequences of characters
            text = normalize_urls(text)

        # Normalize word lengthening
        if self.reduce_len:
            text = HANG_RE.sub(r'\1\1\1', text)
            text = reduce_lengthening(text)

        # Tokenize:
        safe_text = HANG_RE.sub(r'\1\1\1', text)
        words = WORD_RE.findall(safe_text)

        # Possibly alter the case, but avoid changing emoticons like :D into :d:
        # lower words but keep words that are all upper cases
        if not self.preserve_case:
            words = [_lowerize(w, self.keep_allupper) for w in words]

        words = [_stock_code(w) for w in words]

        return words
Beispiel #5
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words