Beispiel #1
0
 def tokenize(self, text):
     '''Casual speech tokenizer wrapper function, closely based on nltk's version.
     Returns a list of words.
     ::param text:: tweet text
     ::type text:: str
     '''
     text = _replace_html_entities(text)
     if not self.preserve_handles:
         text = re.sub(TWITTER_USER_RE, ' ', text)
     if not self.preserve_hashes:
         text = re.sub(HASH_RE, '', text)
     if not self.preserve_url:
         text = re.sub(URL_RE, ' ', text)
     if not self.preserve_len:
         text = reduce_lengthening(text)
     if self.regularize:
         text = self.R.regularize(text)
     if not self.preserve_emoji:
         text = self.strip_emoji(text)
     words = self.WORD_RE.findall(text)
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()),
                 words))
     return words
Beispiel #2
0
    def tokenize(self, text):

        text = _replace_html_entities(text)

        for regexp, substitution in self.STARTING_QUOTES:
            text = regexp.sub(substitution, text)

        for regexp, substitution in self.PUNCTUATION:
            text = regexp.sub(substitution, text)

        text = " " + text + " "

        # split contractions
        for regexp, substitution in self.ENDING_QUOTES:
            text = regexp.sub(substitution, text)
        for regexp in self.CONTRACTIONS:
            text = regexp.sub(r' \1 \2 ', text)

        # handle emojis
        for emoticon in list(EMOTICON_RE.finditer(text))[::-1]:
            pos = emoticon.span()[0]
            if text[pos - 1] != ' ':
                text = text[:pos] + ' ' + text[pos:]

        return text.split()
Beispiel #3
0
def process_headlines(row):
    headline = row['headline']
    headline = _replace_html_entities(headline)
    headline = re.sub('\s+', ' ', headline).lstrip().rstrip()
    headline = ' '.join(nltk.word_tokenize(headline))
    headline = re.sub(r'\d', '%', headline)
    return headline
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     # --- IF YOU DO NOT WANT TO REDUCE LEN : COMMENT LINE 46 --- #
     safe_text = HANG_RE.sub(r'\1\1\1', text)
     safe_text = text
     # Tokenize:
     words = WORD_RE.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
Beispiel #5
0
    def tokenize(self, text):
        '''Casual speech tokenizer wrapper function for Reddit, closely based on nltk's version.
        Returns a list of words.
        ::param text:: reddit text
        ::type text:: str
        '''
        text = _replace_html_entities(text)
        if not self.preserve_handles:
            text = re.sub(REDDIT_USER_RE, ' ', text)
        if not self.preserve_hashes:
            text = re.sub(HASH_RE, '', text)
        if not self.preserve_url:
            text = re.sub(URL_RE, ' ', text)
        if not self.preserve_ellipsis:
            text = re.sub(ELLIPSIS_RE, ' ', text)
        if not self.preserve_numbers:
            text = re.sub(NUMBERS_RE, ' ', text)
        if not self.preserve_aposS:
            text = re.sub(r"""'[sS]\b""", '', text)

        if not self.preserve_len:
            text = reduce_lengthening(text)
        if self.regularize:
            text = self.R.regularize(text)
        if not self.preserve_emoji:
            text = self.strip_emoji(text)

        words = self.WORD_RE.findall(text)
        if not self.preserve_case:
            words = list(
                map((lambda x: x if EMOTICON_RE.search(x) else x.lower()),
                    words))
        return words
Beispiel #6
0
def tokenize_leads(row):
    leads = []
    for lead in row['all_leads']:
        lead = _replace_html_entities(lead)
        lead = re.sub('\s+', ' ', lead).lstrip().rstrip()
        lead = ' '.join(word_tokenize(lead))
        lead = re.sub(r'\d', '%', lead)
        leads.append(lead)
    return leads
Beispiel #7
0
def replace_html_entities(txt):
    """Replace the html entities in text with corresponding unicode entities.

    Uses UTF-8 encoding.

    Args:
        txt (str): input string
    """
    return casual._replace_html_entities(txt)
Beispiel #8
0
  def text_filter(self, text):

    text = text.replace('|||', ' ')
    text = _replace_html_entities(text)

    if text and self.merge_repeated_punc:
      text = self.compiled_regexes["REPEAT_PUNCTS"].sub(
        lambda w: handle_repeated_puncts(w), text)

    text = self.compiled_regexes["REPEAT_CHARS"].sub(
      lambda w: handle_repeated_puncts(w), text)

    return text
Beispiel #9
0
def word_tknz_artsentence(row):
    sentences = row['article_sentences'].split('\n\n')
    proc_sent = []
    for sent in sentences:
        sent = _replace_html_entities(sent)
        sent = re.sub(non_bmp, ' ', sent)
        sent = re.sub(dotted, ' ', sent)
        sent = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', sent, flags=re.MULTILINE)
        sent = re.sub('\s+', ' ', sent).lstrip().rstrip()
        sent = ' '.join(nltk.word_tokenize(sent))
        proc_sent.append(sent)
    tknz_sent = list(
        map(lambda sent: ' '.join(nltk.word_tokenize(sent)), proc_sent))
    dgtz_sent = list(map(lambda sent: re.sub(r'\d', '%', sent), tknz_sent))
    return ' '.join(dgtz_sent)
Beispiel #10
0
    def tokenize(self, text):
        """
        :param text: str
        :rtype: list(str)
        :return: a tokenized list of strings;

        Normalizes URLs, usernames and word lengthening depending of the
        attributes of the instance.

        """
        # Fix HTML character entities:
        text = _replace_html_entities(text)

        # Remove or replace username handles
        if self.strip_handles:
            text = remove_handles(text)
        elif self.normalize_usernames:
            text = normalize_mentions(text)

        if self.normalize_urls:
            # Shorten problematic sequences of characters
            text = normalize_urls(text)

        # Normalize word lengthening
        if self.reduce_len:
            text = HANG_RE.sub(r'\1\1\1', text)
            text = reduce_lengthening(text)

        # Tokenize:
        safe_text = HANG_RE.sub(r'\1\1\1', text)
        words = WORD_RE.findall(safe_text)

        # Possibly alter the case, but avoid changing emoticons like :D into :d:
        # lower words but keep words that are all upper cases
        if not self.preserve_case:
            words = [_lowerize(w, self.keep_allupper) for w in words]

        words = [_stock_code(w) for w in words]

        return words
Beispiel #11
0
    def clean(self, text):
        if not self.type_include:
            tknzr = NLTK.TweetTokenizer()
            return tknzr.tokenize(text)

    # Fix HTML character entities:
        text = NLTK._replace_html_entities(text)
        # Shorten problematic sequences of characters
        safe_text = NLTK.HANG_RE.sub(r'\1\1\1', text)
        # Tokenize:
        words = WORD_RE.findall(safe_text)

        clean_text = text
        # # Possibly alter the case, but avoid changing emoticons like :D into :d:
        for i, x in enumerate(words[:]):

            # if EMOTICON_RE.match(x) or EMOJI_RE.match(x):
            # text.decode('utf8')

            if URLS_RE.match(x) or EMAIL_RE.match(x):
                # print "url"
                clean_text = clean_text.replace(x, '')
            elif USERNAME_RE.match(x):
                # print "Username"
                clean_text = clean_text.replace(x, '')
            elif HASHTAG_RE.match(x):
                # print "tag"
                clean_text = clean_text.replace(x, '')
            elif PHONUM_RE.match(x):
                # print "phone"
                clean_text = clean_text.replace(x, '')
            # elif x.lower() in STOP:
            #   print "stop"
            #     clean_text = clean_text.replace(x, '')
            # elif EMOJI_RE.match(x):
            #       clean_text = clean_text.replace(x, '')

            else:
                continue
        return clean_text
Beispiel #12
0
 def tokenize(self, text):
     """
     :param text: str
     :rtype: list(str)
     :return: a tokenized list of strings; concatenating this list returns\
     the original string if `preserve_case=False`
     """
     # Fix HTML character entities:
     text = _replace_html_entities(text)
     # Remove username handles
     if self.strip_handles:
         text = remove_handles(text)
     # Normalize word lengthening
     if self.reduce_len:
         text = reduce_lengthening(text)
     # Shorten problematic sequences of characters
     safe_text = HANG_RE.sub(r"\1\1\1", text)
     # Tokenize:
     r"|<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>"
     custom_Re = regex.compile(
         r"""(%s)"""
         % "|".join(
             (
                 r":[^:\s]+:",
                 r"<:[^:\s]+:[0-9]+>",
                 r"<a:[^:\s]+:[0-9]+>",
                 r"<(?:[^\d>]+|:[A-Za-z0-9]+:)\w+>",
             )
             + REGEXPS
         ),
         regex.VERBOSE | regex.I | regex.UNICODE,
     )
     words = custom_Re.findall(safe_text)
     # Possibly alter the case, but avoid changing emoticons like :D into :d:
     if not self.preserve_case:
         words = list(
             map((lambda x: x if EMOTICON_RE.search(x) else x.lower()), words)
         )
     return words
Beispiel #13
0
def process_tweets(row):
    tweet_tkns = row['tweet'].split()
    twt_clean = [
        tkn for idx, tkn in enumerate(tweet_tkns)
        if 'https' not in tkn and 'RT' != tkn and not (idx == 1 and '@' in tkn)
        and not (idx == 0 and '@' in tkn)
    ]
    twt_clean = twt_clean[:-1] if '@' in twt_clean[-1] else twt_clean
    tweet = ' '.join(twt_clean)
    tweet = _replace_html_entities(tweet)
    reduce_lengthening = re.compile(r"(.)\1{2,}")
    tweet = reduce_lengthening.sub(r"\1\1\1", tweet)
    remode_handles = re.compile(
        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){20}(?!@))|"
        r"(?<![A-Za-z0-9_!@#\$%&*])@(([A-Za-z0-9_]){1,19})(?![A-Za-z0-9_]*@)")
    tweet = remode_handles.sub(' ', tweet)
    tweet = re.sub(non_bmp, ' ', tweet)
    tweet = re.sub(dotted, ' ', tweet)
    tweet = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', tweet, flags=re.MULTILINE)
    tweet = re.sub('\s+', ' ', tweet).lstrip().rstrip()
    tweet = ' '.join(nltk.word_tokenize(tweet))
    tweet = re.sub(r'\d', '%', tweet)
    # return ' '.join(tweet_tknzr.tokenize(' '.join(twt_clean))), ' '.join(nltk.word_tokenize(' '.join(twt_clean)))
    return tweet
Beispiel #14
0
    def tokenize(self, text):
        if not self.type_include:
            tknzr = NLTK.TweetTokenizer()
            return tknzr.tokenize(text)

        # Fix HTML character entities:
        text = NLTK._replace_html_entities(text)
        # Shorten problematic sequences of characters
        safe_text = NLTK.HANG_RE.sub(r'\1\1\1', text)
        # Tokenize:
        words = WORD_RE.findall(safe_text)

        # # Possibly alter the case, but avoid changing emoticons like :D into :d:
        for i, x in enumerate(words[:]):

            if EMOTICON_RE.match(x) or EMOJI_RE.match(x):
                words[i] = (x, 'E')
            elif URLS_RE.match(x) or EMAIL_RE.match(x):
                words[i] = (x, 'U')
            elif USERNAME_RE.match(x):
                words[i] = (x, 'USR')
            elif HASHTAG_RE.match(x):
                words[i] = (x, 'H')
            elif PHONUM_RE.match(x):
                words[i] = (x, 'PN')
            elif x.lower() in STOP:
                words[i] = (x, 'S')
            elif x in PUNCTUATION:
                words[i] = (x, 'PUNC')
            else:
                words[i] = (x, 'N')
        return words

# tz = MyTweetTokenizer()
# t ="RT @team_staystrong: #np COOL FOR THE SUMMER #DemiLovato"
# print(tz.tokenize(t))
Beispiel #15
0
def unicodeReplacement(tweet):
    return _replace_html_entities(tweet)
Beispiel #16
0
def preprocess(tweet_text):
    return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text)))