Esempio n. 1
0
def normalizeAndSplitWithLemmatization(tweet):
    # TODO this attempt to catch faulty input does not work!
    # if type(tweet) is 'pyspark.sql.types.Row':
    #     raise AttributeError('You are trying to give a Spark SQL Row, you need to be more specific!')
    ascii_tweet = ''
    if type(tweet) is bytes:
        print("This is bytes: ", tweet, " and we need to change it.")
        raise Exception("Tweet has type of bytes. Should have type of string.")
    else:
        # yes
        #ascii_tweet = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore')
        if type(tweet) is str:
            ascii_tweet = tweet
        else:
            try:
                ascii_tweet = unicodeToASCII(tweet)
            except AttributeError as err:
                print("Attribute Error: "+str(err))
                print(tweet)
                print("Type is: "+str(type(tweet)))
        # start stemming and all that
        lcase_tweet = ascii_tweet.lower()
        # twitter tweet cleaner
        parsed_tweet = ttp.escape(lcase_tweet)
        nospec_tweet = removeSpecialCharactersAndSomeURLs(parsed_tweet)
        # start stemming
        wordnet_lemmatizer = WordNetLemmatizer()
        words = nospec_tweet.split()
        # lemmatization for each word
        out_array = []
        for w in words:
            lemmatized_word = wordnet_lemmatizer.lemmatize(w)
            if len(lemmatized_word) > 1:
                out_array.append(lemmatized_word)
        # done
        return out_array
Esempio n. 2
0
 def format_url(self, url, text):
     '''Return formatted HTML for a url.'''
     return '<a target="_blank" rel="nofollow" href="{0}">{1}</a>'.format(ttp.escape(url), text)
Esempio n. 3
0
 def format_url(self, url, text):
     """Return formatted HTML for a url."""
     return '<a href="%s" target="_blank">%s</a>' % (
         ttp.escape(url),
         ttp.escape(url),
     )