def normalizeAndSplitWithLemmatization(tweet): # TODO this attempt to catch faulty input does not work! # if type(tweet) is 'pyspark.sql.types.Row': # raise AttributeError('You are trying to give a Spark SQL Row, you need to be more specific!') ascii_tweet = '' if type(tweet) is bytes: print("This is bytes: ", tweet, " and we need to change it.") raise Exception("Tweet has type of bytes. Should have type of string.") else: # yes #ascii_tweet = unicodedata.normalize('NFKD', tweet).encode('ascii','ignore') if type(tweet) is str: ascii_tweet = tweet else: try: ascii_tweet = unicodeToASCII(tweet) except AttributeError as err: print("Attribute Error: "+str(err)) print(tweet) print("Type is: "+str(type(tweet))) # start stemming and all that lcase_tweet = ascii_tweet.lower() # twitter tweet cleaner parsed_tweet = ttp.escape(lcase_tweet) nospec_tweet = removeSpecialCharactersAndSomeURLs(parsed_tweet) # start stemming wordnet_lemmatizer = WordNetLemmatizer() words = nospec_tweet.split() # lemmatization for each word out_array = [] for w in words: lemmatized_word = wordnet_lemmatizer.lemmatize(w) if len(lemmatized_word) > 1: out_array.append(lemmatized_word) # done return out_array
def format_url(self, url, text): '''Return formatted HTML for a url.''' return '<a target="_blank" rel="nofollow" href="{0}">{1}</a>'.format(ttp.escape(url), text)
def format_url(self, url, text): """Return formatted HTML for a url.""" return '<a href="%s" target="_blank">%s</a>' % ( ttp.escape(url), ttp.escape(url), )