def add_word_to_timeline(str_word, words_per_time, timestamp): if timestamp is not '': str_word = remove_punctuation(str_word) if str_word is not None: str_word = str_word.lower() if (not is_stopword(str_word)) and len(str_word) > 1: try: words_per_time[str_word].append(timestamp) except KeyError: words_per_time[str_word] = [timestamp]
def handle_common_words(str_word, dict_int_words): """ Inserts a word in the dictionary of word counts or increment the count if it already was used. """ str_word = str_word.lower() str_word = remove_punctuation(str_word) if str_word is not '': #after the word was cleaned, it may have 0 letters i.e: if the word was ";)" if (not is_stopword(str_word)) and len(str_word) > 1: dict_int_words[str_word] += 1
def get_words(status): try: text = str( unicode(status['text']).encode('utf-8')).decode("utf-8").replace( "\n", "").lower() words = lib_text.remove_punctuation(text) words = lib_text.remove_punctuation_special(words) # words = str(unicode(words.replace("\n","")).encode("utf-8")).decode("utf-8") words = words.split(" ") return words except Exception as e: print 'get_words' raise e
def get_hashtags_without_accents(str_text): """ Returns all the hashtags in a given string. Hashtags are considered words that start with # and have a length bigger than 1, not considering the # character. """ list_str_words = str_text.split() list_str_hashtags = [] for word in list_str_words: if word.startswith("#") and not(word.endswith("…")): #checks if the word wasn't truncated temp_word = remove_punctuation(word.lower()) if temp_word is not None and len(temp_word) > 1: list_str_hashtags.append("#" + remove_latin_accents(temp_word)) return list_str_hashtags
def get_hashtags_without_accents(str_text): """ Returns all the hashtags in a given string. Hashtags are considered words that start with # and have a length bigger than 1, not considering the # character. """ list_str_words = str_text.split() list_str_hashtags = [] for word in list_str_words: if word.startswith("#") and not ( word.endswith("…")): #checks if the word wasn't truncated temp_word = remove_punctuation(word.lower()) if temp_word is not None and len(temp_word) > 1: list_str_hashtags.append("#" + remove_latin_accents(temp_word)) return list_str_hashtags
def handle_hashtags(str_hashtag, str_username, dict_set_hashtags, dict_set_hashtags_without_accents): """ Adds a hashtag to the hashtags dictionary. Each entry contains a set of users that tweeted the key hashtag. """ str_hashtag = str_hashtag.lower() str_hashtag = remove_punctuation(str_hashtag) str_hashtag_without_accents = remove_latin_accents(str_hashtag) if str_hashtag is not '': try: dict_set_hashtags[str_hashtag].add(str_username) dict_set_hashtags_without_accents[str_hashtag_without_accents].add(str_username) except KeyError: dict_set_hashtags[str_hashtag] = set([str_username]) dict_set_hashtags_without_accents[str_hashtag_without_accents] = set([str_username])
def handle_hashtags(str_hashtag, str_username, dict_set_hashtags, dict_set_hashtags_without_accents): """ Adds a hashtag to the hashtags dictionary. Each entry contains a set of users that tweeted the key hashtag. """ str_hashtag = str_hashtag.lower() str_hashtag = remove_punctuation(str_hashtag) str_hashtag_without_accents = remove_latin_accents(str_hashtag) if str_hashtag is not '': try: dict_set_hashtags[str_hashtag].add(str_username) dict_set_hashtags_without_accents[str_hashtag_without_accents].add( str_username) except KeyError: dict_set_hashtags[str_hashtag] = set([str_username]) dict_set_hashtags_without_accents[ str_hashtag_without_accents] = set([str_username])