Beispiel #1
0
def add_word_to_timeline(str_word, words_per_time, timestamp):
    if timestamp is not '':
        str_word = remove_punctuation(str_word)
        if str_word is not None:
            str_word = str_word.lower()
            if (not is_stopword(str_word)) and len(str_word) > 1:
                try:
                    words_per_time[str_word].append(timestamp)
                except KeyError:
                    words_per_time[str_word] = [timestamp]
def add_word_to_timeline(str_word, words_per_time, timestamp):
	if timestamp is not '':
		str_word = remove_punctuation(str_word)
		if str_word is not None:
			str_word = str_word.lower()
			if (not is_stopword(str_word)) and len(str_word) > 1:
				try:
					words_per_time[str_word].append(timestamp)
				except KeyError:
					words_per_time[str_word] = [timestamp]
Beispiel #3
0
def handle_common_words(str_word, dict_int_words):
    """ 
	Inserts a word in the dictionary of word counts or increment the 
	count if it already was used. 
	"""
    str_word = str_word.lower()
    str_word = remove_punctuation(str_word)
    if str_word is not '':
        #after the word was cleaned, it may have 0 letters i.e: if the word was ";)"
        if (not is_stopword(str_word)) and len(str_word) > 1:
            dict_int_words[str_word] += 1
def handle_common_words(str_word, dict_int_words):
	""" 
	Inserts a word in the dictionary of word counts or increment the 
	count if it already was used. 
	"""
	str_word = str_word.lower()
	str_word = remove_punctuation(str_word)
	if str_word is not '':		
		#after the word was cleaned, it may have 0 letters i.e: if the word was ";)"
		if (not is_stopword(str_word)) and len(str_word) > 1:
			dict_int_words[str_word] += 1
def get_words(status):
    try:
        text = str(
            unicode(status['text']).encode('utf-8')).decode("utf-8").replace(
                "\n", "").lower()
        words = lib_text.remove_punctuation(text)
        words = lib_text.remove_punctuation_special(words)
        # words = str(unicode(words.replace("\n","")).encode("utf-8")).decode("utf-8")
        words = words.split(" ")
        return words
    except Exception as e:
        print 'get_words'
        raise e
def get_hashtags_without_accents(str_text):
	""" 
	Returns all the hashtags in a given string. 
	Hashtags are considered words that start with # and 
	have a length bigger than 1, not considering the # character. 
	"""
	list_str_words = str_text.split()
	list_str_hashtags = []
	for word in list_str_words:
		if word.startswith("#") and not(word.endswith("…")): #checks if the word wasn't truncated
			temp_word = remove_punctuation(word.lower())
			if temp_word is not None and len(temp_word) > 1:
				list_str_hashtags.append("#" + remove_latin_accents(temp_word))
	return list_str_hashtags
def get_hashtags_without_accents(str_text):
    """ 
	Returns all the hashtags in a given string. 
	Hashtags are considered words that start with # and 
	have a length bigger than 1, not considering the # character. 
	"""
    list_str_words = str_text.split()
    list_str_hashtags = []
    for word in list_str_words:
        if word.startswith("#") and not (
                word.endswith("…")):  #checks if the word wasn't truncated
            temp_word = remove_punctuation(word.lower())
            if temp_word is not None and len(temp_word) > 1:
                list_str_hashtags.append("#" + remove_latin_accents(temp_word))
    return list_str_hashtags
def handle_hashtags(str_hashtag, str_username, dict_set_hashtags, dict_set_hashtags_without_accents):
	"""
	Adds a hashtag to the hashtags dictionary. Each entry contains a set of 
	users that tweeted the key hashtag.
	"""
	str_hashtag = str_hashtag.lower()
	str_hashtag = remove_punctuation(str_hashtag)
	str_hashtag_without_accents = remove_latin_accents(str_hashtag)
	if str_hashtag is not '':
		try:
			dict_set_hashtags[str_hashtag].add(str_username)
			dict_set_hashtags_without_accents[str_hashtag_without_accents].add(str_username)
		except KeyError:
			dict_set_hashtags[str_hashtag] = set([str_username])
			dict_set_hashtags_without_accents[str_hashtag_without_accents] = set([str_username])
Beispiel #9
0
def handle_hashtags(str_hashtag, str_username, dict_set_hashtags,
                    dict_set_hashtags_without_accents):
    """
	Adds a hashtag to the hashtags dictionary. Each entry contains a set of 
	users that tweeted the key hashtag.
	"""
    str_hashtag = str_hashtag.lower()
    str_hashtag = remove_punctuation(str_hashtag)
    str_hashtag_without_accents = remove_latin_accents(str_hashtag)
    if str_hashtag is not '':
        try:
            dict_set_hashtags[str_hashtag].add(str_username)
            dict_set_hashtags_without_accents[str_hashtag_without_accents].add(
                str_username)
        except KeyError:
            dict_set_hashtags[str_hashtag] = set([str_username])
            dict_set_hashtags_without_accents[
                str_hashtag_without_accents] = set([str_username])