def createBigramTrigram(infile, outfile, type): #read file with loader module loader = Loader() items = loader.read_file(infile) plt = [] #create bigram or trigram for tweet in items: if type == 'b': plt.append(bigramas(tweet['text'])) else: plt.append(trigramas(tweet['text'])) #Pass all strings list to a single string palavras = ' ' for i in range(len(plt)): for j in range(len(plt[i])): palavras += plt[i][j].replace('\n', ' ').replace('\t', '') + ' ' #Formatting count = {} for word in palavras.split(" "): if len(word) < 0: continue if word not in count: count[word] = 0 count[word] += 1 l = sorted(count.items(), key=lambda x: -x[1]) write_file(infile, outfile, l)
def createDict(infile): #read file with loader module loader = Loader() items = loader.read_file(infile) #create dict() dic = collections.defaultdict(list) for tweet in items: aux = tweet['user_name'] #Check if key exists in dictionary using in operator if aux in dic: dic[aux] = dic[aux] + 1 else: dic[aux] = 1 #sorted dictionary list_x = sorted(dic.items(), key=lambda kv: kv[1], reverse=True) return list_x
def sanitize(infile, outfile, stopwords, emoji, rt): #initialize cleaner and load stopwords cleaner = TweetCleaner() stopwords = cleaner.load_stopwords(stopwords) #read file with loader module loader = Loader() items = loader.read_file(infile) #remove stopwords and emoji from tweets for tweet in items: tweet['text'] = cleaner.standardize_quotes(tweet['text']) tweet['text'] = cleaner.clean_apostrophe_s(tweet['text']) tweet['text'] = cleaner.remove_urls(tweet['text']) tweet['text'] = cleaner.remove_symbols(tweet['text']) tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords) if not emoji: tweet['text'] = cleaner.remove_emoji(tweet['text']) if rt: cleaner.remove_rts(items, tweet) write_file(infile, outfile, items)
def report(infile, outfile, displaycount): #initialize cleaner and load stopwords cleaner = TweetCleaner() stopwords = cleaner.load_stopwords(['stopwords/stopwords_en.txt', 'stopwords/stopwords_pt-br.txt']) #read file with loader module print('Reading file. This may take a while...') loader = Loader() items = loader.read_file(infile) print('File read successfully!\nProcessing the summary...') if 'text' not in items[0]: print("Warning: 'text' key is required.\nTerminating...") sys.exit(0) tweet_count = len(items) summary = "File name: " + infile + '\n' summary += "Tweet count: " + str(tweet_count) + "\n\n" if 'created_at' in items[0]: #created_at exists date_upper = items[0]['created_at'] date_lower = items[tweet_count - 1]['created_at'] summary += "Most recent tweet: " + date_upper + "\n" summary += "Oldest tweet: " + date_lower + "\n" elif 'date' in items[0]: date_upper = items[0]['date'] date_lower = items[tweet_count - 1]['date'] summary += "Most recent tweet: " + date_upper + "\n" summary += "Oldest tweet: " + date_lower + "\n" else: summary += "Warning: 'created_at' or 'date' key does not exist. Date range information cannot be fetched." username_key = get_username_key(items[0]) if 'retweets' in items[0]: summary+='\nTop retweeted tweets:\n' cont = 0 for tweet in sorted(items, reverse=True, key = lambda i: i['retweets']): if 'RT @' not in tweet['text'] and cont < displaycount: summary+= format_print_tweet(tweet, username_key) cont+=1 if cont>=10: break word_list = [] hashtag_list = [] user_list = [] for tweet in items: tweet['text'] = cleaner.standardize_quotes(tweet['text']) tweet['text'] = cleaner.clean_apostrophe_s(tweet['text']) tweet['text'] = cleaner.remove_urls(tweet['text']) tweet['text'] = cleaner.remove_symbols(tweet['text']) tweet['text'] = cleaner.remove_stopwords(tweet['text'], stopwords) tweet['text'] = cleaner.remove_emoji(tweet['text']) tweet['text'] = tweet['text'].lower() for tweet in items: #print(re.findall(r'#\w+', tweet['text'])) hashtag_list += re.findall(r'#\w+', tweet['text']) user_list += re.findall(r'@\w+', tweet['text']) word_list += re.findall(r'\b\w+', tweet['text']) word_dict = {} hashtag_dict = {} user_dict = {} for hashtag in hashtag_list: if hashtag in hashtag_dict: hashtag_dict[hashtag] += 1 else: hashtag_dict[hashtag] = 1 for user in user_list: if user in user_dict: user_dict[user] += 1 else: user_dict[user] = 1 for word in word_list: if word in word_dict: word_dict[word] += 1 else: word_dict[word] = 1 summary+='\n\nWord ranking:\n\n' count = 0 for key, value in sorted(list(word_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 summary+='\nUser ranking:\n\n' count = 0 for key, value in sorted(list(user_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 count = 0 summary+='\nHashtag ranking:\n\n' for key, value in sorted(list(hashtag_dict.items()), reverse=True, key=lambda k_v: (k_v[1],k_v[0])): if count < displaycount: summary+= '\t%s: %s\n' % (key, value) count +=1 with open(outfile, 'w', encoding='utf8') as f: f.write(summary) print('Succesfully wrote file to ' + outfile + '!')