def TweetRank(self, unidentified_terms): ranked_tweets = defaultdict(list) # Create an instance of the indexer indexer = Indexer() # Index the tweets indexer.LoadIndexes() for term in unidentified_terms: # Get the tweets for the read term term_tweetids = indexer.GetTweetsForTerm(term) # The setup is that the number of retweets, favorites and author followers need to be normalized # Normalisation needs maximum and minimum of each of the tweet metrics # First loop finds the maximum and minimum value of all metrics. Let's declare them max_rt = 0 # will hold max value min_rt = 100000 # will hold min value max_fav = 0 # will hold max value min_fav = 100000 # will hold min value max_af = 0 # will hold max value min_af = 100000 # will hold min value for tweetid in term_tweetids: tweetid_rt = self.GetRetweetsForTweetid(indexer, tweetid) if tweetid_rt > max_rt: max_rt = tweetid_rt if tweetid_rt < min_rt: min_rt = tweetid_rt tweetid_fav = self.GetFavsForTweetid(indexer, tweetid) if tweetid_fav > max_fav: max_fav = tweetid_fav if tweetid_fav < min_fav: min_fav = tweetid_fav tweetid_af = self.GetFollowersForTweetid(indexer, tweetid) if tweetid_af > max_af: max_af = tweetid_af if tweetid_af < min_af: min_af = tweetid_af # Second loop uses the retrieved max and min of each metric to calculate a normalized score # of each tweet for that term for tweetid in term_tweetids: # For every tweet id get the number of retweets, favorites and author followers rt = self.GetRetweetsForTweetid(indexer, tweetid) fav = self.GetFavsForTweetid(indexer, tweetid) af = self.GetFollowersForTweetid(indexer, tweetid) tweet_term_score = self.GetScoreForTermTweetid(rt, max_rt, min_rt, fav, max_fav, min_fav, af, max_af, min_af) ranked_tweets[term].append((tweetid, tweet_term_score)) # Get the rankings, sorted descending on score and return only last x results self.sorted_cropped_rankings = self.SortCropRankings(ranked_tweets, unidentified_terms) self.rankings_output = list() for ut in unidentified_terms: number_rankings = len(self.sorted_cropped_rankings[ut]) for i in range(0, number_rankings): self.rankings_output.append(self.sorted_cropped_rankings[ut][i]) return self.rankings_output
class NoiseFilter: """ NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs """ # Regexes to use as a filter ONLYNUM = "^[0-9]*$" SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]" NON_ASCII = "[^\x00-\x7F]" PUNCT = "[.?\-\",]" CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}" VOWEL_4 = "[aAeEiIoOuU]{4}" def __init__(self, args): self.args = args # define noisefilter arguments/parameter input self.ACTIONS = ('filter') # instance variables self.output_filename = "ut_filtered.txt" self.output_filename_regex = "ut_filtered_regex.txt" self.output_filename_idf = "ut_filtered_idf.txt" self.output_filename_noise = "ut_noise.txt" self.output_filename_noise_regex = "ut_noise_regex.txt" self.output_filename_noise_idf = "ut_noise_idf.txt" # global vars to store output of FilterNoise self.unfiltered_terms = [] self.filtered_terms_regex = [] self.filtered_terms_idf = [] self.noise_terms_regex = [] self.noise_terms_idf = [] self.combined_filtered_terms = [] self.combined_noise_terms = [] # parse noisefilter action if args is not None: if not args.action[0] in self.ACTIONS: error("Action not recognized, try: %s" % ', '.join(self.ACTIONS)) self.ACTION = args.action[0] else: print("No args supplied to NoiseFilter.") # Create an instance of the indexer self.indexer = Indexer() # Index the tweets self.indexer.LoadIndexes() def PerformAction(self): if self.ACTION == 'filter': self.FilterNoiseFromFile(self.args.file) def FilterNoiseFromFile(self, fname): # make sure file exists if not os.path.isfile(fname): error("Provided file does not exist?") unfiltered_terms = set() # read file and iterate over the lines with open(fname) as fd: lines = fd.readlines() for line in lines: term = line.strip().split('\t')[0] unfiltered_terms.add(term) self.FilterNoise(unfiltered_terms, self.args.idf_factor) with open(self.output_filename_regex, 'w') as outputfile_regex: for ut in self.filtered_terms_regex: outputfile_regex.write(ut + '\n') with open(self.output_filename_noise_regex, 'w') as outputfile_noise_regex: for nt in self.noise_terms_regex: outputfile_noise_regex.write(nt + '\n') with open(self.output_filename_idf, 'w') as outputfile_idf: for ut in self.filtered_terms_idf: outputfile_idf.write(ut + '\n') with open(self.output_filename_noise_idf, 'w') as outputfile_noise_idf: for nt in self.noise_terms_idf: outputfile_noise_idf.write(nt + '\n') with open(self.output_filename, 'w') as outputfile: for ut in self.combined_filtered_terms: outputfile.write(ut + '\n') with open(self.output_filename_noise, 'w') as outputfile_noise: for nt in self.combined_noise_terms: outputfile_noise.write(nt + '\n') def FilterNoise(self, unfiltered_input, idf_factor): self.unfiltered_terms = [] self.filtered_terms_regex = [] self.filtered_terms_idf = [] self.noise_terms_regex = [] self.noise_terms_idf = [] self.combined_filtered_terms = [] self.combined_noise_terms = [] for term in unfiltered_input: self.unfiltered_terms.append(term) # Applied filters: # 1. terms of 1 or 2 characters or terms larger than 10 characters # 2. terms containing non-ascii characters # 3. terms containing special characters # 4. terms consisting only of numbers # 5. terms having more punctuation than characters # 6. Four or more consecutive vowels, or five or more consecutive consonants. if len(term) < 3 or len(term) >= 7 \ or re.search(NoiseFilter.NON_ASCII, term) is not None \ or re.search(NoiseFilter.SPECIAL, term) is not None \ or re.search(NoiseFilter.ONLYNUM, term) is not None \ or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \ or re.search(NoiseFilter.VOWEL_4, term) is not None \ or re.search(NoiseFilter.CONSONANT_4, term) is not None: self.noise_terms_regex.append(term) else: self.filtered_terms_regex.append(term) # Get IDF term values. idf_base is the idf factor for terms that only appear once # in the whole collection. # Values lower than the idf_base can be a valid UTs, otherwise not idf = self.indexer.GetIDFForTerm(term) doccount = len(self.indexer.index_tweets) if doccount > 0: idf_base = math.log(float(doccount)) else: idf_base = 100.0 print("Tried to take the log of a <= 0 doccount! Was: ", doccount) threshold_idf = idf_factor * idf_base if idf <= threshold_idf: self.filtered_terms_idf.append(term) else: self.noise_terms_idf.append(term) self.combined_filtered_terms = intersect(self.filtered_terms_regex, self.filtered_terms_idf) self.combined_noise_terms = diff(self.unfiltered_terms, self.combined_filtered_terms) print('Input Terms: ' + str(len(self.unfiltered_terms))) print('Unidentified Terms Regex: ' + str(len(self.filtered_terms_regex))) print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex))) print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf))) print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf))) print('Combined Unidentified Terms: ' + str(len(self.combined_filtered_terms))) print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms))) # This is the list we use as a result. return self.combined_filtered_terms
output_extension = ".txt" # Input files: input_tweets_filename = "normalized_filtered_tweets.txt" tweet_text_column_index = 5 input_unidentified_terms_filename = "ut_output.txt" stopwords_set = set(stopwords.words('english')) tweets = [] unidentified_terms = set() # The minimum number of times a words should co-occur with a UT before we record it. minimal_nr_occurrences = 60 # The maximum nr of co-occurring words that we record per UT. max_words = 5 tweet_indexer = Indexer() tweet_indexer.LoadIndexes() # First we read in all the tweets. if os.path.isfile(input_tweets_filename): with open(input_tweets_filename) as tweetsfile: lines = tweetsfile.readlines() for line in lines: linedata = line.strip().split('\t') if len(linedata) >= tweet_text_column_index + 1: tweet_text = set(linedata[tweet_text_column_index].strip().split(' ')) tweet_text = tweet_text - stopwords_set tweets.append(tweet_text) else: print(str(linedata)) print("Invalid file format!")