Example #1
0
    def TweetRank(self, unidentified_terms):
        ranked_tweets = defaultdict(list)

        # Create an instance of the indexer
        indexer = Indexer()
        # Index the tweets
        indexer.LoadIndexes()

        for term in unidentified_terms:
            # Get the tweets for the read term
            term_tweetids = indexer.GetTweetsForTerm(term)

            # The setup is that the number of retweets, favorites and author followers need to be normalized
            # Normalisation needs maximum and minimum of each of the tweet metrics
            # First loop finds the maximum and minimum value of all metrics. Let's declare them
            max_rt = 0  # will hold max value
            min_rt = 100000  # will hold min value
            max_fav = 0  # will hold max value
            min_fav = 100000  # will hold min value
            max_af = 0  # will hold max value
            min_af = 100000  # will hold min value

            for tweetid in term_tweetids:
                tweetid_rt = self.GetRetweetsForTweetid(indexer, tweetid)
                if tweetid_rt > max_rt:
                    max_rt = tweetid_rt
                if tweetid_rt < min_rt:
                    min_rt = tweetid_rt

                tweetid_fav = self.GetFavsForTweetid(indexer, tweetid)
                if tweetid_fav > max_fav:
                    max_fav = tweetid_fav
                if tweetid_fav < min_fav:
                    min_fav = tweetid_fav

                tweetid_af = self.GetFollowersForTweetid(indexer, tweetid)

                if tweetid_af > max_af:
                    max_af = tweetid_af
                if tweetid_af < min_af:
                    min_af = tweetid_af

            # Second loop uses the retrieved max and min of each metric to calculate a normalized score
            # of each tweet for that term
            for tweetid in term_tweetids:
                # For every tweet id get the number of retweets, favorites and author followers
                rt = self.GetRetweetsForTweetid(indexer, tweetid)
                fav = self.GetFavsForTweetid(indexer, tweetid)
                af = self.GetFollowersForTweetid(indexer, tweetid)

                tweet_term_score = self.GetScoreForTermTweetid(rt, max_rt, min_rt, fav, max_fav, min_fav,
                                                               af, max_af, min_af)

                ranked_tweets[term].append((tweetid, tweet_term_score))

        # Get the rankings, sorted descending on score and return only last x results
        self.sorted_cropped_rankings = self.SortCropRankings(ranked_tweets, unidentified_terms)

        self.rankings_output = list()
        for ut in unidentified_terms:
            number_rankings = len(self.sorted_cropped_rankings[ut])
            for i in range(0, number_rankings):
                self.rankings_output.append(self.sorted_cropped_rankings[ut][i])

        return self.rankings_output
Example #2
0
class NoiseFilter:
    """
    NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs

    """
    # Regexes to use as a filter
    ONLYNUM = "^[0-9]*$"
    SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]"
    NON_ASCII = "[^\x00-\x7F]"
    PUNCT = "[.?\-\",]"
    CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}"
    VOWEL_4 = "[aAeEiIoOuU]{4}"

    def __init__(self, args):

        self.args = args

        # define noisefilter arguments/parameter input
        self.ACTIONS = ('filter')

        # instance variables
        self.output_filename = "ut_filtered.txt"
        self.output_filename_regex = "ut_filtered_regex.txt"
        self.output_filename_idf = "ut_filtered_idf.txt"

        self.output_filename_noise = "ut_noise.txt"
        self.output_filename_noise_regex = "ut_noise_regex.txt"
        self.output_filename_noise_idf = "ut_noise_idf.txt"

        # global vars to store output of FilterNoise
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        # parse noisefilter action
        if args is not None:
            if not args.action[0] in self.ACTIONS:
                error("Action not recognized, try: %s" %
                      ', '.join(self.ACTIONS))
            self.ACTION = args.action[0]
        else:
            print("No args supplied to NoiseFilter.")

        # Create an instance of the indexer
        self.indexer = Indexer()
        # Index the tweets
        self.indexer.LoadIndexes()

    def PerformAction(self):
        if self.ACTION == 'filter':
            self.FilterNoiseFromFile(self.args.file)

    def FilterNoiseFromFile(self, fname):
        # make sure file exists
        if not os.path.isfile(fname):
            error("Provided file does not exist?")

        unfiltered_terms = set()

        # read file and iterate over the lines
        with open(fname) as fd:
            lines = fd.readlines()
            for line in lines:
                term = line.strip().split('\t')[0]
                unfiltered_terms.add(term)
            self.FilterNoise(unfiltered_terms, self.args.idf_factor)
            with open(self.output_filename_regex, 'w') as outputfile_regex:
                for ut in self.filtered_terms_regex:
                    outputfile_regex.write(ut + '\n')

            with open(self.output_filename_noise_regex,
                      'w') as outputfile_noise_regex:
                for nt in self.noise_terms_regex:
                    outputfile_noise_regex.write(nt + '\n')

            with open(self.output_filename_idf, 'w') as outputfile_idf:
                for ut in self.filtered_terms_idf:
                    outputfile_idf.write(ut + '\n')

            with open(self.output_filename_noise_idf,
                      'w') as outputfile_noise_idf:
                for nt in self.noise_terms_idf:
                    outputfile_noise_idf.write(nt + '\n')

            with open(self.output_filename, 'w') as outputfile:
                for ut in self.combined_filtered_terms:
                    outputfile.write(ut + '\n')

            with open(self.output_filename_noise, 'w') as outputfile_noise:
                for nt in self.combined_noise_terms:
                    outputfile_noise.write(nt + '\n')

    def FilterNoise(self, unfiltered_input, idf_factor):
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        for term in unfiltered_input:

            self.unfiltered_terms.append(term)

            # Applied filters:
            # 1. terms of 1 or 2 characters or terms larger than 10 characters
            # 2. terms containing non-ascii characters
            # 3. terms containing special characters
            # 4. terms consisting only of numbers
            # 5. terms having more punctuation than characters
            # 6. Four or more consecutive vowels, or five or more consecutive consonants.

            if len(term) < 3 or len(term) >= 7 \
                    or re.search(NoiseFilter.NON_ASCII, term) is not None \
                    or re.search(NoiseFilter.SPECIAL, term) is not None \
                    or re.search(NoiseFilter.ONLYNUM, term) is not None \
                    or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \
                    or re.search(NoiseFilter.VOWEL_4, term) is not None \
                    or re.search(NoiseFilter.CONSONANT_4, term) is not None:
                self.noise_terms_regex.append(term)
            else:
                self.filtered_terms_regex.append(term)

            # Get IDF term values. idf_base is the idf factor for terms that only appear once
            # in the whole collection.
            # Values lower than the idf_base can be a valid UTs, otherwise not
            idf = self.indexer.GetIDFForTerm(term)
            doccount = len(self.indexer.index_tweets)
            if doccount > 0:
                idf_base = math.log(float(doccount))
            else:
                idf_base = 100.0
                print("Tried to take the log of a <= 0 doccount! Was: ",
                      doccount)
            threshold_idf = idf_factor * idf_base

            if idf <= threshold_idf:
                self.filtered_terms_idf.append(term)
            else:
                self.noise_terms_idf.append(term)

        self.combined_filtered_terms = intersect(self.filtered_terms_regex,
                                                 self.filtered_terms_idf)
        self.combined_noise_terms = diff(self.unfiltered_terms,
                                         self.combined_filtered_terms)

        print('Input Terms: ' + str(len(self.unfiltered_terms)))
        print('Unidentified Terms Regex: ' +
              str(len(self.filtered_terms_regex)))
        print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex)))
        print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf)))
        print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf)))
        print('Combined Unidentified Terms: ' +
              str(len(self.combined_filtered_terms)))
        print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms)))

        # This is the list we use as a result.
        return self.combined_filtered_terms
Example #3
0
output_extension = ".txt"
# Input files:
input_tweets_filename = "normalized_filtered_tweets.txt"
tweet_text_column_index = 5
input_unidentified_terms_filename = "ut_output.txt"

stopwords_set = set(stopwords.words('english'))
tweets = []
unidentified_terms = set()
# The minimum number of times a words should co-occur with a UT before we record it.
minimal_nr_occurrences = 60
# The maximum nr of co-occurring words that we record per UT.
max_words = 5

tweet_indexer = Indexer()
tweet_indexer.LoadIndexes()

# First we read in all the tweets.
if os.path.isfile(input_tweets_filename):
    with open(input_tweets_filename) as tweetsfile:
        lines = tweetsfile.readlines()
        for line in lines:
            linedata = line.strip().split('\t')
            if len(linedata) >= tweet_text_column_index + 1:
                tweet_text = set(linedata[tweet_text_column_index].strip().split(' '))
                tweet_text = tweet_text - stopwords_set
                tweets.append(tweet_text)
            else:
                print(str(linedata))
                print("Invalid file format!")