Python Indexer.LoadIndexes Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: LoadIndexes

Examples at hotexamples.com: 3

Python Indexer.LoadIndexes - 3 examples found. These are the top rated real world Python examples of indexer.Indexer.LoadIndexes extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

    def TweetRank(self, unidentified_terms):
        ranked_tweets = defaultdict(list)

        # Create an instance of the indexer
        indexer = Indexer()
        # Index the tweets
        indexer.LoadIndexes()

        for term in unidentified_terms:
            # Get the tweets for the read term
            term_tweetids = indexer.GetTweetsForTerm(term)

            # The setup is that the number of retweets, favorites and author followers need to be normalized
            # Normalisation needs maximum and minimum of each of the tweet metrics
            # First loop finds the maximum and minimum value of all metrics. Let's declare them
            max_rt = 0  # will hold max value
            min_rt = 100000  # will hold min value
            max_fav = 0  # will hold max value
            min_fav = 100000  # will hold min value
            max_af = 0  # will hold max value
            min_af = 100000  # will hold min value

            for tweetid in term_tweetids:
                tweetid_rt = self.GetRetweetsForTweetid(indexer, tweetid)
                if tweetid_rt > max_rt:
                    max_rt = tweetid_rt
                if tweetid_rt < min_rt:
                    min_rt = tweetid_rt

                tweetid_fav = self.GetFavsForTweetid(indexer, tweetid)
                if tweetid_fav > max_fav:
                    max_fav = tweetid_fav
                if tweetid_fav < min_fav:
                    min_fav = tweetid_fav

                tweetid_af = self.GetFollowersForTweetid(indexer, tweetid)

                if tweetid_af > max_af:
                    max_af = tweetid_af
                if tweetid_af < min_af:
                    min_af = tweetid_af

            # Second loop uses the retrieved max and min of each metric to calculate a normalized score
            # of each tweet for that term
            for tweetid in term_tweetids:
                # For every tweet id get the number of retweets, favorites and author followers
                rt = self.GetRetweetsForTweetid(indexer, tweetid)
                fav = self.GetFavsForTweetid(indexer, tweetid)
                af = self.GetFollowersForTweetid(indexer, tweetid)

                tweet_term_score = self.GetScoreForTermTweetid(rt, max_rt, min_rt, fav, max_fav, min_fav,
                                                               af, max_af, min_af)

                ranked_tweets[term].append((tweetid, tweet_term_score))

        # Get the rankings, sorted descending on score and return only last x results
        self.sorted_cropped_rankings = self.SortCropRankings(ranked_tweets, unidentified_terms)

        self.rankings_output = list()
        for ut in unidentified_terms:
            number_rankings = len(self.sorted_cropped_rankings[ut])
            for i in range(0, number_rankings):
                self.rankings_output.append(self.sorted_cropped_rankings[ut][i])

        return self.rankings_output

Example #2

Show file

File: noisefilter.py Project: vdevos/2ID26

class NoiseFilter:
    """
    NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs

    """
    # Regexes to use as a filter
    ONLYNUM = "^[0-9]*$"
    SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]"
    NON_ASCII = "[^\x00-\x7F]"
    PUNCT = "[.?\-\",]"
    CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}"
    VOWEL_4 = "[aAeEiIoOuU]{4}"

    def __init__(self, args):

        self.args = args

        # define noisefilter arguments/parameter input
        self.ACTIONS = ('filter')

        # instance variables
        self.output_filename = "ut_filtered.txt"
        self.output_filename_regex = "ut_filtered_regex.txt"
        self.output_filename_idf = "ut_filtered_idf.txt"

        self.output_filename_noise = "ut_noise.txt"
        self.output_filename_noise_regex = "ut_noise_regex.txt"
        self.output_filename_noise_idf = "ut_noise_idf.txt"

        # global vars to store output of FilterNoise
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        # parse noisefilter action
        if args is not None:
            if not args.action[0] in self.ACTIONS:
                error("Action not recognized, try: %s" %
                      ', '.join(self.ACTIONS))
            self.ACTION = args.action[0]
        else:
            print("No args supplied to NoiseFilter.")

        # Create an instance of the indexer
        self.indexer = Indexer()
        # Index the tweets
        self.indexer.LoadIndexes()

    def PerformAction(self):
        if self.ACTION == 'filter':
            self.FilterNoiseFromFile(self.args.file)

    def FilterNoiseFromFile(self, fname):
        # make sure file exists
        if not os.path.isfile(fname):
            error("Provided file does not exist?")

        unfiltered_terms = set()

        # read file and iterate over the lines
        with open(fname) as fd:
            lines = fd.readlines()
            for line in lines:
                term = line.strip().split('\t')[0]
                unfiltered_terms.add(term)
            self.FilterNoise(unfiltered_terms, self.args.idf_factor)
            with open(self.output_filename_regex, 'w') as outputfile_regex:
                for ut in self.filtered_terms_regex:
                    outputfile_regex.write(ut + '\n')

            with open(self.output_filename_noise_regex,
                      'w') as outputfile_noise_regex:
                for nt in self.noise_terms_regex:
                    outputfile_noise_regex.write(nt + '\n')

            with open(self.output_filename_idf, 'w') as outputfile_idf:
                for ut in self.filtered_terms_idf:
                    outputfile_idf.write(ut + '\n')

            with open(self.output_filename_noise_idf,
                      'w') as outputfile_noise_idf:
                for nt in self.noise_terms_idf:
                    outputfile_noise_idf.write(nt + '\n')

            with open(self.output_filename, 'w') as outputfile:
                for ut in self.combined_filtered_terms:
                    outputfile.write(ut + '\n')

            with open(self.output_filename_noise, 'w') as outputfile_noise:
                for nt in self.combined_noise_terms:
                    outputfile_noise.write(nt + '\n')

    def FilterNoise(self, unfiltered_input, idf_factor):
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        for term in unfiltered_input:

            self.unfiltered_terms.append(term)

            # Applied filters:
            # 1. terms of 1 or 2 characters or terms larger than 10 characters
            # 2. terms containing non-ascii characters
            # 3. terms containing special characters
            # 4. terms consisting only of numbers
            # 5. terms having more punctuation than characters
            # 6. Four or more consecutive vowels, or five or more consecutive consonants.

            if len(term) < 3 or len(term) >= 7 \
                    or re.search(NoiseFilter.NON_ASCII, term) is not None \
                    or re.search(NoiseFilter.SPECIAL, term) is not None \
                    or re.search(NoiseFilter.ONLYNUM, term) is not None \
                    or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \
                    or re.search(NoiseFilter.VOWEL_4, term) is not None \
                    or re.search(NoiseFilter.CONSONANT_4, term) is not None:
                self.noise_terms_regex.append(term)
            else:
                self.filtered_terms_regex.append(term)

            # Get IDF term values. idf_base is the idf factor for terms that only appear once
            # in the whole collection.
            # Values lower than the idf_base can be a valid UTs, otherwise not
            idf = self.indexer.GetIDFForTerm(term)
            doccount = len(self.indexer.index_tweets)
            if doccount > 0:
                idf_base = math.log(float(doccount))
            else:
                idf_base = 100.0
                print("Tried to take the log of a <= 0 doccount! Was: ",
                      doccount)
            threshold_idf = idf_factor * idf_base

            if idf <= threshold_idf:
                self.filtered_terms_idf.append(term)
            else:
                self.noise_terms_idf.append(term)

        self.combined_filtered_terms = intersect(self.filtered_terms_regex,
                                                 self.filtered_terms_idf)
        self.combined_noise_terms = diff(self.unfiltered_terms,
                                         self.combined_filtered_terms)

        print('Input Terms: ' + str(len(self.unfiltered_terms)))
        print('Unidentified Terms Regex: ' +
              str(len(self.filtered_terms_regex)))
        print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex)))
        print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf)))
        print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf)))
        print('Combined Unidentified Terms: ' +
              str(len(self.combined_filtered_terms)))
        print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms)))

        # This is the list we use as a result.
        return self.combined_filtered_terms

Example #3

Show file

output_extension = ".txt"
# Input files:
input_tweets_filename = "normalized_filtered_tweets.txt"
tweet_text_column_index = 5
input_unidentified_terms_filename = "ut_output.txt"

stopwords_set = set(stopwords.words('english'))
tweets = []
unidentified_terms = set()
# The minimum number of times a words should co-occur with a UT before we record it.
minimal_nr_occurrences = 60
# The maximum nr of co-occurring words that we record per UT.
max_words = 5

tweet_indexer = Indexer()
tweet_indexer.LoadIndexes()

# First we read in all the tweets.
if os.path.isfile(input_tweets_filename):
    with open(input_tweets_filename) as tweetsfile:
        lines = tweetsfile.readlines()
        for line in lines:
            linedata = line.strip().split('\t')
            if len(linedata) >= tweet_text_column_index + 1:
                tweet_text = set(linedata[tweet_text_column_index].strip().split(' '))
                tweet_text = tweet_text - stopwords_set
                tweets.append(tweet_text)
            else:
                print(str(linedata))
                print("Invalid file format!")