Python Indexer.GetIDFForTerm Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: GetIDFForTerm

Examples at hotexamples.com: 1

Python Indexer.GetIDFForTerm - 1 examples found. These are the top rated real world Python examples of indexer.Indexer.GetIDFForTerm extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

File: noisefilter.py Project: vdevos/2ID26

class NoiseFilter:
    """
    NoiseFilter takes a list of Unidentified Terms (UTs), filters out noise (garbage) and returns a (noise)filtered list of UTs

    """
    # Regexes to use as a filter
    ONLYNUM = "^[0-9]*$"
    SPECIAL = "[/_$&+,:;{}\"=?\[\]@#|~'<>^*()%!]"
    NON_ASCII = "[^\x00-\x7F]"
    PUNCT = "[.?\-\",]"
    CONSONANT_4 = "[bBcCdDfFgGhHjJkKlLmMnNpPqQrRsStTvVwWxXyYzZ]{4}"
    VOWEL_4 = "[aAeEiIoOuU]{4}"

    def __init__(self, args):

        self.args = args

        # define noisefilter arguments/parameter input
        self.ACTIONS = ('filter')

        # instance variables
        self.output_filename = "ut_filtered.txt"
        self.output_filename_regex = "ut_filtered_regex.txt"
        self.output_filename_idf = "ut_filtered_idf.txt"

        self.output_filename_noise = "ut_noise.txt"
        self.output_filename_noise_regex = "ut_noise_regex.txt"
        self.output_filename_noise_idf = "ut_noise_idf.txt"

        # global vars to store output of FilterNoise
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        # parse noisefilter action
        if args is not None:
            if not args.action[0] in self.ACTIONS:
                error("Action not recognized, try: %s" %
                      ', '.join(self.ACTIONS))
            self.ACTION = args.action[0]
        else:
            print("No args supplied to NoiseFilter.")

        # Create an instance of the indexer
        self.indexer = Indexer()
        # Index the tweets
        self.indexer.LoadIndexes()

    def PerformAction(self):
        if self.ACTION == 'filter':
            self.FilterNoiseFromFile(self.args.file)

    def FilterNoiseFromFile(self, fname):
        # make sure file exists
        if not os.path.isfile(fname):
            error("Provided file does not exist?")

        unfiltered_terms = set()

        # read file and iterate over the lines
        with open(fname) as fd:
            lines = fd.readlines()
            for line in lines:
                term = line.strip().split('\t')[0]
                unfiltered_terms.add(term)
            self.FilterNoise(unfiltered_terms, self.args.idf_factor)
            with open(self.output_filename_regex, 'w') as outputfile_regex:
                for ut in self.filtered_terms_regex:
                    outputfile_regex.write(ut + '\n')

            with open(self.output_filename_noise_regex,
                      'w') as outputfile_noise_regex:
                for nt in self.noise_terms_regex:
                    outputfile_noise_regex.write(nt + '\n')

            with open(self.output_filename_idf, 'w') as outputfile_idf:
                for ut in self.filtered_terms_idf:
                    outputfile_idf.write(ut + '\n')

            with open(self.output_filename_noise_idf,
                      'w') as outputfile_noise_idf:
                for nt in self.noise_terms_idf:
                    outputfile_noise_idf.write(nt + '\n')

            with open(self.output_filename, 'w') as outputfile:
                for ut in self.combined_filtered_terms:
                    outputfile.write(ut + '\n')

            with open(self.output_filename_noise, 'w') as outputfile_noise:
                for nt in self.combined_noise_terms:
                    outputfile_noise.write(nt + '\n')

    def FilterNoise(self, unfiltered_input, idf_factor):
        self.unfiltered_terms = []
        self.filtered_terms_regex = []
        self.filtered_terms_idf = []
        self.noise_terms_regex = []
        self.noise_terms_idf = []
        self.combined_filtered_terms = []
        self.combined_noise_terms = []

        for term in unfiltered_input:

            self.unfiltered_terms.append(term)

            # Applied filters:
            # 1. terms of 1 or 2 characters or terms larger than 10 characters
            # 2. terms containing non-ascii characters
            # 3. terms containing special characters
            # 4. terms consisting only of numbers
            # 5. terms having more punctuation than characters
            # 6. Four or more consecutive vowels, or five or more consecutive consonants.

            if len(term) < 3 or len(term) >= 7 \
                    or re.search(NoiseFilter.NON_ASCII, term) is not None \
                    or re.search(NoiseFilter.SPECIAL, term) is not None \
                    or re.search(NoiseFilter.ONLYNUM, term) is not None \
                    or len(re.findall(NoiseFilter.PUNCT, term)) > (len(term) - len(re.findall(NoiseFilter.PUNCT, term))) \
                    or re.search(NoiseFilter.VOWEL_4, term) is not None \
                    or re.search(NoiseFilter.CONSONANT_4, term) is not None:
                self.noise_terms_regex.append(term)
            else:
                self.filtered_terms_regex.append(term)

            # Get IDF term values. idf_base is the idf factor for terms that only appear once
            # in the whole collection.
            # Values lower than the idf_base can be a valid UTs, otherwise not
            idf = self.indexer.GetIDFForTerm(term)
            doccount = len(self.indexer.index_tweets)
            if doccount > 0:
                idf_base = math.log(float(doccount))
            else:
                idf_base = 100.0
                print("Tried to take the log of a <= 0 doccount! Was: ",
                      doccount)
            threshold_idf = idf_factor * idf_base

            if idf <= threshold_idf:
                self.filtered_terms_idf.append(term)
            else:
                self.noise_terms_idf.append(term)

        self.combined_filtered_terms = intersect(self.filtered_terms_regex,
                                                 self.filtered_terms_idf)
        self.combined_noise_terms = diff(self.unfiltered_terms,
                                         self.combined_filtered_terms)

        print('Input Terms: ' + str(len(self.unfiltered_terms)))
        print('Unidentified Terms Regex: ' +
              str(len(self.filtered_terms_regex)))
        print('Noisy Terms Regex: ' + str(len(self.noise_terms_regex)))
        print('Unidentified Terms IDF: ' + str(len(self.filtered_terms_idf)))
        print('Noisy Terms IDF: ' + str(len(self.noise_terms_idf)))
        print('Combined Unidentified Terms: ' +
              str(len(self.combined_filtered_terms)))
        print('Combined Noisy Terms: ' + str(len(self.combined_noise_terms)))

        # This is the list we use as a result.
        return self.combined_filtered_terms