Python Indexer.fix_inverted_index Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: fix_inverted_index

Examples at hotexamples.com: 2

Python Indexer.fix_inverted_index - 2 examples found. These are the top rated real world Python examples of indexer.Indexer.fix_inverted_index extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

File: search_engine_1.py Project: JonatanMilver/search_engine_v2

class SearchEngine:
    GLOVE_PATH_SERVER = '../../../../glove.twitter.27B.25d.txt'
    GLOVE_PATH_LOCAL = '.\model/model.txt'

    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = self.initialize_glove_dict()
        self._indexer.set_glove_dict(self.model)

    def initialize_glove_dict(self):
        glove_dict = {}
        with open(self.GLOVE_PATH_LOCAL, 'r', encoding='utf-8') as f:
            for line in tqdm(f):
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                glove_dict[word] = vector
        return glove_dict

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine1',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine1.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        l_res = searcher.search(query_as_list[0])
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids

Example #2

Show file

class SearchEngine:
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine2',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine2.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        list_copy = list(query_as_list[0])
        tagged_words = pos_tag(list_copy)
        for word in tagged_words:
            wn_tag = Wordnet.get_wordnet_pos(word[1])
            synonym = Wordnet.get_closest_term(word[0], wn_tag)
            if synonym is not None:
                list_copy.append(synonym)
        l_res = searcher.search(list_copy)
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids