Python Indexer.addEntities Examples

Programming Language: Python

Namespace/Package Name: indexer

Class/Type: Indexer

Method/Function: addEntities

Examples at hotexamples.com: 1

Python Indexer.addEntities - 1 examples found. These are the top rated real world Python examples of indexer.Indexer.addEntities extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

add_new_doc(30)

Indexer(30)

create_index(6)

create_unigram_index(3)

calculate_idf(3)

LoadIndexes(3)

close(3)

dump(3)

coords_to_indices(2)

indices_to_coords(2)

calculationSummerize(2)

add_idf_to_dictionary(2)

add_document(2)

LoadDict(2)

fix_inverted_index(2)

finish(2)

evaluate_input(1)

execute(1)

create_save_indexer_with_relevant_docs(1)

entities_and_small_big(1)

directory(1)

delete_dict_after_saving(1)

create_indexer(1)

create_dirs(1)

create_bulk_index_string(1)

finish_index(1)

CreatInvertedIndex(1)

finish_indexing(1)

get_num_spatial_nodes(1)

tokenize(1)

set_idx_fields(1)

process(1)

keys(1)

isStopword(1)

ignore_extensions(1)

get__lda__(1)

fit(1)

getStemmed(1)

getOr(1)

getAnd(1)

get(1)

generate_local_index(1)

create_block(1)

generate_global_index(1)

compute_tf(1)

createIndex(1)

add_square_Wij(1)

bp_index(1)

batch_get_feat_stacked(1)

after_indexing(1)

Example #1

Show file

class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self.invertedIndex = self._indexer.inverted_idx
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        # r = ReadFile(ConfigClass.corpusPath)
        # documents_list = r.readAllCorpus() #change if we need to read more then 1 parquet

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()

        utils.save_obj(
            {}, "inverted_idx"
        )  # needed to pass boris tests, sometimes, inverted_idx fails to save in testings system

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            if parsed_document.doc_length != 0:  #sometimes we get an empty tweet, no need to index them
                # index the document data
                self._indexer.add_new_doc(parsed_document)
        # Inserting entities to the indexer and posting files
        self._indexer.addEntities(self._parser.suspectedEntityDict)
        # Sort the posting files
        self._indexer.update_idfWij(idx)
        self._indexer.save_index("inverted_idx")
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """

        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        self._parser.suspectedEntityDict = {}

        query_as_list = self._parser.parse_sentence(query)

        # add entities to query - entities doesn't adds to query_as_list in parse_sentence
        # suspectedEntityDict holds only entities from original query
        for entity in self._parser.suspectedEntityDict:
            query_as_list.append(entity)

        # Clear query from Entities parts
        query_as_list = self.clearEntitiesParts(query_as_list)

        # WordNet expenssion
        extendedQ = copy.deepcopy(query_as_list)
        for term in query_as_list:
            synset = wordnet.synsets(term)
            try:
                for i in range(2):
                    Synonym = synset[i].lemmas()[0].name()
                    if term.lower() != Synonym.lower(
                    ) and Synonym + "~" not in extendedQ:
                        Synonym += "~"
                        extendedQ.append(Synonym)
            except:
                continue
        query_as_list = extendedQ

        numberOFresults, relevantDocIdList = searcher.search(
            query_as_list
        )  # returns tuple (number of results,relevantDocIdList)
        return numberOFresults, relevantDocIdList

    def clearEntitiesParts(self, query):
        modifiedQuery_l = copy.deepcopy(query)
        termsToRemoveFromQuery = []
        # at this point if query holds Entity, it will hold the terms builds the Entity and the Entity as 1 term
        # this is why this part below for : ['BILL','Gates','blabla','bla','Bill Gates']
        # if "Bill Gates" is already known Entity it will leave us with: ['blabla','bla','Bill Gates']
        for term in query:  # cleaning parts of entities from the query if the entity exist in the inverted index
            if " " in term:
                if term in self.invertedIndex:  # entity and in inverted Index
                    # modifiedQuery_l.append(term)
                    entity_l = term.split(" ")
                    for word in entity_l:
                        try:
                            termsToRemoveFromQuery.append(word.upper())
                        except:
                            termsToRemoveFromQuery.append(word.lower())
                else:  # unknown entity
                    modifiedQuery_l.remove(term)

        for word in termsToRemoveFromQuery:  #clear all appears of token from modifiedQuery
            modifiedQuery_l[:] = [x for x in modifiedQuery_l if x != word]
        query = modifiedQuery_l
        return query