Esempio n. 1
0
class SearchEngine:

    num_of_tweets = 0

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    def get_num_of_tweets(self):
        return self.num_of_tweets

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.

    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        self.num_of_tweets = len(documents_list)

        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            parsed_document.num_of_tweets = self.num_of_tweets
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')
        # TODO: check indexer saving
        utils.save_obj(self._indexer.inverted_idx, "inverted_idx")

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        inverted_idx = self._indexer.load_index(fn)
        return inverted_idx

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        query_as_list = self._parser.parse_sentence(query, 0)
        original_query_list = query.split(" ")
        stop_words = stopwords.words('english')
        original_query_list = [
            w for w in original_query_list if w not in stop_words
        ]
        # find long terms and upper case words
        counter = 0
        while counter < len(original_query_list):
            len_term = 1
            word = original_query_list[counter]
            if word.isupper():  # NBA
                if word.find("\n") != -1:
                    word = word[:-1]
                    if word.find(".") != -1:
                        word = word[:-1]
                query_as_list.append(word)
            elif len(word) > 1 and re.search(
                    '[a-zA-Z]',
                    word) and word[0].isupper():  # upper first char
                term = word
                if original_query_list.index(word) + 1 < len(
                        original_query_list):
                    index = original_query_list.index(word) + 1
                    while index < len(original_query_list):  # find all term
                        if len(original_query_list[index]) > 1 and re.search('[a-zA-Z]',
                                                                             original_query_list[index]) and \
                                original_query_list[index][0].isupper():
                            new_word2 = original_query_list[index][
                                0] + original_query_list[index][1:].lower(
                                )  # Donald Trump
                            term += " " + new_word2
                            index += 1
                            len_term += 1
                        else:
                            break
                    if len_term > 1:
                        query_as_list.append(term)
            counter += len_term

        spell_checker = SpellChecker_ranker.correct_query(query_as_list)
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(spell_checker)  # TODO: add K results
Esempio n. 2
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    #using wordNet to expand queries
    def __init__(self, config=None):
        self._config = config
        if config.toStem:
            self._parser = Parse_stem()
        else:
            self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        config = self._config
        indexer = self._indexer
        number_of_documents = 0

        if(config.getoneFile()):
            df = pd.read_parquet(fn, engine="pyarrow")
            documents_list = df.values.tolist()
            # Iterate over every document in the file
            for idx, document in enumerate(documents_list):
                # parse the document
                parsed_document = self._parser.parse_doc(document)
                number_of_documents += 1
                # index the document data
                self._indexer.add_new_doc(parsed_document)
            self._indexer.calculationSummerize()
        else:
            r = ReadFile(corpus_path=config.get__corpusPath())
            for root, dirs, files in os.walk(config.get__corpusPath(), topdown=True):
                for name in files:
                    ext = name.split('.')[-1]
                    if ext == 'parquet':
                        documents_list = r.read_folder(root, file_name=name)
                        # Iterate over every document in the file
                        for idx, document in enumerate(documents_list):
                            # parse the document
                            parsed_document = self._parser.parse_doc(document)
                            number_of_documents += 1
                            # index the document data
                            indexer.add_new_doc(parsed_document)
                        # indexer.update_posting_files()
                        # indexer.reset_cach()
        self._indexer.save_index('inverted_idx')
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    def get_full_text(self, d_id):
        return  self._indexer.documents_data[d_id][4]

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        if self._indexer.inverted_idx == None:
            print("can't run query without inverted index been loaded")
            return
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Esempio n. 3
0
class Searcher:
    def __init__(self, inverted_index, corpus_size, average_length,
                 output_path):
        """
        :param inverted_index: dictionary of inverted index
        """
        self.parser = Parse()
        self.ranker = Ranker()
        self.inverted_index = inverted_index
        self.corpus_size = corpus_size
        self.average_length = average_length
        self.output_path = output_path

    def calculate_doc_scores(self, term, relevant_docs, posting_pointer,
                             posting_file):
        """
        Retrieves term's posting file and calculates score for each relevant document.
        Adds the relevant documents to relevant_docs dictionary
        :param term: query term for retrieval
        :param relevant_docs: dictionary of relevant documents
        :param posting_pointer: pointer (name) of relevant posting file
        :param posting_file: relevant posting file
        :return: returns a tuple of the current relevant posting pointer and posting file
        """
        # retrieve term's posting file
        if posting_pointer is None or term[0].lower(
        ) != posting_pointer or posting_file is None:
            posting_pointer = self.inverted_index[term][POSTING_POINTER_INDEX]
            posting_file = utils.load_obj(self.output_path +
                                          str(posting_pointer))

        inverted_document_frequency = log(self.corpus_size /
                                          self.inverted_index[term][DF_INDEX])

        documents = posting_file[term]
        for document in documents:

            # calculate score
            document_id = document[DOCUMENT_ID_INDEX]
            doc_weight = document[FREQUENCY_INDEX]
            normalized_length = document[LENGTH_INDEX] / self.average_length

            if document_id not in relevant_docs:
                relevant_docs[document_id] = 0

            # calculate score according to BM25+ weighting formula
            relevant_docs[document_id] += inverted_document_frequency * (float(
                (doc_weight *
                 (K1 + 1))) / (doc_weight + K1 *
                               (1 - B + B * normalized_length)) + DELTA)

        return posting_pointer, posting_file

    def relevant_docs_from_posting(self, query):
        """
        Search and retrieve relevant documents for the query. Calculate the similarity score for each document.
        :param query: query
        :return: dictionary of relevant documents and their scores
        """

        # parse query according to the same parsing rules of the corpus
        entities = dict()
        term_dict = dict()
        parsed_query = self.parser.parse_sentence(query, entities)
        self.parser.parse_capital_letters(parsed_query, term_dict)

        # perform spell correction
        spell_checker = SpellChecker()
        corrected_terms = []
        misspelled_terms = spell_checker.unknown([*term_dict.keys()])
        for term in misspelled_terms:

            # only correct terms that aren't in the inverted dictionary
            # terms in the dictionary are considered correct for retrieval
            if term not in self.inverted_index:
                candidates = spell_checker.candidates(term)
                if term in candidates:  # remove duplicate originally correct terms
                    candidates.remove(term)
                corrected_terms.extend(candidates)

        # sort the parsed query alphabetically for optimal posting files retrieval
        # always hold at most one posting file in memory
        sorted_query = [*term_dict.keys()] + [*entities.keys()
                                              ] + corrected_terms
        sorted_query.sort()

        # dictionary for holding all relevant documents (at least one query term appeared in the document)
        # format: {document_id: score}
        relevant_docs = dict()
        posting_file = None  # currently used posting file from disk
        posting_pointer = None  # current posting's pointer
        for term in sorted_query:

            # check if term exists in inverted dictionary in either lower or upper form
            if term in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term, relevant_docs, posting_pointer, posting_file)
            elif term.islower() and term.upper() in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term.upper(), relevant_docs, posting_pointer, posting_file)
            elif term.isupper() and term.lower() in self.inverted_index:
                posting_pointer, posting_file = self.calculate_doc_scores(
                    term.lower(), relevant_docs, posting_pointer, posting_file)

        return relevant_docs
Esempio n. 4
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        self._parser.curr_idx = self.parse_and_index_tweet_list(documents_list, 0)
        self._indexer.save_index('idx_bench.pkl')
        print('Finished parsing and indexing.')

    def parse_and_index_tweet_list(self, documents_list, idx):

        for document in documents_list:
            # parse the document
            self._parser.curr_idx = idx
            parsed_document = self._parser.parse_doc(document)
            # add the doucment to indexer here
            self._indexer.set_idx(idx)
            self._indexer.add_new_doc(parsed_document)
            idx += 1

        return idx-1

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        self._model = _Thesaurus()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Esempio n. 5
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        if not config:
            self._config = ConfigClass()
        else:
            self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self._reader = ReadFile(self._config.get__corpusPath())

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        self._indexer.check_pending_list()
        self._indexer.calculate_and_add_idf()
        self._indexer.calculate_sigma_Wij()
        self._indexer.calculate_avg_doc_len()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implmentation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def search(self, query, k=None):  # TODO: change
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        terms, entities = self._parser.parse_sentence(query)
        query_as_list = terms + entities
        # word net
        extended_query = word_net(terms)
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search_with_extension(query_as_list, extended_query, k)
Esempio n. 6
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        reader = ReadFile('')
        documents_list = reader.read_fn(fn)
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.after_indexing()
        self._indexer.save_index("inverted_idx")
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        self.load_index("inverted_idx.pkl")
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Esempio n. 7
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config

        if self._config:
            if not hasattr(self._config, 'toStem'):
                self._config.toStem = False
            if not hasattr(self._config, 'toLemm'):
                self._config.toLemm = False

        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.corpus_size = 0
        self.load_precomputed_model()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        self._indexer.save_index(
            self._config.get_output_path())  # Save the inverted_index to disk
        self.corpus_size = self._indexer.get_docs_count()
        self.calculate_doc_weight()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        self._model = SpellCheck

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def calculate_doc_weight(self):
        """
       The method calculates the TF-IDF for each document
       :return:
       """
        for word in self._indexer.inverted_idx:
            for doc_id in self._indexer.inverted_idx[word]['posting_list']:
                normalized_term_tf = self._indexer.inverted_idx[word][
                    "posting_list"][doc_id][0]
                term_df = self._indexer.inverted_idx[word]['df']
                term_idf = math.log10(self.corpus_size / term_df)
                # calculate doc's total weight
                term_weight = normalized_term_tf * term_idf
                self._indexer.inverted_idx[word]["posting_list"][
                    doc_id].append(term_weight)
                term_weight_squared = math.pow(term_weight, 2)
                self._indexer.docs_index[doc_id][0] += term_weight_squared
                self._indexer.docs_index[doc_id][0] = round(
                    self._indexer.docs_index[doc_id][0], 3)
Esempio n. 8
0
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    __slots__ = [
        '_config', '_indexer', '_parser', '_model', 'searcher', '_run_config',
        '_config'
    ]

    def __init__(self, config=None, run_config=None):
        if not config:
            config = ConfigClass()
        if not run_config:
            run_config = RunConfigClass()
        self._run_config = run_config
        self._config = config
        self._parser = Parse(run_config)
        self._indexer = Indexer(run_config)
        self._model = None
        self.searcher = Searcher(self._parser,
                                 self._indexer,
                                 run_config,
                                 model=self._model)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        # Iterate over every document in the file
        for document in df.values:
            # parse the document
            parsed_list = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_list)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn.strip('.pkl'))

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relevant
            and the last is the least relevant result.
        """
        return self.searcher.search(query, None, {3})
class SearchEngine:
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self.reader = ReadFile(corpus_path=config.get__corpusPath())
        self._indexer = Indexer(config)
        self.model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in tqdm(enumerate(documents_list)):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            if parsed_document is None:
                continue
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        tuple_to_save = self._indexer.fix_inverted_index()
        utils.save_pickle_tuple(tuple_to_save, 'idx_engine3',
                                self._config.get_out_path())

        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_path):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    def load_index(self, fn):
        return self._indexer.load_index(fn)

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        self._indexer.inverted_idx, self._indexer.document_dict = self.load_index(
            'idx_engine3.pkl')
        searcher = Searcher(self._parser, self._indexer, model=self.model)
        # TODO check about K
        query_as_list = self._parser.parse_sentence(query)
        list_copy = list(query_as_list[0])
        tagged_words = pos_tag(list_copy)
        for word in tagged_words:
            synonym = ThesaurusModel.get_synonym(word)
            if synonym is not None:
                list_copy.extend(synonym)
        l_res = searcher.search(list_copy)
        t_ids = [tup[1] for tup in l_res]
        return len(l_res), t_ids
Esempio n. 10
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(self._config, advanced=False)
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        number_of_documents = 0

        r = ReadFile(corpus_path=self._config.get__corpusPath())

        doc = r.read_file(fn)
        for document in doc:
            parsed_document = self._parser.parse_doc(document)
            self._indexer.add_new_doc(parsed_document)
            number_of_documents += 1
        capital_letters = self._parser.caps_dict
        self._indexer.change_inverted_by_caps(capital_letters)
        self._indexer.save_index('idx_bench')
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.

    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(parser=self._parser,
                            indexer=self._indexer,
                            wordnet=False,
                            correction=False)
        n_relevant, ranked_doc_ids = searcher.search(query)
        return n_relevant, [id for (id, rank) in ranked_doc_ids]
Esempio n. 11
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(self._config, advanced=False)
     self._indexer = Indexer(config)
     self._model = None
Esempio n. 12
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(False)
     self._indexer = Indexer(config)
Esempio n. 13
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse(False)
        self._indexer = Indexer(config)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        dict_of_methods = {}
        dict_of_methods['wordnet'] = False
        dict_of_methods['spell_correction'] = False
        dict_of_methods['thesaurus'] = False
        dict_of_methods['word2vec'] = True
        dict_of_methods['parser'] = False
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')
        self._indexer.slice_uncommon_terms()
        self._indexer.calculate_wij_idf()
        self._indexer.set_dict_methods(dict_of_methods)
        # self._indexer.save_index(fn="idx_bench")
        # self._indexer.save_index(self._config.get_output_path() + 'inverted_idx')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        self._model = gensim.models.KeyedVectors.load_word2vec_format(
            os.path.join(model_dir, "model_word2vec_last"),
            binary=True,
            encoding='utf-8',
            unicode_errors='ignore')

        self._config.set_download_model(False)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Esempio n. 14
0
                continue
            if number_arr[0] >= self.avg_length:
                map_reduce.write_dict(tmp_pos)
                self.set_is_writting.add(key)
                number_arr[0] = 0
            if term.lower() not in self.tmp_pos.keys():
                tmp_pos[term.lower()] = []
            if key in self.set_is_writting:
                map_reduce.wait_untill_finish()
                self.set_is_writting.remove(key)
            tmp_pos[term.lower()].append((document.tweet_id, document_dictionary[term]))
            number_arr[0] += 1
            # except:
            #     print('TERMS : _____ ' + str(term))
            #     print('INVERTED: problem with the following key {}'.format(term[0]))
        max_freq = max([document_dictionary.values()])
        self.tmp_pos_doc[document.tweet_id] = document_dictionary
        self.num_in_pos_doc_other[0] += 1
        if self.num_in_pos_doc_other[0] >= self.avg_length:
            self.map_reduce_doc.write_dict(self.tmp_pos_doc)
            self.num_in_pos_doc_other[0] = 0


if __name__ == '__main__':
    p = Parse(True)
    parsed_document = p.parse_doc(['1280914835979501568', 'Wed Jul 08 17:21:09 +0000 2020', '70% @loganxtalor: Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a bind. I…', '{}', '[]',
                                   'Y’all Towson took away my housing cause of COVID and I literally didn’t know where I was gonna go. I was in such a… https://t.co/i8IdrIKp2B', '{"https://t.co/i8IdrIKp2B":"https://twitter.com/i/web/status/1280659984628490246"}', '[[116,139]]', None, None, None, None, None, None])
    i = Indexer()
    i.add_new_doc(parsed_document)

Esempio n. 15
0
from reader import ReadFile
from parser_module import Parse
from stemmer import Stemmer
import os
import string

preprocessed_file = "model/preprocessed.txt"
corpus_path = r"C:\Users\Owner\Desktop\SearchEngine\Data"
reader = ReadFile(corpus_path)
parser = Parse()
stemmer = Stemmer()
#documents_list = reader.read_file("covid19_08-05.snappy.parquet")

documents_list = []
# files_to_process = [
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-08-2020\covid19_07-08.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-09-2020\covid19_07-09.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-10-2020\covid19_07-10.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-11-2020\covid19_07-11.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-12-2020\covid19_07-12.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-13-2020\covid19_07-13.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-15-2020\covid19_07-15.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-16-2020\covid19_07-16.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-18-2020\covid19_07-18.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-20-2020\covid19_07-20.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-04-2020\covid19_08-04.snappy.parquet",
#     r"C:\Users\Owner\Desktop\SearchEngine\Data\date=07-27-2020\covid19_07-27.snappy.parquet",
# ]

files_to_process = [
    r"C:\Users\Owner\Desktop\SearchEngine\Data\date=08-07-2020\covid19_08-07.snappy.parquet",
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(False)
     self.reader = ReadFile(corpus_path=config.get__corpusPath())
     self._indexer = Indexer(config)
     self.model = None
Esempio n. 17
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model =  None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """

        def is_ascii(s):
            return all(ord(c) < 128 for c in s)

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)

        #this funtion is respobsible to write the entities dict to text file


        #def write_entites():
        # file1 = open("entities.txt", "a")
        # start=time.time()
        # our_dict = sorted(self._parser.entities.items(), key=lambda item: item[1], reverse=True)
        # print(our_dict)
        # for word in our_dict:
        #     if is_ascii(word[0]):
        #         parsed=self._parser.parse_sentence(word[0])
        #         for term in parsed:
        #             if(not   term[0].isdigit() and   term[0]!="#" and term[0]!="@"):
        #                 file1.writelines(str(term)+"\n")
        # file1.close()


        to_del=[]

        # saving the necessary data to pickle
        to_Save = (self._indexer.inverted_idx, self._indexer.postingDict, self._indexer.num_of_docs, self._indexer.avg_Size_doc)
        utils.save_obj(to_Save, "index_5")


        def remove_word_1():
            for key in self._indexer.inverted_idx:
                if (self._indexer.inverted_idx[key] == 1 and key.isalpha()==False):
                    to_del.append(key)
                    self._indexer.postingDict.pop(key)
            for key in to_del:
                self._indexer.inverted_idx.pop(key)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        obj=utils.load_obj(fn)
        self._indexer.inverted_idx=obj[0]
        self._indexer.postingDict=obj[1]
        self._indexer.num_of_docs=obj[2]
        self._indexer.avg_Size_doc=obj[3]
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self,model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

        # DO NOT MODIFY THIS SIGNATURE
        # You can change the internal implementation as you see fit.

    def search(self, query,k=2000):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer)
        return searcher.search(query,k)



    def main(self,output_path,stemming,query_to_check,num_docs_to_retrieve):
        self.build_index_from_parquet("data/benchmark_data_train.snappy.parquet")
        if isinstance(query_to_check, list):
            queries = query_to_check
        elif isinstance(query_to_check, str):
            if query_to_check.endswith(".txt"):
                try:
                    with open(query_to_check, "r",encoding="utf-8") as queries:
                        queries = queries.readlines()
                        query2 = []
                        for q in queries:
                            if (q != "\n"):
                                query2.append(q)
                        queries=query2
                except FileNotFoundError as e:
                    print(e)
            else:
                queries = [query_to_check]
        else:
            return

        if (stemming):
            output_path = output_path + "/WithStem"
        else:
            output_path = output_path + "/WithoutStem"

        query_num =1
        for query in queries:
            start = time.time()
            mylist=self.search(query, num_docs_to_retrieve)
            answer_to_run=mylist[1]
            for doc_tuple in answer_to_run:
                print('tweet id: {}'.format(doc_tuple))
            query_num += 1
            print("time that toke to retrieve :" + str(time.time() - start))
Esempio n. 18
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = SpellCChecker()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        # glove_input_file = 'glove.twitter.27B.25d.txt'
        # word2vec_output_file = 'glove.twitter.27B.25d.txt.word2vec'
        # glove2word2vec(glove_input_file, word2vec_output_file)

        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            if parsed_document is None:
                continue
            self._indexer.add_new_doc(parsed_document)
        if len(self._indexer.inverted_idx) > 100000:
            self._indexer.sort_100K_inverted_index()
        self._indexer.add_idf_to_dictionary()
        print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        if ".pkl" in fn:
            fn = fn[:-4]
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        # self._model = KeyedVectors.load_word2vec_format('glove.twitter.27B.25d.txt.word2vec', binary=False)
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self.map_list = []
        self.prec5_list = []
        self.prec10_list = []
        self.prec50_list = []
        self.prec_total_list = []
        self.recall_list = []

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        print("\nNow Starting search engine 2")

        total_time = datetime.now()
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        # print("len of inverted: ", len(self._indexer.inverted_idx))
        # print("len of posting: ", len(self._indexer.postingDict))
        # print("len of dataSet: ", len(self._indexer.benchDataSet))
        # end_time = datetime.now()
        # print('\n ------ Time To Retrieve: {}'.format(end_time - total_time), " ------\n")
        #
        # print('Finished parsing and indexing.')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)

    def run_engine_two(self, fn):

        self.build_index_from_parquet(fn)
        queries_path = "data\\queries_train.tsv"

        all_queries = SearchEngine.query_reader(
            queries_path)["information_need"]

        for i, q in enumerate(all_queries):
            print(q)
            k, docs = self.search(q)
            # print(docs[:10])
            self.check_engine_quality(i + 1, docs[:300])
            print()

        print("Avg map is :", (sum(self.map_list) / len(self.map_list)))

    @staticmethod
    def query_reader(queries_path):

        data = pd.read_csv(queries_path, sep="\t")
        return data

    def get_parser(self):
        return self._parser

    def check_engine_quality(self, query_num, list_of_docs):
        """
        :param query_num:
        :param list_of_docs:
        :return: no return. prints metrics of the query. precision, recall, map.
        """

        benchmark_path = "data\\benchmark_lbls_train.csv"
        df = pd.read_csv(benchmark_path)

        df_prec = df[df['query'] == query_num]
        df_prec = df_prec[df_prec['tweet'].isin(list_of_docs)]
        dict_for_data = df_prec.set_index('tweet')['y_true'].to_dict()

        rmv_lst = []

        ranking = []
        # Add to list for rank
        for doc in list_of_docs:
            try:
                ranking.append(dict_for_data[int(doc)])
            except:
                rmv_lst.append(doc)
        for d in rmv_lst:
            list_of_docs.remove(d)

        data_df = pd.DataFrame({
            'query': query_num,
            'tweet': list_of_docs,
            'y_true': ranking
        })

        df_rec = df[df['query'] == query_num]
        recall_total = len(df_rec[df_rec['y_true'] == 1.0])

        # print("total Relevant doc found with tag 1 :" , len (data_df[data_df['y_true'] == 1.0]))
        # print("total NON relevant doc found with tag 0 :" , len (data_df[data_df['y_true'] == 0]))
        # print("found total of", len(df_prec), "tagged docs")
        # Calculate metrics and print
        prec5 = metrics.precision_at_n(data_df, query_num, 5)
        prec10 = metrics.precision_at_n(data_df, query_num, 10)
        prec50 = metrics.precision_at_n(data_df, query_num, 50)
        prec_total = metrics.precision(data_df, True, query_number=query_num)
        map_of_query = metrics.map(data_df)
        recall_val = metrics.recall_single(data_df, recall_total, query_num)
        self.map_list.append(map_of_query)
        self.prec5_list.append(prec5)
        self.prec10_list.append(prec10)
        self.prec50_list.append(prec50)
        self.prec_total_list.append(prec_total)
        self.recall_list.append(recall_val)

        print()
        print("precision at 5 of query", query_num, "is :", prec5)
        print("precision at 10 of query", query_num, "is :", prec10)
        print("precision at 50 of query", query_num, "is :", prec50)
        print("precision of query", query_num, "is :", prec_total)
        print("recall of query", query_num, "is :", recall_val)
        print("map of query", query_num, "is :", map_of_query)
Esempio n. 20
0
class SearchEngine:
    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None
        self._method = wordnet_method()

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0
        doc_len = len(documents_list)
        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document, doc_len)
        # print('Finished parsing and indexing.')

        # print('Finished marge, start rebuild posting dict')
        # self._indexer.rebuild_postingDict()
        self._indexer.rebuild_inverted_index()
        # print('finished rebuild inverted index')

        to_save = (self._indexer.inverted_idx, self._indexer.tweet_dict,
                   self._indexer.reversed_inverted_index)
        utils.save_obj(to_save, 'idx_bench')
        # TODO: inverted_idx, tweet_dict,reversed_inverted_index, to_save change to None
        self._indexer.inverted_idx = None
        self._indexer.tweet_dict = None
        self._indexer.reversed_inverted_index = None
        to_save = None
        # print('Finished rebuild inverted index and build reversed_inverted_index')

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """

        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and 
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """ 
        Executes a query over an existing index and returns the number of 
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and 
            a list of tweet_ids where the first element is the most relavant 
            and the last is the least relevant result.
        """
        searcher = Searcher(self._parser,
                            self._indexer,
                            model=self._model,
                            method=self._method)
        return searcher.search(query)
Esempio n. 21
0
class SearchEngine:

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation, but you must have a parser and an indexer.
    def __init__(self, config=None):
        self._config = config
        self._parser = Parse()
        self._indexer = Indexer(config)
        self._model = None

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def build_index_from_parquet(self, fn):
        """
        Reads parquet file and passes it to the parser, then indexer.
        Input:
            fn - path to parquet file
        Output:
            No output, just modifies the internal _indexer object.
        """
        df = pd.read_parquet(fn, engine="pyarrow")
        documents_list = df.values.tolist()
        # Iterate over every document in the file
        number_of_documents = 0

        for idx, document in enumerate(documents_list):
            # parse the document
            parsed_document = self._parser.parse_doc(document)
            number_of_documents += 1
            # index the document data
            self._indexer.add_new_doc(parsed_document)
        print('Finished parsing and indexing.')

        # self._indexer.save_index("idx_bench.pkl")
        #
        # indexer_dic = utils.load_obj("idx_bench")
        #
        self._indexer.save_index("idx.pkl")  # TODO - we need submit this

        indexer_dic = utils.load_obj("idx")  # TODO - we need submit this

        localMethod = True
        globalMethod = False
        wordNet = False
        spellChecker = False

        if localMethod:
            indexer_dic["local"] = True

        if wordNet:
            indexer_dic["wordnet"] = True

        if spellChecker:
            indexer_dic["spellChecker"] = True



        if globalMethod:
            docs_dic, Sij_dic = compute_Wi(indexer_dic, globalMethod)
            indexer_dic["docs"] = docs_dic
            indexer_dic["global"] = Sij_dic
        else:
            docs_dic = compute_Wi(indexer_dic)
            indexer_dic["docs"] = docs_dic

        # utils.save_obj(indexer_dic, "idx_bench")
        utils.save_obj(indexer_dic, "idx")  # TODO - we need submit this



    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_index(self, fn):
        """
        Loads a pre-computed index (or indices) so we can answer queries.
        Input:
            fn - file name of pickled index.
        """
        self._indexer.load_index(fn)

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def load_precomputed_model(self, model_dir=None):
        """
        Loads a pre-computed model (or models) so we can answer queries.
        This is where you would load models like word2vec, LSI, LDA, etc. and
        assign to self._model, which is passed on to the searcher at query time.
        """
        pass

    # DO NOT MODIFY THIS SIGNATURE
    # You can change the internal implementation as you see fit.
    def search(self, query):
        """
        Executes a query over an existing index and returns the number of
        relevant docs and an ordered list of search results.
        Input:
            query - string.
        Output:
            A tuple containing the number of relevant search results, and
            a list of tweet_ids where the first element is the most relavant
            and the last is the least relevant result.
        """

        searcher = Searcher(self._parser, self._indexer, model=self._model)
        return searcher.search(query)
Esempio n. 22
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse()
     self._indexer = Indexer(config)
     self._model = None
     self._method = wordnet_method()
Esempio n. 23
0
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse()
     self._indexer = Indexer(config)
     self._model = None
 def __init__(self, config=None):
     self._config = config
     self._parser = Parse(self._config.toStem, self._config.toLemm)
     self._indexer = Indexer(config)
     self._model = None