Example #1
0
    def gen_posting_list(self, offset, size, idf):
        """
        Generate a memory based posting list from a file
        :param offset: start of the posting list in the file
        :param size: size of the posting list
        :param idf: idf of the token
        :return: a memory based posting list with it size
        """
        posting_list = PostingList()
        pl_size = 0

        with open(self.__pl_file, "rb") as file:
            SC.last_query().add_mem_access()
            file.seek(offset)
            if self.__use_vbytes:
                bytes_read = file.read(size)
                numbers = VariableByte.decoding(bytes_read)
                for i in range(0, len(numbers), 2):
                    doc_id = numbers[i]
                    score = idf * (1 + log10(numbers[i + 1]))
                    posting_list.add_document(doc_id, score)
                    pl_size += 1
            else:
                read = 0
                while read < size:
                    pl_size += 1
                    doc_id = int.from_bytes(file.read(4), byteorder='big')
                    score = idf * (1 + log10(
                        int.from_bytes(file.read(4), byteorder='big')))
                    posting_list.add_document(doc_id, score)
                    read += 4 + 4
        return posting_list, pl_size
Example #2
0
 def ordered_access(self):
     '''
     allows to access each elements of the list ordered by their score
     :return: a tuple (document_id, score)
     '''
     for elem in self.ord_elems:
         if SC.last_query() is not None:
             SC.last_query().add_pl_access()
         yield elem
Example #3
0
 def alpha_access(self):
     """
     access documents by alphabetical order
     :return: tuple (doc_id, score)
     """
     for (key, val) in self.rand_elems.items():
         if SC.last_query() is not None:
             SC.last_query().add_pl_access()
         yield (key, val)
Example #4
0
 def document_score(self, document_id):
     '''
     return the score of a document in log time
     :param document_id: the document
     :return: the score of the document
     '''
     if SC.last_query() is not None:
         SC.last_query().add_pl_access()
     if document_id in self.rand_elems:
         return self.rand_elems[document_id]
     return 0
Example #5
0
    def query(self, query="", algorithm="NAIVE", number_of_results=5):
        """
        Query the inverted file for documents
        :param query: the query
        :param algorithm: the name fo the algorithm to use (a key in ALGORITHM)
        :param number_of_results: the number of results expected
        :return: an array of array containing [doc_id, score, path to the file containing the documents]
        """
        SC.new_query(query)

        self.current_status = "Querying - Using {} alogrithm".format(algorithm)
        documents = self.ALGORITHMS[algorithm]().execute(
            query, self.inv_file, number_of_results)
        SC.last_query().stop()
        if documents is not None:
            SC.last_query().log(algorithm, number_of_results, len(documents))
        else:
            SC.last_query().log(algorithm, number_of_results, 0)

        results = []

        if documents is not None:
            for document in documents:
                results.append([
                    document[0], document[1],
                    self.__id_to_filename[document[0]]
                ])

        self.current_status = "Querying - Finished"

        return results
Example #6
0
    def on_indexation_complete(self):
        print("Indexation complete !")

        # When indexation is finished Change this to get vocabulary from inverted file
        # Eventualy manage exception if vocabular inexistent
        vocabulary = self.backend.inv_file.get_terms()

        liststore = Gtk.ListStore(str)
        for s in vocabulary:
            liststore.append([s])

        completion = Gtk.EntryCompletion()
        completion.set_model(liststore)
        completion.set_text_column(0)

        entry = self.builder.get_object("search_entry")
        entry.set_completion(completion)

        loading_box = self.builder.get_object("loading_box")
        indexation_statistics_box = self.builder.get_object(
            "indexation_statistics_box")
        query_box = self.builder.get_object("query_box")
        start_indexation_button = self.builder.get_object(
            "start_indexation_button")

        loading_box.set_visible(False)

        indexation_stats = StatsControl.last_indexing()

        indexation_start_time_tofill = self.builder.get_object(
            "indexation_start_time_tofill")
        indexation_start_time_tofill.set_text("{:%H:%M:%S.%f}".format(
            indexation_stats.start_time))

        indexation_end_time_tofill = self.builder.get_object(
            "indexation_end_time_tofill")
        indexation_end_time_tofill.set_text("{:%H:%M:%S.%f}".format(
            indexation_stats.finish_time))

        indexation_total_time_tofill = self.builder.get_object(
            "indexation_total_time_tofill")
        indexation_total_time_tofill.set_text("{}".format(
            indexation_stats.total_time))

        indexation_file_size_tofill = self.builder.get_object(
            "indexation_file_size_tofill")
        indexation_file_size_tofill.set_text(str(indexation_stats.file_size))

        indexation_statistics_box.set_visible(True)

        query_box.set_visible(True)

        start_indexation_button.set_sensitive(True)
Example #7
0
    def indexing(self,
                 files,
                 ignore_case=True,
                 ignore_stop_words=True,
                 stemming=True,
                 use_weights=True,
                 title_weight=5,
                 date_weight=2,
                 memory_limit=50,
                 use_vbytes=True):
        """
        Launch the indexing of a list of files
        :param files: the paths to the files to index
        :param ignore_case: should case be ignored in the indexing ?
        :param ignore_stop_words: should stop words be ignored ?
        :param stemming: should we stemm the tokens ?
        :param use_weights: shoud we differenciate word with their position in the document ?
        :param title_weight: weight for words in title
        :param date_weight: weight for words in the date
        :param memory_limit: limit on the memory before a flush in a temp file
        :param use_vbytes: usage of variable bytes for the final posting list ?
        :return: when the indexing is finished
        """

        SC.new_indexing()

        documents = []

        self.current_status = "Indexing - Starting"

        self.__id_to_filename = SortedDict()

        self.inv_file = InvertedFile(use_vbytes, memory_limit)
        for file in files:
            self.current_status = "Indexing - {}".format(file)
            file_docs = Reader.read_file(file, ignore_case, ignore_stop_words,
                                         stemming, use_weights, title_weight,
                                         date_weight)
            for doc in file_docs:
                self.__id_to_filename[int(doc.doc_id())] = file
                self.inv_file.add_document(doc)

        self.current_status = "Indexing - Making the inverted file"

        self.inv_file.gen_pl_file()

        self.current_status = "Indexing - Saving to pickle file"

        with open(self.PICKLES[0], "wb") as file:
            pickle.dump(self.inv_file, file)
        with open(self.PICKLES[1], "wb") as file:
            pickle.dump(self.__id_to_filename, file)

        self.current_status = "Indexing - Finished - You can query"

        SC.last_indexing().stop()
        SC.last_indexing().log(files, ignore_case, ignore_stop_words, stemming,
                               use_weights, title_weight, date_weight,
                               memory_limit, use_vbytes)
Example #8
0
    def on_query_complete(self, results):
        print("Query complete !")

        query_stats = StatsControl.last_query()

        loading_box = self.builder.get_object("loading_box")
        loading_box.set_visible(False)

        start_time_tofill = self.builder.get_object("start_time_tofill")
        start_time_tofill.set_text("{:%H:%M:%S.%f}".format(
            query_stats.start_time))

        end_time_tofill = self.builder.get_object("end_time_tofill")
        end_time_tofill.set_text("{:%H:%M:%S.%f}".format(
            query_stats.finish_time))

        total_time_tofill = self.builder.get_object("total_time_tofill")
        total_time_tofill.set_text("{}".format(query_stats.total_time))

        pl_accesses_tofill = self.builder.get_object("pl_accesses_tofill")
        pl_accesses_tofill.set_text(str(query_stats.pl_accesses))

        disk_accesses_tofill = self.builder.get_object("disk_accesses_tofill")
        disk_accesses_tofill.set_text(str(query_stats.memory_accesses))

        results_text = "\t Score     |\tDOCID   |\t   File path \n"
        for result in results:
            results_text += ("\t{:8.5f} |\t{:8} |\t{}".format(
                result[1], result[0], result[2])) + "\n"

        print("results" + results_text)

        results_textview = self.builder.get_object("results_textview")
        results_textview_buffer = results_textview.get_buffer()
        results_textview_buffer.set_text(results_text)

        results_box = self.builder.get_object("results_box")
        results_box.set_visible(True)

        start_query_button = self.builder.get_object("start_query_button")
        start_query_button.set_sensitive(True)
Example #9
0
    return filelist


file_paths = get_filelist_from_folderpath("latests")

exe = Executable()

algorithm = DEFAULT_ALGORITHM
number_of_results = DEFAULT_NUMBER_OF_RESULTS

memorylimit = 200

exe.indexing(file_paths, memory_limit=memorylimit)

print(SC.last_indexing())

try:
    in_res = int(input("Number of results desired ? ").strip())
    number_of_results = in_res
except ValueError:
    print("Non-int value entered using default {}".format(
        DEFAULT_NUMBER_OF_RESULTS))

print("Algorithm description :")
for (name, desc) in ALGORITHMS_DESC.items():
    print("{}\t- {}".format(name, desc))

in_alg = input("Choose your algorithm : ").strip().upper()
if in_alg not in ALGORITHMS_DESC:
    algorithm = DEFAULT_ALGORITHM
Example #10
0
    def gen_pl_file(self):
        self._dump(self.tmp_path, self.tmp_voc)

        del self.tmp_voc

        self.tmp_files_path.append(self.tmp_path)

        tmp_files = []
        tmp_used = []
        for path in self.tmp_files_path:
            tmp_used.append(False)
            tmp_files.append(open(path, "r"))

        tmp_lines = []
        for file in tmp_files:
            tmp_lines.append(file.readline())

        self.vocabulary_of_term = SortedDict()
        self.vectors_of_term = SortedDict()  #HERE

        offset = 0

        while True:

            min_term = ''
            min_lists = []
            for i in range(len(tmp_files)):
                if tmp_used[i] and tmp_lines[i] != '':
                    tmp_lines[i] = tmp_files[i].readline()
                    tmp_used[i] = tmp_lines[i] == ''

                if tmp_lines[i] != '':
                    term = tmp_lines[i].split('\t')[0]
                    if i == 0:
                        min_term = term
                    if term < min_term:
                        min_term = term
                        min_lists = [i]
                    elif term == min_term:
                        min_lists.append(i)

            if min_term == '':
                break

            pl_size = 0
            pl_string = ""

            for i in min_lists:
                split = tmp_lines[i].split('\t')
                pl_string = "{}{}".format(pl_string,
                                          split[2].replace("\n", ","))
                tmp_used[i] = True

            freq = 0

            term_rdm_index = [0] * RandomIndex.get_n()  # HERE

            with open(self.__postinglist_file_path, "ab") as file:
                if_doc_id = True  # HERE
                for val in pl_string.split(","):
                    if val != '':
                        if if_doc_id:  # HERE
                            term_rdm_index += self.doc_id_vectors_list[
                                val]  #HERE
                            term_rdm_index = list(
                                map(lambda x, y: x + y, term_rdm_index,
                                    self.doc_id_vectors_list[val]))  # HERE
                        freq += 1
                        if self.use_vbytes:
                            bytes_val = VariableByte.encoding_number(int(val))
                        else:
                            bytes_val = int(val).to_bytes(4,
                                                          byteorder='big',
                                                          signed=False)

                        pl_size += file.write(bytes_val)
                    if_doc_id = not if_doc_id  # HERE

            self.vectors_of_term[min_term] = term_rdm_index  # HERE
            # print(self.vectors_of_term[min_term])   # HERE
            # print()  # HERE

            idf = log10(self.nb_docs / (1 + (freq / 2)))

            self.vocabulary_of_term[min_term] = (offset, pl_size, idf)
            offset += pl_size

        SC.last_indexing().add_pl_size(offset)

        for file in tmp_files:
            file.close()
        for file_path in self.tmp_files_path:
            os.remove(file_path)

        self.__postinglist_gen = FileToPostingLists(
            self.__postinglist_file_path, self.use_vbytes)