Example #1
0
    def _save_block(self, file_num, tokens):
        """
        Save an inverted block.

        :param file_num:     The file index.
        :type  file_num:     int
        :param tokens:       List of tokens to save.
        :type  tokens:       list
        """
        for raw_token in tokens:
            raw_token = raw_token.strip()
            for token in tokenize(raw_token):
                if len(token) > 0:
                    line = token + ' ' + str(file_num) + "\n"
                    self._lines.append(line)
                    if len(line) > self.block_size:
                        self._save_line_buffer()
Example #2
0
 def construct(self):
     dictionary = dict()
     postings = dict()
     last_docs = dict()
     files = dict()
     token_id = 0
     num_files = len(self._files)
     # Apply lossless compression by only saving the gaps between document ids (not the document ids themselves)
     for doc_id in range(0, num_files):
         # Loop through all documents
         file = self._files[doc_id]
         files[doc_id] = file
         with open(file, 'r') as handle:
             # Open the file and tokenize
             tokens = tokenize(handle.read())
             for token in tokens:
                 # Update the dictionary and postings
                 if token not in dictionary:
                     dictionary[token] = token_id
                     postings[token_id] = deque([doc_id])
                     last_docs[token_id] = doc_id
                     token_id += 1
                 else:
                     found_token_id = dictionary[token]
                     gap = doc_id - last_docs[found_token_id]
                     postings[found_token_id].append(gap)
                     last_docs[found_token_id] = doc_id
         if self.show_progress:
             sys.stdout.write("\rIndexing: " + '{0} / {1}'.format(doc_id, num_files))
             sys.stdout.flush()
     # The index construction is done
     if self.show_progress:
         sys.stdout.write("\rIndex construction: done\n")
         sys.stdout.flush()
     # Save the index
     index = (postings, dictionary, files)
     with open(self.get_index_path(), 'wb') as handle:
         pickle.dump(index, handle)
Example #3
0
    def construct(self):
        """
        Construct the index.
        """
        # Make a subdirectory to store the blocks
        if os.path.exists(self._block_path):
            shutil.rmtree(self._block_path)
        os.mkdir(self._block_path)
        self._lines = []
        self._block_num = 0
        file_num = 0
        for file in self._files:
            file_pointer = open(file)
            buffered_token = ''
            while True:
                block = file_pointer.read(self.block_size + 1)
                if not block:
                    break
                tokens = tokenize(block)

                # Only do something if there are found tokens
                if len(tokens) > 0:
                    # Detect whether the block ends with a token
                    ends_with_token = (tokens[-1][-1] == block[-1])

                    # Detect whether the block start with a token
                    starts_with_token = (tokens[0][0] == block[0])

                    # Check whether the whole block is a token
                    block_is_token = (tokens[0] == block)

                    if block_is_token:
                        # If the full block is a token, then just enlarge the buffer
                        buffered_token += tokens[0]
                    else:
                        # Otherwise, there are at least two tokens in the block
                        block_tokens = []
                        if starts_with_token:
                            block_tokens.append(buffered_token + tokens[0])
                            buffered_token = ''
                            tokens = tokens[1:]
                        else:
                            block_tokens.append(buffered_token)
                            buffered_token = ''
                        if ends_with_token:
                            buffered_token = tokens[-1]
                            tokens = tokens[:-1]
                        for token in tokens:
                            block_tokens.append(token)
                        # Save the block tokens and increase the block number
                        if len(block_tokens) > 0:
                            self._save_block(file_num, block_tokens)
            if len(buffered_token) > 0:
                # If there is some left over token, save it as a block token and increase the block number
                self._save_block(file_num, [buffered_token])

            # Do not forget to close the file
            file_pointer.close()

            # Increase the file identifier
            file_num += 1

            # Display the progres
            if self.show_progress:
                sys.stdout.write("\rIndex construction: " + '{0} / {1}'.format(file_num, len(self._files)))
                sys.stdout.flush()

        # Save the line buffer if there is any left
        self._save_line_buffer()

        # The index construction is done
        if self.show_progress:
            sys.stdout.write("\rIndex construction: done\n")
            sys.stdout.flush()

        # Merge the blocks
        self._merge_blocks()

        # Clean up the temporary block folder
        shutil.rmtree(self._block_path)