Python tokenize Examples

Programming Language: Python

Namespace/Package Name: indexr.utils

Method/Function: tokenize

Examples at hotexamples.com: 3

Python tokenize - 3 examples found. These are the top rated real world Python examples of indexr.utils.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: __init__.py Project: kevin91nl/indexr

    def _save_block(self, file_num, tokens):
        """
        Save an inverted block.

        :param file_num:     The file index.
        :type  file_num:     int
        :param tokens:       List of tokens to save.
        :type  tokens:       list
        """
        for raw_token in tokens:
            raw_token = raw_token.strip()
            for token in tokenize(raw_token):
                if len(token) > 0:
                    line = token + ' ' + str(file_num) + "\n"
                    self._lines.append(line)
                    if len(line) > self.block_size:
                        self._save_line_buffer()

Example #2

Show file

File: __init__.py Project: kevin91nl/indexr

 def construct(self):
     dictionary = dict()
     postings = dict()
     last_docs = dict()
     files = dict()
     token_id = 0
     num_files = len(self._files)
     # Apply lossless compression by only saving the gaps between document ids (not the document ids themselves)
     for doc_id in range(0, num_files):
         # Loop through all documents
         file = self._files[doc_id]
         files[doc_id] = file
         with open(file, 'r') as handle:
             # Open the file and tokenize
             tokens = tokenize(handle.read())
             for token in tokens:
                 # Update the dictionary and postings
                 if token not in dictionary:
                     dictionary[token] = token_id
                     postings[token_id] = deque([doc_id])
                     last_docs[token_id] = doc_id
                     token_id += 1
                 else:
                     found_token_id = dictionary[token]
                     gap = doc_id - last_docs[found_token_id]
                     postings[found_token_id].append(gap)
                     last_docs[found_token_id] = doc_id
         if self.show_progress:
             sys.stdout.write("\rIndexing: " + '{0} / {1}'.format(doc_id, num_files))
             sys.stdout.flush()
     # The index construction is done
     if self.show_progress:
         sys.stdout.write("\rIndex construction: done\n")
         sys.stdout.flush()
     # Save the index
     index = (postings, dictionary, files)
     with open(self.get_index_path(), 'wb') as handle:
         pickle.dump(index, handle)

Example #3

Show file

File: __init__.py Project: kevin91nl/indexr

    def construct(self):
        """
        Construct the index.
        """
        # Make a subdirectory to store the blocks
        if os.path.exists(self._block_path):
            shutil.rmtree(self._block_path)
        os.mkdir(self._block_path)
        self._lines = []
        self._block_num = 0
        file_num = 0
        for file in self._files:
            file_pointer = open(file)
            buffered_token = ''
            while True:
                block = file_pointer.read(self.block_size + 1)
                if not block:
                    break
                tokens = tokenize(block)

                # Only do something if there are found tokens
                if len(tokens) > 0:
                    # Detect whether the block ends with a token
                    ends_with_token = (tokens[-1][-1] == block[-1])

                    # Detect whether the block start with a token
                    starts_with_token = (tokens[0][0] == block[0])

                    # Check whether the whole block is a token
                    block_is_token = (tokens[0] == block)

                    if block_is_token:
                        # If the full block is a token, then just enlarge the buffer
                        buffered_token += tokens[0]
                    else:
                        # Otherwise, there are at least two tokens in the block
                        block_tokens = []
                        if starts_with_token:
                            block_tokens.append(buffered_token + tokens[0])
                            buffered_token = ''
                            tokens = tokens[1:]
                        else:
                            block_tokens.append(buffered_token)
                            buffered_token = ''
                        if ends_with_token:
                            buffered_token = tokens[-1]
                            tokens = tokens[:-1]
                        for token in tokens:
                            block_tokens.append(token)
                        # Save the block tokens and increase the block number
                        if len(block_tokens) > 0:
                            self._save_block(file_num, block_tokens)
            if len(buffered_token) > 0:
                # If there is some left over token, save it as a block token and increase the block number
                self._save_block(file_num, [buffered_token])

            # Do not forget to close the file
            file_pointer.close()

            # Increase the file identifier
            file_num += 1

            # Display the progres
            if self.show_progress:
                sys.stdout.write("\rIndex construction: " + '{0} / {1}'.format(file_num, len(self._files)))
                sys.stdout.flush()

        # Save the line buffer if there is any left
        self._save_line_buffer()

        # The index construction is done
        if self.show_progress:
            sys.stdout.write("\rIndex construction: done\n")
            sys.stdout.flush()

        # Merge the blocks
        self._merge_blocks()

        # Clean up the temporary block folder
        shutil.rmtree(self._block_path)