Python Tokenizer.format_data Examples

Programming Language: Python

Namespace/Package Name: src.tokenizer

Class/Type: Tokenizer

Method/Function: format_data

Examples at hotexamples.com: 2

Python Tokenizer.format_data - 2 examples found. These are the top rated real world Python examples of src.tokenizer.Tokenizer.format_data extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Tokenizer(30)

next(14)

_generator(7)

tokenize(6)

finished(3)

detokenize(3)

format_data(2)

create_tokens(2)

load_state_dict(1)

show_tokens(1)

set_title(1)

set_doc_id(1)

scan_source(1)

save_state_dict(1)

peek(1)

normalize(1)

_status(1)

convert_text_to_number(1)

load_config(1)

iter_terms(1)

counter_tokenize(1)

get_doc_id(1)

convert_number_to_text(1)

fit_on_text(1)

filter_new_lines(1)

filter(1)

export_xml(1)

doTokenization(1)

get_title(1)

Example #1

Show file

class DictionaryBuilder(BaseBuilder):
    def __init__(self):
        super().__init__()
        self.dictionary = set()
        self.count = 0
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_files(input_dir_path)
        self.build()
        self.save(output_path)
        self.print_counts()

    def build(self):
        for file in self.files:
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                self.count += len(words)
                self.dictionary.update(words)

    def save(self, output_path):
        with open(output_path, 'w') as f:
            f.write('\n'.join(sorted(self.dictionary)))

    def print_counts(self):
        print('Total count: ', self.count)
        print('Dictionary count: ', len(self.dictionary))

Example #2

Show file

class IdentityMatrixBuilder(BaseBuilder):
    def __init__(self, dict_path):
        super().__init__()
        self.dict_path = dict_path
        self.dictionary = set()
        self.matrix = {}
        self.tokenizer = Tokenizer()

    def run(self, input_dir_path, output_path):
        self.load_dictionary()
        self.load_files(input_dir_path)
        self.init_matrix()
        self.build()
        self.save(output_path)

    def load_dictionary(self):
        with open(self.dict_path, 'r') as f:
            lines = self.tokenizer.filter_new_lines(f.readlines())
            self.dictionary = sorted(lines)

    def init_matrix(self):
        self.matrix = {x: [0] * len(self.files) for x in self.dictionary}

    def build(self):
        for i, file in enumerate(self.files):
            with open(file, 'r') as f:
                lines = f.readlines()
                words = self.tokenizer.format_data(lines)
                for word in words:
                    self.matrix[word][i] += 1

    def save(self, output_path):
        field_names = ['Token'] + list(
            map(lambda x: x.split('/')[-1], self.files))
        with open(output_path, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(field_names)

            for k, v in self.matrix.items():
                writer.writerow([k] + v)