def __init__(self,
                 source,
                 batch_size,
                 charset,
                 max_text_length=128,
                 predict=False):
        self.tokenizer = Tokenizer(charset, max_text_length)
        self.batch_size = batch_size
        self.partitions = ['test'] if predict else ['train', 'valid', 'test']

        self.size = dict()
        self.steps = dict()
        self.index = dict()

        self.dataset = reader.read_from_txt(source)
        self.arange = np.arange(len(self.dataset['train']['gt']))
        np.random.seed(42)

        for pt in self.partitions:
            self.dataset[pt]['dt'] = np.array(
                [pp.text_standardize(x) for x in self.dataset[pt]['dt']])
            self.dataset[pt]['gt'] = np.array(
                [pp.text_standardize(x) for x in self.dataset[pt]['gt']])

            self.size[pt] = len(self.dataset[pt]['gt'])
            self.steps[pt] = int(np.ceil(self.size[pt] / self.batch_size))

        self.one_hot_process = True
        self.noise_process = not bool(
            max(self.dataset['train']['dt'], default=['']))

        # increase `iterations` parameter if there is noise process in the train data
        if self.noise_process:
            ratio, iterations = pp.add_noise.__defaults__
            pp.add_noise.__defaults__ = (ratio, iterations + 2)
    def preprocess_partitions(self, input_size):
        """Preprocess images and sentences from partitions"""

        for y in self.partitions:
            arange = range(len(self.dataset[y]['gt']))

            for i in reversed(arange):
                text = pp.text_standardize(self.dataset[y]['gt'][i])

                if not self.check_text(text):
                    self.dataset[y]['gt'].pop(i)
                    self.dataset[y]['dt'].pop(i)
                    continue

                self.dataset[y]['gt'][i] = text.encode()

            results = []
            with multiprocessing.Pool(multiprocessing.cpu_count()) as pool:
                print(f"Partition: {y}")
                for result in tqdm(pool.imap(
                        partial(pp.preprocess, input_size=input_size),
                        self.dataset[y]['dt']),
                                   total=len(self.dataset[y]['dt'])):
                    results.append(result)
                pool.close()
                pool.join()

            self.dataset[y]['dt'] = results
    def read_lines(self, maxlen):
        """Read sentences from dataset and preprocess"""

        name = os.path.basename(self.source)
        print(f"The {name} dataset will be transformed...")

        dataset = getattr(self, f"_{name}")()

        if not isinstance(self.dataset, list):
            dataset = dataset['train'] + dataset['valid'] + dataset['test']

        dataset = [y for x in dataset for y in pp.generate_multigrams(x)]
        dataset = [
            y for x in dataset for y in pp.split_by_max_length(x, maxlen)
        ]

        dataset = [pp.text_standardize(x) for x in dataset]
        dataset = [x for x in dataset if self.check_text(x)]

        dataset = list(set(dataset))
        np.random.shuffle(dataset)

        index = int(len(dataset) * 0.1)
        self.dataset['train'] = dataset[index:]
        self.dataset['valid'] = dataset[:index]
        self.dataset['test'] = dataset[:32]  # just a sample
        del dataset

        for pt in self.partitions:
            self.size[pt] = len(self.dataset[pt])
            self.size['total'] += self.size[pt]
    def decode(self, text):
        """Decode vector to text"""

        decoded = "".join([self.chars[int(x)] for x in text if x > -1])
        decoded = self.remove_tokens(decoded)
        decoded = pp.text_standardize(decoded)

        return decoded
Esempio n. 5
0
    def decode(self, text):
        """metin vektörünün kodunu çözme..."""

        decoded = "".join([self.chars[int(x)] for x in text if x > -1])
        decoded = self.remove_tokens(decoded)
        decoded = pp.text_standardize(decoded)

        return decoded
    def preprocess_partitions(self, image_input_size):
        """Preprocess images and sentences from partitions"""

        for i in self.partitions:
            self.dataset[i]["gt"] = [
                pp.text_standardize(x).encode() for x in self.dataset[i]["gt"]
            ]

            pool = Pool()
            self.dataset[i]["dt"] = pool.map(
                partial(pp.preproc, img_size=image_input_size),
                self.dataset[i]["dt"])
            pool.close()
            pool.join()
Esempio n. 7
0
    def check_text(data):
        """Checks if the text has more characters instead of punctuation marks"""

        for i in reversed(range(len(data['gt']))):
            text = pp.text_standardize(data['gt'][i])
            strip_punc = text.strip(string.punctuation).strip()
            no_punc = text.translate(str.maketrans(
                "", "", string.punctuation)).strip()

            if len(text) <= 1 or len(strip_punc) <= 1 or len(no_punc) <= 1:
                data['gt'].pop(i)
                data['dt'].pop(i)
                continue

        return data
    def check_text(data, max_text_length=128):
        """Checks if the text has more characters instead of punctuation marks"""

        for i in reversed(range(len(data['gt']))):
            text = pp.text_standardize(data['gt'][i])
            strip_punc = text.strip(string.punctuation).strip()
            no_punc = text.translate(str.maketrans("", "", string.punctuation)).strip()

            length_valid = (len(text) > 1) and (len(text) < max_text_length)
            text_valid = (len(strip_punc) > 1) or (len(no_punc) > 1)

            if (not length_valid) or (not text_valid):
                data['gt'].pop(i)
                data['dt'].pop(i)
                continue

        return data
Esempio n. 9
0
    def preprocess_partitions(self, input_size):
        """Preprocess images and sentences from partitions"""

        for y in self.partitions:
            arange = range(len(self.dataset[y]['gt']))

            for i in reversed(arange):
                text = pp.text_standardize(self.dataset[y]['gt'][i])

                if not self.check_text(text):
                    self.dataset[y]['gt'].pop(i)
                    self.dataset[y]['dt'].pop(i)
                    continue

                self.dataset[y]['gt'][i] = text.encode()

            pool = Pool()
            self.dataset[y]['dt'] = pool.map(partial(pp.preprocess, input_size=input_size), self.dataset[y]['dt'])
            pool.close()
            pool.join()
    def preprocess_partitions(self, input_size):
        """Ön işlem görüntüleri ve bölümlerdeki cümleler"""

        for y in self.partitions:
            arange = range(len(self.dataset[y]['gt']))

            for i in reversed(arange):
                text = pp.text_standardize(self.dataset[y]['gt'][i])

                if not self.check_text(text):
                    self.dataset[y]['gt'].pop(i)
                    self.dataset[y]['dt'].pop(i)
                    continue

                self.dataset[y]['gt'][i] = text.encode()

            pool = Pool()
            self.dataset[y]['dt'] = pool.map(
                partial(pp.preprocess, input_size=input_size),
                self.dataset[y]['dt'])
            pool.close()
            pool.join()
Esempio n. 11
0
                                              dtgen.dataset['test']['gt'])

                    with open(os.path.join(output_path, "corpus.txt"),
                              "w") as lg:
                        lg.write(corpus)

            elif args.test:
                if args.mode != "kaldi":
                    lm.read_corpus(
                        corpus_path=os.path.join(output_path, "corpus.txt"))

                start_time = datetime.datetime.now()

                predicts = lm.autocorrect(
                    sentences=dtgen.dataset['test']['dt'])
                predicts = [pp.text_standardize(x) for x in predicts]

                total_time = datetime.datetime.now() - start_time

                old_metric, new_metric = ev.ocr_metrics(
                    ground_truth=dtgen.dataset['test']['gt'],
                    data=dtgen.dataset['test']['dt'],
                    predict=predicts,
                    norm_accentuation=args.norm_accentuation,
                    norm_punctuation=args.norm_punctuation)

                p_corpus, e_corpus = report(dtgen=dtgen,
                                            predicts=predicts,
                                            metrics=[old_metric, new_metric],
                                            total_time=total_time,
                                            plus=f"N: {args.N}\n")
Esempio n. 12
0
                                     ctc_decode=False,
                                     verbose=1)

            # get data and ground truth lists
            ctc_TK, space_TK = "<ctc>", "<space>"
            multigrams, multigrams_size = dict(), 0
            ground_truth = []

            # generate multigrams to compose the dataset
            for pt in dtgen.partitions:
                multigrams[pt] = [
                    pp.generate_multigrams(x) for x in dtgen.dataset[pt]['gt']
                ]
                multigrams[pt] = list(
                    set([
                        pp.text_standardize(y) for x in multigrams[pt]
                        for y in x
                    ]))

                multigrams[pt] = [
                    x for x in multigrams[pt] if Dataset.check_text(x)
                ]
                multigrams_size += len(multigrams[pt])

                for x in multigrams[pt]:
                    ground_truth.append(
                        [space_TK if y == " " else y for y in list(f" {x} ")])

                for x in dtgen.dataset[pt]['gt']:
                    ground_truth.append(
                        [space_TK if y == " " else y for y in list(f" {x} ")])
Esempio n. 13
0
        elif args.kaldi_assets:
            predicts = model.predict(x=dtgen.next_test_batch(),
                                     steps=dtgen.steps['test'],
                                     ctc_decode=False,
                                     verbose=1)

            # get data and ground truth lists
            ctc_TK, space_TK = "<ctc>", "<space>"
            multigrams, multigrams_size = dict(), 0
            ground_truth = []

            # generate multigrams to compose the dataset
            for pt in dtgen.partitions:
                multigrams[pt] = [pp.generate_multigrams(x) for x in dtgen.dataset[pt]['gt']]
                multigrams[pt] = list(set([pp.text_standardize(y) for x in multigrams[pt] for y in x]))

                multigrams[pt] = [x for x in multigrams[pt] if Dataset.check_text(x)]
                multigrams_size += len(multigrams[pt])

                for x in multigrams[pt]:
                    ground_truth.append([space_TK if y == " " else y for y in list(f" {x} ")])

                for x in dtgen.dataset[pt]:
                    ground_truth.append([space_TK if y == " " else y for y in list(f" {x} ")])

            # define dataset size and default tokens
            train_size = dtgen.size['train'] + dtgen.size['valid'] + multigrams_size

            # get chars list and save with the ctc and space tokens
            chars = list(dtgen.tokenizer.chars) + [ctc_TK]
Esempio n. 14
0
    max_text_length = 128
    charset_base = "".join([chr(i) for i in range(32, 127)])

    if args.transform:
        assert os.path.exists(raw_path)
        print(f"The {args.dataset} dataset will be transformed...")

        mod = importlib.import_module(f"transform.{args.dataset}")
        os.makedirs(os.path.dirname(hdf5_src), exist_ok=True)

        dtgen = mod.Dataset(partitions=["train", "valid", "test"])
        dataset = dtgen.get_partitions(source=raw_path)

        for i in dtgen.partitions:
            dataset[i]["gt"] = [
                pp.text_standardize(x).encode() for x in dataset[i]["gt"]
            ]

            pool = Pool()
            dataset[i]["dt"] = pool.map(
                partial(pp.preproc, img_size=input_size), dataset[i]["dt"])
            pool.close()
            pool.join()

            with h5py.File(hdf5_src, "a") as hf:
                hf.create_dataset(f"{i}/dt",
                                  data=dataset[i]["dt"],
                                  compression="gzip",
                                  compression_opts=9)
                hf.create_dataset(f"{i}/gt",
                                  data=dataset[i]["gt"],