Beispiel #1
0
    def make_feature(self, txt):
        syllables = preprocessing.syllable_tokenize(txt)

        sy2ix, ch2ix = self.sy_dict, self.ch_dict

        ch_ix, ch_type_ix, syllable_ix = [], [], []

        for syllable in syllables:
            six = preprocessing.syllable2ix(sy2ix, syllable)

            characters = list(syllable)
            chs = list(
                map(
                    lambda ch: preprocessing.character2ix(ch2ix, ch),
                    characters,
                )
            )
            ch_ix.extend(chs)
            ch_type_ix.extend(char_type.get_char_type_ix(characters))
            syllable_ix.extend([six]*len(chs))

        features = np.stack((ch_ix, ch_type_ix, syllable_ix), axis=0) \
            .reshape((1, 3, -1)) \
            .astype(np.int64)

        seq_lengths = np.array([features.shape[-1]], dtype=np.int64)

        return list(txt), (torch.from_numpy(features), torch.from_numpy(seq_lengths))
Beispiel #2
0
    def _process_training_line(self, syllables, w_bi_labels, output_scheme):
        sy_ix = list(map(lambda s: preprocessing.syllable2ix(self.sy_dict, s), syllables))
        x = np.array(sy_ix)

        y = w_bi_labels

        assert len(sy_ix) == len(y)

        return (x, len(y)), y
Beispiel #3
0
    def make_feature(self, txt):
        syllables = preprocessing.syllable_tokenize(txt)

        sy2ix = self.sy_dict

        syllable_ix = []

        for syllable in syllables:
            six = preprocessing.syllable2ix(sy2ix, syllable)
            syllable_ix.append(six)

        # dims: (len,)
        features = np.array(syllable_ix)\
            .astype(np.int64)\
            .reshape(-1)

        seq_lengths = np.array([features.shape[-1]], dtype=np.int64)
        
        features = torch.from_numpy(features)

        return syllables, (features, torch.from_numpy(seq_lengths))
Beispiel #4
0
    def _process_training_line(self, syllables, w_bi_labels, output_scheme):
        assert len(syllables) == len(w_bi_labels)

        characters, syllable_indices, labels = [], [], []

        # we get syllable and its label here
        for syllable, label in zip(syllables, w_bi_labels):
            _len = len(syllable)
            if _len == 0:
                _len, _chr = 1, [""]
                _label = [label]
            else:
                _chr = list(syllable)
                _label = [label] + [0] * (_len - 1)

            assert len(_chr) == len(_label) == _len, "%d vs %d vs %d" % (len(_chr), len(_label), _len)

            characters.extend(_chr)
            labels.extend(_label)

            sy_ix = preprocessing.syllable2ix(self.sy_dict, syllable)
            syllable_indices.extend([sy_ix]*_len)

        y = np.array(list(labels)).astype(int)

        ch_ix = np.array(
            list(map(lambda ch: preprocessing.character2ix(self.ch_dict, ch), characters))
        ).astype(int)
        ct_ix = np.array(char_type.get_char_type_ix(characters)).astype(int)

        x = np.stack((ch_ix, ct_ix, syllable_indices), axis=0)

        y = output_scheme.encode(y, syllable_indices)

        assert len(y) == len(ch_ix)
        assert len(y) == len(ct_ix)
        assert len(y) == len(syllable_indices)

        return (x, len(y)), y
Beispiel #5
0
def prepare_syllable_charater_seq_data(files,
                                       ch2ix,
                                       sy2ix,
                                       sampling=10,
                                       output_dir=""):
    training, validation = files

    if sampling:
        training = training[:sampling]
        validation = validation[:sampling]

    output_dir = "%s/best-syllable-crf-and-character-seq-feature-sampling-%d" % (
        output_dir, sampling)

    print("Saving data to %s" % output_dir)
    utils.maybe_create_dir(output_dir)
    for name, dataset in zip(("training", "val"), (training, validation)):
        print("working on : %s" % name)
        fout_txt = open("%s/%s.txt" % (output_dir, name), "w")
        try:
            for path in dataset:
                count = 0
                with open(path,
                          "r") as fin, open(path.replace(".txt", ".label"),
                                            "r") as flab:

                    has_space_problem = False

                    for txt, label in zip(fin, flab):
                        txt = txt.strip().replace("~~", "~")

                        if not txt:
                            continue

                        label = label.strip()
                        syllables = txt.split("~")

                        chars_idx = []
                        char_labels = []
                        syllable_idx = []

                        syllable_indices = list(
                            map(
                                lambda sy: preprocessing.syllable2ix(
                                    sy2ix, sy), syllables))

                        if len(syllables) != len(label):
                            print(txt, path)
                            print(len(syllables), len(label))
                            print(syllables)
                            print(label)
                            raise SystemExit("xx")

                        label = list(label)
                        for ii, (syllable, six, l) in enumerate(
                                zip(syllables, syllable_indices, label)):
                            if not syllable:
                                continue

                            if syllable == " ":
                                # next syllable is B, then we should also split this space
                                if label[ii + 1] == "1":
                                    l = "1"
                                else:
                                    l = "0"

                            chs = list(
                                map(
                                    lambda c: preprocessing.character2ix(
                                        ch2ix, c), list(syllable)))

                            total_chs = len(chs)
                            syllable_idx.extend([six] * total_chs)

                            chars_idx.extend(chs)
                            if l == "1":
                                char_labels.extend(["1"] + ["0"] *
                                                   (total_chs - 1))
                            else:
                                char_labels.extend(["0"] * total_chs)

                        assert len(char_labels) == len(chars_idx)

                        # check space problem
                        if not has_space_problem:
                            for cix, clb in zip(chars_idx, char_labels):
                                if cix == 3 and clb == "0":
                                    has_space_problem = True
                                    print(txt)
                                    break

                        fout_txt.write("%s::%s::%s\n" % (
                            "".join(char_labels),
                            " ".join(np.array(chars_idx).astype(str)),
                            " ".join(np.array(syllable_idx).astype(str)),
                        ))

                    if has_space_problem:
                        print("problem with space in %s" % path)

        finally:
            fout_txt.close()