Beispiel #1
0
    def __init__(self, mode, params):
        super().__init__(mode, params)

        # Load vocab and tag list
        self.char_to_idx = load_tkn_to_idx(os.path.join(self.get_processed_data_dir(), "vocab", "chars.txt"))
        self.word_to_idx = load_tkn_to_idx(os.path.join(self.get_processed_data_dir(), "vocab", "words.txt"))
        self.idx_to_word = load_idx_to_tkn(os.path.join(self.get_processed_data_dir(), "vocab", "words.txt"))
        self.tag_to_idx = load_tkn_to_idx(os.path.join(
            self.get_processed_data_dir(),
            "vocab",
            "seg_tags.txt" if self.params.dataset.tag_type == "seg" else "pos_tags.txt"))
        self.idx_to_tag = load_idx_to_tkn(os.path.join(
            self.get_processed_data_dir(),
            "vocab",
            "seg_tags.txt" if self.params.dataset.tag_type == "seg" else "pos_tags.txt"))

        def _add_tag(tag):
            self.tag_to_idx[tag] = len(self.tag_to_idx)
            self.idx_to_tag.append(tag)

        if '<pad>' not in self.tag_to_idx:
            _add_tag('<pad>')
        if '<sos>' not in self.tag_to_idx:
            _add_tag('<sos>')
        if '<eos>' not in self.tag_to_idx:
            _add_tag('<eos>')

        self.vocab_char_size = len(self.char_to_idx)
        self.vocab_word_size = len(self.idx_to_word)
        self.num_tags = len(self.tag_to_idx)

        # Load data
        self.data = []
        if self.mode in ["test", "train"]:
            self.data = self.load_data_from_file(os.path.join(self.get_processed_data_dir(), self.mode + ".csv"))
        elif self.mode == "infer":
            self.data = []
Beispiel #2
0
    def __init__(self, mode, params):
        super().__init__(mode, params)
        cfg = params.dataset

        self.lang_src = cfg.source
        self.lang_tgt = cfg.target
        self.lang = (self.lang_src, self.lang_tgt)
        self.dataset_name = (self.lang_src if self.lang_src != 'eng' else
                             self.lang_tgt) + '-eng'

        # Load vocab
        self.word2index = {}
        self.index2word = {}
        for lang in self.lang:
            self.word2index[lang] = load_tkn_to_idx(
                os.path.join(self.get_processed_data_dir(), self.dataset_name,
                             "vocab", lang + ".txt"))
            self.index2word[lang] = load_idx_to_tkn(
                os.path.join(self.get_processed_data_dir(), self.dataset_name,
                             "vocab", lang + ".txt"))
            logger.info("%s vocab size: %d", lang, len(self.word2index[lang]))

        self.input_size = len(self.word2index[self.lang_src])
        self.output_size = len(self.word2index[self.lang_tgt])

        # Load data
        if self.mode in ["test", "train"]:
            data = []
            fo = open(os.path.join(self.get_processed_data_dir(),
                                   self.dataset_name, self.mode + ".csv"),
                      "r",
                      encoding='utf-8')
            lang_pairs = fo.readline().strip().split('\t')[:2]
            for line in tqdm(fo):
                pairs = line.strip().split('\t')[:2]
                pairs = {lang: pairs[i] for i, lang in enumerate(lang_pairs)}
                data.append(
                    dict(X=[self.word2index[self.lang_src]['<sos>']] +
                         [int(i) for i in pairs[self.lang_src].split(' ')] +
                         [self.word2index[self.lang_src]['<eos>']],
                         Y=[self.word2index[self.lang_tgt]['<sos>']] +
                         [int(i) for i in pairs[self.lang_tgt].split(' ')] +
                         [self.word2index[self.lang_tgt]['<eos>']]))
            fo.close()
            self.data = data
        elif self.mode == "infer":
            self.data = []
Beispiel #3
0
def write_dataset(processed_dir, output_prefix, file_paths, transcripts, vocab_path, normalize_fn, tokenize_fn):
    """
    :param str processed_dir:
    :param str output_prefix: Output file is saved to {output_prefix}_{train/test}.csv
    :param dict[str, list[str]] file_paths:
    :param dict[str, list[str]] transcripts:
    :param str vocab_path:
    :param (str) -> str normalize_fn:
    :param (str) -> list[str] tokenize_fn:
    """
    outputs = {'train': [], 'test': []}
    vocab = load_tkn_to_idx(vocab_path)
    for mode in ["test", "train"]:
        os.makedirs(os.path.join(processed_dir, mode, "npy"), exist_ok=True)
        for file_path, transcript in tqdm(list(zip(file_paths[mode], transcripts[mode])), desc=mode):
            file_name = os.path.basename(file_path)
            file_name, _ = os.path.splitext(file_name)
            if file_name == "":
                continue
            npy_filename = os.path.join(processed_dir, mode, "npy", file_name + ".npy")
            if os.path.exists(npy_filename):
                outputs[mode].append(dict(
                    filename=npy_filename,
                    target=' '.join([str(get_token_id(vocab, tkn)) for tkn in tokenize_fn(normalize_fn(transcript))]),
                    trans_words=transcript
                ))

    for mode in ["test", "train"]:
        # outputs[mode].sort(key=lambda item: len(item['target_word']))
        fn = os.path.join(processed_dir, "%s_%s" % (output_prefix, mode) + '.csv')
        logger.info("Output to %s" % fn)
        with open(fn, 'w', encoding='utf-8') as f:
            f.write('\t'.join(['sound', 'target', 'trans']) + '\n')
            for o in outputs[mode]:
                f.write('\t'.join([
                    o['filename'],
                    o['target'],
                    o['trans_words']
                ]) + '\n')
Beispiel #4
0
    def __init__(self, mode, params):
        super().__init__(mode, params)
        cfg = params.dataset

        vocab = load_tkn_to_idx(
            os.path.join(self.get_processed_data_dir(),
                         "vocab_%ss.txt" % cfg.unit))
        self.id_to_token = load_idx_to_tkn(
            os.path.join(self.get_processed_data_dir(),
                         "vocab_%ss.txt" % cfg.unit))
        self.vocab = vocab

        vocab['<sos>'] = len(vocab)
        vocab['<eos>'] = len(vocab)
        self.output_size = len(vocab)

        is_debug = mode == "debug"
        if mode == "debug":
            mode = "train"

        with open(os.path.join(self.get_processed_data_dir(),
                               "%s_%s" % (cfg.unit, mode) + '.csv'),
                  'r',
                  encoding='utf-8') as f:
            lines = f.read().split('\n')[1:]
            lines = [l.split('\t') for l in lines if l != ""]
            self.data = [{
                'X_path':
                l[0],
                'Y': [vocab['<sos>']] + [int(w) for w in l[1].split(' ')] +
                [vocab['<eos>']],
                'Y_len':
                len(l[1].split(' ')) + 2
            } for l in lines]

            if is_debug:
                self.data = self.data[:20]
Beispiel #5
0
def preprocess(raw_dir, processed_dir):
    logger.info("Write outputs to file")
    outputs = {'train': [], 'test': []}
    mean = np.load("mean.npy")
    var = np.load("var.npy")
    vocab_word = load_tkn_to_idx(os.path.join(processed_dir,
                                              "vocab_words.txt"))
    vocab_char = load_tkn_to_idx(os.path.join(processed_dir,
                                              "vocab_chars.txt"))

    for mode in ["test", "train"]:
        os.makedirs(os.path.join(processed_dir, mode, "npy"), exist_ok=True)
        with open(os.path.join(raw_dir, mode, "prompts.txt"),
                  encoding="utf-8") as f:
            lines = f.read().split("\n")
            for i, s in tqdm(list(enumerate(lines)), desc=mode):
                filename = s.split(' ')[0]
                if filename == "":
                    continue
                npy_filename = os.path.join(processed_dir, mode, "npy",
                                            filename + ".npy")

                if True:
                    # (rate, sig) = wav.read(wav_filename)
                    htk_filename = os.path.join(processed_dir, mode,
                                                "features", filename + ".htk")
                    fh = open(htk_filename, "rb")
                    spam = fh.read(12)
                    nSamples, sampPeriod, sampSize, parmKind = unpack(
                        ">IIHH", spam)
                    veclen = int(sampSize / 4)
                    fh.seek(12, 0)
                    dat = np.fromfile(fh, dtype=np.float32)
                    dat = dat.reshape(len(dat) // veclen, veclen)
                    dat = dat.byteswap()
                    fh.close()

                    dat = (dat - mean) / np.sqrt(var)
                    np.save(npy_filename, dat)

                trans = s.lower().split(' ', 1)[1].replace(':', '')
                outputs[mode].append(
                    dict(filename=npy_filename,
                         target_word=' '.join([
                             str(get_token_id(vocab_word,
                                              normalize_word(word)))
                             for word in trans.split(' ')
                         ]),
                         target_char=' '.join([
                             str(
                                 get_token_id(
                                     vocab_char,
                                     normalize_char(c).replace(' ', '_')))
                             for c in trans
                         ]),
                         trans_words=' '.join(s.lower().split(' ')[1:])))

    for mode in ["test", "train"]:
        #outputs[mode].sort(key=lambda item: len(item['target_word']))
        for unit in ["word", "char"]:
            logger.info("Output to %s" %
                        os.path.join(processed_dir, "%s_%s" %
                                     (unit, mode) + '.csv'))
            with open(os.path.join(processed_dir,
                                   "%s_%s" % (unit, mode) + '.csv'),
                      'w',
                      encoding='utf-8') as f:
                f.write('\t'.join(['sound', 'target', 'trans']) + '\n')
                for o in outputs[mode]:
                    f.write('\t'.join([
                        o['filename'], o['target_%s' % unit], o['trans_words']
                    ]) + '\n')
Beispiel #6
0
    def maybe_preprocess(cls, force=False):
        super().maybe_preprocess(force)
        if os.path.exists(cls.get_processed_data_dir()):
            return

        for lang_pairs in DOWNLOAD_URLS:
            try:
                dataset_name = "-".join(lang_pairs)
                pairs = zip(
                    read_lang(
                        os.path.join(
                            cls.get_working_dir(), dataset_name,
                            "europarl-v7.%s.%s" %
                            (dataset_name, lang_pairs[0]))),
                    read_lang(
                        os.path.join(
                            cls.get_raw_data_dir(), dataset_name,
                            "europarl-v7.%s.%s" %
                            (dataset_name, lang_pairs[1]))))
                logger.info("Read %s sentence pairs", len(pairs))
                pairs = filter_pairs(pairs)
                logger.info("Trimmed to %s sentence pairs", len(pairs))

                os.makedirs(cls.get_processed_data_dir(), exist_ok=True)
                default_words = ['<pad>', '<sos>', '<eos>', '<oov>']

                word_token_to_idx = {}
                for i in [0, 1]:
                    write_vocab(
                        os.path.join(cls.get_processed_data_dir(),
                                     dataset_name), [_p[i] for _p in pairs],
                        lang_pairs[i], 0, default_words)
                    word_token_to_idx[lang_pairs[i]] = load_tkn_to_idx(
                        os.path.join(cls.get_processed_data_dir(),
                                     dataset_name, "vocab",
                                     lang_pairs[i] + ".txt"))

                data = {'train': pairs[10000:], 'test': pairs[:10000]}
                for mode in ['train', 'test']:
                    with open(
                            os.path.join(cls.get_processed_data_dir(),
                                         dataset_name, "%s.csv" % mode),
                            'w') as fo:
                        fo.write('\t'.join(
                            list(lang_pairs) +
                            [l + '-original' for l in lang_pairs]) + '\n')
                        for item in data[mode]:
                            fo.write('\t'.join([
                                ' '.join([
                                    str(
                                        get_token_id(
                                            word_token_to_idx[lang_pairs[0]],
                                            w)) for w in item[0]
                                ]), ' '.join([
                                    str(
                                        get_token_id(
                                            word_token_to_idx[lang_pairs[1]],
                                            w)) for w in item[1]
                                ]), ' '.join([w for w in item[0]]), ' '.join(
                                    [w for w in item[1]])
                            ]) + "\n")
            except:
                logger.error("Failed to process %s" % '-'.join(lang_pairs))
Beispiel #7
0
def maybe_preprocess(path, working_dir):
    if os.path.exists(working_dir):
        return

    logger.info("Preprocess data")
    os.makedirs(working_dir)
    os.mkdir(os.path.join(working_dir, "vocab"))

    original_sentences = {'train': [], 'test': []}
    sentences = {'train': [], 'test': []}
    postags = {'train': [], 'test': []}
    for mode in ['train', 'test']:
        for file in glob.glob(os.path.join(path, "%s*" % mode)):
            for sent in open(file, encoding='utf-8'):
                sent = sent.strip()
                if sent == '':
                    continue
                words = [normalize_word(s.split('//')[0]) for s in sent.split(' ')]
                tags = []
                for word in sent.split(' '):
                    word = word.lower()
                    if 'a' <= word[0] <= 'z':
                        if len(word.split('//')[1]) <= 2:
                            tags.append(word.split('//')[1])
                    else:
                        tags.append("<punc>")
                original_sentences[mode].append(sent)
                sentences[mode].append(words)
                postags[mode].append(tags)

    prepare_vocab_chars(working_dir, sentences['train'])
    write_vocab(working_dir, sentences['train'], min_freq=0)
    write_vocab(working_dir, postags['train'], name="pos_tags", min_freq=0, default_tags=['<w>', '<oov>'])
    prepare_tag_list(working_dir)

    word_token_to_idx = load_tkn_to_idx(os.path.join(working_dir, "vocab", "words.txt"))
    pos_tag_to_idx = load_tkn_to_idx(os.path.join(working_dir, "vocab", "pos_tags.txt"))
    char_token_to_idx = load_tkn_to_idx(os.path.join(working_dir, "vocab", "chars.txt"))

    for mode in ['train', 'test']:
        data = []
        for p_tags, compound_words, sent in \
                zip(postags[mode], sentences[mode], original_sentences[mode]):
            chars = tokenize(sent, 'char')
            words = []
            seg_tags = []
            pos_tags = []
            for w, tag in zip(compound_words, p_tags):
                if '_' in w:
                    w = w.split('_')
                    words += w
                    seg_tags += [1] + [0] * (len(w) - 1)
                    pos_tags += [tag] + ['<w>'] * (len(w) - 1)
                else:
                    words.append(w)
                    seg_tags.append(1)
                    pos_tags.append(tag)

            data.append((words, seg_tags, pos_tags, chars))

        with open(os.path.join(working_dir, "%s.csv" % mode), "w", encoding='utf-8') as fo:
            fo.write('word_tokens,word_seg_tags,word_pos_tags,char_tokens\n')
            data.sort(key=lambda d: len(d[0]), reverse=True)
            for words, seg_tags, pos_tags, chars in data:
                fo.write(','.join([
                    ' '.join([str(get_token_id(word_token_to_idx, w)) for w in words]),
                    ' '.join([str(t) for t in seg_tags]),
                    ' '.join([str(get_token_id(pos_tag_to_idx, t)) for t in pos_tags]),
                    ' '.join([str(get_token_id(char_token_to_idx, c)) for c in chars]),
                ]) + '\n')