Example #1
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_x = utils.read_lines(args.path['train_x'])
    train_y = utils.read_lines(args.path['train_y'])
    dataset = train_x + train_y
    keywords = None

    if args.problem == 'lda':
        model = LDAModel(args)
    else:
        trainset = [tokenizer.encode_line_into_words(i) for i in dataset]
        train_keywords(trainset, args.path['model'])
        keywords = load_keywords(args.path['model'])
        model = TFIDFModel(args)

    list_toks = []
    for n, line in enumerate(train_x):
        if not n % 10000 and n:
            utils.verbose('Tokenizing {} lines for {}'.format(n, args.problem))
        if keywords is None:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)])
        else:
            list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)
                              if s in keywords[: args.num_keywords]])
    model.fit(list_toks)
Example #2
0
def __generate_simple_poly_dict():
    data_path = '/data1/liujshi/yunlv_research/total_zhuiyi_corup/' \
                'total_metadata_new'
    total_lines = read_lines(data_path)[0:]
    print(total_lines[0:5])

    total_dict, poly_dict = defaultdict(list), defaultdict(list)
    for line in total_lines:
        phone, chars = line.split("|")[1], line.split("|")[2]
        chars = clean_sentence(chars.replace(" ", ""))
        phone = __change_tone_format(phone)
        try:
            phone_pairs = phone2pairs(chars, phone)
            for c, p in phone_pairs:
                total_dict[c].append(p)
                total_dict[c] = list(set(total_dict[c]))
        except TypeError:
            pass
        except IndexError:
            print("Index Error:", phone, chars)

    for line in read_lines("../../other_files/poly_dict"):
        key = line.split(":")[0]
        value = line.split(":")[1].split(",")
        poly_dict[key] = value

    map_phone = dict()
    for line in read_lines("../../other_files/phone_map_merge.txt"):
        key = line.split(":")[0]
        value = line.split(":")[1]
        map_phone[key] = value

    new_lines = []
    for char in poly_dict.keys():
        if char not in total_dict.keys():
            pass  # 未出现过的多音字移除掉
        else:
            values = total_dict[char]
            value_saved = []
            for value in values:
                # 发音词典拼音转化成标准拼音进行比对。
                map_value = map_phone[value.split()[0]] + \
                            map_phone[value.split()[1] + value.split()[2][-1]]
                if map_value in poly_dict[char]:
                    value_saved.append(value)
            if len(value_saved) > 1:
                new_line = "{}:{}".format(char, ",".join(value_saved))
                new_lines.append(new_line)
                print("save:", new_line)
            else:
                pass  # 只出现过其中一个音的多音字移除掉。

    write_lines("../../other_files/simple_poly_dict", new_lines)
    return None
Example #3
0
 def __init__(self):
   super().__init__()
   self.poly_dict = dict()
   poly_dict_path = "/data1/liufeng/synthesis/frontend/data/simple_poly_dict"
   for line in read_lines(poly_dict_path):
     line = line.replace(" ", "").replace("*", "")
     key = line.split(":")[0]
     value = line.split(":")[1].split(",")
     self.poly_dict[key] = value
   self.model, self.model_dir = None, None
   self.sess = None
def main():
    train_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/train.txt"
    train_lines = [
        line.split("|")[6] for line in read_lines(train_path) if "#" in line
    ]
    train_data = [split_psd(line, pairs=False) for line in train_lines]
    print(train_data[0])

    train_x = [x for x, _ in train_data]
    train_y = [y for _, y in train_data]

    dev_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/dev_psd.txt"
    dev_lines = [line.split("|")[6] for line in read_lines(dev_path)]
    dev_data = [split_psd(line, pairs=False) for line in dev_lines]

    dev_x = [x for x, _ in dev_data]
    dev_y = [y for _, y in dev_data]

    with open('/data1/liufeng/synthesis/frontend/models/feature_psd.pkl',
              'wb') as fw:
        pickle.dump((train_x, train_y, dev_x, dev_y), fw)
        print("save {}/{} train/dev items".format(len(train_x), len(dev_x)))
    return
Example #5
0
def process(hparam):
    utils.raise_inexistence(hparam.tmp_dir)
    tokenizer = vocab.Tokenizer()
    all_data = []
    paths = [
        join(hparam.tmp_dir, i)
        for i in ['train_q.txt', 'train_a.txt', 'dev_q.txt', 'dev_a.txt']
    ]
    word_path = join(hparam.tmp_dir, '{}.vcb'.format(hparam.word_size))
    char_path = join(hparam.tmp_dir, '{}.vcb'.format(hparam.char_size))
    for path in paths:
        utils.raise_inexistence(path)
        all_data += utils.read_lines(path)
    tokenizer.build_vocab(all_data, [hparam.word_size, hparam.char_size],
                          [word_path, char_path])
Example #6
0
    def __init__(self, files=None):
        """ Char-base adding word Tokenizer

        :param files: [word_file_path, char_file_path]
        """
        self.word_counter = {}
        self.char_counter = {}
        if files is not None:
            self.words = utils.read_lines(files[0])
            self.chars = utils.read_lines(files[1])
            utils.verbose('loading words from file {} with word size {}'.format(
                files[0], self.word_size))
            utils.verbose('loading chars from file {} with char size {}'.format(
                files[1], self.char_size))
        else:
            self.words = []
            self.chars = []
        self.cutter = SubCutter()
        self.word_dict = dict()
        self.char_dict = dict()
        self._set_dict()
        self.PAD_ID = 0
        self.UNK_ID = 1
        self.EOS_ID = 2
Example #7
0
def __read_from_result(data_path):
    lines = read_lines(data_path)
    count = 0
    result_pairs = []
    while count < len(lines):
        if lines[count].startswith("id:"):
            count += 2
        elif lines[count].startswith("split-id:"):
            sub_sentence = lines[count].split("|")[1]
            psd_pairs = split_psd(sub_sentence.replace(" ", ""))
            result_pairs.append(psd_pairs)
            count += 2
        elif lines[count].startswith("split-end"):
            count += 1
        elif lines[count].lower().startswith("end"):
            break
        else:
            count += 1
    return result_pairs
Example #8
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/psd_v1"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    # data_path = os.path.join(model_dir, "metadata_dev.txt")
    data_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/" \
                "dev_psd.txt"
    metadata = read_lines(data_path)
    print(metadata[0:2])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[6].replace(" ", "")).upper()
        for line in metadata
    ]
    write_lines(text_path, corpus)

    sub_count = 0
    # truth_path = "output.txt"
    truth_path = os.path.join(eval_dir, "truth.txt")
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):
            phone = (meta.split("|")[5])
            sentence = clean_sentence(
                meta.split("|")[6].replace(" ", "").upper())
            fr.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")

            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, phone))
                print("split-id:{} | {} | {}".format(split_id, sent, phone))
                sub_count += 1
            fr.write("split-end\n")

    print("\nsub count:{}".format(sub_count))
    print("write other_files to {}".format(truth_path))
Example #9
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/v3"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    data_path = os.path.join(model_dir, "metadata_dev.txt")
    metadata = read_lines(data_path)
    print(metadata[0])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[2].replace(" ", "")).upper()
        for line in metadata
    ]
    print(corpus[0])
    write_lines(text_path, corpus)

    sub_count = 0
    truth_path = os.path.join(eval_dir, "truth.txt")
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):
            meta = rm_prosody(meta)
            sentence, phone = meta.split("|")[2].replace(
                " ", ""), meta.split("|")[1]
            sentence = clean_sentence(sentence).upper()
            print(sentence)

            fr.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")
            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, phone))
                print("split-id:{} | {} | {}".format(split_id, sent, phone))
                sub_count += 1
            fr.write("split-end\n")

    print("\nsub count:{}".format(sub_count))
    print("write files to {}".format(truth_path))
Example #10
0
def main():
  data_path = '/data1/liufeng/synthesis/TACOTRON-2-refined/data/data_0306/' \
              'metadata_tot.csv'
  total_lines = read_lines(data_path)[0:]
  print(total_lines[0])

  poly_dict = load_poly_dict()

  new_lines = []
  for line in total_lines:
    if has_poly_char(line.split("|")[2], poly_dict):
      new_lines.append(line)
  print("there are {} lines with poly char".format(len(new_lines)))

  random.shuffle(new_lines)
  dev_lines = new_lines[0:5000]
  train_lines = new_lines[5000:]

  poly_chars, tot_chars = 0, 0
  for line in new_lines:
    for char in line:
      tot_chars += 1
      if char in poly_dict.keys():
        poly_chars += 1
  print("there are {}chars in total {}chars ({})".format(
    poly_chars, tot_chars, poly_chars/tot_chars))

  train_x, train_y, data_lines = __extract_feature(train_lines, poly_dict)
  write_lines("metadata_train.txt", data_lines)
  dev_x, dev_y, data_lines = __extract_feature(dev_lines, poly_dict)
  write_lines("metadata_dev.txt", data_lines)

  with open('/data1/liufeng/synthesis/frontend/models/feature.pkl', 'wb') as fw:
    pickle.dump((train_x, train_y, dev_x, dev_y), fw)
    print("save {}/{} train/dev items ".format(len(train_x), len(dev_x)))
  return
Example #11
0
 def __init__(self, args):
     self.dual_encoder_searcher = searcher_lib.DualEncoderSearcher(args)
     self.lda_searcher = searcher_lib.LDASearcher(args)
     self.tfidf_searcher = searcher_lib.TFIDFSearcher(args)
     self.questions = utils.read_lines(args.path['train_x'])
     self.answers = utils.read_lines(args.path['train_y'])
def _retrieve_linked_ids(ids, already_seen_ids: Set[int],
                         groups_definitions: GroupDefinitions) -> List[int]:
    linked_ids = [
        linked_id for _id in ids for linked_id in groups_definitions[_id]
    ]
    not_seen_ids = [i for i in linked_ids if i not in already_seen_ids]

    if not_seen_ids:
        already_seen_ids = already_seen_ids.union(not_seen_ids)
        return not_seen_ids + _retrieve_linked_ids(
            not_seen_ids, already_seen_ids, groups_definitions)
    else:
        return []


def parse_row(row):
    start_program, end_programs = row.split(' <-> ')
    start_program = int(start_program)
    end_programs = [int(p.strip()) for p in end_programs.split(',')]
    return start_program, end_programs


if __name__ == '__main__':
    raw_puzzle_input = read_lines('digital_plumber.txt')
    puzzle_input = dict(parse_row(r) for r in raw_puzzle_input)

    group_size = len(set(retrieve_linked_programs(0, puzzle_input)))
    print(f'Result for Part 1: {group_size}')
    groups_number = count_total_groups(puzzle_input)
    print(f'Result for Part 1: {groups_number}')
Example #13
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/v2"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    # data_path = os.path.join(model_dir, "metadata_dev.txt")
    data_path = "/data1/liufeng/synthesis/feature/feature_prosody/" \
                "bzn/dev.txt"
    metadata = read_lines(data_path)
    print(metadata[0:2])
    # dev_corpus = [line.split("|")[6] for line in read_lines(data_path)]
    # dev_phones = [line.split("|")[5] for line in read_lines(data_path)]
    # print(dev_corpus[0])
    # line = dev_corpus[0]
    # x, y = split_psd(line)
    # print(x, y)
    #
    # exit()
    # metadata = [line for line in metadata if "bzn" in line]
    # print(metadata[0:3])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[6].replace(" ", "")) for line in metadata
    ]
    # print(corpus[0:3])
    write_lines(text_path, corpus)
    # exit()

    sub_count = 0
    # truth_path = os.path.join(eval_dir, "bc_dev.txt")
    truth_path = "output.txt"
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):

            phone = (meta.split("|")[5])
            sentence = clean_sentence(meta.split("|")[6].replace(" ", ""))
            _ss = sentence
            print(phone, sentence)
            # x, y = split_psd(sentence)
            # sentence = "".join(x)
            # assert len(y) == len(sentence)

            # if not check_exist_eng(sentence):
            #   continue

            fr.write("\nid:{}\n{}\n".format(sent_id, _ss))
            print("\nid:{}\n{}".format(sent_id, _ss))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")
            # print(len(y), len(sub_phones), len(sub_sentences))
            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                x, y = split_psd(sent)
                sent = "".join(x)
                print(sent, phone)
                pairs = phone2pairs(sent, phone)
                new_pairs = [(_x[0], _x[1], _y) for _x, _y in zip(pairs, y)]
                new_phone = [_y + " #" + _z for _x, _y, _z in new_pairs]
                new_phone = " ".join(new_phone).replace(" #0", "")
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, new_phone))
                print("split-id:{} | {} | {}".format(split_id, sent,
                                                     new_phone))
                sub_count += 1
            fr.write("split-end\n")
            # exit()

    print("\nsub count:{}".format(sub_count))
    print("write other_files to {}".format(truth_path))
Example #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('yaml_path', help='config path for frontend')
    parser.add_argument('input_path', help='input path(txt)')
    parser.add_argument('output_path', help='output path(txt)')
    args = parser.parse_args()

    # todo: add sil label
    hparams = __load_hparams(args.yaml_path)

    text_path = args.input_path
    frontend_path = args.output_path
    flag_psd = hparams.flag_psd

    if hparams.norm_text:
        raw_file_lines = read_lines(text_path)
        sentences = []
        print("text normalize:")
        for line in raw_file_lines:
            new_line = Text().normalize(line)
            sentences.append(new_line.replace(" ", ""))
            if not new_line == line:
                print("{}->{}".format(line, new_line))
    else:
        sentences = read_lines(text_path)
    write_lines("norm.txt", sentences)
    # exit()

    trans = TranscriptToPinyin(
        dic_path=hparams.dict_path,
        eng_dic_path=hparams.eng_dict_path,
    )

    if hparams.nnet_psd and hparams.flag_psd:
        psd_predict, bert_psd_result = __compute_psd_result(
            hparams, sentences, hparams.load_memory_psd)
    else:
        psd_predict, bert_psd_result = None, None

    if hparams.nnet_phone:
        phone_predictor, bert_phone_result = __compute_nnet_phone_result(
            hparams, sentences, hparams.load_memory_phone)
    else:
        phone_predictor, bert_phone_result = None, None

    sub_count, count = 0, 0
    with open(frontend_path, "w", encoding="utf-8") as frontend_file:
        for sent_id, sentence in enumerate(sentences[0:]):
            # sentence = num2hanzi(clean_sentence(sentence))
            frontend_file.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            for split_id, sub_sentence in enumerate(sub_sentences):
                sub_count += 1
                phone_pairs = trans.get_phone_pairs(
                    sub_sentence, change_eng_symbol=hparams.eng_symbol)
                new_ = []
                if hparams.nnet_phone:
                    bert_phone = bert_phone_result[count]
                    if len(sub_sentence) == len(bert_phone):
                        phone = phone_predictor.modify_result(
                            bert_phone, phone_pairs)
                        for i, (c, ph, p) in enumerate(phone_pairs):
                            new_.append((c, phone[i], p))
                        phone_pairs = new_
                    else:
                        print("Error for bert result")

                if flag_psd and not hparams.nnet_psd:
                    phone = " ".join(
                        [ph + " #" + psd for _, ph, psd in phone_pairs])
                    phone = phone.replace("#0", "").replace("#5", "")
                    sub_sentence = "".join(
                        [c + "#" + psd for c, _, psd in phone_pairs])
                    sub_sentence = sub_sentence.replace("#0",
                                                        "").replace("#5", "")
                elif flag_psd and hparams.nnet_psd:
                    new_pairs = []
                    for new_psd, (char, ph, _) in zip(bert_psd_result[count],
                                                      phone_pairs):
                        new_pairs.append((char, ph, new_psd))
                    new_pairs = psd_predict.change_by_rules(new_pairs)
                    phone = " ".join(
                        [ph + " #" + psd for _, ph, psd in new_pairs])
                    phone = phone.replace("#0", "").replace("#5", "")
                    sub_sentence = "".join(
                        [c + "#" + psd for c, _, psd in new_pairs])
                    sub_sentence = sub_sentence.replace("#0",
                                                        "").replace("#5", "")
                else:
                    phone = " ".join([ph for _, ph, _ in phone_pairs])
                    sub_sentence = "".join([c for c, _, _ in phone_pairs])

                count += 1
                frontend_file.write("split-id:{} | {}\n{}\n".format(
                    split_id, sub_sentence, phone))
                print("split-id:{} | {} | {}".format(split_id, sub_sentence,
                                                     phone))
            frontend_file.write("split-end\n")

    # todo: 改善停顿。
    # todo: 重构,废弃kashagri,使用keras-bert
    print("\nsub count:{}".format(sub_count))
    print("write output data to {}".format(frontend_path))
Example #15
0
def process(args):
    utils.make_directory(args.path['model'])
    tokenizer = args.tokenizer(args.path['vocab'])
    train_batch = args.batch(tokenizer, args.max_lens)
    train_batch.set_data(utils.read_lines(args.path['train_x']),
                         utils.read_lines(args.path['train_y']))
    dev_batch = args.batch(tokenizer, args.max_lens)
    dev_batch.set_data(utils.read_lines(args.path['dev_x']),
                       utils.read_lines(args.path['dev_y']))
    model = args.model(args)

    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device
    config = tf.ConfigProto()
    config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory

    with tf.Session(config=config) as sess:
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver(pad_step_number=True)
        recorder = Recorder()
        starter = time.time()

        for i in range(args.max_steps):
            input_x, input_y, idx, update_epoch = train_batch.next_batch(
                args.batch_size, recorder.train_idx)
            train_features = {
                'input_x_ph': input_x,
                'input_y_ph': input_y,
                'keep_prob_ph': args.keep_prob
            }
            recorder.train_idx = idx
            train_fetches, train_feed = model.train_step(train_features)
            _, train_loss, train_acc = sess.run(train_fetches, train_feed)
            recorder.train_losses.append(train_loss)
            recorder.train_accs.append(train_acc)

            if not i % args.show_steps and i:
                input_x, input_y, idx, update_epoch = dev_batch.next_batch(
                    args.batch_size, recorder.dev_idx)
                dev_features = {
                    'input_x_ph': input_x,
                    'input_y_ph': input_y,
                    'keep_prob_ph': 1.0
                }
                recorder.dev_idx = idx
                dev_fetches, dev_feed = model.dev_step(dev_features)
                dev_loss, dev_acc = sess.run(dev_fetches, dev_feed)
                recorder.dev_losses.append(dev_loss)
                recorder.dev_accs.append(dev_acc)
                speed = args.show_steps / (time.time() - starter)
                utils.verbose(
                    r'        step {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format(
                        i, train_loss, train_acc, dev_loss, dev_acc, speed))
                starter = time.time()

            if not i % args.save_steps and i:
                features = recorder.stats()
                if features['save']:
                    saver.save(sess, args.path['model'])
                utils.verbose(
                    r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | '
                    r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i,
                                                  features['train_loss'],
                                                  features['train_acc'],
                                                  features['dev_loss'],
                                                  features['dev_acc']))
                print('-+' * 55)
                utils.write_result(args, recorder.lowest_loss)

        utils.verbose('Start building vector space from dual encoder model')
        vectors = []
        infer_batch = args.batch(tokenizer, args.max_lens)
        infer_batch.set_data(utils.read_lines(args.path['train_x']),
                             utils.read_lines(args.path['train_y']))
        starter = time.time()
        idx = 0
        update_epoch = False
        i = 0
        while not update_epoch:
            input_x, input_y, idx, update_epoch = infer_batch.next_batch(
                args.batch_size, idx)
            infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0}
            infer_fetches, infer_feed = model.infer_step(infer_features)
            enc_questions = sess.run(infer_fetches, infer_feed)
            vectors += enc_questions
            if not i % args.show_steps and i:
                speed = args.show_steps / (time.time() - starter)
                utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format(
                    i, speed))
                starter = time.time()
            i += 1
    vectors = np.reshape(np.array(vectors),
                         [-1, args.hidden])[:infer_batch.data_size]
    vec_dim = vectors.shape[-1]
    ann = AnnoyIndex(vec_dim)
    for n, ii in enumerate(vectors):
        ann.add_item(n, ii)
    ann.build(args.num_trees)
    ann.save(args.path['ann'])
    utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))