Ejemplo n.º 1
0
def extract_embeddings(model, tokenizer, text, upsampling=True):
    '''
    Extract embeddings from the pre-trained bert model.
    Apply upsampling to ensure that embedding length are the same as the phoneme length
    '''

    clean_text = text_normalize(text)
    pinyin_seq = txt2pinyin(clean_text)
    phon_seq = [i for syl in pinyin_seq for i in syl]

    inputs = torch.tensor(tokenizer.encode(clean_text)).unsqueeze(0)
    outputs = model(inputs)
    h = outputs[0].cpu().detach().numpy()
    #    del outputs

    assert h.shape[1] == len(pinyin_seq)

    features = [
        np.tile(h[:, i, :], [1, len(syl), 1])
        for i, syl in enumerate(pinyin_seq)
    ]
    features = np.concatenate(features, axis=1)

    assert features.shape[1] == len(phon_seq)
    assert features.shape[2] == 768
    assert features.shape[0] == 1

    return features
Ejemplo n.º 2
0
def load_lab(words, rhythms, poses, times, phs_type, lab_file=None):
    assert len(times) == len(phs_type) + 1
    # words=re.split('#\d',txt)
    syllables = txt2pinyin(''.join(words))
    # print syllables
    # rhythms=re.findall('#\d',txt)
    # rhythms.append('#4')
    #print ' '.join(rhythms)
    # print ' '.join(words)
    # print ' '.join(words)
    # print rhythms
    # print poses
    # print syllables
    phone = tree(words, rhythms, syllables, poses, phs_type)
    # while phone:
    #	print phone.txt,
    #	phone=phone.rbrother
    #print
    #print syllables
    if lab_file:
        for ph_list in LabGator(phone, rhythms, times):
            print >> lab_file, ph_list
    else:
        for ph_list in LabGator(phone, rhythms, times):
            print ph_list
Ejemplo n.º 3
0
def chinese_cleaners(text):
    '''Pipeline for Chinese text, including text normalization, punctuation removal and pinyin conversion.'''
    text = text_normalize(text)

    py = txt2pinyin(text)

    py_seq = [i for syl in py for i in syl]

    return py_seq
Ejemplo n.º 4
0
def load_lab(words, rhythms, poses, times, phs_type, lab_file=None):
    assert len(times) == len(phs_type) + 1
    syllables = txt2pinyin(''.join(words))
    phone = tree(words, rhythms, syllables, poses, phs_type)
    if lab_file:
        for ph_list in LabGenerator(phone, rhythms, times):
            print >> lab_file, ph_list
    else:
        for ph_list in LabGenerator(phone, rhythms, times):
            print ph_list
Ejemplo n.º 5
0
def generate_dict(texts):

    dictionary = Counter()

    for i in tqdm(range(len(texts))):

        clean_text = text_normalize(texts.iloc[i, 1])

        py = txt2pinyin(clean_text)

        sequence = [i for syl in py for i in syl]

        dictionary.update(sequence)

    return dictionary
Ejemplo n.º 6
0
def txt2label(txt, wavfile=None, sfsfile=None, style='default'):
    '''Return a generator of HTS format label of txt.
    
    If only input txt without prosody mark and wavfile, it also 
    return label (but without prosody and wav information)
    
    Args:
        txt: like raw txt "向香港特别行政区同胞澳门台湾同胞"
             or txt with prosody make like "向#1香港#2特别行政区#1同胞#3澳门台湾#1同胞",
             punctuation is also allow in txt
        wavfile: absolute path of wav file for txt
        sfsfile: absolute path of sfs file (alignment file). A sfs file
            example(measure time by 10e-7 second, 12345678 means 1.2345678
            second)
            --------
            239100 s 
            313000 a 
            323000 d
            400000 b 
            480000 s 
            ---------
            a stands for consonant
            b stands for vowel
            d stands for silence that is shorter than 100ms
            s stands for silence that is longer than 100ms
        style: label style, currently only support the default HTS format
        
    Return:
        A generator of phone label for the txt, convenient to save as a label file
    '''
    # If txt with prosody mark, use prosody mark,
    # else use jieba position segmetation
    assert wavfile == None, 'wavfile currently is not supported'
    assert style == 'default', 'Currently only default style is support in txt2label'

    if '#' in txt:
        words, poses, rhythms = _adjust(txt)
    else:
        words = []
        poses = []
        for word, pos in posseg.cut(txt):
            words.append(word.encode('utf-8'))
            poses.append(pos[0].encode('utf-8'))
        rhythms = ['#0'] * (len(words) - 1)
        rhythms.append('#4')

    syllables = txt2pinyin(''.join(words))

    if sfsfile:
        phs_type = []
        times = ['0']
        with open(sfsfile) as fid:
            for line in fid.readlines():
                line = line.strip().rstrip('\n')
                assert len(line.split(' ')) == 2, 'check format of sfs file'
                time, ph = line.split(' ')
                times.append(int(time))
                phs_type.extend(ph)
    else:
        length = 0
        for syllable in syllables:
            length += len(syllable)
        phs_type = ['a'] * length
        phs_type.insert(0, 's')
        phs_type.append('s')
        times = [0] * (length + 3)
    phone = tree(words, rhythms, syllables, poses, phs_type)
    return LabGenerator(phone, rhythms, times)
def txt2label(txt, sfsfile=None, style='default'):
    '''Return a generator of HTS format label of txt.

    Args:
        txt: like raw txt "向香港特别行政区同胞澳门台湾同胞"
             or txt with prosody make like "向#1香港#2特别行政区#1同胞#3澳门台湾#1同胞",
             punctuation is also allow in txt
        sfsfile: absolute path of sfs file (alignment file). A sfs file
            example(measure time by 10e-7 second, 12345678 means 1.2345678
            second)
            --------
            239100 s
            313000 a
            323000 d
            400000 b
            480000 s
            ---------
            a stands for consonant
            b stands for vowel
            d stands for silence that is shorter than 100ms
            s stands for silence that is longer than 100ms
        style: label style, currently only support the default HTS format

    Return:
        A generator of phone label for the txt, convenient to save as a label file
    '''
    assert style == 'default', 'Currently only default style is support in txt2label'

    # delete all character which is not number && alphabet && chinese word
    txt = re.sub(r'(?!#)\W', '', txt)

    # If txt with prosody mark, use prosody mark,
    # else use jieba position segmetation
    if '#' in txt:
        words, poses, rhythms = _adjust(txt)
    else:
        txt = re.sub('[,.,。]', '#4', txt)
        words = []
        poses = []
        for word, pos in posseg.cut(txt):
            words.append(word)
            poses.append(pos[0])
        rhythms = ['#0'] * (len(words) - 1)
        rhythms.append('#4')

    syllables = txt2pinyin(''.join(words))

    phone_num = 0
    for syllable in syllables:
        phone_num += len(syllable)  # syllable is like ('b', 'a3')

    if sfsfile:
        phs_type = []
        times = ['0']
        with open(sfsfile) as fid:
            for line in fid.readlines():
                line = line.strip().rstrip('\n')
                assert len(line.split(' ')) == 2, 'check format of sfs file'
                time, ph = line.split(' ')
                times.append(int(float(time)))
                phs_type.extend(ph)
    else:
        phs_type = []
        for i, rhythm in enumerate(rhythms):
            single_word_pinyin = txt2pinyin(words[i])
            single_word_phone_num = sum(
                [len(syllable) for syllable in single_word_pinyin])
            phs_type.extend(['a'] * single_word_phone_num)
            if i != (len(rhythms) - 1) and rhythm == '#4':
                phs_type.append('s')
        '''
        phs_type = ['a'] * phone_num
        '''
        phs_type.insert(0, 's')
        phs_type.append('s')
        times = [0] * (len(phs_type) + 1)
    '''
    for item in words:
        print(item)

    print ('words: ', words)
    print ('rhythms: ',rhythms)
    print ('syllables: ', syllables)
    print ('poses: ', poses)
    print ('phs_type: ', phs_type)
    print ('times: ', times)
    '''

    phone = tree(words, rhythms, syllables, poses, phs_type)
    return LabGenerator(phone, rhythms, times)
Ejemplo n.º 8
0
        0, 264200, 360650, 492100, 596550, 737200, 774550, 989300, 1048049,
        1211600, 1295550, 1417500, 1483700, 1644000, 1685300, 1719600, 1894300,
        1933800, 2065200, 2156650, 2279300, 2370850, 2556100, 2583600, 2703700,
        2785200, 2873050, 2992500, 3035150, 3140490, 3198140, 3284050, 3415750,
        3507100, 3622700, 3766000, 3862800, 3984500, 4126900, 4213200, 4408500,
        4527250, 4703800, 4757350, 4931700, 52253061
    ]
    phs_type = [
        's', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'd',
        'a', 'b', 'a', 'b', 'a', 'b', 'a', 'b', 'd', 'a', 'b', 'a', 'b', 'a',
        'b', 'a', 'b', 'a', 'b', 'a', 'b', 's', 'a', 'b', 'a', 'b', 'a', 'b',
        'a', 'b', 's'
    ]
    words = re.split('#\d', txt)
    # print ' '.join(words)
    syllables = txt2pinyin(''.join(words))
    # print syllables
    rhythms = re.findall('#\d', txt)
    rhythms.append('#4')
    #print ' '.join(rhythms)
    print ' '.join(words)
    print rhythms
    print syllables
    poses = ['n'] * len(words)
    phone = tree(words, rhythms, syllables, poses, phs_type)
    # while phone:
    #	print phone.txt,
    #	phone=phone.rbrother
    #print
    #print syllables
Ejemplo n.º 9
0
def txt2label(txt, sfsfile=None, style='default'):
    '''Return a generator of HTS format label of txt.
    
    Args:
        txt: like raw txt "向香港特别行政区同胞澳门台湾同胞"
             or txt with prosody make like "向#1香港#2特别行政区#1同胞#3澳门台湾#1同胞",
             punctuation is also allow in txt
        sfsfile: absolute path of sfs file (alignment file). A sfs file
            example(measure time by 10e-7 second, 12345678 means 1.2345678
            second)
            --------
            239100 s 
            313000 a 
            323000 d
            400000 b 
            480000 s 
            ---------
            a stands for consonant
            b stands for vowel
            d stands for silence that is shorter than 100ms
            s stands for silence that is longer than 100ms
        style: label style, currently only support the default HTS format
        
    Return:
        A generator of phone label for the txt, convenient to save as a label file
    '''
    assert style == 'default', 'Currently only default style is support in txt2label'

    # del all Chinese punctuation
    # punctuation = "·!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
    # txt = re.sub(r'[%s]'%punctuation, '', txt)

    # delete all character which is not number && alphabet && chinese word
    txt = re.sub(r'\W', '', txt)

    # If txt with prosody mark, use prosody mark,
    # else use jieba position segmetation
    if '#' in txt:
        words, poses, rhythms = _adjust(txt)
    else:
        words = []
        poses = []
        for word, pos in posseg.cut(txt):
            words.append(word)
            poses.append(pos[0])
        rhythms = ['#0'] * (len(words) - 1)
        rhythms.append('#4')

    syllables = txt2pinyin(''.join(words))

    phone_num = 0
    for syllable in syllables:
        phone_num += len(syllable)

    if sfsfile:
        phs_type = []
        times = ['0']
        with open(sfsfile) as fid:
            for line in fid.readlines():
                line = line.strip().rstrip('\n')
                assert len(line.split(' ')) == 2, 'check format of sfs file'
                time, ph = line.split(' ')
                times.append(int(float(time)))
                phs_type.extend(ph)
    else:
        length = 0
        for syllable in syllables:
            length += len(syllable)
        phs_type = ['a'] * phone_num
        phs_type.insert(0, 's')
        phs_type.append('s')
        times = [0] * (phone_num + 3)
    '''
    for item in words:
        print(item)

    print (words)
    print (rhythms)
    print (syllables)
    print (poses)
    print (phs_type)
    '''

    phone = tree(words, rhythms, syllables, poses, phs_type)
    return LabGenerator(phone, rhythms, times)