Exemple #1
0
def gop_feat_simple(gop_vals, textgrid_file):
    """
    Calculate gop statistics on vowels, consonants and syllables
    :param gop_vals: gop values of one utterance extracted from gop files
    :param textgrid_file: textgrid file
    :return:
    """

    textgrid = TextGrid()
    textgrid.read(textgrid_file)

    phn_seq = ""
    for intervals in textgrid.tiers[1]:
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                phn_seq += " " + intervals.mark

    language = syllabifier.English
    syllables = syllabifier.syllabify(language, str(phn_seq))

    vowel_gop, consonants_gop, syllable_gop = [], [], []

    syllable_idx = 0  # determine which syllable current phoneme is in
    phn_idx = 0
    syllable_phn_gop = []
    for intervals in textgrid.tiers[1]:

        # vowels and consonants
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                if intervals.mark in vowels:
                    vowel_gop.append(gop_vals[phn_idx])
                elif intervals.mark in consonants:
                    consonants_gop.append(gop_vals[phn_idx])
                else:
                    continue

        # syllables
        if syllable_idx < len(syllables):
            current_syllable = syllables[syllable_idx][1] + syllables[syllable_idx][2] + syllables[syllable_idx][3]
            if intervals.mark is not None:
                if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                    if intervals.mark in current_syllable:
                        syllable_phn_gop.append(gop_vals[phn_idx])
                    if intervals.mark == current_syllable[-1]:
                        syllable_idx += 1
                        syllable_gop.append(np.mean(syllable_phn_gop))
                        syllable_phn_gop = []

        if intervals.mark is not None:
            phn_idx += 1

    if not vowel_gop: vowel_gop.append(1)
    if not consonants_gop: consonants_gop.append(1)
    if not syllable_gop: syllable_gop.append(1)

    return [np.mean(vowel_gop), np.mean(consonants_gop), np.mean(syllable_gop)], ["gop_avgV", "gop_avgC", "gop_avgSyl"]
Exemple #2
0
def syllabify(phonemes, language, nonsilence_phonemes):
    syllabifier_conf = LANGUAGES[language].get('syllabifier_conf', None)
    if syllabifier_conf is None:
        return None
    if len(phonemes) == 1 and phonemes[0] not in nonsilence_phonemes:
        return None

    syllables = syllabifier.syllabify(syllabifier_conf, phonemes)
    return [s[1] + s[2] + s[3] for s in syllables]
def syllabify(phonemes, language, nonsilence_phonemes):
    syllabifier_conf = LANGUAGES[language].get('syllabifier_conf', None)
    if syllabifier_conf is None:
        return None
    if len(phonemes) == 1 and phonemes[0] not in nonsilence_phonemes:
        return None

    syllables = syllabifier.syllabify(syllabifier_conf, phonemes)
    return [s[1] + s[2] + s[3] for s in syllables]
def wrd_to_slbs(phn_in_wrd, wrd_tuple):
    lang = syllabifier.English
    # phns = ' '.join([phn_tuple[0] for phn_tuple in phn_in_wrd]).upper()
    phns = ''
    for i, phn_tuple in enumerate(phn_in_wrd):
        phn_token = phn_tuple[0]
        # check 'h#'
        if phn_token == 'h#':
            continue
        if i == 0:
            phns += phn_token
        else:
            phns += (' ' + phn_token)

    phns = phns.upper()
    # try:
    slbs = syllabifier.syllabify(lang, phns)
    slbs = syllabifier.stringify(slbs)
    # except:
    # check ambiguous phn
    # phns_list = _ambig(phns)
    # for j, phns in enumerate(phns_list):
    # try:
    # slbs = syllabifier.syllabify(lang, phns)
    # slbs = syllabifier.stringify(slbs)
    # break
    # except:
    # if j == len(phns_list)-1:
    # print ('Exception!')
    # print (wrd_tuple, phns)
    # raise
    # except:
    # try:
    # seg = _exception(wrd_tuple[0])
    # slbs = ' . '.join([' '.join(phns[seg[k]: seg[k+1]]) for k in range(len(seg)-1)])
    # # print (slbs)
    # except:
    # print (wrd_tuple, phns)
    return slbs.lower()
Exemple #5
0
def add_syllable_boundaries(yiddish_patterns, word):
    subwords_syllabified = []

    # split on punctuation, but retain punctuation marks
    subwords = re.split('([\.\-\\/!\?־„“”′״″"\';])', word)
    for subword in subwords:
        if subword not in [
                '"', "'", '.', '-', '\\', '/', '!', '?', '־', '„', '״', '“',
                '”', '′', '″', ';'
        ]:
            try:

                # syllabify word
                syllables = syllabifier.syllabify(yiddish_patterns, subword)
                result = ''
                for syllable in syllables:
                    for item in syllable:
                        if isinstance(item, list):
                            if len(item) > 0:
                                result += ''.join(item)
                    result += '-'

                # change the special Latin chars back to Yiddish chars
                result = re.sub('j', 'י', result)
                result = re.sub('ņ', 'נ', result)
                result = re.sub('Ņ', 'ן', result)
                result = re.sub('ļ', 'ל', result)
                result = re.sub(r'\-$', '', result)

                subwords_syllabified.append(result)
            except:
                subwords_syllabified = list(
                    subword
                )  # if there's a non-phoneme char in word, just add the word as-is
        else:
            subwords_syllabified.append(subword)

    return ''.join(subwords_syllabified)
Exemple #6
0
def gop_feat(gop_vals, textgrid_file):
    """
    Calculate gop statistics on vowels, consonants and syllables
    :param gop_vals: gop values of one utterance extracted from gop files
    :param textgrid_file: textgrid file
    :return:
    """

    textgrid = TextGrid()
    textgrid.read(textgrid_file)

    phn_seq = ""
    for intervals in textgrid.tiers[1]:
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                phn_seq += " " + intervals.mark

    language = syllabifier.English
    syllables = syllabifier.syllabify(language, str(phn_seq))

    vowel_gop, consonants_gop, syllable_gop = [], [], []

    syllable_idx = 0  # determine which syllable current phoneme is in
    phn_idx = 0
    syllable_phn_gop = []
    for intervals in textgrid.tiers[1]:

        # vowels and consonants
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                if intervals.mark in vowels:
                    vowel_gop.append(gop_vals[phn_idx])
                elif intervals.mark in consonants:
                    consonants_gop.append(gop_vals[phn_idx])
                else:
                    continue

        # syllables
        if syllable_idx < len(syllables):
            current_syllable = syllables[syllable_idx][1] + syllables[
                syllable_idx][2] + syllables[syllable_idx][3]
            if intervals.mark is not None:
                if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                    if intervals.mark in current_syllable:
                        syllable_phn_gop.append(gop_vals[phn_idx])
                    if intervals.mark == current_syllable[-1]:
                        syllable_idx += 1
                        syllable_gop.append(np.mean(syllable_phn_gop))
                        syllable_phn_gop = []

        if intervals.mark is not None:
            phn_idx += 1

    return [
        min(vowel_gop),
        min(consonants_gop),
        min(syllable_gop),  #minimum gops of vowel, consonant and syllable
        np.mean(vowel_gop),
        np.mean(consonants_gop),
        np.mean(syllable_gop),
        # average gop of vowels, consonants and syllables
        np.std(vowel_gop),
        np.std(consonants_gop),
        np.std(syllable_gop),
        # standard deviation of gops of vowels, consonants and syllables
        np.std(vowel_gop) / np.mean(vowel_gop),
        np.std(consonants_gop) / np.mean(consonants_gop),
        np.std(syllable_gop) /
        np.mean(syllable_gop),  # standard deviation normalized by average
    ], [
        "gop_minV",
        "gop_minC",
        "gop_minSyl",
        "gop_avgV",
        "gop_avgC",
        "gop_avgSyl",
        "gop_stdV",
        "gop_stdC",
        "gop_stdSyl",
        "gop_VacroV",
        "gop_VacroC",
        "gop_VacroSyl",
    ]
Exemple #7
0
def rhythmic_feat(tg_file, threshold = 0.8):
    """
    Return rhythmic features from analysis of alignment file (.textgrid)
    :param tg_file: textgrid file
    :param threshold: duration threshold that a phoneme will be considered as bad alignment
    :return:
    """

    my_tg_file = textgrid.TextGrid()
    my_tg_file.read(tg_file)
    phn_seq = ""
    for intervals in my_tg_file.tiers[1]:
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                phn_seq += " " + intervals.mark

    language = syllabifier.English
    syllables = syllabifier.syllabify(language, str(phn_seq))

    vowel_interval, consonants_interval, syllable_interval, sil_interval = [], [], [], []

    syllable_idx = 0 # determine which syllable current phoneme is in
    syllable_phn_dur = []
    for intervals in my_tg_file.tiers[1]:

        # vowels and consonants
        if intervals.mark is not None:
            if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                if intervals.mark in vowels:
                    if intervals.maxTime - intervals.minTime < threshold:
                        vowel_interval.append(float(intervals.maxTime) - float(intervals.minTime))
                elif intervals.mark in consonants:
                    if intervals.maxTime - intervals.minTime < threshold:
                        consonants_interval.append(float(intervals.maxTime) - float(intervals.minTime))
                else:
                    continue
            else:
                sil_interval.append(float(intervals.maxTime) - float(intervals.minTime))

        #syllables
        if syllable_idx < len(syllables):
            current_syllable = syllables[syllable_idx][1] + syllables[syllable_idx][2] + syllables[syllable_idx][3]
            if intervals.mark is not None:
                if intervals.mark != 'SIL' and intervals.mark != 'SPN':
                    if intervals.mark in current_syllable:
                        if intervals.maxTime - intervals.minTime < threshold:
                            syllable_phn_dur.append(float(intervals.maxTime) - float(intervals.minTime))
                    if intervals.mark == current_syllable[-1]:
                        syllable_idx += 1
                        syllable_interval.append(sum(syllable_phn_dur))
                        syllable_phn_dur = []

    # remove onset and offset silence
    sil_interval_true = sil_interval[1:-1]

    return [np.mean(vowel_interval), np.mean(consonants_interval), np.mean(syllable_interval), # average duration of vowels, consonants and syllables
            np.std(vowel_interval), np.std(consonants_interval), np.std(syllable_interval), # standard deviation of vowels, consonants and syllables
            np.std(vowel_interval)/np.mean(vowel_interval), np.std(consonants_interval)/np.mean(consonants_interval),
            np.std(syllable_interval)/np.mean(syllable_interval), # standard deviation normalized by average
            np.sum(vowel_interval)/float(my_tg_file.maxTime-my_tg_file.minTime),
            np.sum(consonants_interval)/float(my_tg_file.maxTime-my_tg_file.minTime),
            np.sum(syllable_interval)/float(my_tg_file.maxTime-my_tg_file.minTime), # percentage of duration of vowels, consonants and syllables
            float(len(syllable_interval))/float(my_tg_file.maxTime-my_tg_file.minTime), # number of syllables per second
            np.sum(sil_interval_true) / float(float(my_tg_file.maxTime) - float(my_tg_file.minTime) - sil_interval[0]-sil_interval[-1]),
            np.mean(sil_interval_true), np.std(sil_interval_true), # proportion, average and std of silence duration
            rPVI(vowel_interval), rPVI(consonants_interval), rPVI(syllable_interval), # raw PVI of vowels, consonants and syllables
            nPVI(vowel_interval), nPVI(consonants_interval), nPVI(syllable_interval) # normalized PVI of vowels, consonants and syllables
            ], [ "avgV","avgC","avgSyl", "stdV","stdC","stdSyl", "VacroV","VacroC","VacroSyl",
              "perV","perC","perSyl","SylPerSec","perSil","avgSil","stdSil",
              "rPVIV","rPVIC","rPVISyl","nPVIV","nPVIC","nPVISyl"
            ]
#
# by Josh Tauberer, 2008
#
# We read the CMU dictionary line by line from standard input,
# syllabify the line, and then print the result to standard output.

import sys
import syllabifier

language = syllabifier.English # syllabifier.loadLanguage("english.cfg")

print "## This is a syllabified version of the pronunciation dictionary"
print "## below made using the P2TK automated syllabifier. Periods"
print "## indicate syllable boundaries."
print "##"
print "##"

for line in sys.stdin :
	line = line.rstrip()

	if len(line) < 2 or line[0:2] == "##" :
		print line
		continue

	fields = line.split(" ")

	word = fields.pop(0)
	phonemes = fields
		
	print word + "  " + syllabifier.stringify(syllabifier.syllabify(language, phonemes))
Exemple #9
0
# by Josh Tauberer, 2008
#
# We read the CMU dictionary line by line from standard input,
# syllabify the line, and then print the result to standard output.

import sys
import syllabifier

language = syllabifier.English  # syllabifier.loadLanguage("english.cfg")

print "## This is a syllabified version of the pronunciation dictionary"
print "## below made using the P2TK automated syllabifier. Periods"
print "## indicate syllable boundaries."
print "##"
print "##"

for line in sys.stdin:
    line = line.rstrip()

    if len(line) < 2 or line[0:2] == "##":
        print line
        continue

    fields = line.split(" ")

    word = fields.pop(0)
    phonemes = fields

    print word + "  " + syllabifier.stringify(
        syllabifier.syllabify(language, phonemes))
def syl(word):

    try:
        pronounce = " ".join(cmu[word.lower()][0])
        syl_pro = syllabifier.syllabify(eng, pronounce)
    except KeyError:
        return -1

    print syl_pro

    if len(syl_pro) > 1:
        parsed_word = []

        for syllable, (stress, onset, nucleus, coda) in enumerate(syl_pro):
            if onset == []:
                split_point = nucleus[0]
            else:
                split_point = onset[0]

            print split_point
            """
            if the split_point is in the phoneme dictionary,
            iterate and find index of all letters corrosponding to the 
            phoneme and set split_point_index to the smallest index
            """
            if split_point in phoneme_dict:
                indices = []
                for spelling in phoneme_dict[split_point]:
                    index = word.lower().find(spelling)
                    if index != -1:
                        indices.append(index)
                indices.sort()
                split_point_index = indices[0]

            else:
                split_point_index = word.lower().find(split_point.lower())

            print "caught: ", word[split_point_index]

            """
            If you've iterated passed the first syllable, 
            take every letter preceding the onset of the current syllable,
            and append it to the onset of the previous syllable
            """
            if syllable > 0:
                parsed_word[syllable - 1].append(word[:split_point_index])

            syl_split = []
            syl_split.append(word[split_point_index])
            parsed_word.append(syl_split)

            word = word[split_point_index + 1 :]

            if syllable + 1 == len(syl_pro):
                syl_split.append(word)

            print "word: ", word
            print "parsed: ", parsed_word

        return " ".join(["".join(a) for a in parsed_word])

    return word
def pro_syl(word):
    pronounce = " ".join(cmu[word][0])
    return syllabifier.syllabify(eng, pronounce)
def add_syllables(title, input_path, syllabification_file_path, output_path):

	# Load language syllable structure for the syllabifier
	with open(syllabification_file_path) as f:   
		language_syllables =  json.load(f)


	# Load the textgrid
	tg = tgt.read_textgrid(os.path.join(input_path,title+'.TextGrid'))

	# Load name of all tiers
	tier_names = tg.get_tier_names()

	# Select a tier whose name contains 'words'
	words_tier_name = [name for name in tier_names if 'words' in name][0]
	words_tier = tg.get_tier_by_name(words_tier_name)

	# Select a tier whose name contains 'phones'
	phones_tier_name = [name for name in tier_names if 'phones' in name][0]
	phones_tier = tg.get_tier_by_name(phones_tier_name)

	# Start an empty tier for syllables
	syllable_tier = tgt.IntervalTier()
	syll_tier_name = [name for name in tier_names if 'words' in name][0].replace('words', 'sylls')
	syllable_tier.name = syll_tier_name

	# Syllabify one word at a time
	for w in words_tier._get_annotations():
		
		# For the current word, get all of its phones
		phs = phones_tier.get_annotations_between_timepoints(w.start_time, w.end_time)
		for ph in phs: 
			if ph.text == 'spn':
				ph.text = 'aa1'


		# Transform the string of phones into a string of syllables
		# Format: ph1 ph2 . ph3 ph4 ph5 . ph6 etc.
		s = stringify(syllabify(' '.join([ph.text for ph in phs]), language_syllables))

		# From string of syllables to a nested lists of phone indeces
		# Format: [[ph1_idx, ph2_idx, etc.], [ph3_idx, ph4_idx, etc.], etc.]

		sylls = [syll.split() for syll in s.split('.')]
		i = 0
		sylls_indeces = []
		for j, syll in enumerate(sylls):
			syll_indeces = []
			for k in range(0, len(syll)):
				syll_indeces.append(int(i))
				i += 1
			sylls_indeces.append(syll_indeces)

		# Extract the relevant intervals using the indeces
		sylls_intervals = [[phs[index] for index in ph_group] for ph_group in sylls_indeces]

		# Extract the stress for each syllable:
		# Format: [['0'], ['1'], etc.]
		sylls_stresses = [[char for char in ''.join(ph_group) if char.isdigit()==True] for ph_group in sylls]
		sylls_stresses = [ph_group if ph_group != [] else ['0'] for ph_group in sylls_stresses]

		#print(w)
		#print(sylls_indeces)
		#print(sylls_stresses)
		#print(sylls_intervals)

		syllable_intervals = [tgt.Interval(interval[0].start_time, interval[-1].end_time, str(sylls_stresses[i][0])) for i, interval in enumerate(sylls_intervals)]

		#print(syllable_intervals)
		syllable_tier.add_annotations(syllable_intervals)

	tg.add_tier(syllable_tier)

	tgt.write_to_file(tg, os.path.join(output_path,title+'.TextGrid'), format='short')