def guessStress(input_string): b = BlickLoader() vows = {x[:-1]: { y[-1] for y in b.vowels if y[:-1] == x[:-1]} for x in b.vowels} phones = input_string.split(" ") stress_pattern_space = [ vows[x] for x in phones if x in vows] revised_string = [] for p in phones: if p in vows: revised_string.append(p+"%s") else: revised_string.append(p) revised_string = ' '.join(revised_string) possible_patterns = list(itertools.product(*stress_pattern_space)) possible_strings = [revised_string % x for x in possible_patterns] best = 1000 pattern = '' for p in possible_strings: if p in word_list: return p score = b.assessWord(p) if score < best: best = score pattern = p return pattern
def getPhonScores(puns): b = BlickLoader() scoredPuns = {} for pun in puns: words = pun.split('#') del words[len(words) - 1] goodWords = [] for w in words: if "D I C T" not in w: goodWords.append(w) scoredPuns[pun] = sum(b.assessWord(w.strip()) for w in goodWords) return scoredPuns
def train_wakeword_model(audio_train_loader, vocab_list, label_model, beam_size=3, num_hypotheses=5, query_by_string=False): wakeword_model = {} if query_by_string: # load ww model produced by MFA from config keywords = config["wakeword_model"] # load blick b = BlickLoader() for i, _, y_hat in enumerate(keywords.items()): w = b.assessWord(y_hat) # for each keyword, append the tuple(hypotheses + weights) to the list # only one hypothesis if using MFA wakeword_model[i] = (y_hat, w) else: # train ww model from scratch for i in audio_train_loader: posteriors_i = label_model(i) # decode using CTC, vocab_list is A (labels) decoder = CTCBeamDecoder(self.vocab_list, beam_width=self.beam_size, blank_id=self.vocab_list.index('_')) beam, beam_scores, _, _ = decoder.decode(posteriors_i) for j in range(num_hypotheses): y_hat = beam[j] # hypothesis log_prob_post = beam_scores[j] w = log_prob_post**-1 # for each keyword, append the tuple(hypotheses + weights) to the list wakeword_model[i].append((y_hat, w)) return wakeword_model
def getPhonotacticProb(input_string,use_blick=True,no_stress=False): if use_blick: if no_stress: b = BlickLoader(grammarType='NoStress') else: b = BlickLoader() return b.assessWord(str(input_string)) if no_stress: input_string = re.sub('[0-9]','',input_string) SPprob = 0.0 BPprob = 0.0 phones = input_string.split(" ") for i in range(len(phones)): patt = [any_segment] * i patt.append(phones[i]) pattern = '^'+' '.join(patt) +'.*$' totPattern = '^'+' '.join([any_segment] * (i+1)) +'.*$' if no_stress: count = len(PhonoString.objects.filter(NoStress__regex = pattern)) totCount = len(PhonoString.objects.filter(NoStress__regex = totPattern)) else: count = len(PhonoString.objects.filter(Transcription__regex = pattern)) totCount = len(PhonoString.objects.filter(Transcription__regex = totPattern)) SPprob += float(count) / float(totCount) if i != len(phones)-1: patt = [any_segment] * i patt.extend([phones[i],phones[i+1]]) pattern = '^'+' '.join(patt) +'.*$' totPattern = '^'+' '.join([any_segment] * (i+2)) +'.*$' if no_stress: count = len(PhonoString.objects.filter(NoStress__regex = pattern)) totCount = len(PhonoString.objects.filter(NoStress__regex = totPattern)) else: count = len(PhonoString.objects.filter(Transcription__regex = pattern)) totCount = len(PhonoString.objects.filter(Transcription__regex = totPattern)) BPprob += float(count) / float(totCount) SPprob = SPprob / float(len(phones)) BPprob = BPprob / float(len(phones)-1) return (SPprob,BPprob)
if syl.nucleus in SHORT_VOWELS: return False if is_first and syl.onset and syl.onset[0] == "ZH": return False # if is_last and stress_lvl == 1 and len(syl.coda) == 0: # return False if syl.onset and syl.coda and syl.onset[0] == "S" and not syl.onset[ -1] == "T" and syl.nucleus in SHORT_VOWELS: if syl.coda[0] == syl.onset[-1]: return False if stress_lvl != 1 and syl.nucleus not in { "AH", "ER", "IH", "IY", "OW", "UW" }: return False return True blick_rater = BlickLoader() words = [] for i in range(100): word = getWord() score, rules = blick_rater.assessWord(word.replace(" ", " "), includeConstraints=True) score = exp(-score) words.append([word, score, rules]) for word, score, rules in sorted(words, key=itemgetter(1)): if score > 0.00001: print(word, score, rules) print()
for word in open(dictionary): word = word.strip() if word != "" and not word.endswith("'s"): chain.add(word.lower()) #Make a word, check if it is within set range, search google for it, save it. Up to 5000 words words = 0 while words < 5000: word = "".join(chain.random_output()) if len(word) > 4 and len(word) < 10: score = 100 blickified = phonetify(word) for blicked in blickified: try: #sometimes this bails out, instead of tracking it down each time, this was an easy out thisscore = b.assessWord(blicked) except: score = 100 if thisscore < score: score=thisscore if score > 0 and score < 18: try: first_url = search('"' + word + '"',num=10, stop=1) for x in range(1,10): this_url = first_url.next() print str(score) + ',' + word + " - bad " + this_url except StopIteration: with open('proop.out', 'a') as outfile: outfile.write(str(score) + ',' + word + '\n') print str(score) + ',' + word + " - good" words += 1