def test_overrides(): # Stuff that's not in the cmudict database assert syllables.count_syllables('1st') == {1} assert syllables.count_syllables('sauropod') == {3} # Stuff that's in the database, but which is also wrong # In emoji this is only Sa-ke 🍶 assert syllables.count_syllables('sake') == {2}
def test_predictions(training_file, test_file1, counts): words, y_true = load_file(training_file) feat1 = [] feat2 = [] feat3 = [] feat4 = [] for i in range(len(words)): feat1.append(len(words[i])) feat2.append(counts[words[i]]) feat3.append(syllables.count_syllables(words[i])) feat4.append(len(wn.synsets(words[i]))) mean1 = np.mean(feat1) mean2 = np.mean(feat2) mean3 = np.mean(feat3) mean4 = np.mean(feat4) std1 = np.std(feat1) std2 = np.std(feat2) std3 = np.std(feat3) std4 = np.std(feat4) Xtrain = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2, (feat3 - mean3) / std3, (feat4 - mean4) / std4)) clf = RandomForestClassifier(max_depth=7, n_estimators=1000, criterion='entropy') clf.fit(Xtrain, y_true) words, y_true = load_file(test_file1) feat1 = [] feat2 = [] feat3 = [] feat4 = [] for i in range(len(words)): feat1.append(len(words[i])) feat2.append(counts[words[i]]) feat3.append(syllables.count_syllables(words[i])) feat4.append(len(wn.synsets(words[i]))) print(len(feat1)) print(len(feat2)) print(len(feat3)) print(len(feat4)) Xtest = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2, (feat3 - mean3) / std3, (feat4 - mean4) / std4)) y_pred = clf.predict(Xtest) #y_pred=[int(x) for x in y_pred] s = np.column_stack((words, y_true, y_pred)) import pandas as pd df = pd.DataFrame(s) df.to_csv('f.csv')
def preprocess_yezheng(words, labels, counts): Thres_opt_len = 6 Thres_opt_freq = 19904037 #<-19903996#<- 19903896#<-19903906# <-19902396 #<- 19881406 #<- 19802396 # 1.0*len(w), # X_features = [[1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len], int(counts[w] < Thres_opt_freq), counts[w] ]+[w.count(alp) for alp in letter_sele] if w in counts else [1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len],1,1120679362]+[w.count(alp) for alp in letter_sele] for w in words] # best X_features = np.array([[ 1.0 * len(w), count_syllables(w), [0, 1][len(w) > Thres_opt_len], int(counts[w] < Thres_opt_freq), counts[w] ] + [w.count(alp) for alp in letter_sele] if w in counts else [ 1.0 * len(w), count_syllables(w), [0, 1][len(w) > Thres_opt_len], 1, 1120679362 ] + [w.count(alp) for alp in letter_sele] for w in words]) # X_features = np.array([[1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len], int(counts[w] < Thres_opt_freq) ]+[w.count(alp) for alp in letter_sele] if w in counts else [1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len],1]+[w.count(alp) for alp in letter_sele] for w in words]) scaler = sklearn.preprocessing.StandardScaler() scaler.fit(X_features) X_features = scaler.transform(X_features) # X_features = np.array([np.concatenate((row,np.convolve(row,row))) for row in X_features]) # scaler = sklearn.preprocessing.StandardScaler(); scaler.fit(X_features); X_features = scaler.transform(X_features) return X_features, np.array(labels)
def test_syllable_count(): # Cam-ra or Cam-e-ra assert syllables.count_syllables("camera") == {2, 3} # You'd be pretty crazy to pronounce this as Cam-ra, Cam-er-a but we'll allow it. assert syllables.count_syllables("camera, camera") == {4, 5, 6} assert syllables.count_syllables("Unicorn?!") == {3} assert syllables.count_syllables("Yes, Unicorn.") == {4} assert syllables.count_syllables("truffles") == {2} assert syllables.count_syllables("No, *you're* crazy!") == {4}
def line_of_length(n, lm, thecontext=()): """Generate a line of n syllables, using the given language model.""" for attempt in range(100): out = [] total = 0 words = lm.generate(n, context=thecontext) words = words[len(thecontext):] for word in words: out.append(word) total += syllables.count_syllables(word) if total == n: return " ".join(out).lower() if total > n: break print("WEIRD FAILURE") return random.choice(fives if (n == 5) else sevens)
def _map_description_to_emoji_and_syllable_count( emoji_desc_pairs: Iterable[Tuple[Emoji, str]] ) -> Dict[int, List[Tuple[Emoji, str]]]: """Takes a list of [Emoji, description] pairs and maps them to a dict of format: [syllable count] --> A list of all [emoji, description] pairs where the description has that syllable count. """ return_dict: Dict[int, List[Tuple[Emoji, str]]] = {} for emoji, desc in emoji_desc_pairs: syllable_options = count_syllables(desc) for syllable_count in syllable_options: list_for_syllable_count = return_dict.get(syllable_count, []) list_for_syllable_count.append((emoji, desc)) return_dict[syllable_count] = list_for_syllable_count return return_dict
def line_of_length(nsyllables, model, context=[]): """Generate a line with nsyllables syllables via recursive search.""" for i in range(10): if not context: candidate = random.sample(model._ngrams, 1)[0][0] else: candidate = model.choose_random_word(context) candidatelen = count_syllables(candidate.lower()) if candidatelen == nsyllables: return [candidate] elif candidatelen > nsyllables: continue else: searchfurther = line_of_length(nsyllables - candidatelen, model, context + [candidate]) if searchfurther: proposed = [candidate] + searchfurther return proposed return None
def get_features(words, counts, normalize_mean=None, normalize_std=None): length_frequency, normalize_mean, normalize_std = get_length_and_frequency( words, counts, normalize_mean, normalize_std) syllable_count = [] synonym_count = [] frequency_ratio = [] for word in words: syllable_count.append([syllables.count_syllables(word)]) synonym = [] for syn in wordnet.synsets(word): for orginal in syn.lemmas(): synonym.append(orginal.name()) synonym = set(synonym) synonym_count.append([len(synonym)]) features = np.concatenate( (np.array(length_frequency), np.array(syllable_count), np.array(synonym_count)), axis=1) return features, normalize_mean, normalize_std
def syllables_feature(words): word_syllables_feature = [] for word in words: word_syllables_feature.append(syllables.count_syllables(word)) return np.array(word_syllables_feature).T
def classifier(training_file, development_file, test_file, awl_file, dc_file, counts, train_dev): curr_classifier = LogisticRegression() full_classifier = LogisticRegression() file = open(training_file, 'rt', encoding="utf-8") # return dictionaries sen_len = sentence_length(file, False) file.close() dc_list = load_words(dc_file) awl_list = load_words(awl_file) top1000_list = load_words(top1000_file) #put number of features here num_features = 8 words, labels = load_file(training_file) training_dic = dict(zip(words, labels)) words, labels = load_file(development_file) development_dic = dict(zip(words, labels)) features_matrix = np.zeros((len(training_dic), num_features)) lab_vec = np.zeros(len(training_dic)) i = 0 for word in training_dic.keys(): lab_vec[i] = training_dic[word] # 0 index feature is word length features_matrix[i, 0] = len(word) # 1 index feature is word count count = counts[word] if count == 0: fixed_word = re.sub(pattern="-", repl="", string = word) count = counts[fixed_word] features_matrix[i, 1] = count lab_vec[i] = training_dic[word] # 2 index feature is word syllables features_matrix[i, 2] = syllables.count_syllables(word) # 3 index feature is wordnet synsets features_matrix[i, 3] = wordnet_sens(word) # 4 index feature is sentence length features_matrix[i, 4] = sen_len[word] # 5 index feature is indicator for presence in DC list features_matrix[i, 5] = in_list(word, dc_list) # 6 index feature is indicator for presence in AWL list features_matrix[i, 6] = in_list(word, awl_list) # 7 index feature is indicator for presence in top 100 most common words list features_matrix[i, 7] = in_list(word, top1000_list) i += 1 mean_list = list() std_list = list() for i in range(len(features_matrix[1,:])): mean_list.append(np.mean(features_matrix[:, i])) std_list.append(np.std(features_matrix[:, i])) features_matrix_stand = standardize(features_matrix, mean_list, std_list) dev_matrix = np.zeros((len(development_dic), num_features)) dev_vec = np.zeros(len(development_dic)) file = open(development_file, 'rt', encoding="utf8") # return dictionaries sen_len = sentence_length(file, False) file.close() i = 0 word_vec = list() for word in development_dic.keys(): word_vec.append(word) # 0 index feature is word length dev_matrix[i, 0] = len(word) # 1 index feature is word count count = counts[word] if count == 0: fixed_word = re.sub(pattern="-", repl="", string = word) count = counts[fixed_word] dev_matrix[i, 1] = count dev_vec[i] = development_dic[word] # 2 index feature is word syllables dev_matrix[i, 2] = syllables.count_syllables(word) # 3 index feature is wordnet synsets dev_matrix[i, 3] = wordnet_sens(word) # 4 index feature is sentence length dev_matrix[i, 4] = sen_len[word] # 5 index feature is indicator for presence in DC list dev_matrix[i, 5] = in_list(word, dc_list) # 6 index feature is indicator for presence in AWL list dev_matrix[i, 6] = in_list(word, awl_list) # 7 index feature is indicator for presence in top 100 most common words list dev_matrix[i, 7] = in_list(word, top1000_list) i += 1 curr_classifier.fit(features_matrix_stand, lab_vec) dev_matrix_stand = standardize(dev_matrix, mean_list, std_list) train_predict = curr_classifier.predict(features_matrix_stand) dev_predict = curr_classifier.predict(dev_matrix_stand) print("Development Classifier Performance Statistics") test_predictions(dev_predict, dev_vec) print("Training Classifier Performance Statistics") test_predictions(train_predict, lab_vec) # print(mean_list) # print(std_list) if(train_dev): full_matrix = np.concatenate((features_matrix, dev_matrix), axis = 0) full_pred = np.concatenate((lab_vec, dev_vec)) mean_list = list() std_list = list() for i in range(len(full_matrix[1,:])): mean_list.append(np.mean(full_matrix[:, i])) std_list.append(np.std(full_matrix[:, i])) full_matrix = standardize(full_matrix, mean_list, std_list) full_classifier.fit(full_matrix, full_pred) # print(mean_list) # print(std_list) test_words = load_test_file(test_file) file = open(test_file, 'rt', encoding="utf8") # return dictionaries sen_len = sentence_length(file, True) file.close() test_matrix = np.zeros((len(test_words), num_features)) i=0 for word in test_words: # 0 index feature is word length test_matrix[i, 0] = len(word) # 1 index feature is word count count = counts[word] if count == 0: fixed_word = re.sub(pattern="-", repl="", string = word) count = counts[fixed_word] test_matrix[i, 1] = count # 2 index feature is word syllables test_matrix[i, 2] = syllables.count_syllables(word) # 3 index feature is wordnet synsets test_matrix[i, 3] = wordnet_sens(word) # 4 index feature is sentence length test_matrix[i, 4] = sen_len[word] # 5 index feature is indicator for presence in DC list test_matrix[i, 5] = in_list(word, dc_list) # 6 index feature is indicator for presence in AWL list test_matrix[i, 6] = in_list(word, awl_list) # 7 index feature is indicator for presence in top 100 most common words list test_matrix[i, 7] = in_list(word, top1000_list) i += 1 test_matrix = standardize(test_matrix, mean_list, std_list) test_predict = full_classifier.predict(test_matrix) return test_predict
if __name__ == "__main__": words = sys.stdin.read().split() count = 0 detected_first = False detected_second = False detected_third = False stop_point_first = 0 stop_point_second = 0 stop_point_third = 0 for i, word in enumerate(words): syllables = count_syllables(word) count += syllables if count == 5 and not detected_first: count = 0 detected_first = True stop_point_first = i + 1 if count == 7 and detected_first and not detected_second: count = 0 detected_second = True stop_point_second = i + 1 if count == 5 and detected_first and detected_second and not detected_third: count = 0 detected_third = True
def random_forrest(training_file, development_file, test_file, counts): # load in features and labels for all words & create feature vectors twords, Y_t = load_file(training_file) tavg_word, tsentence_len, tword_freq = get_sentence_features(training_file) X_train = [] for i in range(len(twords)): word = twords[i] X_train.append([ counts[word], len(word), count_syllables(word), tavg_word[i], tsentence_len[i], tword_freq[i] ]) X_train = np.array(X_train, dtype='float32') dwords, Y_d = load_file(development_file) davg_word, dsentence_len, dword_freq = get_sentence_features( development_file) X_dev = [] for i in range(len(dwords)): word = dwords[i] X_dev.append([ counts[word], len(word), count_syllables(word), davg_word[i], dsentence_len[i], dword_freq[i] ]) X_dev = np.array(X_dev, dtype='float32') rwords = load_test_file(test_file) ravg_word, rsentence_len, rword_freq = get_sentence_features(test_file) X_test = [] for i in range(len(rwords)): word = rwords[i] X_test.append([ counts[word], len(word), count_syllables(word), ravg_word[i], rsentence_len[i], rword_freq[i] ]) X_test = np.array(X_test, dtype='float32') # standardize data mean = np.mean(X_train, axis=0) sd = np.std(X_train, axis=0) X_train = (X_train - mean) / sd X_dev = (X_dev - mean) / sd X_test = (X_test - mean) / sd # build Random Forest Model trained on training file clf = RandomForestClassifier() clf.fit(X_train, Y_t) # evaluate model using training and development files & return metrics Y_tpred = clf.predict(X_train).tolist() tprecision, trecall, tfscore = evaluate(Y_tpred, Y_t) Y_dpred = clf.predict(X_dev).tolist() dprecision, drecall, dfscore = evaluate(Y_dpred, Y_d) training_performance = [tprecision, trecall, tfscore] development_performance = [dprecision, drecall, dfscore] # make predictions using model on test set and store in txt file for teacher evaluation Y_testpred = clf.predict(X_test).tolist() with open("test_labels.txt", "w") as f: for label in Y_testpred: f.write(str(label) + "\n") return training_performance, development_performance
def test_throws_when_cant_find_word(): assert syllables.count_syllables("rex") == {1} with pytest.raises(KeyError): syllables.count_syllables("gronkasaurus rex")
def own_classifier(training_file, development_file, test_file1, extra_file, counts): words, y_true1 = load_file(training_file) words1, y_true2 = load_file2(extra_file) words.extend(words1) y_true1.extend(y_true2) feat1 = [] feat2 = [] feat3 = [] feat4 = [] for i in range(len(words)): #print (i) #print (words) feat1.append(len(words[i])) if words[i] not in counts: counts[words[i]] = 1 feat2.append(counts[words[i]]) feat3.append(syllables.count_syllables(words[i])) feat4.append(len(wn.synsets(words[i]))) mean1 = np.mean(feat1) mean2 = np.mean(feat2) mean3 = np.mean(feat3) mean4 = np.mean(feat4) std1 = np.std(feat1) std2 = np.std(feat2) std3 = np.std(feat3) std4 = np.std(feat4) Xtrain = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2, (feat3 - mean3) / std3, (feat4 - mean4) / std4)) best_fscore = -1 dep = -1 est = -1 for a in range(1, 12): for b in range(500, 501, 1): clf = RandomForestClassifier(max_depth=a, n_estimators=b, criterion='entropy', bootstrap=False) # from sklearn.neural_network import MLPClassifier # clf = MLPClassifier(alpha=1e-2, hidden_layer_sizes=(5, 2), random_state=1) # from sklearn.svm import SVC # clf = SVC(C=1,tol=1e-9,gamma=0.10) # from sklearn import tree # clf = tree.DecisionTreeClassifier(max_depth=4) clf.fit(Xtrain, y_true1) y_pred = clf.predict(Xtrain) tprecision = get_precision(y_pred, y_true1) trecall = get_recall(y_pred, y_true1) tfscore = get_fscore(y_pred, y_true1) words, y_true = load_file(development_file) feat1 = [] feat2 = [] feat3 = [] feat4 = [] for i in range(len(words)): feat1.append(len(words[i])) feat2.append(counts[words[i]]) feat3.append(syllables.count_syllables(words[i])) feat4.append(len(wn.synsets(words[i]))) #print (len(feat1)) #print (len(feat2)) #print (len(feat3)) #print (len(feat4)) Xtest = np.column_stack( ((feat1 - mean1) / std1, (feat2 - mean2) / std2, (feat3 - mean3) / std3, (feat4 - mean4) / std4)) y_pred = clf.predict(Xtest) dprecision = get_precision(y_pred, y_true) drecall = get_recall(y_pred, y_true) dfscore = get_fscore(y_pred, y_true) training_performance = [tprecision, trecall, tfscore] development_performance = [dprecision, drecall, dfscore] if best_fscore < dfscore: best_fscore = dfscore dep = a est = b print(best_fscore) print(dep) print(est) clf = RandomForestClassifier(max_depth=dep, n_estimators=est, criterion='entropy', bootstrap=False) # clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1) # Xtrain=np.vstack((Xtrain,Xtest)) # y_true1.append(y_true) clf.fit(Xtrain, y_true1) words, y_true = load_file1(test_file1) feat1 = [] feat2 = [] feat3 = [] feat4 = [] for i in range(len(words)): feat1.append(len(words[i])) feat2.append(counts[words[i]]) feat3.append(syllables.count_syllables(words[i])) feat4.append(len(wn.synsets(words[i]))) #print (len(feat1)) #print (len(feat2)) #print (len(feat3)) #print (len(feat4)) Xtest = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2, (feat3 - mean3) / std3, (feat4 - mean4) / std4)) y_pred = clf.predict(Xtest) #y_pred=[int(x) for x in y_pred] with open('test_labels.txt', 'w') as f: y_pred = list(map(lambda a: str(a) + '\n', y_pred)) f.writelines(y_pred) return training_performance, development_performance
def get_syllables(words): return [syllables.count_syllables(word) for word in words]