Example #1
0
def test_overrides():
    # Stuff that's not in the cmudict database
    assert syllables.count_syllables('1st') == {1}
    assert syllables.count_syllables('sauropod') == {3}

    # Stuff that's in the database, but which is also wrong
    # In emoji this is only Sa-ke 🍶
    assert syllables.count_syllables('sake') == {2}
Example #2
0
def test_predictions(training_file, test_file1, counts):
    words, y_true = load_file(training_file)
    feat1 = []
    feat2 = []
    feat3 = []
    feat4 = []

    for i in range(len(words)):
        feat1.append(len(words[i]))
        feat2.append(counts[words[i]])
        feat3.append(syllables.count_syllables(words[i]))
        feat4.append(len(wn.synsets(words[i])))

    mean1 = np.mean(feat1)
    mean2 = np.mean(feat2)
    mean3 = np.mean(feat3)
    mean4 = np.mean(feat4)

    std1 = np.std(feat1)
    std2 = np.std(feat2)
    std3 = np.std(feat3)
    std4 = np.std(feat4)

    Xtrain = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2,
                              (feat3 - mean3) / std3, (feat4 - mean4) / std4))

    clf = RandomForestClassifier(max_depth=7,
                                 n_estimators=1000,
                                 criterion='entropy')
    clf.fit(Xtrain, y_true)

    words, y_true = load_file(test_file1)
    feat1 = []
    feat2 = []
    feat3 = []
    feat4 = []
    for i in range(len(words)):
        feat1.append(len(words[i]))
        feat2.append(counts[words[i]])
        feat3.append(syllables.count_syllables(words[i]))
        feat4.append(len(wn.synsets(words[i])))

    print(len(feat1))
    print(len(feat2))
    print(len(feat3))
    print(len(feat4))

    Xtest = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2,
                             (feat3 - mean3) / std3, (feat4 - mean4) / std4))
    y_pred = clf.predict(Xtest)
    #y_pred=[int(x) for x in y_pred]
    s = np.column_stack((words, y_true, y_pred))
    import pandas as pd
    df = pd.DataFrame(s)
    df.to_csv('f.csv')
def preprocess_yezheng(words, labels, counts):
    Thres_opt_len = 6
    Thres_opt_freq = 19904037  #<-19903996#<- 19903896#<-19903906# <-19902396 #<- 19881406 #<- 19802396
    # 1.0*len(w),
    #     X_features = [[1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len], int(counts[w] < Thres_opt_freq), counts[w] ]+[w.count(alp) for alp in letter_sele] if w in counts else [1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len],1,1120679362]+[w.count(alp) for alp in letter_sele] for w in words]
    # best
    X_features = np.array([[
        1.0 * len(w),
        count_syllables(w), [0, 1][len(w) > Thres_opt_len],
        int(counts[w] < Thres_opt_freq), counts[w]
    ] + [w.count(alp) for alp in letter_sele] if w in counts else [
        1.0 * len(w),
        count_syllables(w), [0, 1][len(w) > Thres_opt_len], 1, 1120679362
    ] + [w.count(alp) for alp in letter_sele] for w in words])
    #     X_features = np.array([[1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len], int(counts[w] < Thres_opt_freq) ]+[w.count(alp) for alp in letter_sele] if w in counts else [1.0*len(w),count_syllables(w),[0,1][len(w) > Thres_opt_len],1]+[w.count(alp) for alp in letter_sele] for w in words])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(X_features)
    X_features = scaler.transform(X_features)
    #     X_features = np.array([np.concatenate((row,np.convolve(row,row))) for row in X_features])
    #     scaler = sklearn.preprocessing.StandardScaler(); scaler.fit(X_features); X_features = scaler.transform(X_features)
    return X_features, np.array(labels)
Example #4
0
def test_syllable_count():
    # Cam-ra or Cam-e-ra
    assert syllables.count_syllables("camera") == {2, 3}
    # You'd be pretty crazy to pronounce this as Cam-ra, Cam-er-a but we'll allow it.
    assert syllables.count_syllables("camera, camera") == {4, 5, 6}
    assert syllables.count_syllables("Unicorn?!") == {3}
    assert syllables.count_syllables("Yes, Unicorn.") == {4}
    assert syllables.count_syllables("truffles") == {2}
    assert syllables.count_syllables("No, *you're* crazy!") == {4}
Example #5
0
def line_of_length(n, lm, thecontext=()):
    """Generate a line of n syllables, using the given language model."""
    for attempt in range(100):
        out = []
        total = 0
        words = lm.generate(n, context=thecontext)
        words = words[len(thecontext):]
        for word in words:
            out.append(word)
            total += syllables.count_syllables(word)
            if total == n:
                return " ".join(out).lower()
            if total > n:
                break
    print("WEIRD FAILURE")
    return random.choice(fives if (n == 5) else sevens)
Example #6
0
def line_of_length(n, lm, thecontext=()):
    """Generate a line of n syllables, using the given language model."""
    for attempt in range(100):
        out = []
        total = 0
        words = lm.generate(n, context=thecontext)
        words = words[len(thecontext):]
        for word in words:
            out.append(word)
            total += syllables.count_syllables(word)
            if total == n:
                return " ".join(out).lower()
            if total > n:
                break
    print("WEIRD FAILURE")
    return random.choice(fives if (n == 5) else sevens)
Example #7
0
def _map_description_to_emoji_and_syllable_count(
    emoji_desc_pairs: Iterable[Tuple[Emoji, str]]
) -> Dict[int, List[Tuple[Emoji, str]]]:
    """Takes a list of [Emoji, description] pairs and maps them to a dict of format:
        [syllable count] --> A list of all [emoji, description] pairs where the description has that
                             syllable count.
    """

    return_dict: Dict[int, List[Tuple[Emoji, str]]] = {}

    for emoji, desc in emoji_desc_pairs:
        syllable_options = count_syllables(desc)
        for syllable_count in syllable_options:
            list_for_syllable_count = return_dict.get(syllable_count, [])
            list_for_syllable_count.append((emoji, desc))
            return_dict[syllable_count] = list_for_syllable_count
    return return_dict
Example #8
0
def line_of_length(nsyllables, model, context=[]):
    """Generate a line with nsyllables syllables via recursive search."""

    for i in range(10):
        if not context:
            candidate  = random.sample(model._ngrams, 1)[0][0]
        else:
            candidate = model.choose_random_word(context)

        candidatelen = count_syllables(candidate.lower())

        if candidatelen == nsyllables:
            return [candidate]
        elif candidatelen > nsyllables:
            continue
        else:
            searchfurther = line_of_length(nsyllables - candidatelen,
                                           model,
                                           context + [candidate])
            if searchfurther:
                proposed = [candidate] + searchfurther
                return proposed
    return None
Example #9
0
def get_features(words, counts, normalize_mean=None, normalize_std=None):
    length_frequency, normalize_mean, normalize_std = get_length_and_frequency(
        words, counts, normalize_mean, normalize_std)
    syllable_count = []
    synonym_count = []
    frequency_ratio = []

    for word in words:
        syllable_count.append([syllables.count_syllables(word)])

        synonym = []
        for syn in wordnet.synsets(word):
            for orginal in syn.lemmas():
                synonym.append(orginal.name())
        synonym = set(synonym)

        synonym_count.append([len(synonym)])

    features = np.concatenate(
        (np.array(length_frequency), np.array(syllable_count),
         np.array(synonym_count)),
        axis=1)

    return features, normalize_mean, normalize_std
def syllables_feature(words):
    word_syllables_feature = []
    for word in words:
        word_syllables_feature.append(syllables.count_syllables(word))
    return np.array(word_syllables_feature).T
Example #11
0
def classifier(training_file, development_file, test_file, awl_file, dc_file, counts, train_dev):
    curr_classifier = LogisticRegression()
    full_classifier =  LogisticRegression()
    file = open(training_file, 'rt', encoding="utf-8")
    # return dictionaries
    sen_len = sentence_length(file, False)

    file.close()

    dc_list = load_words(dc_file)
    awl_list = load_words(awl_file)
    top1000_list = load_words(top1000_file)

    #put number of features here
    num_features = 8

    words, labels = load_file(training_file)
    training_dic = dict(zip(words, labels))

    words, labels = load_file(development_file)
    development_dic = dict(zip(words, labels))

    features_matrix = np.zeros((len(training_dic), num_features))
    lab_vec = np.zeros(len(training_dic))
    i = 0
    for word in training_dic.keys():
        lab_vec[i] = training_dic[word]
        # 0 index feature is word length
        features_matrix[i, 0] = len(word)
        # 1 index feature is word count
        count = counts[word]
        if count == 0:
            fixed_word = re.sub(pattern="-", repl="", string = word)
            count = counts[fixed_word]
        features_matrix[i, 1] = count
        lab_vec[i] = training_dic[word]
        # 2 index feature is word syllables
        features_matrix[i, 2] = syllables.count_syllables(word)
        # 3 index feature is wordnet synsets
        features_matrix[i, 3] = wordnet_sens(word)
        # 4 index feature is sentence length
        features_matrix[i, 4] = sen_len[word]
        # 5 index feature is indicator for presence in DC list
        features_matrix[i, 5] = in_list(word, dc_list)
        # 6 index feature is indicator for presence in AWL list
        features_matrix[i, 6] = in_list(word, awl_list)
        # 7 index feature is indicator for presence in top 100 most common words list
        features_matrix[i, 7] = in_list(word, top1000_list)
        i += 1
    
    mean_list = list()
    std_list = list()
    for i in range(len(features_matrix[1,:])):
        mean_list.append(np.mean(features_matrix[:, i]))
        std_list.append(np.std(features_matrix[:, i]))

    features_matrix_stand = standardize(features_matrix, mean_list, std_list)


    dev_matrix = np.zeros((len(development_dic), num_features))
    dev_vec = np.zeros(len(development_dic))

    file = open(development_file, 'rt', encoding="utf8")
    # return dictionaries
    sen_len = sentence_length(file, False)

    file.close()
    i = 0
    word_vec = list()
    for word in development_dic.keys():
        word_vec.append(word)
        # 0 index feature is word length
        dev_matrix[i, 0] = len(word)
        # 1 index feature is word count
        count = counts[word]
        if count == 0:
            fixed_word = re.sub(pattern="-", repl="", string = word)
            count = counts[fixed_word]
        dev_matrix[i, 1] = count
        dev_vec[i] = development_dic[word]
        # 2 index feature is word syllables
        dev_matrix[i, 2] = syllables.count_syllables(word)
        # 3 index feature is wordnet synsets
        dev_matrix[i, 3] = wordnet_sens(word)
        # 4 index feature is sentence length
        dev_matrix[i, 4] = sen_len[word]
        # 5 index feature is indicator for presence in DC list
        dev_matrix[i, 5] = in_list(word, dc_list)
        # 6 index feature is indicator for presence in AWL list
        dev_matrix[i, 6] = in_list(word, awl_list)
        # 7 index feature is indicator for presence in top 100 most common words list
        dev_matrix[i, 7] = in_list(word, top1000_list)
        i += 1
    curr_classifier.fit(features_matrix_stand, lab_vec)
    
    dev_matrix_stand = standardize(dev_matrix, mean_list, std_list)
    
    train_predict = curr_classifier.predict(features_matrix_stand)
    dev_predict = curr_classifier.predict(dev_matrix_stand)
    print("Development Classifier Performance Statistics")
    test_predictions(dev_predict, dev_vec)

    print("Training Classifier Performance Statistics")
    test_predictions(train_predict, lab_vec)
    # print(mean_list)
    # print(std_list)

    if(train_dev): 
        full_matrix = np.concatenate((features_matrix, dev_matrix), axis = 0)
        full_pred = np.concatenate((lab_vec, dev_vec))
        mean_list = list()
        std_list = list()
        for i in range(len(full_matrix[1,:])):
            mean_list.append(np.mean(full_matrix[:, i]))
            std_list.append(np.std(full_matrix[:, i]))
        full_matrix = standardize(full_matrix, mean_list, std_list)
        full_classifier.fit(full_matrix, full_pred)
        # print(mean_list)
        # print(std_list)

    test_words = load_test_file(test_file)
    file = open(test_file, 'rt', encoding="utf8")
    # return dictionaries
    sen_len = sentence_length(file, True)

    file.close()        

    test_matrix = np.zeros((len(test_words), num_features))
    i=0
    for word in test_words:
        # 0 index feature is word length
        test_matrix[i, 0] = len(word)
        # 1 index feature is word count
        count = counts[word]
        if count == 0:
            fixed_word = re.sub(pattern="-", repl="", string = word)
            count = counts[fixed_word]
        test_matrix[i, 1] = count
        # 2 index feature is word syllables
        test_matrix[i, 2] = syllables.count_syllables(word)
        # 3 index feature is wordnet synsets
        test_matrix[i, 3] = wordnet_sens(word)
        # 4 index feature is sentence length
        test_matrix[i, 4] = sen_len[word]
        # 5 index feature is indicator for presence in DC list
        test_matrix[i, 5] = in_list(word, dc_list)
        # 6 index feature is indicator for presence in AWL list
        test_matrix[i, 6] = in_list(word, awl_list)
        # 7 index feature is indicator for presence in top 100 most common words list
        test_matrix[i, 7] = in_list(word, top1000_list)
        i += 1

    test_matrix = standardize(test_matrix, mean_list, std_list)
    test_predict = full_classifier.predict(test_matrix)
    return test_predict 
Example #12
0
if __name__ == "__main__":
    words = sys.stdin.read().split()

    count = 0

    detected_first = False
    detected_second = False
    detected_third = False

    stop_point_first = 0
    stop_point_second = 0
    stop_point_third = 0

    for i, word in enumerate(words):
        syllables = count_syllables(word)
        count += syllables

        if count == 5 and not detected_first:
            count = 0
            detected_first = True
            stop_point_first = i + 1

        if count == 7 and detected_first and not detected_second:
            count = 0
            detected_second = True
            stop_point_second = i + 1

        if count == 5 and detected_first and detected_second and not detected_third:
            count = 0
            detected_third = True
Example #13
0
def random_forrest(training_file, development_file, test_file, counts):
    # load in features and labels for all words & create feature vectors
    twords, Y_t = load_file(training_file)
    tavg_word, tsentence_len, tword_freq = get_sentence_features(training_file)
    X_train = []
    for i in range(len(twords)):
        word = twords[i]
        X_train.append([
            counts[word],
            len(word),
            count_syllables(word), tavg_word[i], tsentence_len[i],
            tword_freq[i]
        ])
    X_train = np.array(X_train, dtype='float32')
    dwords, Y_d = load_file(development_file)
    davg_word, dsentence_len, dword_freq = get_sentence_features(
        development_file)
    X_dev = []
    for i in range(len(dwords)):
        word = dwords[i]
        X_dev.append([
            counts[word],
            len(word),
            count_syllables(word), davg_word[i], dsentence_len[i],
            dword_freq[i]
        ])
    X_dev = np.array(X_dev, dtype='float32')
    rwords = load_test_file(test_file)
    ravg_word, rsentence_len, rword_freq = get_sentence_features(test_file)
    X_test = []
    for i in range(len(rwords)):
        word = rwords[i]
        X_test.append([
            counts[word],
            len(word),
            count_syllables(word), ravg_word[i], rsentence_len[i],
            rword_freq[i]
        ])
    X_test = np.array(X_test, dtype='float32')
    # standardize data
    mean = np.mean(X_train, axis=0)
    sd = np.std(X_train, axis=0)
    X_train = (X_train - mean) / sd
    X_dev = (X_dev - mean) / sd
    X_test = (X_test - mean) / sd
    # build Random Forest Model trained on training file
    clf = RandomForestClassifier()
    clf.fit(X_train, Y_t)
    # evaluate model using training and development files & return metrics
    Y_tpred = clf.predict(X_train).tolist()
    tprecision, trecall, tfscore = evaluate(Y_tpred, Y_t)
    Y_dpred = clf.predict(X_dev).tolist()
    dprecision, drecall, dfscore = evaluate(Y_dpred, Y_d)
    training_performance = [tprecision, trecall, tfscore]
    development_performance = [dprecision, drecall, dfscore]
    # make predictions using model on test set and store in txt file for teacher evaluation
    Y_testpred = clf.predict(X_test).tolist()
    with open("test_labels.txt", "w") as f:
        for label in Y_testpred:
            f.write(str(label) + "\n")
    return training_performance, development_performance
Example #14
0
def test_throws_when_cant_find_word():
    assert syllables.count_syllables("rex") == {1}
    with pytest.raises(KeyError):
        syllables.count_syllables("gronkasaurus rex")
Example #15
0
def own_classifier(training_file, development_file, test_file1, extra_file,
                   counts):
    words, y_true1 = load_file(training_file)
    words1, y_true2 = load_file2(extra_file)
    words.extend(words1)
    y_true1.extend(y_true2)
    feat1 = []
    feat2 = []
    feat3 = []
    feat4 = []

    for i in range(len(words)):
        #print (i)
        #print (words)
        feat1.append(len(words[i]))
        if words[i] not in counts:
            counts[words[i]] = 1
        feat2.append(counts[words[i]])
        feat3.append(syllables.count_syllables(words[i]))
        feat4.append(len(wn.synsets(words[i])))

    mean1 = np.mean(feat1)
    mean2 = np.mean(feat2)
    mean3 = np.mean(feat3)
    mean4 = np.mean(feat4)

    std1 = np.std(feat1)
    std2 = np.std(feat2)
    std3 = np.std(feat3)
    std4 = np.std(feat4)

    Xtrain = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2,
                              (feat3 - mean3) / std3, (feat4 - mean4) / std4))

    best_fscore = -1
    dep = -1
    est = -1

    for a in range(1, 12):
        for b in range(500, 501, 1):
            clf = RandomForestClassifier(max_depth=a,
                                         n_estimators=b,
                                         criterion='entropy',
                                         bootstrap=False)
            #			from sklearn.neural_network import MLPClassifier
            #			clf = MLPClassifier(alpha=1e-2, hidden_layer_sizes=(5, 2), random_state=1)

            # from sklearn.svm import SVC
            # clf = SVC(C=1,tol=1e-9,gamma=0.10)

            # from sklearn import tree
            # clf = tree.DecisionTreeClassifier(max_depth=4)

            clf.fit(Xtrain, y_true1)
            y_pred = clf.predict(Xtrain)

            tprecision = get_precision(y_pred, y_true1)
            trecall = get_recall(y_pred, y_true1)
            tfscore = get_fscore(y_pred, y_true1)

            words, y_true = load_file(development_file)
            feat1 = []
            feat2 = []
            feat3 = []
            feat4 = []
            for i in range(len(words)):
                feat1.append(len(words[i]))
                feat2.append(counts[words[i]])
                feat3.append(syllables.count_syllables(words[i]))
                feat4.append(len(wn.synsets(words[i])))

            #print (len(feat1))
            #print (len(feat2))
            #print (len(feat3))
            #print (len(feat4))

            Xtest = np.column_stack(
                ((feat1 - mean1) / std1, (feat2 - mean2) / std2,
                 (feat3 - mean3) / std3, (feat4 - mean4) / std4))
            y_pred = clf.predict(Xtest)

            dprecision = get_precision(y_pred, y_true)
            drecall = get_recall(y_pred, y_true)
            dfscore = get_fscore(y_pred, y_true)

            training_performance = [tprecision, trecall, tfscore]
            development_performance = [dprecision, drecall, dfscore]
            if best_fscore < dfscore:
                best_fscore = dfscore
                dep = a
                est = b

    print(best_fscore)
    print(dep)
    print(est)

    clf = RandomForestClassifier(max_depth=dep,
                                 n_estimators=est,
                                 criterion='entropy',
                                 bootstrap=False)
    #	clf = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
    #	Xtrain=np.vstack((Xtrain,Xtest))
    #	y_true1.append(y_true)
    clf.fit(Xtrain, y_true1)

    words, y_true = load_file1(test_file1)
    feat1 = []
    feat2 = []
    feat3 = []
    feat4 = []
    for i in range(len(words)):
        feat1.append(len(words[i]))
        feat2.append(counts[words[i]])
        feat3.append(syllables.count_syllables(words[i]))
        feat4.append(len(wn.synsets(words[i])))

    #print (len(feat1))
    #print (len(feat2))
    #print (len(feat3))
    #print (len(feat4))

    Xtest = np.column_stack(((feat1 - mean1) / std1, (feat2 - mean2) / std2,
                             (feat3 - mean3) / std3, (feat4 - mean4) / std4))
    y_pred = clf.predict(Xtest)
    #y_pred=[int(x) for x in y_pred]
    with open('test_labels.txt', 'w') as f:
        y_pred = list(map(lambda a: str(a) + '\n', y_pred))
        f.writelines(y_pred)

    return training_performance, development_performance
Example #16
0
def get_syllables(words):
    return [syllables.count_syllables(word) for word in words]