Exemple #1
0
def test_labels():
    # get the texts from the training data
    examples = classify.read_smsspam("smsspam/SMSSpamCollection.train")
    labels = [label for label, _ in examples]

    # create the label encoder from the training texts
    to_labels = classify.TextToLabels(labels)

    # make sure that some sample labels are encoded as expected
    ham_index = to_labels.index("ham")
    spam_index = to_labels.index("spam")
    assert ham_index != spam_index
    assert np.all(to_labels(["ham", "spam", "spam"]) ==
                  [ham_index, spam_index, spam_index])
Exemple #2
0
def test_read_smsspam():
    # keep a counter here (instead of enumerate) in case the iterator is empty
    count = 0
    for example in classify.read_smsspam("AGBIGnp.out"):

        # make sure the right shape is returned
        assert len(example) == 2
        label, text = example

        # make sure the label is one of the expected two
        assert label in {"Positive", "Negative"}

        count += 1
    assert count == 1553
Exemple #3
0
def test_read_smsspam():
    # keep a counter here (instead of enumerate) in case the iterator is empty
    count = 0
    for example in classify.read_smsspam("smsspam/SMSSpamCollection.train"):

        # make sure the right shape is returned
        assert len(example) == 2
        label, text = example

        # make sure the label is one of the expected two
        assert label in {"ham", "spam"}

        count += 1
    assert count == 3345
Exemple #4
0
def test_labels():
    # get the texts from the training data
    examples = classify.read_smsspam("AGBIG_annotation.outt")
    labels = [label for label, _ in examples]

    # create the label encoder from the training texts
    to_labels = classify.TextToLabels(labels)

    # make sure that some sample labels are encoded as expected
    nc_index = to_labels.index("no")
    c_index = to_labels.index("yes")
    assert nc_index != c_index
    assert np.all(
        to_labels(["no", "yes", "yes"]) == [nc_index, c_index, c_index])
Exemple #5
0
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97):
    # get texts and labels from the training data
    train_examples = classify.read_smsspam("smsspam/SMSSpamCollection.train")
    train_labels, train_texts = zip(*train_examples)

    # get texts and labels from the development data
    devel_examples = classify.read_smsspam("smsspam/SMSSpamCollection.devel")
    devel_labels, devel_texts = zip(*devel_examples)

    # create the feature extractor and label encoder
    to_features = classify.TextToFeatures(train_texts)
    to_labels = classify.TextToLabels(train_labels)

    # train the classifier on the training data
    classifier = classify.Classifier()
    classifier.train(to_features(train_texts), to_labels(train_labels))

    # make predictions on the development data
    predicted_indices = classifier.predict(to_features(devel_texts))
    assert np.array_equal(predicted_indices, predicted_indices.astype(bool))

    # measure performance of predictions
    devel_indices = to_labels(devel_labels)
    spam_label = to_labels.index("spam")
    f1 = f1_score(devel_indices, predicted_indices, pos_label=spam_label)
    accuracy = accuracy_score(devel_indices, predicted_indices)

    # print out performance
    if capsys is not None:
        with capsys.disabled():
            msg = "\n{:.1%} F1 and {:.1%} accuracy on SMSSpam development data"
            print(msg.format(f1, accuracy))

    # make sure that performance is adequate
    assert f1 > min_f1
    assert accuracy > min_accuracy
Exemple #6
0
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97):
    #K FOLD TEST
    full_examples = classify.read_smsspam("a_lil_morenp.out")
    full_labels, full_texts = zip(*full_examples)

    clf = MLPClassifier(max_iter=1000)
    pipeline = Pipeline([('vectorizer',
                          CountVectorizer(binary=False,
                                          ngram_range=(1, 1),
                                          max_df=1)),
                         ('classifier', AdaBoostClassifier())])

    #print(np.asarray(train_texts[1:5]))
    k_fold = KFold(n_splits=2)

    #for LinearRegression
    #full_labels = [0 if i == "no" else i for i in full_labels]
    #full_labels = [1 if i == "yes" else i for i in full_labels]

    scores = []
    for train_indices, test_indices in k_fold.split(np.array(full_texts)):
        train_text = np.array(full_texts)[train_indices]
        train_y = np.array(full_labels)[train_indices]

        test_text = np.array(full_texts)[test_indices]
        test_y = np.array(full_labels)[test_indices]

        pipeline.fit(train_text, train_y)
        score = pipeline.score(test_text, test_y)
        p = pipeline.predict(test_text)
        p2 = pipeline.predict_proba(test_text)
        scores.append(score)

    p_o = [j for j in p if j == "Positive"]
    p2_o = [p2[j] for j in range(0, len(p)) if p[j] == "Positive"]

    #    print("Positive " , len(p_o) , " total " , len(p) , " proba " , p2_o[0])
    print("Positive ", len(p_o), " total ", len(p))

    score = sum(scores) / len(scores)
    #KFOLD performance
    if capsys is not None:
        with capsys.disabled():
            msg = "\n{:.1%} score on MTURK development data" + p_o
            print(msg.format(score))
Exemple #7
0
def test_labels():
    # get the texts from the training data
    examples = classify.read_smsspam("AGBIGnp.out")
    labels = [label for label, _ in examples]

    # create the label encoder from the training texts
    to_labels = classify.TextToLabels(labels)

    # make sure that some sample labels are encoded as expected
    #fl_index = to_labels.index("Facts/Logic")
    pt_index = to_labels.index("Positive")
    nt_index = to_labels.index("Negative")
    #a_index = to_labels.index("Affiliation")
    #h_index = to_labels.index("Humor")
    #w_index = to_labels.index("Warning")
    assert nt_index != pt_index
    #assert np.all(to_labels(["Facts/Logic", "Positive", "Negative", "Affiliation", "Humor", "Warning"]) ==
    assert np.all(to_labels(["Positive", "Negative"]) == [pt_index, nt_index])
Exemple #8
0
def test_features():
    # get the texts from the training data
    examples = classify.read_smsspam("AGBIGnp.out")
    texts = [text for _, text in examples]

    # create the feature extractor from the training texts
    to_features = classify.TextToFeatures(texts)

    # extract features for some made-up sentences
    features = to_features(["illegals should leave", "Build the wall"])
    # make sure there is one row of features for each sentence
    assert len(features.shape) == 2
    n_rows, n_cols = features.shape
    assert n_rows == 2

    # make sure there are nonzero values for some selected unigram
    # features in the first sentence
    indices = [to_features.index(f) for f in ["illegals", "wall"]]
    assert len(set(indices)) > 1
    row_indices, col_indices = features[:, indices].nonzero()
    assert np.all(row_indices == 0)
    assert len(col_indices) == 2
Exemple #9
0
def test_features():
    # get the texts from the training data
    examples = classify.read_smsspam("AGBIG_annotation.outt")
    texts = [text for _, text in examples]

    # create the feature extractor from the training texts
    to_features = classify.TextToFeatures(texts)

    # extract features for some made-up sentences
    features = to_features(
        ["There are some things that I need to send to you.", "Hello!"])
    # make sure there is one row of features for each sentence
    assert len(features.shape) == 2
    n_rows, n_cols = features.shape
    assert n_rows == 2

    # make sure there are nonzero values for some selected unigram and bigram
    # features in the first sentence
    indices = [to_features.index(f) for f in ["need", "to you"]]
    assert len(set(indices)) > 1
    row_indices, col_indices = features[:, indices].nonzero()
    assert np.all(row_indices == 0)
    assert len(col_indices) == 2
Exemple #10
0
def test_prediction(capsys, min_f1=0.89, min_accuracy=0.97):
    #K FOLD TEST
    full_examples = classify.read_smsspam("a_lil_more.out")
    full_labels, full_texts = zip(*full_examples)

    clf = MLPClassifier(max_iter=1000)
    pipeline = Pipeline([('vectorizer',
                          CountVectorizer(binary=False,
                                          ngram_range=(1, 1),
                                          max_df=1)),
                         ('classifier', AdaBoostClassifier())])

    #print(np.asarray(train_texts[1:5]))
    k_fold = KFold(n_splits=2)

    #for LinearRegression
    #full_labels = [0 if i == "no" else i for i in full_labels]
    #full_labels = [1 if i == "yes" else i for i in full_labels]

    scores = []
    for train_indices, test_indices in k_fold.split(np.array(full_texts)):
        train_text = np.array(full_texts)[train_indices]
        train_y = np.array(full_labels)[train_indices]

        test_text = np.array(full_texts)[test_indices]
        test_y = np.array(full_labels)[test_indices]

        pipeline.fit(train_text, train_y)
        score = pipeline.score(test_text, test_y)
        p = pipeline.predict(test_text)
        p2 = pipeline.predict_proba(test_text)
        scores.append(score)
    p_o = [j for j in p if j == "yes"]
    p2_o = [p2[j] for j in range(0, len(p)) if p[j] == "yes"]

    print("yes ", len(p_o), " total ", len(p), " proba ", p2_o[0])

    score = sum(scores) / len(scores)
    #KFOLD performance
    if capsys is not None:
        with capsys.disabled():
            msg = "\n{:.1%} score on MTURK development data" + p
            print(msg.format(score))
    '''f = open("classify.js", "w")
    porter = Porter(clf, language='js')
    output = porter.export(embed_data=True)
    f.write(output)
    f.close()'''

    #NORMAL VALIDATION
    # get texts and labels from the training data
    train_examples = classify.read_smsspam("AGBIG_annotation.outt")
    train_labels, train_texts = zip(*train_examples)

    # get texts and labels from the development data
    devel_examples = classify.read_smsspam("AGBIG_annotation.outd")
    devel_labels, devel_texts = zip(*devel_examples)

    # create the feature extractor and label encoder
    to_features = classify.TextToFeatures(train_texts)
    to_labels = classify.TextToLabels(train_labels)

    # train the classifier on the training data aka fit
    classifier = classify.Classifier()
    classifier.train(to_features(train_texts), to_labels(train_labels))

    # make predictions on the development data
    predicted_indices = classifier.predict(to_features(devel_texts))
    assert np.array_equal(predicted_indices, predicted_indices.astype(bool))

    # measure performance of predictions
    devel_indices = to_labels(devel_labels)
    spam_label = to_labels.index("yes")
    f1 = f1_score(devel_indices, predicted_indices, pos_label=spam_label)
    accuracy = accuracy_score(devel_indices, predicted_indices)

    # print out performance
    if capsys is not None:
        with capsys.disabled():
            msg = "\n{:.1%} F1 and {:.1%} accuracy on MTURK development data"
            print(msg.format(f1, accuracy))