def kaggle_classify(stemming=True, rem_kws=True, voting_type='hard'):
    train_set = 'def_train'
    test_set = 'def_test'

    lines_train = get_lines(serialized=True, set=train_set)
    features_train = get_features(set=train_set,
                                  stemming=stemming,
                                  rem_kws=rem_kws,
                                  lines=lines_train)

    comments_train = get_comments(set=train_set)

    #tfidf_vector = TfidfVectorizer(tokenizer=tokenizer, lowercase=False, sublinear_tf=True)
    #dt_matrix_train = tfidf_vector.fit_transform(comments_train)

    count_vector = CountVectorizer(tokenizer=tokenizer, lowercase=False)
    dt_matrix_train = count_vector.fit_transform(comments_train)

    dt_matrix_train = normalize(dt_matrix_train, norm='l1', axis=0)

    features_train = sparse.hstack((dt_matrix_train, features_train))

    comments_test = get_comments(set=test_set)
    lines_test = get_lines(serialized=True, set=test_set)
    features_test = get_features(set=test_set,
                                 stemming=stemming,
                                 rem_kws=rem_kws,
                                 lines=lines_test)

    #dt_matrix_test = tfidf_vector.transform(comments_test)

    dt_matrix_test = count_vector.transform(comments_test)

    dt_matrix_test = normalize(dt_matrix_test, norm='l1', axis=0)

    features_test = sparse.hstack((dt_matrix_test, features_test))

    labels = get_labels(set='def_train')
    """classifiers = [Classifier(BaggingClassifier(bootstrap=False, bootstrap_features=False, max_features=500, max_samples=0.5, n_estimators=200, warm_start=True)), Classifier(LinearSVC_Initializer()), Classifier(SDGClassifier_Initializer()), Classifier(GradientBoostingClassifier()),
                   Classifier(AdaBoostClassifier(n_estimators=1200, learning_rate=0.1, algorithm='SAMME.R'))]"""
    classifiers = both_c
    results = Probas()
    for classifier in classifiers:
        classifier.classifier.fit(X=features_train, y=labels)
        result = classifier.classifier.predict_proba(features_test)
        results.add_proba(result, classifier.name)
        print(result)
        print(len(result))

    _, voting_results = compute_voting(voting=results.get_names(),
                                       probas=results,
                                       labels=None,
                                       folder=None,
                                       voting_type=voting_type)
    return voting_results, test_set
def plot_length(rough=True):
    if rough:
        comments = np.array([len(x) for x in get_comments()])
        out = 'length_distribution.png'
    else:
        comments = np.array([len(x.split()) for x in get_comments()])
        out = 'length_distribution_split.png'

    labels = np.array(get_labels())

    sample_yes = []
    sample_no = []
    for x in range(len(comments)):
        if labels[x] == non_information:
            sample_yes.append(comments[x])
        else:
            sample_no.append(comments[x])

    plt.hist(sample_no, bins='auto', color='orange')
    plt.hist(sample_yes, bins='auto', color='blue')

    plt.xlabel('comment length')
    plt.ylabel('number of comments')
    plt.legend(['no', 'yes'])
    if rough:
        plt.text(
            1000, 80, 'yes avg length= ' +
            str(round(sum(sample_yes) / len(sample_yes), 2)))
        plt.text(
            1000, 90,
            'no avg length=' + str(round(sum(sample_no) / len(sample_no), 2)))
    else:
        plt.text(
            150, 70, 'yes avg length= ' +
            str(round(sum(sample_yes) / len(sample_yes), 2)))
        plt.text(
            150, 80,
            'no avg length=' + str(round(sum(sample_no) / len(sample_no), 2)))
    plt.savefig(img_outpath + out)
    plt.clf()
def has_tags_analysis():
    has_tags = 1
    no_tags = 0
    tags = get_tags()
    comments = get_comments()
    labels = np.array(get_labels())

    tag_comment = []
    for comment in comments:
        match = False
        for tag in tags:
            if re.search(tag, comment):
                match = True
                tag_comment.append(has_tags)
                break
        if match is False:
            tag_comment.append(no_tags)

    tags_positive = 0
    tags_negative = 0
    no_tags_positive = 0
    no_tags_negative = 0

    for i in range(len(labels)):
        if labels[i] == non_information and tag_comment[i] == has_tags:
            tags_positive += 1
        elif labels[i] == information and tag_comment[i] == has_tags:
            tags_negative += 1
        elif labels[i] == non_information and tag_comment[i] == no_tags:
            no_tags_positive += 1
        elif labels[i] == information and tag_comment[i] == no_tags:
            no_tags_negative += 1
        i += 1
    with open(reports_outpath + "has_tags" + ".txt", 'w') as f:
        f.write("yes w tags = " + str(tags_positive) + "/" +
                str(tags_positive + no_tags_positive) + "\n")
        f.write("yes wout tags = " + str(no_tags_positive) + "/" +
                str(tags_positive + no_tags_positive) + "\n")
        f.write("no w tags = " + str(tags_negative) + "/" +
                str(tags_negative + no_tags_negative) + "\n")
        f.write("no wout tags = " + str(no_tags_negative) + "/" +
                str(tags_negative + no_tags_negative) + "\n")
    assert tags_positive + tags_negative + no_tags_positive + no_tags_negative == len(
        labels)
def tags_analysis():
    labels = np.array(get_labels())
    comments = get_comments()
    tags = get_tags()

    tags_dict = {}
    for tag in tags:
        tags_dict[tag] = [0, 0]
        i = 0
        for comment in comments:
            if re.search(tag, comment):
                if labels[i] == non_information:
                    tags_dict[tag][non_information] += 1
                else:
                    tags_dict[tag][information] += 1
            i += 1
    with open(reports_outpath + "tags_analysis" + ".txt", 'w') as f:
        for key in tags_dict:
            if tags_dict[key] != [0, 0]:
                f.write(key + ":" + "\n")
                f.write("\tno -> " + str(tags_dict[key][information]) + "\n")
                f.write("\tyes-> " + str(tags_dict[key][non_information]) +
                        "\n")
def classify_split(folder="split_classifier"):
    # data_split()

    train_set = 'split_train'
    test_set = 'split_test'

    selected_for_voting = []

    # TF-IDF
    comments_train = get_comments(set=train_set)
    tfidf_vector = TfidfVectorizer(tokenizer=tokenizer,
                                   lowercase=False,
                                   sublinear_tf=True)
    dt_matrix_train = tfidf_vector.fit_transform(comments_train)
    # dt_matrix_train = normalize(dt_matrix_train, norm='l1', axis=0)

    comments_test = get_comments(set=test_set)
    dt_matrix_test = tfidf_vector.transform(comments_test)
    # dt_matrix_test = normalize(dt_matrix_test, norm='l1', axis=0)

    stats, voting = tf_idf_classify(set=train_set,
                                    folder=folder + "/tf_idf_classifier/")

    selected_for_voting.append(voting)

    stats, voting = train_n_test(get_tf_idf_classifiers(),
                                 dt_matrix_train,
                                 get_labels(set=train_set),
                                 dt_matrix_test,
                                 get_labels(set=test_set),
                                 folder=folder + "/tf_idf_classifier_tet/")

    selected_for_voting.append(voting)

    # FEATURES
    lines_train = get_lines(serialized=True, set=train_set)

    stats, voting = feat_classify(set=train_set,
                                  folder=folder + "/feat_classifier/",
                                  lines=lines_train)

    selected_for_voting.append(voting)

    features_train = get_features(set=train_set,
                                  scaled=True,
                                  lines=lines_train)
    lines_test = get_lines(serialized=True, set=test_set)
    features_test = get_features(set=test_set, scaled=True, lines=lines_test)

    stats, voting = train_n_test(get_feat_classifiers(),
                                 features_train,
                                 get_labels(set=train_set),
                                 features_test,
                                 get_labels(set=test_set),
                                 folder=folder + "/feat_classifier_tet/")

    selected_for_voting.append(voting)

    # BOTH_CLASSIFIERS
    stats, voting = both_classify(set=train_set,
                                  folder=folder + "/both_classifier/",
                                  lines=lines_train)

    selected_for_voting.append(voting)

    both_train = sparse.hstack((dt_matrix_train, features_train))
    both_test = sparse.hstack((dt_matrix_test, features_test))

    stats, voting = train_n_test(get_feat_classifiers(),
                                 both_train,
                                 get_labels(set=train_set),
                                 both_test,
                                 get_labels(set=test_set),
                                 folder=folder + "/both_classifier_tet/")

    selected_for_voting.append(voting)

    return selected_for_voting
Exemple #6
0
def ablation(set='def_train',
             stemming=True,
             rem_kws=True,
             scaled=True,
             normalized=True,
             lines=None):
    if lines is None:
        lines = get_lines(set=set)

    #NON-TEXTUAL
    jacc_score = np.array(jaccard(stemming, rem_kws, lines=lines, set=set))
    positions = np.array(get_positions_encoded(lines=lines, set=set))
    rough_length = np.array(
        [x for x in get_comment_length(rough=True, set=set)])
    length = np.array([x for x in get_comment_length(rough=False, set=set)])
    types = np.array(get_type_encoded(set=set))
    link_tag = np.array(get_links_tag(set=set))

    #TEXTUAL
    comments = get_comments(set=set)
    tfidf_vector = TfidfVectorizer(tokenizer=tokenizer,
                                   lowercase=False,
                                   sublinear_tf=True)
    dt_matrix = tfidf_vector.fit_transform(comments)
    if normalized:
        dt_matrix = normalize(dt_matrix, norm='l1', axis=0)

    for i in range(0, 7):
        features = np.array([])
        if i != 0:
            new_feature = rough_length.reshape((rough_length.shape[0], 1))
            features = scale(np.hstack(
                (features, new_feature))) if features.size else new_feature
        if i != 1:
            new_feature = length.reshape((length.shape[0], 1))
            features = scale(np.hstack(
                (features, new_feature))) if features.size else new_feature
        if i != 2:
            features = scale(
                np.hstack(
                    (features, jacc_score.reshape((jacc_score.shape[0], 1)))))
        if i != 3:
            features = scale(
                np.hstack((features, types.reshape((types.shape[0], 1)))))
        if i != 4:
            features = scale(
                np.hstack((features, positions.reshape(
                    (positions.shape[0], 1)))))
        if i != 5:
            features = scale(
                np.hstack((features, link_tag.reshape(
                    (link_tag.shape[0], 1)))))
        if i != 6:
            features = sparse.hstack((dt_matrix, features))

        do_kfold(classifiers=[Classifier(AdaBoostClassifier())],
                 labels=get_labels(set=set),
                 features=features,
                 folder="ablation" + str(i),
                 voting=False)

    return 1