def kaggle_classify(stemming=True, rem_kws=True, voting_type='hard'): train_set = 'def_train' test_set = 'def_test' lines_train = get_lines(serialized=True, set=train_set) features_train = get_features(set=train_set, stemming=stemming, rem_kws=rem_kws, lines=lines_train) comments_train = get_comments(set=train_set) #tfidf_vector = TfidfVectorizer(tokenizer=tokenizer, lowercase=False, sublinear_tf=True) #dt_matrix_train = tfidf_vector.fit_transform(comments_train) count_vector = CountVectorizer(tokenizer=tokenizer, lowercase=False) dt_matrix_train = count_vector.fit_transform(comments_train) dt_matrix_train = normalize(dt_matrix_train, norm='l1', axis=0) features_train = sparse.hstack((dt_matrix_train, features_train)) comments_test = get_comments(set=test_set) lines_test = get_lines(serialized=True, set=test_set) features_test = get_features(set=test_set, stemming=stemming, rem_kws=rem_kws, lines=lines_test) #dt_matrix_test = tfidf_vector.transform(comments_test) dt_matrix_test = count_vector.transform(comments_test) dt_matrix_test = normalize(dt_matrix_test, norm='l1', axis=0) features_test = sparse.hstack((dt_matrix_test, features_test)) labels = get_labels(set='def_train') """classifiers = [Classifier(BaggingClassifier(bootstrap=False, bootstrap_features=False, max_features=500, max_samples=0.5, n_estimators=200, warm_start=True)), Classifier(LinearSVC_Initializer()), Classifier(SDGClassifier_Initializer()), Classifier(GradientBoostingClassifier()), Classifier(AdaBoostClassifier(n_estimators=1200, learning_rate=0.1, algorithm='SAMME.R'))]""" classifiers = both_c results = Probas() for classifier in classifiers: classifier.classifier.fit(X=features_train, y=labels) result = classifier.classifier.predict_proba(features_test) results.add_proba(result, classifier.name) print(result) print(len(result)) _, voting_results = compute_voting(voting=results.get_names(), probas=results, labels=None, folder=None, voting_type=voting_type) return voting_results, test_set
def plot_length(rough=True): if rough: comments = np.array([len(x) for x in get_comments()]) out = 'length_distribution.png' else: comments = np.array([len(x.split()) for x in get_comments()]) out = 'length_distribution_split.png' labels = np.array(get_labels()) sample_yes = [] sample_no = [] for x in range(len(comments)): if labels[x] == non_information: sample_yes.append(comments[x]) else: sample_no.append(comments[x]) plt.hist(sample_no, bins='auto', color='orange') plt.hist(sample_yes, bins='auto', color='blue') plt.xlabel('comment length') plt.ylabel('number of comments') plt.legend(['no', 'yes']) if rough: plt.text( 1000, 80, 'yes avg length= ' + str(round(sum(sample_yes) / len(sample_yes), 2))) plt.text( 1000, 90, 'no avg length=' + str(round(sum(sample_no) / len(sample_no), 2))) else: plt.text( 150, 70, 'yes avg length= ' + str(round(sum(sample_yes) / len(sample_yes), 2))) plt.text( 150, 80, 'no avg length=' + str(round(sum(sample_no) / len(sample_no), 2))) plt.savefig(img_outpath + out) plt.clf()
def has_tags_analysis(): has_tags = 1 no_tags = 0 tags = get_tags() comments = get_comments() labels = np.array(get_labels()) tag_comment = [] for comment in comments: match = False for tag in tags: if re.search(tag, comment): match = True tag_comment.append(has_tags) break if match is False: tag_comment.append(no_tags) tags_positive = 0 tags_negative = 0 no_tags_positive = 0 no_tags_negative = 0 for i in range(len(labels)): if labels[i] == non_information and tag_comment[i] == has_tags: tags_positive += 1 elif labels[i] == information and tag_comment[i] == has_tags: tags_negative += 1 elif labels[i] == non_information and tag_comment[i] == no_tags: no_tags_positive += 1 elif labels[i] == information and tag_comment[i] == no_tags: no_tags_negative += 1 i += 1 with open(reports_outpath + "has_tags" + ".txt", 'w') as f: f.write("yes w tags = " + str(tags_positive) + "/" + str(tags_positive + no_tags_positive) + "\n") f.write("yes wout tags = " + str(no_tags_positive) + "/" + str(tags_positive + no_tags_positive) + "\n") f.write("no w tags = " + str(tags_negative) + "/" + str(tags_negative + no_tags_negative) + "\n") f.write("no wout tags = " + str(no_tags_negative) + "/" + str(tags_negative + no_tags_negative) + "\n") assert tags_positive + tags_negative + no_tags_positive + no_tags_negative == len( labels)
def tags_analysis(): labels = np.array(get_labels()) comments = get_comments() tags = get_tags() tags_dict = {} for tag in tags: tags_dict[tag] = [0, 0] i = 0 for comment in comments: if re.search(tag, comment): if labels[i] == non_information: tags_dict[tag][non_information] += 1 else: tags_dict[tag][information] += 1 i += 1 with open(reports_outpath + "tags_analysis" + ".txt", 'w') as f: for key in tags_dict: if tags_dict[key] != [0, 0]: f.write(key + ":" + "\n") f.write("\tno -> " + str(tags_dict[key][information]) + "\n") f.write("\tyes-> " + str(tags_dict[key][non_information]) + "\n")
def classify_split(folder="split_classifier"): # data_split() train_set = 'split_train' test_set = 'split_test' selected_for_voting = [] # TF-IDF comments_train = get_comments(set=train_set) tfidf_vector = TfidfVectorizer(tokenizer=tokenizer, lowercase=False, sublinear_tf=True) dt_matrix_train = tfidf_vector.fit_transform(comments_train) # dt_matrix_train = normalize(dt_matrix_train, norm='l1', axis=0) comments_test = get_comments(set=test_set) dt_matrix_test = tfidf_vector.transform(comments_test) # dt_matrix_test = normalize(dt_matrix_test, norm='l1', axis=0) stats, voting = tf_idf_classify(set=train_set, folder=folder + "/tf_idf_classifier/") selected_for_voting.append(voting) stats, voting = train_n_test(get_tf_idf_classifiers(), dt_matrix_train, get_labels(set=train_set), dt_matrix_test, get_labels(set=test_set), folder=folder + "/tf_idf_classifier_tet/") selected_for_voting.append(voting) # FEATURES lines_train = get_lines(serialized=True, set=train_set) stats, voting = feat_classify(set=train_set, folder=folder + "/feat_classifier/", lines=lines_train) selected_for_voting.append(voting) features_train = get_features(set=train_set, scaled=True, lines=lines_train) lines_test = get_lines(serialized=True, set=test_set) features_test = get_features(set=test_set, scaled=True, lines=lines_test) stats, voting = train_n_test(get_feat_classifiers(), features_train, get_labels(set=train_set), features_test, get_labels(set=test_set), folder=folder + "/feat_classifier_tet/") selected_for_voting.append(voting) # BOTH_CLASSIFIERS stats, voting = both_classify(set=train_set, folder=folder + "/both_classifier/", lines=lines_train) selected_for_voting.append(voting) both_train = sparse.hstack((dt_matrix_train, features_train)) both_test = sparse.hstack((dt_matrix_test, features_test)) stats, voting = train_n_test(get_feat_classifiers(), both_train, get_labels(set=train_set), both_test, get_labels(set=test_set), folder=folder + "/both_classifier_tet/") selected_for_voting.append(voting) return selected_for_voting
def ablation(set='def_train', stemming=True, rem_kws=True, scaled=True, normalized=True, lines=None): if lines is None: lines = get_lines(set=set) #NON-TEXTUAL jacc_score = np.array(jaccard(stemming, rem_kws, lines=lines, set=set)) positions = np.array(get_positions_encoded(lines=lines, set=set)) rough_length = np.array( [x for x in get_comment_length(rough=True, set=set)]) length = np.array([x for x in get_comment_length(rough=False, set=set)]) types = np.array(get_type_encoded(set=set)) link_tag = np.array(get_links_tag(set=set)) #TEXTUAL comments = get_comments(set=set) tfidf_vector = TfidfVectorizer(tokenizer=tokenizer, lowercase=False, sublinear_tf=True) dt_matrix = tfidf_vector.fit_transform(comments) if normalized: dt_matrix = normalize(dt_matrix, norm='l1', axis=0) for i in range(0, 7): features = np.array([]) if i != 0: new_feature = rough_length.reshape((rough_length.shape[0], 1)) features = scale(np.hstack( (features, new_feature))) if features.size else new_feature if i != 1: new_feature = length.reshape((length.shape[0], 1)) features = scale(np.hstack( (features, new_feature))) if features.size else new_feature if i != 2: features = scale( np.hstack( (features, jacc_score.reshape((jacc_score.shape[0], 1))))) if i != 3: features = scale( np.hstack((features, types.reshape((types.shape[0], 1))))) if i != 4: features = scale( np.hstack((features, positions.reshape( (positions.shape[0], 1))))) if i != 5: features = scale( np.hstack((features, link_tag.reshape( (link_tag.shape[0], 1))))) if i != 6: features = sparse.hstack((dt_matrix, features)) do_kfold(classifiers=[Classifier(AdaBoostClassifier())], labels=get_labels(set=set), features=features, folder="ablation" + str(i), voting=False) return 1