def AUC(classifier, *args):
    if (type(args[0]).__name__ == "DataFrame"):
        df = args[0]
        feature_names = args[1]
    else:
        df = dt.getTokenizedDataFrame()
        df, feature_names = feat.ExtractFeatures(df, *args)
        df = df.rename_axis(None)
    attitudes = ["proactivo", "reactivo", "agresivo", "provoto"]
    f, axarr = plt.subplots(1, 4, figsize=(18, 4), sharey=True)
    for i, attitude in enumerate(attitudes):
        sorted_train_labels = sorted(list(set(df.loc[:, attitude])))
        for label in sorted_train_labels:
            t = df[(df[attitude] == label)]
            if t.shape[0] < 10:
                df = df.append([t] * 10)
        print("========================={}===========================".format(
            attitude.upper()))
        data = df.loc[:, [
            c for c in df.columns for feature_name in feature_names
            if "_" + feature_name.lower() in c
        ]]
        target = df.loc[:, attitude]

        train_data, test_data, train_labels, test_labels = train_test_split(
            data,
            target,
            train_size=0.6,
            stratify=target,
            random_state=conf.seed)
        precision, recall, accuracy, f1, classifier_name, learning_time, prediction_time = tc.test_classifier(
            train_data, train_labels, test_data, test_labels, classifier)
        auc.plot_auc_cv(data,
                        target,
                        classifier,
                        "Attitude '{}'".format(attitude),
                        save=conf.images_dir,
                        plot=axarr[i])
    plt.setp([a.get_yticklabels() for a in axarr[1:]], visible=False)
    f.subplots_adjust(hspace=0.3)
    plt.suptitle('{}+{} for all attributes'.format(
        "+".join([feature_name.upper() for feature_name in feature_names]),
        classifier_name),
                 y=1)
    f.text(0.5, 0.01, 'False Positive Rate', ha='center')
    f.text(0.09, 0.5, 'True Positive Rate', va='center', rotation='vertical')
    plt.savefig(os.path.join(
        conf.images_dir, '{}+{} for all attributes.png'.format(
            "+".join([feature_name.upper() for feature_name in feature_names]),
            classifier_name)),
                bbox_inches='tight')
import __init__
import load_dataframe as data
df = data.getTokenizedDataFrame()

print(df.head(5))
Example #3
0
import itertools
import pandas as pd
import nltk
import numpy as np
from features import save

def ExtractBOW(df):
	words=itertools.chain.from_iterable(df["tokenized_text_noparties"])
	words=nltk.FreqDist(words).most_common(1000)
	words=[k for k, v in words]
	columns=['id']+list(map(lambda w: w + "_bow", words))
	concat_df = df.copy(deep=True)

	rows=[]
	for idx in concat_df.index.values:
		tokens=concat_df.loc[idx]["tokenized_text_noparties"]
		rows.append([concat_df.loc[idx]["id"]]+[1 if w in tokens else 0 for _, w in enumerate(words)])
	new_df =  pd.DataFrame(rows, columns=columns)
	new_df.set_index("id", inplace=True, drop=False)
	result = pd.concat([concat_df,new_df], axis=1)
	result = result.loc[:, ~result.columns.duplicated()]
	return result

if(__name__=="__main__"):
	import load_dataframe
	df = load_dataframe.getTokenizedDataFrame()
	concat_df=ExtractBOW(df)
	save.save_all(concat_df, "bow")
	with pd.option_context('display.max_rows', None, 'display.max_columns', 10, 'display.max_colwidth', -1, 'display.float_format', lambda x: '%.3f' % x):
		print(concat_df[(concat_df['no_bow']>0)][['filename','text','tokenized_text_noparties','no_bow']].head(5))
	print(concat_df.head(5))