def AUC(classifier, *args): if (type(args[0]).__name__ == "DataFrame"): df = args[0] feature_names = args[1] else: df = dt.getTokenizedDataFrame() df, feature_names = feat.ExtractFeatures(df, *args) df = df.rename_axis(None) attitudes = ["proactivo", "reactivo", "agresivo", "provoto"] f, axarr = plt.subplots(1, 4, figsize=(18, 4), sharey=True) for i, attitude in enumerate(attitudes): sorted_train_labels = sorted(list(set(df.loc[:, attitude]))) for label in sorted_train_labels: t = df[(df[attitude] == label)] if t.shape[0] < 10: df = df.append([t] * 10) print("========================={}===========================".format( attitude.upper())) data = df.loc[:, [ c for c in df.columns for feature_name in feature_names if "_" + feature_name.lower() in c ]] target = df.loc[:, attitude] train_data, test_data, train_labels, test_labels = train_test_split( data, target, train_size=0.6, stratify=target, random_state=conf.seed) precision, recall, accuracy, f1, classifier_name, learning_time, prediction_time = tc.test_classifier( train_data, train_labels, test_data, test_labels, classifier) auc.plot_auc_cv(data, target, classifier, "Attitude '{}'".format(attitude), save=conf.images_dir, plot=axarr[i]) plt.setp([a.get_yticklabels() for a in axarr[1:]], visible=False) f.subplots_adjust(hspace=0.3) plt.suptitle('{}+{} for all attributes'.format( "+".join([feature_name.upper() for feature_name in feature_names]), classifier_name), y=1) f.text(0.5, 0.01, 'False Positive Rate', ha='center') f.text(0.09, 0.5, 'True Positive Rate', va='center', rotation='vertical') plt.savefig(os.path.join( conf.images_dir, '{}+{} for all attributes.png'.format( "+".join([feature_name.upper() for feature_name in feature_names]), classifier_name)), bbox_inches='tight')
import __init__ import load_dataframe as data df = data.getTokenizedDataFrame() print(df.head(5))
import itertools import pandas as pd import nltk import numpy as np from features import save def ExtractBOW(df): words=itertools.chain.from_iterable(df["tokenized_text_noparties"]) words=nltk.FreqDist(words).most_common(1000) words=[k for k, v in words] columns=['id']+list(map(lambda w: w + "_bow", words)) concat_df = df.copy(deep=True) rows=[] for idx in concat_df.index.values: tokens=concat_df.loc[idx]["tokenized_text_noparties"] rows.append([concat_df.loc[idx]["id"]]+[1 if w in tokens else 0 for _, w in enumerate(words)]) new_df = pd.DataFrame(rows, columns=columns) new_df.set_index("id", inplace=True, drop=False) result = pd.concat([concat_df,new_df], axis=1) result = result.loc[:, ~result.columns.duplicated()] return result if(__name__=="__main__"): import load_dataframe df = load_dataframe.getTokenizedDataFrame() concat_df=ExtractBOW(df) save.save_all(concat_df, "bow") with pd.option_context('display.max_rows', None, 'display.max_columns', 10, 'display.max_colwidth', -1, 'display.float_format', lambda x: '%.3f' % x): print(concat_df[(concat_df['no_bow']>0)][['filename','text','tokenized_text_noparties','no_bow']].head(5)) print(concat_df.head(5))