def __init__(self): df = loading.load_extended_posts() df = feature_engineering.add_column_text(df) self.df = feature_engineering.add_column_label_needsmoderation(df) self.current_label = None self.balance_method = None self.sampling_strategy = 1
def get_augmented_val_X_y(X, y, label): '''get a dataset with augmented texts for the minority positive label Arguments: X, y - pandas series containing the validation data that needs to be augmented label - label that needs to be augmented sampling_strategy - float representing the proportion of positive vs negative labels in the augmented dataframe (range [>0.0; <=1.ß]) Return: augmented X, y''' label_range = [ 'label_sentimentnegative', 'label_inappropriate', 'label_discriminating', 'label_offtopic', 'label_needsmoderation', 'label_negative' ] if label in label_range: file_cached = "./cache/df_r3.csv" try: df_aug = pd.read_csv(f'./output/trans_val_{label}.csv') X_aug = df_aug['text'] y_aug = df_aug[label] X, y = pd.concat((X, X_aug)), pd.concat(( y, y_aug, )) except FileNotFoundError as e: pass try: df_r3 = pd.read_csv(file_cached) except: df_r3 = loading.load_extended_posts(label=label) df_r3 = feature_engineering.add_column_ann_round(df_r3) df_r3 = feature_engineering.add_column_text(df_r3) df_r3 = df_r3.query('ann_round==3').copy() df_r3.to_csv(file_cached) df_r3 = feature_engineering.add_column_label_needsmoderation(df_r3) art_list = list(df_r3.id_article.unique()) df_ann = pd.DataFrame(columns=df_r3.columns) for i in art_list: df_ann = pd.concat( (df_ann, df_r3.query(f'id_article=={i} and {label}==1').sample( 1, random_state=42))) return pd.concat((X, df_ann['text'])), pd.concat((y, df_ann[label])) else: print( f'Requested augmentation data for label {label} not available. Returned original X,y' ) return X, y
def create_splits(): """ Create test-train-val split of labeled data and save it to csv. Annotation round 2 and round 3 are split seperately. The csv files are written to ./data and contain the id_post. Args: None Returns: None """ RSEED = 42 df_posts = loading.load_extended_posts() df_posts = feature_engineering.add_column_ann_round(df_posts) # The first round of EDA showed, that only posts annotated in round 2 represent the population. # Posts annotated in round 3 will only be used for the labels "possiblyFeedback" and # "personalStories" (i.e. NaN in any of the other lables) and only in the training set. df_ann2 = df_posts.query("ann_round == 2") df_ann3_feedback_stories = df_posts.query( "ann_round == 3 and label_offtopic != label_offtopic") # Due to a small dataset (1,000 posts) we want to keep 100 observations for test and validation split each # We stratify by labels, that are least frequent in our 1,000 observations features_strat = [ 'label_discriminating', 'label_possiblyfeedback', 'label_personalstories', 'label_sentimentpositive' ] ann2_train, ann2_test = train_test_split(df_ann2, stratify=df_ann2[features_strat], random_state=RSEED, test_size=100) ann2_train, ann2_val = train_test_split( ann2_train, stratify=ann2_train[features_strat], random_state=RSEED, test_size=100) df_train = pd.concat([ann2_train, df_ann3_feedback_stories], axis=0) print(f"Number of posts in train-set: {df_train.shape[0]}") print(f"Number of posts in val-set: {ann2_val.shape[0]}") print(f"Number of posts in test-set: {ann2_test.shape[0]}") df_train.id_post.to_csv('./data/ann2_train.csv', header=False) ann2_test.id_post.to_csv('./data/ann2_test.csv', header=False) ann2_val.id_post.to_csv('./data/ann2_val.csv', header=False) print('Splits created.')
'Inappropriate', 'Discriminating', 'ArgumentsUsed', 'PersonalStories', 'SentimentPositive', 'PossiblyFeedback', ] # %% y_col_dict = {r:c for r,c in zip(y_col_grouped, y_col_grouped_clean)} # %% [markdown] # ## Scores Zero Shot # %% data = loading.load_extended_posts() data = feature_engineering.add_column_ann_round(data) data.fillna(value={'headline':'', 'body':''}, inplace=True) data['text'] = data['headline']+" "+data['body'] data['text']=data.text.str.replace('\n',' ').str.replace('\r', ' ') data_1000 = data.query('ann_round==2').copy() # %% data_pred_1000 = pd.read_csv('../output/zero_shot_result_1000.csv', index_col=0) data_merge_1000 = pd.merge(data_1000, data_pred_1000, how='left', on = 'id_post', suffixes = ('_true', '_pred')) scores_1000_05 = scoring.get_score_df(data_merge_1000) scores_1000_best = scoring.get_score_df(data_merge_1000, best=True) # %% scores_zeroshot = scores_1000_05[['label', 'f1_pred']].query('label in @y_col').copy() scores_zeroshot['model'] = pd.Series(['xlm-roberta-large-xnli']*9)
# --- # %% [markdown] # # Analyze comments vs article categories #2 # Issue link: https://github.com/dominikmn/one-million-posts/issues/2 # %% from utils import loading, feature_engineering import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib import seaborn as sns # %% df_posts = loading.load_extended_posts() # %% df_articles = loading.load_articles() # %% [markdown] # ## Data preparation # %% [markdown] # ### Encode post labels # %% cols_label = [c for c in df_posts.columns if c.startswith('label_')] # %% df_posts[cols_label] = df_posts[cols_label].replace({0.0: 'no', 1.0: 'yes'})
f1_val = f1_score(y_val, y_pred_val_best) precision_val = precision_score(y_val, y_pred_val_best) recall_val = recall_score(y_val, y_pred_val_best) f1_train = f1_score(y_train_best, y_pred_train_best) precision_train = precision_score(y_train_best, y_pred_train_best) recall_train = recall_score(y_train_best, y_pred_train_best) log_metrics(f1_val, recall_val, precision_val, f1_train, recall_train, precision_train) scoring.log_cm(y_train_best, y_pred_train_best, y_val, y_pred_val_best) params['sampling_strategy'] = round(best_perc, 2) sv_model(params, best_model) endrun() if __name__ == '__main__': df_train = loading.load_extended_posts(split='train') df_val = loading.load_extended_posts(split='val') df_train.fillna(value={'headline': '', 'body': ''}, inplace=True) df_train['text'] = df_train['headline'] + " " + df_train['body'] df_train['text'] = df_train.text.str.replace('\n', ' ').str.replace('\r', ' ') df_val.fillna(value={'headline': '', 'body': ''}, inplace=True) df_val['text'] = df_val['headline'] + " " + df_val['body'] df_val['text'] = df_val.text.str.replace('\n', ' ').str.replace('\r', ' ') y_cols = [ 'label_argumentsused', 'label_discriminating', 'label_inappropriate', 'label_offtopic', 'label_personalstories', 'label_possiblyfeedback', 'label_sentimentnegative', 'label_sentimentpositive' ]
'''get a complete dataset with translations for augmentation Arguments: df - a pandas dataframe col - the column name of the column containing german texts lang_list - a list of languages for translation constructed url - the url to connect to the api headers - the headers for the request to the api Return: df2 - a complete dataset with the (back-)translated texts to augment the positive labels up to 50%''' df_temp = pd.DataFrame(columns=df.columns) for lang in lang_list: df_trans = translate_azure(df, col, lang, constructed_url, headers) df_temp = pd.concat((df_temp, df_trans)) return df_temp if __name__ == '__main__': df = loading.load_extended_posts(split='train') df.fillna(value={'headline': '', 'body': ''}, inplace=True) df['text'] = df['headline'] + " " + df['body'] df['text'] = df.text.str.replace('\n', ' ').str.replace('\r', ' ') constructed_url, headers = get_construction(subscription_key, endpoint, location) label_list = [ 'label_argumentsused', 'label_discriminating', 'label_inappropriate', 'label_offtopic', 'label_personalstories', 'label_possiblyfeedback', 'label_sentimentnegative', 'label_sentiment_positive' ] try: for label in label_list: print(f'started {label}') mult = get_mult(df, label) lang_list = get_lang(mult)