def get_augmented_val_X_y(X, y, label): '''get a dataset with augmented texts for the minority positive label Arguments: X, y - pandas series containing the validation data that needs to be augmented label - label that needs to be augmented sampling_strategy - float representing the proportion of positive vs negative labels in the augmented dataframe (range [>0.0; <=1.ß]) Return: augmented X, y''' label_range = [ 'label_sentimentnegative', 'label_inappropriate', 'label_discriminating', 'label_offtopic', 'label_needsmoderation', 'label_negative' ] if label in label_range: file_cached = "./cache/df_r3.csv" try: df_aug = pd.read_csv(f'./output/trans_val_{label}.csv') X_aug = df_aug['text'] y_aug = df_aug[label] X, y = pd.concat((X, X_aug)), pd.concat(( y, y_aug, )) except FileNotFoundError as e: pass try: df_r3 = pd.read_csv(file_cached) except: df_r3 = loading.load_extended_posts(label=label) df_r3 = feature_engineering.add_column_ann_round(df_r3) df_r3 = feature_engineering.add_column_text(df_r3) df_r3 = df_r3.query('ann_round==3').copy() df_r3.to_csv(file_cached) df_r3 = feature_engineering.add_column_label_needsmoderation(df_r3) art_list = list(df_r3.id_article.unique()) df_ann = pd.DataFrame(columns=df_r3.columns) for i in art_list: df_ann = pd.concat( (df_ann, df_r3.query(f'id_article=={i} and {label}==1').sample( 1, random_state=42))) return pd.concat((X, df_ann['text'])), pd.concat((y, df_ann[label])) else: print( f'Requested augmentation data for label {label} not available. Returned original X,y' ) return X, y
def create_splits(): """ Create test-train-val split of labeled data and save it to csv. Annotation round 2 and round 3 are split seperately. The csv files are written to ./data and contain the id_post. Args: None Returns: None """ RSEED = 42 df_posts = loading.load_extended_posts() df_posts = feature_engineering.add_column_ann_round(df_posts) # The first round of EDA showed, that only posts annotated in round 2 represent the population. # Posts annotated in round 3 will only be used for the labels "possiblyFeedback" and # "personalStories" (i.e. NaN in any of the other lables) and only in the training set. df_ann2 = df_posts.query("ann_round == 2") df_ann3_feedback_stories = df_posts.query( "ann_round == 3 and label_offtopic != label_offtopic") # Due to a small dataset (1,000 posts) we want to keep 100 observations for test and validation split each # We stratify by labels, that are least frequent in our 1,000 observations features_strat = [ 'label_discriminating', 'label_possiblyfeedback', 'label_personalstories', 'label_sentimentpositive' ] ann2_train, ann2_test = train_test_split(df_ann2, stratify=df_ann2[features_strat], random_state=RSEED, test_size=100) ann2_train, ann2_val = train_test_split( ann2_train, stratify=ann2_train[features_strat], random_state=RSEED, test_size=100) df_train = pd.concat([ann2_train, df_ann3_feedback_stories], axis=0) print(f"Number of posts in train-set: {df_train.shape[0]}") print(f"Number of posts in val-set: {ann2_val.shape[0]}") print(f"Number of posts in test-set: {ann2_test.shape[0]}") df_train.id_post.to_csv('./data/ann2_train.csv', header=False) ann2_test.id_post.to_csv('./data/ann2_test.csv', header=False) ann2_val.id_post.to_csv('./data/ann2_val.csv', header=False) print('Splits created.')
# # Eda issue 39 distribution of lables #39 # Issue link: https://github.com/dominikmn/one-million-posts/issues/39 # %% import pandas as pd import numpy as np import seaborn as sns import matplotlib import matplotlib.pyplot as plt from utils import loading, feature_engineering # %% df_ann = loading.load_annotations() df_ann = feature_engineering.add_column_ann_round(df_ann) df_ann.head() # %% # prepare extended annotation dataframe with duplicates annotated with "all" df_ann_ext = df_ann.copy() df_ann_ext.ann_round = "all" df_ann.ann_round = df_ann.ann_round.astype(str) df_ann_ext = pd.concat([df_ann_ext, df_ann], axis=0) df_ann_ext.shape # %% # annotation counts per value, annotation round, and category df_ann_counts = df_ann_ext.groupby( ["category", "ann_round"])["value"].value_counts().unstack(level=2).unstack(level=1)
'Discriminating', 'ArgumentsUsed', 'PersonalStories', 'SentimentPositive', 'PossiblyFeedback', ] # %% y_col_dict = {r:c for r,c in zip(y_col_grouped, y_col_grouped_clean)} # %% [markdown] # ## Scores Zero Shot # %% data = loading.load_extended_posts() data = feature_engineering.add_column_ann_round(data) data.fillna(value={'headline':'', 'body':''}, inplace=True) data['text'] = data['headline']+" "+data['body'] data['text']=data.text.str.replace('\n',' ').str.replace('\r', ' ') data_1000 = data.query('ann_round==2').copy() # %% data_pred_1000 = pd.read_csv('../output/zero_shot_result_1000.csv', index_col=0) data_merge_1000 = pd.merge(data_1000, data_pred_1000, how='left', on = 'id_post', suffixes = ('_true', '_pred')) scores_1000_05 = scoring.get_score_df(data_merge_1000) scores_1000_best = scoring.get_score_df(data_merge_1000, best=True) # %% scores_zeroshot = scores_1000_05[['label', 'f1_pred']].query('label in @y_col').copy() scores_zeroshot['model'] = pd.Series(['xlm-roberta-large-xnli']*9) scores_zeroshot.rename(columns={'f1_pred':'f1_score'}, inplace=True)