Esempio n. 1
0
from set_seed import random_ctl
seed = random_ctl(460304)  #best seed from 20 seed search without mixup

from fastai.text import *
from fastai.callbacks import SaveModelCallback
from fastai.layers import LabelSmoothingCrossEntropy

import sentencepiece as spm  #https://github.com/google/sentencepiece
import fire

from sp_tok import *
from nlp_mixup import *
from sklearn.model_selection import KFold


def split_data_by_idx(all_texts_df: DataFrame, train_idx, valid_idx):
    df_train = all_texts_df.iloc[train_idx, :]
    df_valid = all_texts_df.iloc[valid_idx, :]

    return df_train, df_valid


def fit_regr(model_path: str,
             sp_model: str,
             wd: float = 0.,
             mixup: bool = True,
             qrnn: bool = True,
             n_hid: int = 2304,
             load_enc: str = None,
             split_seed: int = None):
    PATH = Path(model_path)
Esempio n. 2
0
from set_seed import random_ctl
seed = random_ctl()

from fastai.text import *
from fastai.callbacks import SaveModelCallback
from fastai.layers import LabelSmoothingCrossEntropy

import sentencepiece as spm #https://github.com/google/sentencepiece
import fire

from sp_tok import *
from sklearn.model_selection import KFold

def split_data_by_idx(all_texts_df:DataFrame, train_idx, valid_idx):
    df_train = all_texts_df.iloc[train_idx,:]
    df_valid     = all_texts_df.iloc[valid_idx,:]

    return df_train, df_valid
    
    
def fit_clas(model_path:str, sp_model:str, wd:float=0.,
             qrnn:bool=True, n_hid:int=2304, load_enc:str=None, split_seed:int=None):
    PATH = Path(model_path)
    # torch.backends.cudnn.enabled=False
    
    defaults.text_spec_tok.append(NL) #add a New Line special char
    sp_vocab = Vocab( get_itos(sp_model) )    
    mycust_tok = CustomTokenizer(SPTokenizer,sp_model,pre_rules=default_rules)

    all_texts_df = pd.read_csv('../data/haha_2019_train.csv')
    all_texts_df.funniness_average.fillna(0,inplace=True)
Esempio n. 3
0
from set_seed import random_ctl
seed = random_ctl(432286)  #best seed from 20 seed search without mixup

from fastai.text import *
from fastai.callbacks import SaveModelCallback
from fastai.layers import LabelSmoothingCrossEntropy

import sentencepiece as spm  #https://github.com/google/sentencepiece
import fire

from sp_tok import *
from nlp_mixup import *
from bin_metrics import Fbeta_binary
from sklearn.model_selection import KFold


def split_rebal_data_by_idx(all_texts_df: DataFrame,
                            train_idx,
                            valid_idx,
                            clas_col: str = 'is_humor'):
    ## rebalance cases
    print('Number of positive samples:',
          (all_texts_df.loc[:, clas_col] == 1).sum())
    print('Number of negative samples:',
          (all_texts_df.loc[:, clas_col] == 0).sum())
    print('Total samples:', len(all_texts_df))

    df_train_all = all_texts_df.iloc[train_idx, :]
    df_valid = all_texts_df.iloc[valid_idx, :]

    print('Valid prevalence(n = %d):' % len(df_valid),