def train_data(self) -> pd.DataFrame: """ Returns ------- """ if self.train_data_ is None: logger.info('Loading training dataset') self.train_data_ = pd.read_csv(self.train_path, usecols=self.cols, nrows=TEST_ROWS) self.train_data_.dropna(inplace=True) self.train_data_['name'] = self.train_data_['name'].str.lower() self.train_data_['alternate_names'] = self.train_data_[ 'alternate_names'].str.lower() self.train_data_['n_alternate_names'] = self.train_data_[ 'alternate_names'].str.split(',').apply(len) self.train_data_['len_name'] = self.train_data_['name'].str.len() # get rid of toponyms that have more than "max_chars" characters. self.train_data_ = self.train_data_[ (self.train_data_['len_name'] <= self.max_chars) & (self.train_data_['len_name'] > 2)].reset_index(drop=True) return self.train_data_
def __init__(self, train_fname: str, val_fname: str, verbose: int = 0, max_chars: int = 32, save_tokenizer: bool = True, tokenizer_params: Optional[dict] = None, train_sampler_params: Optional[dict] = None, val_sampler_params: Optional[dict] = None): """ Parameters ---------- train_fname val_fname verbose max_chars save_tokenizer tokenizer_params train_sampler_params val_sampler_params """ self.train_path = os.path.join(DirConf.DATA_DIR, train_fname) self.val_path = os.path.join(DirConf.DATA_DIR, val_fname) assert os.path.exists(self.train_path) assert os.path.exists(self.val_path) self.verbose = verbose self.max_chars = max_chars self.save_tokenizer = save_tokenizer self.cols = ['name', 'alternate_names'] self.train_data_: Optional[pd.DataFrame] = None self.val_data_: Optional[pd.DataFrame] = None self.tokenizer_params = {'name': 'ngram_tokenizer', 'maxlen': 30, 'filters': '', 'lower': True, 'split': ' ', 'char_level': False, 'num_words': 20_000, 'oov_token': '<OOV>'} if tokenizer_params: self.tokenizer_params.update(tokenizer_params) logger.info('Loading tokenizer') TokenizerCLass = getattr(tokenizers, underscore_to_camel( self.tokenizer_params.pop('name'))) self.tokenizer = TokenizerCLass(**self.tokenizer_params) self.train_sampler_params = train_sampler_params self.val_sampler_params = val_sampler_params self.train_sampler = None self.val_sampler = None
def tokenize_external_data(self): """ Returns ------- """ logger.info('Creating Sequences for External Test dataset') # ========== Procedure for the Test Set =================== for col_name in self.external_cols: if col_name == 'target': # obviously, we don't want to tokenize the target continue logger.info(f'Creating N-grams for column name: "{col_name}".') self.data[f'{col_name}_ngrams'] = self.data[ col_name].progress_apply(self.tokenizer.get_ngrams) logger.info(f'Converting column name: "{col_name}" to sequences') self.data[f'{col_name}_seq'] = self.data[ f'{col_name}_ngrams'].progress_apply( lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0]) logger.info(f'Padding sequences for column name: "{col_name}".') self.data[f'{col_name}_seq'] = self.data[ f'{col_name}_seq'].progress_apply(self.tokenizer.pad_single) self.data_ = self.data_[self.output_cols] return self.data_
def save(self): """ :return: """ self.name = self.build_outfile() tokenizer_json = self.to_json() path = os.path.join(DirConf.MODELS_DIR, self.name) exp_logger.info(f'Storing Unigram Tokenizer at: {path}') with io.open(path, 'w', encoding='utf-8') as f: f.write(json.dumps(tokenizer_json, ensure_ascii=True))
def __init__(self, fname: str, tokenizer_fname: str, use_external: bool = False, verbose: int = 0, max_chars: int = 32, sampler_params: Optional[dict] = None, ): """ Parameters ---------- fname : str The filename of the dataset that we want to prepare for model evaluation. tokenizer_fname : str The filename of the already fitted tokenizer tha we want to use for the conversion of the toponyms and the alternate_names to sequences. use_external : bool verbose : int max_chars : int """ self.path = os.path.join(DirConf.DATA_DIR, fname) assert os.path.exists(self.path) self.verbose = verbose self.max_chars = max_chars self.cols = ['name', 'alternate_names'] self.data_: Optional[pd.DataFrame] = None logger.info(f'Loading fitted tokenizer: {tokenizer_fname}') self.tokenizer = tokenizers.load_tokenizer(name=tokenizer_fname) self.test_sampler = None self.use_external = use_external self.sampler_params = sampler_params self.external_cols = ['name', 'alternate_name', 'target'] self.output_cols = ['name', 'alternate_name', 'name_seq', 'alternate_name_seq', 'target']
def filter_latin_related_records(self) -> pd.DataFrame: """ Returns ------- """ exp_logger.info('Filtering Latin Names') self.data_ = self.data[self.data['name_alphabet'] == "LATIN"] exp_logger.info( f'Number of Records after filtering: {len(self.data_)}') # filter only the alternate names that are written in LATIN exp_logger.info('Filtering Latin Alternate Names') self.data['alt_names_seq'] = self.data.apply(lambda row: [ n for n, ab in zip(row['alt_names_seq'], row[ 'alternate_names_alphabet']) if ab == 'LATIN' ], axis=1) # replace the alternate_names with those that are only written in LATIN self.data['alternate_names'] = self.data['alt_names_seq'].apply( lambda l: ', '.join(l) if l else None) return self.data
def data(self) -> pd.DataFrame: """ This method loads lazily the dataset that we will use for the model evaluation. Returns ------- pd.DataFrame : A pandas dataframe that """ if self.data_ is None: if self.use_external: logger.info('Loading external test dataset') self.data_ = pd.read_csv(self.path, usecols=self.external_cols, nrows=TEST_ROWS, sep='\t') self.data_['target'] = self.data_['target'].astype(int) else: logger.info('Loading test dataset') self.data_ = pd.read_csv(self.path, usecols=self.cols, nrows=TEST_ROWS) self.data_.dropna(inplace=True) logger.info(f'Dataset size: {len(self.data_)}') self.data_['name'] = self.data_['name'].str.lower() self.data_['len_name'] = self.data_['name'].str.len() # get rid of toponyms that have more than "max_chars" characters. self.data_ = self.data_[ (self.data_['len_name'] <= self.max_chars) & (self.data_['len_name'] > 2)].reset_index(drop=True) if not self.use_external: # since we don't have a ready external dataset for testing # we need to pre-process the raw dataset. self.data_['alternate_names'] = self.data_[ 'alternate_names'].str.lower() self.data_['n_alternate_names'] = self.data_[ 'alternate_names'].str.split(',').apply(len) return self.data_
def run(self): """ Returns ------- """ # get the alternate names as a list for each record self.data['alt_names_seq'] = self.data['alternate_names'].apply( lambda x: x.split(',') if x else []) self.data['len_name'] = self.data['name'].apply(len) exp_logger.info(f'Keeping records with Name ' f'Length smaller than {self.max_name_chars}') self.data_ = self.data[ self.data['len_name'] <= self.max_name_chars].reset_index( drop=True) exp_logger.info('Detecting Alphabet for all Names') # detect the alphabet for the name self.data['name_alphabet'] = self.data['name'].progress_apply( self.detect_alphabet) exp_logger.info('Converting non frequent alphabets to "UND"') alphabet_counts = self.data['name_alphabet'].value_counts() non_frequent_alpha = { i: 'UND' for i in alphabet_counts[alphabet_counts.values < 10].index } self.data['name_alphabet'] = self.data['name_alphabet'].apply( lambda x: non_frequent_alpha.get(x, x)) exp_logger.info('Detecting Alphabet for all Alternate Names') # get the alphabet for each alternate name self.data['alternate_names_alphabet'] = self.data[ 'alt_names_seq'].progress_apply( lambda l: [self.detect_alphabet(n) for n in l]) if self.only_latin: self.filter_latin_related_records() # filters self.data self.data['n_alt_names'] = self.data['alt_names_seq'].progress_apply( len) self.data['n_alt_gte'] = self.data['n_alt_names'] >= self.n_alternates if self.show_plots: exp_logger.info('Creating Plots') self.create_plots() datasets = self.split_records() if self.save_data: exp_logger.info('Saving Datasets') for data_type, data in datasets.items(): exp_logger.info(f'Saving {data_type}') ab = 'latin' if self.only_latin else 'global' shuffle = 'stratified' if self.stratified_split else 'random' outfile = f'n_alternates_{self.n_alternates}+_{ab}_' \ f'{shuffle}_split_{data_type}.csv'.strip().lower() outfile = os.path.join(self.data_dir, outfile) data[self.basic_cols].to_csv(outfile, encoding='utf8', index=False) return datasets
def split_records(self) -> Dict[str, pd.DataFrame]: """ Returns ------- """ data_size = len(self.data) test_size = int(self.test_size * data_size) val_size = int(self.val_size * data_size) if not self.stratified_split: exp_logger.info('Random Split into Train-Val and Test') X_train_val, X_test = train_test_split(self.data[self.basic_cols], test_size=test_size, shuffle=True, random_state=2020, stratify=None) exp_logger.info('Random Split into Train and Val') X_train, X_val = train_test_split(X_train_val, test_size=val_size, shuffle=True, random_state=2020, stratify=None) else: if self.only_latin: exp_logger.info('Using Name Length as stratification factor') stratify_column = self.data['len_name'] else: exp_logger.info('Using Name Alphabet as stratification factor') stratify_column = self.data['name_alphabet'] exp_logger.info('Stratified Split into Train-Val and Test') # y_train_val will be used for the stratification in the second # split. X_train_val, X_test, y_train_val, _ = train_test_split( self.data[self.basic_cols], stratify_column, test_size=test_size, shuffle=True, random_state=2020, stratify=stratify_column) exp_logger.info('Stratified Split into Train and Val') X_train, X_val = train_test_split(X_train_val, test_size=val_size, shuffle=True, random_state=2020, stratify=y_train_val) exp_logger.info(f'X_train-val size: {X_train_val.shape[0]}') exp_logger.info(f'X_train size: {X_train.shape[0]}') exp_logger.info(f'X_val size: {X_val.shape[0]}') exp_logger.info(f'X_test size: {X_test.shape[0]}') return dict(X_train_val=X_train_val, X_train=X_train, X_val=X_val, X_test=X_test)
def tokenize_raw_data(self): """ 1) Creates n-grams from the toponyms 2) Creates n-grams for each of the variations Returns ------- """ # ========== Procedure for the Test Set =================== logger.info('Creating N-grams for test toponyms') self.data['toponym_ngrams'] = self.data['name'].progress_apply( self.tokenizer.get_ngrams) self.data['alternate_names'] = self.data['alternate_names'].str.split( ',') logger.info('Creating N-grams for test alternate-names') self.data['variations_ngrams'] = self.data[ 'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams) logger.info('Converting test toponyms to sequences') self.data['toponym_seqs'] = self.data['toponym_ngrams'].progress_apply( lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0]) logger.info('Padding test toponym sequences') self.data['toponym_seqs'] = self.data['toponym_seqs'].progress_apply( self.tokenizer.pad_single) logger.info('Converting test alternate-names to sequences') self.data['variations_seqs'] = self.data[ 'variations_ngrams'].progress_apply( self.tokenizer.texts_to_sequences) logger.info('Padding test alternate-names sequences') self.data['variations_seqs'] = self.data[ 'variations_seqs'].progress_apply(self.tokenizer.pad) if self.verbose > 0: logger.info( f'N-gram index length: {len(self.tokenizer.word_index)}') logger.info('\nExample Transformation') logger.info(self.data.loc[0]) logger.info(self.data.loc[0]['variations_seqs'])
def tokenize_data(self): """ 1) Creates n-grams from the toponyms 2) Creates n-grams for each of the variations Returns ------- """ logger.info('Creating N-grams for training toponyms') # convert each toponym to it's ngram representation self.train_data['toponym_ngrams'] = self.train_data[ 'name'].progress_apply(self.tokenizer.get_ngrams) # convert each variation of each toponym to it's n-gram representation self.train_data['alternate_names'] = self.train_data[ 'alternate_names'].str.split(',') logger.info('Creating N-grams for training alternate-names') self.train_data['variations_ngrams'] = self.train_data[ 'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams) # collect (flatten out) all the n-grams (toponyms and variations) # these are needed in order to fit it to the tokenizer. all_train_names = list() for row in self.train_data['variations_ngrams']: all_train_names.extend(row) all_train_names += list(self.train_data['toponym_ngrams']) # fitting all the training texts on the instantiated tokenizer # this will create all the necessary tools that we will need. logger.info('Fitting tokenizer to training-data') self.tokenizer.fit_on_texts(texts=all_train_names) # using the fitted tokenizer, convert the train toponyms to sequences logger.info('Converting training toponyms to sequences') self.train_data['toponym_seqs'] = self.train_data[ 'toponym_ngrams'].progress_apply( lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0]) logger.info('Padding training toponym sequences') # pad the sequences to the max length self.train_data['toponym_seqs'] = self.train_data[ 'toponym_seqs'].progress_apply(self.tokenizer.pad_single) # using the fitted tokenizer, convert each of the variations of all the # toponyms sequences logger.info('Converting training alternate-names to sequences') self.train_data['variations_seqs'] = self.train_data[ 'variations_ngrams'].progress_apply( self.tokenizer.texts_to_sequences) logger.info('Padding training alternate-names sequences') self.train_data['variations_seqs'] = self.train_data[ 'variations_seqs'].progress_apply(self.tokenizer.pad) # ========== Same Procedure for the Validation Set =================== logger.info('Creating N-grams for validation toponyms') self.val_data['toponym_ngrams'] = self.val_data['name'].progress_apply( self.tokenizer.get_ngrams) self.val_data['alternate_names'] = self.val_data[ 'alternate_names'].str.split(',') logger.info('Creating N-grams for validation alternate-names') self.val_data['variations_ngrams'] = self.val_data[ 'alternate_names'].progress_apply(self.tokenizer.texts_to_ngrams) logger.info('Converting validation toponyms to sequences') self.val_data['toponym_seqs'] = self.val_data[ 'toponym_ngrams'].progress_apply( lambda x: self.tokenizer.texts_to_sequences(texts=[x])[0]) logger.info('Padding validation toponym sequences') self.val_data['toponym_seqs'] = self.val_data[ 'toponym_seqs'].progress_apply( self.tokenizer.pad_single) logger.info('Converting validation alternate-names to sequences') self.val_data['variations_seqs'] = self.val_data[ 'variations_ngrams'].progress_apply( self.tokenizer.texts_to_sequences) logger.info('Padding validation alternate-names sequences') self.val_data['variations_seqs'] = self.val_data[ 'variations_seqs'].progress_apply(self.tokenizer.pad) if self.verbose > 0: print(f'N-gram index length: {len(self.tokenizer.word_index)}') print('\nExample Transformation') print(self.val_data.loc[0]) print(self.val_data.loc[0]['variations_seqs']) if self.save_tokenizer: print('Saving Tokenizer') self.tokenizer.save() print('Tokenizer saved')