def __init__(self, bert_config_file: str, init_checkpoint: str, dataset_db_name: str, dataset_split: str, vocab_file: str, output_dir: str, split_table_name: str, skip_trivial_samples: bool = False, seq_len: int = 256, batch_size: int = 32, layer_indexes: List[int] = [-1, -2, -3, -4], learning_rate: float = 2e-6, num_train_epochs: float = 1.0, warmup_proportion: float = 0.1, do_lower_case: bool = True, save_checkpoints_steps: int = 1000, summary_steps: int = 1, margin: float = 2.0, steps_per_eval_iter: int = 10, loss: str = 'cosine_contrastive', beta: float = 1.0, num_train_steps: int = None, num_query_sentences_per_entity: int = 2): self._seq_len = seq_len self._batch_size = batch_size self._layer_indexes = layer_indexes self._do_lower_case = do_lower_case self._init_checkpoint = init_checkpoint self._bert_config_file = bert_config_file self._output_dir = output_dir self._save_checkpoints_steps = save_checkpoints_steps self._summary_steps = summary_steps self._num_train_epochs = num_train_epochs self._num_train_steps = num_train_steps self._warmup_proportion = warmup_proportion self._learning_rate = learning_rate self._margin = margin self._loss_name = loss self._beta = beta self._steps_per_eval_iter = steps_per_eval_iter self._tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) assert dataset_split in ['train', 'test', 'val'] train_query_data, train_context_data, train_entities, _ = Classifier.load_datasplit( dataset_db_name=dataset_db_name, dataset_split=dataset_split, split_table_name=split_table_name, skip_trivial_samples=skip_trivial_samples, load_context=False) self._training_data = self.generate_data_pairs( train_query_data, train_context_data, train_entities, num_query_sentences_per_entity=num_query_sentences_per_entity) # Only load the validation split if the training split has been specified self._validation_data = None if dataset_split == 'train': val_query_data, val_context_data, val_entities, _ = Classifier.load_datasplit( dataset_db_name=dataset_db_name, dataset_split='val', split_table_name=split_table_name, skip_trivial_samples=skip_trivial_samples, load_context=False) self._validation_data = self.generate_data_pairs( val_query_data, val_context_data, val_entities, num_query_sentences_per_entity=num_query_sentences_per_entity)
from classifiers.classifier import Classifier from bert import tokenization import numpy as np db = '../data/databases/dataset_geraete_small.db' t_q, t_c, t_e, _ = Classifier.load_datasplit(db, 'train') e_q, e_c, e_e, _ = Classifier.load_datasplit(db, 'test') v_q, v_c, v_e, _ = Classifier.load_datasplit(db, 'val') def collect_sentences(query, context): out = set() for sample in query: out.add(sample['sentence']) for sample in context: out.add(sample['sentence']) return out def get_avg_token_len(data, tokenizer, token_lens): for i, sample in enumerate(data): s = str(sample['sentence']) tokens, mapping = tokenizer.tokenize(s) token_lens.append(len(tokens)) print("Avg. number of tokens: %s\n" "Std. deviation: %s\n" "Min: %s \tMax: %s" % ((sum(token_lens) / len(token_lens)), np.std(token_lens), min(token_lens), max(token_lens))) return token_lens