コード例 #1
0
    def __init__(self,
                 bert_config_file: str,
                 init_checkpoint: str,
                 dataset_db_name: str,
                 dataset_split: str,
                 vocab_file: str,
                 output_dir: str,
                 split_table_name: str,
                 skip_trivial_samples: bool = False,
                 seq_len: int = 256,
                 batch_size: int = 32,
                 layer_indexes: List[int] = [-1, -2, -3, -4],
                 learning_rate: float = 2e-6,
                 num_train_epochs: float = 1.0,
                 warmup_proportion: float = 0.1,
                 do_lower_case: bool = True,
                 save_checkpoints_steps: int = 1000,
                 summary_steps: int = 1,
                 margin: float = 2.0,
                 steps_per_eval_iter: int = 10,
                 loss: str = 'cosine_contrastive',
                 beta: float = 1.0,
                 num_train_steps: int = None,
                 num_query_sentences_per_entity: int = 2):
        self._seq_len = seq_len
        self._batch_size = batch_size
        self._layer_indexes = layer_indexes

        self._do_lower_case = do_lower_case
        self._init_checkpoint = init_checkpoint
        self._bert_config_file = bert_config_file

        self._output_dir = output_dir
        self._save_checkpoints_steps = save_checkpoints_steps
        self._summary_steps = summary_steps

        self._num_train_epochs = num_train_epochs
        self._num_train_steps = num_train_steps
        self._warmup_proportion = warmup_proportion
        self._learning_rate = learning_rate
        self._margin = margin
        self._loss_name = loss
        self._beta = beta

        self._steps_per_eval_iter = steps_per_eval_iter

        self._tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=do_lower_case)

        assert dataset_split in ['train', 'test', 'val']
        train_query_data, train_context_data, train_entities, _ = Classifier.load_datasplit(
            dataset_db_name=dataset_db_name,
            dataset_split=dataset_split,
            split_table_name=split_table_name,
            skip_trivial_samples=skip_trivial_samples,
            load_context=False)
        self._training_data = self.generate_data_pairs(
            train_query_data,
            train_context_data,
            train_entities,
            num_query_sentences_per_entity=num_query_sentences_per_entity)

        # Only load the validation split if the training split has been specified
        self._validation_data = None
        if dataset_split == 'train':
            val_query_data, val_context_data, val_entities, _ = Classifier.load_datasplit(
                dataset_db_name=dataset_db_name,
                dataset_split='val',
                split_table_name=split_table_name,
                skip_trivial_samples=skip_trivial_samples,
                load_context=False)
            self._validation_data = self.generate_data_pairs(
                val_query_data,
                val_context_data,
                val_entities,
                num_query_sentences_per_entity=num_query_sentences_per_entity)
コード例 #2
0
from classifiers.classifier import Classifier
from bert import tokenization
import numpy as np

db = '../data/databases/dataset_geraete_small.db'
t_q, t_c, t_e, _ = Classifier.load_datasplit(db, 'train')
e_q, e_c, e_e, _ = Classifier.load_datasplit(db, 'test')
v_q, v_c, v_e, _ = Classifier.load_datasplit(db, 'val')


def collect_sentences(query, context):
    out = set()
    for sample in query:
        out.add(sample['sentence'])
    for sample in context:
        out.add(sample['sentence'])
    return out


def get_avg_token_len(data, tokenizer, token_lens):
    for i, sample in enumerate(data):
        s = str(sample['sentence'])
        tokens, mapping = tokenizer.tokenize(s)
        token_lens.append(len(tokens))

    print("Avg. number of tokens: %s\n"
          "Std. deviation: %s\n"
          "Min: %s \tMax: %s" %
          ((sum(token_lens) / len(token_lens)), np.std(token_lens),
           min(token_lens), max(token_lens)))
    return token_lens