Ejemplo n.º 1
0
    def __init__(self, **kwargs):
        """
        For a full list of configuration options, see `finetune.config`.

        :param config: A config object generated by `finetune.config.get_config` or None (for default config).
        :param **kwargs: key-value pairs of config items to override.
        """
        weak_self = weakref.ref(self)

        def cleanup():
            strong_self = weak_self()
            if strong_self is not None:
                BaseModel.__del__(strong_self)

        atexit.register(cleanup)
        d = deepcopy(self.defaults)
        d.update(kwargs)
        self.config = get_config(**d)
        self.resolved_gpus = None
        self.validate_config()
        download_data_if_required(self.config.base_model)
        self.input_pipeline = self._get_input_pipeline()
        self._trained = False
        self._initialize()
        if self.config.debugging_logs:
            os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
            tf_logging.set_verbosity(tf_logging.DEBUG)
Ejemplo n.º 2
0
    def __init__(self, **kwargs):
        """
        For a full list of configuration options, see `finetune.config`.

        :param config: A config object generated by `finetune.config.get_config` or None (for default config).
        :param **kwargs: key-value pairs of config items to override.
        """
        weak_self = weakref.ref(self)

        def cleanup():
            strong_self = weak_self()
            if strong_self is not None:
                BaseModel.__del__(strong_self)

        atexit.register(cleanup)


        self.config = get_config(**kwargs)
        if self.config.default_context is not None and type(self.config.default_context) != dict:
            raise FinetuneError(
                "Invalid default given: Need a dictionary of auxiliary info fields and default values."
            )
        self.config.use_auxiliary_info = self.config.default_context is not None

        self.resolved_gpus = None
        self.validate_config()
        download_data_if_required(self.config.base_model)
        self.input_pipeline = self._get_input_pipeline()
        self._trained = False
        self._initialize()
        if self.config.debugging_logs:
            os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"
            tf_logging.set_verbosity(tf_logging.DEBUG)
Ejemplo n.º 3
0
 def default_config(self, **kwargs):
     defaults = {
         'batch_size': 2,
         'max_length': 128,
         'n_epochs': 1
     }
     defaults.update(kwargs)
     return dict(get_config(**defaults))
Ejemplo n.º 4
0
    def finetune_grid_search(cls,
                             Xs,
                             Y,
                             *,
                             test_size,
                             eval_fn=None,
                             probs=False,
                             return_all=False,
                             **kwargs):
        """
        Performs grid search over config items defined using "GridSearchable" objects and returns either full results or
        the config object that relates to the best results. The default config contains grid searchable objects for the
        most important parameters to search over.

        :param Xs: Input text. Either [num_samples] or [sequence, num_samples] for single or multi input models respectively.
        :param Y: Targets, A list of targets, [num_samples] that correspond to each sample in Xs.
        :param test_size: Int or float. If an int is given this number of samples is used to validate, if a float is
         given then that fraction of samples is used.
        :param eval_fn: An eval function that takes 2 inputs (prediction, truth) and returns a float, with a max value being desired.
        :param probs: If true, eval_fn is passed probability outputs from predict_proba, otherwise the output of predict is used.
        :param return_all: If True, all results are returned, if False, only the best config is returned.
        :param kwargs: Keyword arguments to pass to get_config()
        :return: default is to return the best config object. If return_all is true, it returns a list of tuples of the
            form [(config, eval_fn output), ... ]
        """
        if isinstance(Xs[0], str):
            Xs = [Xs]
        config = get_config(**kwargs)
        config.val_size = 0.0
        eval_fn = eval_fn or cls.get_eval_fn()

        trainXs, testXs, trainY, testY = train_test_split(list_transpose(Xs),
                                                          Y,
                                                          test_size=test_size,
                                                          shuffle=True)
        trainXs = list_transpose(trainXs)
        testXs = list_transpose(testXs)
        gs = config.get_grid_searchable()
        ranged_keys = gs.keys()
        ranged_iterators = gs.values()
        grid_gen = itertools.product(*ranged_iterators)
        results = []
        for grid_item in grid_gen:
            config_ = deepcopy(config)
            config_.update(dict(zip(ranged_keys, grid_item)))
            instance = cls(config=config_)
            instance.finetune(*trainXs, Y=trainY)
            if probs:
                res = instance.predict_proba(*testXs)
            else:
                res = instance.predict(*testXs)
            results.append((config_, eval_fn(res, testY)))
            del instance

        if return_all:
            return results
        return max(results, key=lambda x: x[1])[0]
 def default_config(self, **kwargs):
     return get_config(
         batch_size=2,
         max_length=128,
         n_epochs=2,
         verbose=False,
         l2_reg=0.,
         clf_p_drop=0.,
         **kwargs
     )
Ejemplo n.º 6
0
def default_config(**kwargs):
    return dict(get_config(
        base_model=TextCNN,
        batch_size=2,
        max_length=128,
        n_epochs=1,
        lm_loss_coef=0,
        val_size=0,
        **kwargs
    ))
Ejemplo n.º 7
0
 def default_config(cls, **kwargs):
     defaults = {
         'batch_size': 2,
         'max_length': 256,
         'n_epochs': 3,
         'adapter_size': 64,
         'base_model': cls.base_model
     }
     defaults.update(kwargs)
     return dict(get_config(**defaults))
Ejemplo n.º 8
0
 def default_config(self, **kwargs):
     defaults = {
         "batch_size": 2,
         "max_length": 256,
         "n_epochs": 3,
         "base_model": self.base_model,
         "val_size": 0,
         "default_context": self.default_context
     }
     defaults.update(kwargs)
     return dict(get_config(**defaults))
Ejemplo n.º 9
0
 def setUp(self):
     self.save_file = 'tests/saved-models/test-save-load'
     config = get_config(batch_size=2, max_length=256, verbose=False)
     self.model = Model(config=config)
     self.dataset = pd.read_csv(self.dataset_path)
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     self.text_data_train = list(
         zip(train_sample.Text, train_sample.Text, train_sample.Text))
     self.text_data_valid = list(
         zip(valid_sample.Text, valid_sample.Text, valid_sample.Text))
     self.train_targets = train_sample.Target
     tf.reset_default_graph()
Ejemplo n.º 10
0
    def default_config(cls, **kwargs):
        defaults = dict(base_model=cls.base_model,
                        batch_size=2,
                        max_length=128,
                        val_size=0,
                        lm_loss_coef=0.0,
                        **cls.model_specific_config)

        return dict(
            get_config(
                **kwargs,
                **{k: v
                   for k, v in defaults.items() if k not in kwargs}))
Ejemplo n.º 11
0
 def default_config(self, **kwargs):
     defaults = {
         "lr": 1e-4,
         "n_context_embed": 768,
         "batch_size": 3,
         "max_length": 32,
         "n_epochs": 1,  # we mostly are making sure nothing errors out
         "base_model": self.base_model,
         "val_size": 0,
         "use_auxiliary_info": True,
         "context_dim": 1,
         "default_context": {
             'bold': False
         }
     }
     defaults.update(kwargs)
     return dict(get_config(**defaults))
Ejemplo n.º 12
0
 def test_tokenize_context(self):
     encoded_output = ArrayEncodedOutput(
         token_ids=[[40478, 40481], [1180, 40482], [535, 40483],
                    [808, 40484], [289, 40485], [17164, 40486],
                    [40480, 40487]],
         tokens=[
             40478, 'everything</w>', "'s</w>", 'only</w>', '$</w>',
             '80</w>', 40480
         ],
         labels=[0] * 7,
         char_locs=[-1, 10, 12, 17, 19, 21, -1],
         mask=[0, 1, 1, 1, 1, 1, 0],
     )
     context = [
         {
             'token': "everything's",
             'start': 0,
             'end': 12,
             'left': 10,
             'bold': False
         },
         {
             'token': "only",
             'start': 13,
             'end': 17,
             'left': 20,
             'bold': False
         },
         {
             'token': "$80",
             'start': 18,
             'end': 21,
             'left': 30,
             'bold': True
         },
     ]
     config = get_config(**{'default_context': {'left': 0, 'bold': False}})
     expanded_context = tokenize_context(context, encoded_output, config)
     expected = [[False, 0], [False, 10], [False, 10], [False, 20],
                 [True, 30], [True, 30], [False, 0]]
     np.testing.assert_array_equal(expected, expanded_context)
Ejemplo n.º 13
0
    def __init__(self, featurizer, **kwargs):
        """
        For a full list of configuration options, see `finetune.config`.

        :param base_model: One of the base models from finetune.base_models, excluding textcnn.
        :param **kwargs: key-value pairs of config items to override.
        """
        if featurizer not in [
                GPTModel, GPTModelSmall, BERTModelCased, GPT2Model
        ]:
            raise FinetuneError("Selected base model not supported.")
        self.config = get_config(**kwargs)
        self.validate_config()
        self.input_pipeline = DeploymentPipeline(self.config)
        super().__init__(**kwargs)
        self.config.base_model = featurizer
        self.task = TaskMode.CLASSIFICATION
        self.input_pipeline.task = self.task
        self.featurizer_loaded = False
        self.adapters = False
        self.loaded_custom_previously = False
Ejemplo n.º 14
0
 def default_config(self, **kwargs):
     return get_config(batch_size=2,
                       max_length=128,
                       n_epochs=1,
                       verbose=False,
                       **kwargs)
Ejemplo n.º 15
0
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Tag

from finetune import SequenceLabeler
from finetune.config import get_config
from finetune.encoding.sequence_encoder import finetune_to_indico_sequence
from finetune.util.metrics import (sequence_labeling_token_precision,
                                   sequence_labeling_token_recall,
                                   sequence_labeling_overlap_precision,
                                   sequence_labeling_overlap_recall)

SKIP_LM_TESTS = get_config().base_model.is_bidirectional


class TestSequenceLabeler(unittest.TestCase):

    n_sample = 100
    dataset_path = os.path.join('Data', 'Sequence', 'reuters.xml')
    processed_path = os.path.join('Data', 'Sequence', 'reuters.json')

    @classmethod
    def _download_reuters(cls):
        """
        Download Stanford Sentiment Treebank to enso `data` directory
        """
        path = Path(cls.dataset_path)
        if not path.exists():
Ejemplo n.º 16
0
 def load(self, path):
     self.variables, finetune_obj = joblib.load(path)
     finetune_obj.config = get_config(error_on_invalid_keywords=False,
                                      **dict(finetune_obj.config))
     return finetune_obj
Ejemplo n.º 17
0
 def default_config(self, **kwargs):
     defaults = {"batch_size": 2, "max_length": 128, "n_epochs": 1}
     defaults.update(kwargs)
     return dict(get_config(**defaults))
Ejemplo n.º 18
0
 def load(self, path):
     self.variables, finetune_obj = joblib.load(path)
     finetune_obj.config = get_config(**dict(finetune_obj.config))
     return finetune_obj