Esempio n. 1
0
    def __init__(self,
                 emb_folder: str,
                 emb_url: str,
                 save_path: str,
                 load_path: str,
                 context_limit: int = 450,
                 question_limit: int = 150,
                 char_limit: int = 16,
                 level: str = 'token',
                 *args,
                 **kwargs):
        self.emb_folder = expand_path(emb_folder)
        self.level = level
        self.emb_url = emb_url
        self.emb_file_name = Path(emb_url).name
        self.save_path = expand_path(save_path)
        self.load_path = expand_path(load_path)
        self.context_limit = context_limit
        self.question_limit = question_limit
        self.char_limit = char_limit
        self.loaded = False

        self.NULL = "<NULL>"
        self.OOV = "<OOV>"

        self.emb_folder.mkdir(parents=True, exist_ok=True)

        if not (self.emb_folder / self.emb_file_name).exists():
            download(self.emb_folder / self.emb_file_name, self.emb_url)

        if self.load_path.exists():
            self.load()
Esempio n. 2
0
def get_config_downloads(
        config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]:
    config = parse_config(config)

    downloads = set()
    if 'metadata' in config and 'download' in config['metadata']:
        for resource in config['metadata']['download']:
            if isinstance(resource, str):
                resource = {'url': resource}

            url = resource['url']
            dest = expand_path(resource.get('subdir', ''))

            downloads.add((url, dest))

    config_references = [
        expand_path(config_ref)
        for config_ref in get_all_elems_from_json(config, 'config_path')
    ]

    downloads |= {(url, dest)
                  for config in config_references
                  for url, dest in get_config_downloads(config)}

    return downloads
Esempio n. 3
0
    def __init__(
        self,
        vocabs_path,
        save_path,
        load_path,
        max_sequence_length,
        padding="post",
        truncating="pre",
    ):
        self.max_sequence_length = max_sequence_length
        self.padding = padding
        self.truncating = truncating

        save_path = expand_path(save_path).resolve().parent
        load_path = expand_path(load_path).resolve().parent

        self.vocabs_path = expand_path(vocabs_path)
        self.tok_save_path = save_path / "tok2int.dict"
        self.tok_load_path = load_path / "tok2int.dict"
        self.cont_save_path = save_path / "cont2toks.dict"
        self.cont_load_path = load_path / "cont2toks.dict"
        self.resp_save_path = save_path / "resp2toks.dict"
        self.resp_load_path = load_path / "resp2toks.dict"
        self.cemb_save_path = str(save_path / "context_embs.npy")
        self.cemb_load_path = str(load_path / "context_embs.npy")
        self.remb_save_path = str(save_path / "response_embs.npy")
        self.remb_load_path = str(load_path / "response_embs.npy")

        self.int2tok_vocab = {}
        self.tok2int_vocab = {}
        self.response2toks_vocab = {}
        self.response2emb_vocab = {}
        self.context2toks_vocab = {}
        self.context2emb_vocab = {}
Esempio n. 4
0
def read_data_by_config(config: dict):
    """Read data by dataset_reader from specified config."""
    dataset_config = config.get('dataset', None)

    if dataset_config:
        config.pop('dataset')
        ds_type = dataset_config['type']
        if ds_type == 'classification':
            reader = {'class_name': 'basic_classification_reader'}
            iterator = {'class_name': 'basic_classification_iterator'}
            config['dataset_reader'] = {**dataset_config, **reader}
            config['dataset_iterator'] = {**dataset_config, **iterator}
        else:
            raise Exception("Unsupported dataset type: {}".format(ds_type))

    try:
        reader_config = dict(config['dataset_reader'])
    except KeyError:
        raise ConfigError("No dataset reader is provided in the JSON config.")

    reader = get_model(reader_config.pop('class_name'))()
    data_path = reader_config.pop('data_path', '')
    if isinstance(data_path, list):
        data_path = [expand_path(x) for x in data_path]
    else:
        data_path = expand_path(data_path)

    return reader.read(data_path, **reader_config)
Esempio n. 5
0
    def __init__(self,
                 save_path: str = './tok.dict',
                 load_path: str = './tok.dict',
                 max_sequence_length: int = None,
                 dynamic_batch: bool = False,
                 padding: str = 'post',
                 truncating: str = 'post',
                 use_matrix: bool = True,
                 num_context_turns: int = 1,
                 num_ranking_samples: int = 1,
                 add_raw_text: bool = False,
                 tokenizer: Component = None,
                 vocab: Optional[Estimator] = None,
                 embedder: Optional[Component] = None,
                 sent_vocab: Optional[Estimator] = None,
                 **kwargs):

        self.max_sequence_length = max_sequence_length
        self.padding = padding
        self.truncating = truncating
        self.dynamic_batch = dynamic_batch
        self.use_matrix = use_matrix
        self.num_ranking_samples = num_ranking_samples
        self.num_context_turns = num_context_turns
        self.add_raw_text = add_raw_text
        self.tokenizer = tokenizer
        self.embedder = embedder
        self.vocab = vocab
        self.sent_vocab = sent_vocab
        self.save_path = expand_path(save_path).resolve()
        self.load_path = expand_path(load_path).resolve()

        super().__init__(load_path=self.load_path,
                         save_path=self.save_path,
                         **kwargs)
Esempio n. 6
0
def get_config_downloads(config_path):
    dp_root_back = get_deeppavlov_root()
    config = read_json(config_path)
    set_deeppavlov_root(config)

    downloads = set()
    if 'metadata' in config and 'download' in config['metadata']:
        for resource in config['metadata']['download']:
            if isinstance(resource, str):
                resource = {
                    'url': resource
                }

            url = resource['url']
            dest = expand_path(resource.get('subdir', ''))

            downloads.add((url, dest))

    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]

    downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)}

    set_deeppavlov_root({'deeppavlov_root': dp_root_back})

    return downloads
Esempio n. 7
0
    def __init__(self,
                 save_path,
                 load_path=None,
                 mode='infer',
                 *args,
                 **kwargs):

        if save_path:
            self.save_path = expand_path(save_path)
            self.save_path.parent.mkdir(parents=True, exist_ok=True)
        else:
            self.save_path = None

        if load_path:
            self.load_path = expand_path(load_path)
            if mode != 'train' and self.save_path and self.load_path != self.save_path:
                log.warning(
                    "Load path '{}' differs from save path '{}' in '{}' mode for {}."
                    .format(self.load_path, self.save_path, mode,
                            self.__class__.__name__))
        elif mode != 'train' and self.save_path:
            self.load_path = self.save_path
            log.warning(
                "No load path is set for {} in '{}' mode. Using save path instead"
                .format(self.__class__.__name__, mode))
        else:
            self.load_path = None
            log.warning("No load path is set for {}!".format(
                self.__class__.__name__))
Esempio n. 8
0
    def __init__(self,
                 bert_config_file: str,
                 n_tags: List[str],
                 keep_prob: float,
                 attention_probs_keep_prob: float = None,
                 hidden_keep_prob: float = None,
                 encoder_layer_ids: List[int] = tuple(range(12)),
                 optimizer: str = None,
                 num_warmup_steps: int = None,
                 weight_decay_rate: float = 0.01,
                 return_probas: bool = False,
                 pretrained_bert: str = None,
                 min_learning_rate: float = 1e-06,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self.return_probas = return_probas
        self.n_tags = n_tags
        self.min_learning_rate = min_learning_rate
        self.keep_prob = keep_prob
        self.optimizer = optimizer
        self.encoder_layer_ids = encoder_layer_ids
        self.num_warmup_steps = num_warmup_steps
        self.weight_decay_rate = weight_decay_rate

        self.bert_config = BertConfig.from_json_file(
            str(expand_path(bert_config_file)))

        if attention_probs_keep_prob is not None:
            self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
        if hidden_keep_prob is not None:
            self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob

        self.sess_config = tf.ConfigProto(allow_soft_placement=True)
        self.sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.sess_config)

        self._init_graph()

        self._init_optimizer()

        self.sess.run(tf.global_variables_initializer())

        if pretrained_bert is not None:
            pretrained_bert = str(expand_path(pretrained_bert))

        if tf.train.checkpoint_exists(pretrained_bert) \
                and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
            logger.info('[initializing model with Bert from {}]'.format(
                pretrained_bert))
            # Exclude optimizer and classification variables from saved variables
            var_list = self._get_saveable_variables(
                exclude_scopes=('Optimizer', 'learning_rate', 'momentum',
                                'ner'))
            saver = tf.train.Saver(var_list)
            saver.restore(self.sess, pretrained_bert)

        if self.load_path is not None:
            self.load()
Esempio n. 9
0
    def load(self, fname=None):
        if fname is not None:
            self.load_path = fname

        self.pretrained_bert = str(expand_path(self.pretrained_bert))
        if self.pretrained_bert:
            config = AutoConfig.from_pretrained(self.pretrained_bert,
                                                num_labels=self.n_classes,
                                                output_attentions=False,
                                                output_hidden_states=False)
            self.model = AutoModelForTokenClassification.from_pretrained(
                self.pretrained_bert, config=config)
        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = AutoConfig.from_json_file(
                str(expand_path(self.bert_config_file)))

            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = AutoModelForTokenClassification(
                config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)

        self.optimizer = getattr(torch.optim, self.optimizer_name)(
            self.model.parameters(), **self.optimizer_parameters)
        if self.lr_scheduler_name is not None:
            self.lr_scheduler = getattr(torch.optim.lr_scheduler,
                                        self.lr_scheduler_name)(
                                            self.optimizer,
                                            **self.lr_scheduler_parameters)

        if self.load_path:
            log.info(f"Load path {self.load_path} is given.")
            if isinstance(self.load_path,
                          Path) and not self.load_path.parent.is_dir():
                raise ConfigError("Provided load path is incorrect!")

            weights_path = Path(self.load_path.resolve())
            weights_path = weights_path.with_suffix(".pth.tar")
            if weights_path.exists():
                log.info(f"Load path {weights_path} exists.")
                log.info(
                    f"Initializing `{self.__class__.__name__}` from saved.")

                # now load the weights, optimizer from saved
                log.info(f"Loading weights from {weights_path}.")
                checkpoint = torch.load(weights_path, map_location=self.device)
                self.model.load_state_dict(checkpoint["model_state_dict"])
                self.optimizer.load_state_dict(
                    checkpoint["optimizer_state_dict"])
                self.epochs_done = checkpoint.get("epochs_done", 0)
            else:
                log.info(
                    f"Init from scratch. Load path {weights_path} does not exist."
                )
Esempio n. 10
0
def run_population(population, evolution, gpus):
    """
    Change save and load paths for obtained population, save config.json with model config,
    run population via current python executor (with which evolve.py already run)
    and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs)
    Args:
        population: list of dictionaries - configs of current population
        evolution: ParamsEvolution
        gpus: list of given devices (list of integers)

    Returns:
        None
    """
    population_size = len(population)
    for k in range(population_size // len(gpus) + 1):
        procs = []
        for j in range(len(gpus)):
            i = k * len(gpus) + j
            if i < population_size:
                save_path = expand_path(
                    Path(
                        evolution.get_value_from_config(
                            population[i],
                            evolution.main_model_path + ["save_path"])).parent)

                save_path.mkdir(parents=True, exist_ok=True)
                f_name = save_path.joinpath("config.json")
                save_json(population[i], f_name)

                with save_path.joinpath('out.txt').open('w', encoding='utf8') as outlog,\
                        save_path.joinpath('err.txt').open('w', encoding='utf8') as errlog:
                    env = dict(os.environ)
                    if len(gpus) > 1 or gpus[0] != -1:
                        env['CUDA_VISIBLE_DEVICES'] = str(gpus[j])

                    procs.append(
                        Popen("{} -m deeppavlov train {}".format(
                            sys.executable, str(f_name)),
                              shell=True,
                              stdout=outlog,
                              stderr=errlog,
                              env=env))
        for j, proc in enumerate(procs):
            i = k * len(gpus) + j
            log.info(f'Waiting on {i}th proc')
            if proc.wait() != 0:
                save_path = expand_path(
                    Path(
                        evolution.get_value_from_config(
                            population[i],
                            evolution.main_model_path + ["save_path"])).parent)

                with save_path.joinpath('err.txt').open(
                        encoding='utf8') as errlog:
                    log.warning(
                        f'Population {i} returned an error code {proc.returncode} and an error log:\n'
                        + errlog.read())
    return None
Esempio n. 11
0
 def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True,
              **kwargs) -> None:
     pop_dict_path = expand_path(pop_dict_path)
     logger.info(f"Reading popularity dictionary from {pop_dict_path}")
     self.pop_dict = read_json(pop_dict_path)
     self.mean_pop = np.mean(list(self.pop_dict.values()))
     load_path = expand_path(load_path)
     logger.info(f"Loading popularity ranker from {load_path}")
     self.clf = joblib.load(load_path)
     self.top_n = top_n
     self.active = active
Esempio n. 12
0
 def __init__(self, pop_dict_path: str, load_path: str, top_n: int = 3, active: bool = True,
              **kwargs) -> None:
     pop_dict_path = expand_path(pop_dict_path)
     logger.info(f"Reading popularity dictionary from {pop_dict_path}")
     self.pop_dict = read_json(pop_dict_path)
     self.mean_pop = np.mean(list(self.pop_dict.values()))
     load_path = expand_path(load_path)
     logger.info(f"Loading popularity ranker from {load_path}")
     self.clf = joblib.load(load_path)
     self.top_n = top_n
     self.active = active
Esempio n. 13
0
    def __init__(self, bert_config_file, n_classes, keep_prob,
                 one_hot_labels=False, multilabel=False, return_probas=False,
                 attention_probs_keep_prob=None, hidden_keep_prob=None,
                 optimizer=None, num_warmup_steps=None, weight_decay_rate=0.01,
                 pretrained_bert=None, min_learning_rate=1e-06, **kwargs) -> None:
        super().__init__(**kwargs)

        self.return_probas = return_probas
        self.n_classes = n_classes
        self.min_learning_rate = min_learning_rate
        self.keep_prob = keep_prob
        self.one_hot_labels = one_hot_labels
        self.multilabel = multilabel
        self.optimizer = optimizer
        self.num_warmup_steps = num_warmup_steps
        self.weight_decay_rate = weight_decay_rate

        if self.multilabel and not self.one_hot_labels:
            raise RuntimeError('Use one-hot encoded labels for multilabel classification!')

        if self.multilabel and not self.return_probas:
            raise RuntimeError('Set return_probas to True for multilabel classification!')

        self.bert_config = BertConfig.from_json_file(str(expand_path(bert_config_file)))

        if attention_probs_keep_prob is not None:
            self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
        if hidden_keep_prob is not None:
            self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob

        self.sess_config = tf.ConfigProto(allow_soft_placement=True)
        self.sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.sess_config)

        self._init_graph()

        self._init_optimizer()

        self.sess.run(tf.global_variables_initializer())

        if pretrained_bert is not None:
            pretrained_bert = str(expand_path(pretrained_bert))

            if tf.train.checkpoint_exists(pretrained_bert) \
                    and not (self.load_path and tf.train.checkpoint_exists(str(self.load_path.resolve()))):
                logger.info('[initializing model with Bert from {}]'.format(pretrained_bert))
                # Exclude optimizer and classification variables from saved variables
                var_list = self._get_saveable_variables(
                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum', 'output_weights', 'output_bias'))
                saver = tf.train.Saver(var_list)
                saver.restore(self.sess, pretrained_bert)

        if self.load_path is not None:
            self.load()
Esempio n. 14
0
    def __init__(self,
                 bert_config_file,
                 keep_prob=0.9,
                 attention_probs_keep_prob=None,
                 hidden_keep_prob=None,
                 optimizer=None,
                 weight_decay_rate=0.01,
                 pretrained_bert=None,
                 min_learning_rate=1e-06,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self.min_learning_rate = min_learning_rate
        self.keep_prob = keep_prob
        self.optimizer = optimizer
        self.weight_decay_rate = weight_decay_rate

        self.bert_config = BertConfig.from_json_file(
            str(expand_path(bert_config_file)))

        if attention_probs_keep_prob is not None:
            self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
        if hidden_keep_prob is not None:
            self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob

        self.sess_config = tf.ConfigProto(allow_soft_placement=True)
        self.sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.sess_config)

        self._init_graph()

        self._init_optimizer()

        if pretrained_bert is not None:
            pretrained_bert = str(expand_path(pretrained_bert))

            if tf.train.checkpoint_exists(pretrained_bert) \
                    and not tf.train.checkpoint_exists(str(self.load_path.resolve())):
                logger.info('[initializing model with Bert from {}]'.format(
                    pretrained_bert))
                # Exclude optimizer and classification variables from saved variables
                var_list = self._get_saveable_variables(
                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum',
                                    'output_weights', 'output_bias'))
                assignment_map = self.get_variables_to_restore(
                    var_list, pretrained_bert)
                tf.train.init_from_checkpoint(pretrained_bert, assignment_map)

        self.sess.run(tf.global_variables_initializer())

        if self.load_path is not None:
            self.load()
Esempio n. 15
0
    def __init__(self,
                 embedder: Component,
                 tokenizer: Component = None,
                 pad_zero: bool = False,
                 mean: bool = False,
                 tags_vocab_path: str = None,
                 vectorizer: Component = None,
                 counter_vocab_path: str = None,
                 idf_base_count: int = 100,
                 log_base: int = 10,
                 min_idf_weight=0.0,
                 **kwargs) -> None:
        """
        Initialize embedder with given parameters.
        """
        self.embedder = embedder
        self.dim = self.embedder.dim
        self.mean = mean
        self.pad_zero = pad_zero

        if tokenizer is None:
            self.tokenizer = self.space_detokenizer
        else:
            self.tokenizer = tokenizer

        if vectorizer and counter_vocab_path:
            raise ConfigError(
                "TfidfWeightedEmbedder got vectorizer and counter_vocab_path simultaneously."
                " Remove one of them, please")
        elif vectorizer:
            self.vectorizer = vectorizer
            self.vocabulary = np.array(
                self.vectorizer.model.get_feature_names())
        elif counter_vocab_path:
            self.counter_vocab_path = expand_path(counter_vocab_path)
            self.counter_vocab, self.min_count = self.load_counter_vocab(
                self.vocab_path)
            self.idf_base_count = idf_base_count
            self.log_base = log_base
            self.min_idf_weight = min_idf_weight
        else:
            raise ConfigError(
                "TfidfWeightedEmbedder did not get vectorizer or counter_vocab_path."
                " Set one of them, please")

        if tags_vocab_path:
            self.tags_vocab = self.load_tags_vocab(
                expand_path(tags_vocab_path))
        else:
            self.tags_vocab = None
Esempio n. 16
0
    def __init__(self,
                 bert_config_file: str,
                 keep_prob: float,
                 attention_probs_keep_prob: Optional[float] = None,
                 hidden_keep_prob: Optional[float] = None,
                 optimizer: Optional[str] = None,
                 weight_decay_rate: Optional[float] = 0.01,
                 pretrained_bert: Optional[str] = None,
                 min_learning_rate: float = 1e-06,
                 **kwargs) -> None:
        super().__init__(**kwargs)

        self.min_learning_rate = min_learning_rate
        self.keep_prob = keep_prob
        self.optimizer = optimizer
        self.weight_decay_rate = weight_decay_rate

        self.bert_config = BertConfig.from_json_file(
            str(expand_path(bert_config_file)))

        if attention_probs_keep_prob is not None:
            self.bert_config.attention_probs_dropout_prob = 1.0 - attention_probs_keep_prob
        if hidden_keep_prob is not None:
            self.bert_config.hidden_dropout_prob = 1.0 - hidden_keep_prob

        self.sess_config = tf.ConfigProto(allow_soft_placement=True)
        self.sess_config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.sess_config)

        self._init_graph()

        self._init_optimizer()

        self.sess.run(tf.global_variables_initializer())

        if pretrained_bert is not None:
            pretrained_bert = str(expand_path(pretrained_bert))

            if tf.train.checkpoint_exists(pretrained_bert) \
                    and not (self.load_path and tf.train.checkpoint_exists(str(self.load_path.resolve()))):
                logger.info('[initializing model with Bert from {}]'.format(
                    pretrained_bert))
                var_list = self._get_saveable_variables(
                    exclude_scopes=('Optimizer', 'learning_rate', 'momentum',
                                    'squad'))
                saver = tf.train.Saver(var_list)
                saver.restore(self.sess, pretrained_bert)

        if self.load_path is not None:
            self.load()
Esempio n. 17
0
    def __init__(self,
                 save_path: str,
                 load_path: str,
                 max_sequence_length: int,
                 max_token_length: int,
                 padding: str = 'post',
                 truncating: str = 'post',
                 token_embeddings: bool = True,
                 char_embeddings: bool = False,
                 char_pad: str = 'post',
                 char_trunc: str = 'post',
                 tok_dynamic_batch: bool = False,
                 char_dynamic_batch: bool = False,
                 update_embeddings: bool = False):

        self.max_sequence_length = max_sequence_length
        self.token_embeddings = token_embeddings
        self.char_embeddings = char_embeddings
        self.max_token_length = max_token_length
        self.padding = padding
        self.truncating = truncating
        self.char_pad = char_pad
        self.char_trunc = char_trunc
        self.tok_dynamic_batch = tok_dynamic_batch
        self.char_dynamic_batch = char_dynamic_batch
        self.upd_embs = update_embeddings

        save_path = expand_path(save_path).resolve().parent
        load_path = expand_path(load_path).resolve().parent

        self.char_save_path = save_path / "char2int.dict"
        self.char_load_path = load_path / "char2int.dict"
        self.tok_save_path = save_path / "tok2int.dict"
        self.tok_load_path = load_path / "tok2int.dict"
        self.cont_save_path = save_path / "cont2toks.dict"
        self.cont_load_path = load_path / "cont2toks.dict"
        self.resp_save_path = save_path / "resp2toks.dict"
        self.resp_load_path = load_path / "resp2toks.dict"
        self.cemb_save_path = str(save_path / "context_embs.npy")
        self.cemb_load_path = str(load_path / "context_embs.npy")
        self.remb_save_path = str(save_path / "response_embs.npy")
        self.remb_load_path = str(load_path / "response_embs.npy")

        self.int2tok_vocab = {}
        self.tok2int_vocab = {}
        self.response2toks_vocab = {}
        self.response2emb_vocab = {}
        self.context2toks_vocab = {}
        self.context2emb_vocab = {}
Esempio n. 18
0
    def __init__(self,
                 save_path: str,
                 load_path: str,
                 max_sequence_length: int,
                 max_token_length: int,
                 padding: str = 'post',
                 truncating: str = 'post',
                 token_embeddings: bool = True,
                 char_embeddings: bool = False,
                 char_pad: str = 'post',
                 char_trunc: str = 'post',
                 tok_dynamic_batch: bool = False,
                 char_dynamic_batch: bool = False,
                 update_embeddings: bool = False):

        self.max_sequence_length = max_sequence_length
        self.token_embeddings = token_embeddings
        self.char_embeddings = char_embeddings
        self.max_token_length = max_token_length
        self.padding = padding
        self.truncating = truncating
        self.char_pad = char_pad
        self.char_trunc = char_trunc
        self.tok_dynamic_batch = tok_dynamic_batch
        self.char_dynamic_batch = char_dynamic_batch
        self.upd_embs = update_embeddings

        save_path = expand_path(save_path).resolve().parent
        load_path = expand_path(load_path).resolve().parent

        self.char_save_path = save_path / "char2int.dict"
        self.char_load_path = load_path / "char2int.dict"
        self.tok_save_path = save_path / "tok2int.dict"
        self.tok_load_path = load_path / "tok2int.dict"
        self.cont_save_path = save_path / "cont2toks.dict"
        self.cont_load_path = load_path / "cont2toks.dict"
        self.resp_save_path = save_path / "resp2toks.dict"
        self.resp_load_path = load_path / "resp2toks.dict"
        self.cemb_save_path = str(save_path / "context_embs.npy")
        self.cemb_load_path = str(load_path / "context_embs.npy")
        self.remb_save_path = str(save_path / "response_embs.npy")
        self.remb_load_path = str(load_path / "response_embs.npy")

        self.int2tok_vocab = {}
        self.tok2int_vocab = {}
        self.response2toks_vocab = {}
        self.response2emb_vocab = {}
        self.context2toks_vocab = {}
        self.context2emb_vocab = {}
Esempio n. 19
0
    def __init__(self,
                 freq_dict_filename: str,
                 candidate_nouns: int = 10,
                 **kwargs):
        """

        Args:
            freq_dict_filename: file with the dictionary of Russian words with the corresponding frequencies
            candidate_nouns: how many candidate nouns to leave after search
            **kwargs:
        """
        self.candidate_nouns = candidate_nouns
        alphabet = "абвгдеёжзийклмнопрстуфхцчшщъыьэюя-"
        self.alphabet_length = len(alphabet)
        self.max_word_length = 24
        self.letter_nums = {letter: num for num, letter in enumerate(alphabet)}
        with open(str(expand_path(freq_dict_filename)), 'r') as fl:
            lines = fl.readlines()
        pos_freq_dict = defaultdict(list)
        for line in lines:
            line_split = line.strip('\n').split('\t')
            if re.match("[\d]+\.[\d]+", line_split[2]):
                pos_freq_dict[line_split[1]].append(
                    (line_split[0], float(line_split[2])))
        self.nouns_with_freq = pos_freq_dict["s"] + pos_freq_dict["s.PROP"]
        self.adj_set = set([word for word, freq in pos_freq_dict["a"]])
        self.nouns = [noun[0] for noun in self.nouns_with_freq]

        self.matrix = self.make_sparse_matrix(self.nouns).transpose()
Esempio n. 20
0
    def __init__(self, spec: str, elmo_output_names: Optional[List] = None,
                 dim: Optional[int] = None, pad_zero: bool = False,
                 concat_last_axis: bool = True, max_token: Optional[int] = None,
                 mini_batch_size: int = 32, **kwargs) -> None:

        self.spec = spec if '://' in spec else str(expand_path(spec))

        self.elmo_output_dims = {'word_emb': 512,
                                 'lstm_outputs1': 1024,
                                 'lstm_outputs2': 1024,
                                 'elmo': 1024,
                                 'default': 1024}
        elmo_output_names = elmo_output_names or ['default']
        self.elmo_output_names = elmo_output_names
        elmo_output_names_set = set(self.elmo_output_names)
        if elmo_output_names_set - set(self.elmo_output_dims.keys()):
            log.error(f'Incorrect elmo_output_names = {elmo_output_names} . You can use either  ["default"] or some of'
                      '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]')
            sys.exit(1)

        if elmo_output_names_set - {'default'} and elmo_output_names_set - {"word_emb", "lstm_outputs1",
                                                                            "lstm_outputs2", "elmo"}:
            log.error('Incompatible conditions: you can use either  ["default"] or list of '
                      '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"] ')
            sys.exit(1)

        self.pad_zero = pad_zero
        self.concat_last_axis = concat_last_axis
        self.max_token = max_token
        self.mini_batch_size = mini_batch_size
        self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load()
        self.dim = self._get_dims(self.elmo_output_names, dim, concat_last_axis)
Esempio n. 21
0
    def __init__(self, data_path: Union[Path, str], *args, **kwargs):
        log.info(f"Initializing `{self.__class__.__name__}`")
        data_path = Path(expand_path(data_path))
        required_files = [
            'obscenity_words.json', 'obscenity_words_exception.json',
            'obscenity_words_extended.json'
        ]
        for file in required_files:
            if not (data_path / file).exists():
                raise FileNotFoundError(errno.ENOENT,
                                        os.strerror(errno.ENOENT),
                                        str(data_path / file))

        self.obscenity_words = set(
            json.load(
                open(data_path / 'obscenity_words.json', encoding="utf-8")))
        self.obscenity_words_extended = set(
            json.load(
                open(data_path / 'obscenity_words_extended.json',
                     encoding="utf-8")))
        self.obscenity_words_exception = set(
            json.load(
                open(data_path / 'obscenity_words_exception.json',
                     encoding="utf-8")))
        self.obscenity_words.update(self.obscenity_words_extended)

        PATTERN_1, PATTERN_2 = self._get_patterns()
        self.regexp = re.compile(PATTERN_1, re.U | re.I)
        self.regexp2 = re.compile(PATTERN_2, re.U | re.I)
        self.morph = pymorphy2.MorphAnalyzer()
        self.word_pattern = re.compile(r'[А-яЁё]+')
Esempio n. 22
0
    def __init__(self,
                 data_dir: str = '',
                 data_url: str = DB_URL,
                 batch_size: int = None,
                 shuffle: bool = None,
                 seed: int = None,
                 **kwargs):
        """
        :param data_dir: a directory name where DB is located
        :param data_url: an URL to SQLite DB
        :param batch_size: a batch size for reading from the database
        """
        download_dir = expand_path(data_dir)
        download_path = download_dir.joinpath(data_url.split("/")[-1])
        download(download_path, data_url, force_download=False)

        # if not download_dir.exists() or is_empty(download_dir):
        #     logger.info('[downloading wiki.db from {} to {}]'.format(data_url, download_path))
        #     download(download_path, data_url)

        self.connect = sqlite3.connect(str(download_path),
                                       check_same_thread=False)
        self.db_name = self.get_db_name()
        self.doc_ids = self.get_doc_ids()
        self.doc2index = self.map_doc2idx()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.random = Random(seed)
    def __init__(self,
                 squad_model_config: str,
                 vocab_file: str,
                 do_lower_case: bool,
                 max_seq_length: int = 512,
                 batch_size: int = 10,
                 lang: str = 'en',
                 **kwargs) -> None:
        config = json.load(open(squad_model_config))
        config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length
        self.model = build_model(config)
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

        self.batch_size = batch_size

        if lang == 'en':
            from nltk import sent_tokenize
            self.sent_tokenizer = sent_tokenize
        elif lang == 'ru':
            from ru_sent_tokenize import ru_sent_tokenize
            self.sent_tokenizer = ru_sent_tokenize
        else:
            raise RuntimeError('en and ru languages are supported only')
Esempio n. 24
0
    def __init__(self,
                 dictionary: StaticDictionary,
                 window=1,
                 lm_file=None,
                 *args,
                 **kwargs):

        super().__init__(*args, **kwargs)
        self.costs = defaultdict(itertools.repeat(float('-inf')).__next__)
        self.dictionary = dictionary
        self.window = window
        if self.window == 0:
            self.find_candidates = self._find_candidates_window_0
        else:
            self.find_candidates = self._find_candidates_window_n
        self.costs[('', '')] = log(1)
        self.costs[('⟬', '⟬')] = log(1)
        self.costs[('⟭', '⟭')] = log(1)

        for c in self.dictionary.alphabet:
            self.costs[(c, c)] = log(1)
        # if self.ser_path.is_file():
        self.load()

        if lm_file:
            self.lm = kenlm.Model(str(expand_path(lm_file)))
            self.beam_size = 4
            self.candidates_count = 4
            self._infer_instance = self._infer_instance_lm
Esempio n. 25
0
 def save(self) -> None:
     encoder_weights_path = expand_path(
         self.encoder_save_path).with_suffix(".pth.tar")
     log.info(f"Saving encoder to {encoder_weights_path}.")
     torch.save({"model_state_dict": self.encoder.cpu().state_dict()},
                encoder_weights_path)
     self.encoder.to(self.device)
Esempio n. 26
0
 def _load_actions2slots_formfilling_info_from_json(self,
                                                    actions_required_acquired_slots_path: Optional[Union[str, Path]] = None)\
         -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
     """
     loads the formfilling mapping of actions onto the required slots from the json of the following structure:
     {action1: {"required": [required_slot_name_1], "acquired": [acquired_slot_name_1, acquired_slot_name_2]},
      action2: {"required": [required_slot_name_21, required_slot_name_22], "acquired": [acquired_slot_name_21]},
     ..}
     Returns:
          the dictionary represented by the passed json
     """
     actions_required_acquired_slots_path = expand_path(
         actions_required_acquired_slots_path)
     with open(actions_required_acquired_slots_path,
               encoding="utf-8") as actions2slots_json_f:
         actions2slots = json.load(actions2slots_json_f)
         actions2required_slots = {
             act: act_slots["required"]
             for act, act_slots in actions2slots.items()
         }
         actions2acquired_slots = {
             act: act_slots["acquired"]
             for act, act_slots in actions2slots.items()
         }
     return actions2required_slots, actions2acquired_slots
Esempio n. 27
0
    def read(self, data_path: str,
             **kwargs) -> Dict[str, List[Dict[str, Union[int, List[int]]]]]:
        """Read the InsuranceQA data from files and forms the dataset.

        Args:
            data_path: A path to a folder where dataset files are stored.
            **kwargs: Other parameters.

        Returns:
            A dictionary containing training, validation and test parts of the dataset obtainable via
            ``train``, ``valid`` and ``test`` keys.
        """

        data_path = expand_path(data_path)
        self._download_data(data_path)
        dataset = {'train': None, 'valid': None, 'test': None}
        train_fname = Path(
            data_path) / 'insuranceQA-master/V1/question.train.token_idx.label'
        valid_fname = Path(
            data_path
        ) / 'insuranceQA-master/V1/question.dev.label.token_idx.pool'
        test_fname = Path(
            data_path
        ) / 'insuranceQA-master/V1/question.test1.label.token_idx.pool'
        self.idxs2cont_vocab = self._build_context2toks_vocabulary(
            train_fname, valid_fname, test_fname)
        dataset["valid"] = self._preprocess_data_valid_test(valid_fname)
        dataset["train"] = self._preprocess_data_train(train_fname)
        dataset["test"] = self._preprocess_data_valid_test(test_fname)

        return dataset
Esempio n. 28
0
    def __init__(self,
                 sparql_queries_filename: str,
                 lang: str = "rus",
                 adj_to_noun: RuAdjToNoun = None,
                 **kwargs):
        """

        Args:
            sparql_queries_filename: file with sparql query templates
            lang: english or russian
            adj_to_noun: component deeppavlov.models.kbqa.tree_to_sparql:RuAdjToNoun
            **kwargs:
        """
        self.lang = lang
        if self.lang == "rus":
            self.q_pronouns = {
                "какой", "какая", "какое", "каком", "каким", "какую", "кто",
                "что", "как", "когда", "где", "чем", "сколько"
            }
            self.how_many = "сколько"
            self.change_root_tokens = {"каким был", "какой была"}
            self.temporal_order_tokens = {"первый", "последний"}
        elif self.lang == "eng":
            self.q_pronouns = {"what", "who", "how", "when", "where", "which"}
            self.how_many = "how many"
            self.change_root_tokens = ""
            self.temporal_order_tokens = {"first", "last"}
        else:
            raise ValueError(f"unsupported language {lang}")
        self.sparql_queries_filename = expand_path(sparql_queries_filename)
        self.template_queries = read_json(self.sparql_queries_filename)
        self.adj_to_noun = adj_to_noun
        self.morph = pymorphy2.MorphAnalyzer()
Esempio n. 29
0
    def __init__(self, chainer_config: dict, *, batch_size: int = -1,
                 metrics: Iterable[Union[str, dict]] = ('accuracy',),
                 evaluation_targets: Iterable[str] = ('valid', 'test'),
                 show_examples: bool = False,
                 tensorboard_log_dir: Optional[Union[str, Path]] = None,
                 max_test_batches: int = -1,
                 **kwargs) -> None:
        if kwargs:
            log.info(f'{self.__class__.__name__} got additional init parameters {list(kwargs)} that will be ignored:')
        self.chainer_config = chainer_config
        self._chainer = Chainer(chainer_config['in'], chainer_config['out'], chainer_config.get('in_y'))
        self.batch_size = batch_size
        self.metrics = parse_metrics(metrics, self._chainer.in_y, self._chainer.out_params)
        self.evaluation_targets = tuple(evaluation_targets)
        self.show_examples = show_examples

        self.max_test_batches = None if max_test_batches < 0 else max_test_batches

        self.tensorboard_log_dir: Optional[Path] = tensorboard_log_dir
        if tensorboard_log_dir is not None:
            try:
                # noinspection PyPackageRequirements
                # noinspection PyUnresolvedReferences
                import tensorflow
            except ImportError:
                log.warning('TensorFlow could not be imported, so tensorboard log directory'
                            f'`{self.tensorboard_log_dir}` will be ignored')
                self.tensorboard_log_dir = None
            else:
                self.tensorboard_log_dir = expand_path(tensorboard_log_dir)
                self._tf = tensorflow

        self._built = False
        self._saved = False
        self._loaded = False
Esempio n. 30
0
    def load(self) -> None:
        if self.pretrained_bert:
            log.info(f"From pretrained {self.pretrained_bert}.")
            self.pretrained_bert = str(expand_path(self.pretrained_bert))
            self.config = AutoConfig.from_pretrained(self.pretrained_bert,
                                                     output_hidden_states=True)
            self.encoder = AutoModel.from_pretrained(self.pretrained_bert,
                                                     config=self.config)

        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.config = AutoConfig.from_json_file(
                str(expand_path(self.bert_config_file)))
            self.encoder = AutoModel.from_config(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")
        self.encoder.to(self.device)
 def __init__(self, tags_file, **kwargs):
     tags_file = str(expand_path(tags_file))
     self.tags_list = []
     with open(tags_file, "r") as fl:
         lines = fl.readlines()
         for line in lines:
             self.tags_list.append(line.strip().split()[0])
Esempio n. 32
0
    def __init__(self, spec: str, elmo_output_names: Optional[List] = None, dim: Optional[int] = None, pad_zero: bool = False,
                 concat_last_axis: bool = True, max_token: Optional[int] = None, mini_batch_size: int = 32, **kwargs) -> None:

        self.spec = spec if '://' in spec else str(expand_path(spec))

        self.elmo_output_dims = {'word_emb': 512,
                                 'lstm_outputs1': 1024,
                                 'lstm_outputs2': 1024,
                                 'elmo': 1024,
                                 'default': 1024}
        elmo_output_names = elmo_output_names if elmo_output_names else ['default']
        self.elmo_output_names = elmo_output_names if elmo_output_names else ['default']
        elmo_output_names_set = set(self.elmo_output_names)
        if elmo_output_names_set - set(self.elmo_output_dims.keys()):
            log.error(f'Incorrect elmo_output_names = {elmo_output_names} . You can use either  ["default"] or some of'\
                       '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"]')
            sys.exit(1)

        if elmo_output_names_set - set(['default']) and elmo_output_names_set - set(["word_emb", "lstm_outputs1",
                                                                                     "lstm_outputs2", "elmo"]):
            log.error('Incompatible conditions: you can use either  ["default"] or list of '\
                      '["word_emb", "lstm_outputs1", "lstm_outputs2","elmo"] ')
            sys.exit(1)

        self.pad_zero = pad_zero
        self.concat_last_axis = concat_last_axis
        self.max_token = max_token
        self.mini_batch_size = mini_batch_size
        self.elmo_outputs, self.sess, self.tokens_ph, self.tokens_length_ph = self._load()
        self.dim = self._get_dims(self.elmo_output_names, dim, concat_last_axis)
Esempio n. 33
0
    def __init__(self,
                 vocabs_path,
                 save_path,
                 load_path,
                 max_sequence_length,
                 padding="post",
                 truncating="post",
                 max_token_length=None,
                 token_embeddings=True,
                 char_embeddings=False,
                 char_pad="post",
                 char_trunc="post",
                 tok_dynamic_batch=False,
                 char_dynamic_batch=False,
                 update_embeddings=False):

        super().__init__(save_path, load_path, max_sequence_length,
                         max_token_length, padding, truncating,
                         token_embeddings, char_embeddings, char_pad,
                         char_trunc, tok_dynamic_batch, char_dynamic_batch,
                         update_embeddings)

        vocabs_path = expand_path(vocabs_path)
        self.int2tok_fname = Path(vocabs_path) / 'vocabulary'
        self.response2ints_fname = Path(
            vocabs_path) / 'answers.label.token_idx'
        self.train_context2ints_fname = Path(
            vocabs_path) / 'question.train.token_idx.label'
        self.val_context2ints_fname = Path(
            vocabs_path) / 'question.dev.label.token_idx.pool'
        self.test_context2ints_fname = Path(
            vocabs_path) / 'question.test1.label.token_idx.pool'
Esempio n. 34
0
def run_population(population, evolution, gpus):
    """
    Change save and load paths for obtained population, save config.json with model config,
    run population via current python executor (with which evolve.py already run)
    and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs)
    Args:
        population: list of dictionaries - configs of current population
        evolution: ParamsEvolution
        gpus: list of given devices (list of integers)

    Returns:
        None
    """
    population_size = len(population)
    for k in range(population_size // len(gpus) + 1):
        procs = []
        for j in range(len(gpus)):
            i = k * len(gpus) + j
            if i < population_size:
                save_path = expand_path(
                    evolution.get_value_from_config(parse_config(population[i]),
                                                    evolution.path_to_models_save_path))

                save_path.mkdir(parents=True, exist_ok=True)
                f_name = save_path / "config.json"
                save_json(population[i], f_name)

                with save_path.joinpath('out.txt').open('w', encoding='utf8') as outlog,\
                        save_path.joinpath('err.txt').open('w', encoding='utf8') as errlog:
                    env = dict(os.environ)
                    if len(gpus) > 1 or gpus[0] != -1:
                        env['CUDA_VISIBLE_DEVICES'] = str(gpus[j])

                    procs.append(Popen("{} -m deeppavlov train {}".format(sys.executable, str(f_name)),
                                       shell=True, stdout=outlog, stderr=errlog, env=env))
        for j, proc in enumerate(procs):
            i = k * len(gpus) + j
            log.info(f'Waiting on {i}th proc')
            if proc.wait() != 0:
                save_path = expand_path(
                    evolution.get_value_from_config(parse_config(population[i]),
                                                    evolution.path_to_models_save_path))
                with save_path.joinpath('err.txt').open(encoding='utf8') as errlog:
                    log.warning(f'Population {i} returned an error code {proc.returncode} and an error log:\n' +
                                errlog.read())
    return None
Esempio n. 35
0
    def __init__(self, spec: str, dim: int = 1024, pad_zero: bool = False, mean: bool = False,
                **kwargs) -> None:

        self.spec = spec if '://' in spec else str(expand_path(spec))
        self.dim = dim
        self.pad_zero = pad_zero
        self.mean = mean
        self.elmo_outputs, self.sess,  self.tokens_ph,  self.tokens_length_ph = self._load()
Esempio n. 36
0
 def _load_options(self, options_json_path):
     if options_json_path:
         options_json_path = expand_path(options_json_path)
         with open(options_json_path, 'r') as fin:
             options = json.load(fin)
     else:
         options = {}
     return options
Esempio n. 37
0
def from_params(params: Dict, mode: str = 'infer', serialized: Any = None, **kwargs) -> Component:
    """Builds and returns the Component from corresponding dictionary of parameters."""
    # what is passed in json:
    config_params = {k: _resolve(v) for k, v in params.items()}

    # get component by reference (if any)
    if 'ref' in config_params:
        try:
            component = _refs[config_params['ref']]
            if serialized is not None:
                component.deserialize(serialized)
            return component
        except KeyError:
            e = ConfigError('Component with id "{id}" was referenced but not initialized'
                            .format(id=config_params['ref']))
            log.exception(e)
            raise e

    elif 'config_path' in config_params:
        from deeppavlov.core.commands.infer import build_model
        refs = _refs.copy()
        _refs.clear()
        config = parse_config(expand_path(config_params['config_path']))
        model = build_model(config, serialized=serialized)
        _refs.clear()
        _refs.update(refs)
        return model

    cls_name = config_params.pop('class_name', None)
    if not cls_name:
        e = ConfigError('Component config has no `class_name` nor `ref` fields')
        log.exception(e)
        raise e
    cls = get_model(cls_name)

    # find the submodels params recursively
    config_params = {k: _init_param(v, mode) for k, v in config_params.items()}

    try:
        spec = inspect.getfullargspec(cls)
        if 'mode' in spec.args+spec.kwonlyargs or spec.varkw is not None:
            kwargs['mode'] = mode

        component = cls(**dict(config_params, **kwargs))
        try:
            _refs[config_params['id']] = component
        except KeyError:
            pass
    except Exception:
        log.exception("Exception in {}".format(cls))
        raise

    if serialized is not None:
        component.deserialize(serialized)
    return component
Esempio n. 38
0
    def __init__(self, save_path: Union[str, Path], load_path: Optional[Union[str, Path]] = None, mode: str = 'infer',
                 *args, **kwargs) -> None:

        if save_path:
            self.save_path = expand_path(save_path)
            self.save_path.parent.mkdir(parents=True, exist_ok=True)
        else:
            self.save_path = None

        if load_path:
            self.load_path = expand_path(load_path)
            if mode != 'train' and self.save_path and self.load_path != self.save_path:
                log.warning("Load path '{}' differs from save path '{}' in '{}' mode for {}."
                            .format(self.load_path, self.save_path, mode, self.__class__.__name__))
        elif mode != 'train' and self.save_path:
            self.load_path = self.save_path
            log.warning("No load path is set for {} in '{}' mode. Using save path instead"
                        .format(self.__class__.__name__, mode))
        else:
            self.load_path = None
            log.warning("No load path is set for {}!".format(self.__class__.__name__))
Esempio n. 39
0
def get_config_downloads(config: Union[str, Path, dict]) -> Set[Tuple[str, Path]]:
    config = parse_config(config)

    downloads = set()
    if 'metadata' in config and 'download' in config['metadata']:
        for resource in config['metadata']['download']:
            if isinstance(resource, str):
                resource = {
                    'url': resource
                }

            url = resource['url']
            dest = expand_path(resource.get('subdir', ''))

            downloads.add((url, dest))

    config_references = [expand_path(config_ref) for config_ref in get_all_elems_from_json(config, 'config_path')]

    downloads |= {(url, dest) for config in config_references for url, dest in get_config_downloads(config)}

    return downloads
Esempio n. 40
0
    def __init__(self, data_dir: str = '', data_url: str = DB_URL, batch_size: int = None,
                 shuffle: bool = None, seed: int = None, **kwargs):

        download_dir = expand_path(data_dir)
        download_path = download_dir.joinpath(data_url.split("/")[-1])
        download(download_path, data_url, force_download=False)

        self.connect = sqlite3.connect(str(download_path), check_same_thread=False)
        self.db_name = self.get_db_name()
        self.doc_ids = self.get_doc_ids()
        self.doc2index = self.map_doc2idx()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.random = Random(seed)
Esempio n. 41
0
    def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str,
                 context_limit: int = 450, question_limit: int = 150, char_limit: int = 16,
                 level: str = 'token', *args, **kwargs):
        self.emb_folder = expand_path(emb_folder)
        self.level = level
        self.emb_url = emb_url
        self.emb_file_name = Path(emb_url).name
        self.save_path = expand_path(save_path)
        self.load_path = expand_path(load_path)
        self.context_limit = context_limit
        self.question_limit = question_limit
        self.char_limit = char_limit
        self.loaded = False

        self.NULL = "<NULL>"
        self.OOV = "<OOV>"

        self.emb_folder.mkdir(parents=True, exist_ok=True)

        self.emb_dim = self.emb_mat = self.token2idx_dict = None

        if self.load_path.exists():
            self.load()
Esempio n. 42
0
    def __init__(self,
                 tokenizer: Component,
                 tracker: Tracker,
                 network_parameters: Dict[str, Any],
                 template_path: str,
                 template_type: str = "DefaultTemplate",
                 word_vocab: Component = None,
                 bow_embedder: Component = None,
                 embedder: Component = None,
                 slot_filler: Component = None,
                 intent_classifier: Component = None,
                 database: Component = None,
                 api_call_action: str = None,  # TODO: make it unrequired
                 use_action_mask: bool = False,
                 debug: bool = False,
                 load_path: str = None,
                 save_path: str = None,
                 **kwargs):
        super().__init__(load_path=load_path, save_path=save_path, **kwargs)

        self.tokenizer = tokenizer
        self.tracker = tracker
        self.bow_embedder = bow_embedder
        self.embedder = embedder
        self.slot_filler = slot_filler
        self.intent_classifier = intent_classifier
        self.use_action_mask = use_action_mask
        self.debug = debug
        self.word_vocab = word_vocab

        template_path = expand_path(template_path)
        template_type = getattr(templ, template_type)
        log.info("[loading templates from {}]".format(template_path))
        self.templates = templ.Templates(template_type).load(template_path)
        self.n_actions = len(self.templates)
        log.info("{} templates loaded".format(self.n_actions))

        self.database = database
        self.api_call_id = None
        if api_call_action is not None:
            self.api_call_id = self.templates.actions.index(api_call_action)

        self.intents = []
        if callable(self.intent_classifier):
            # intent_classifier returns (y_labels, y_probs)
            self.intents = list(self.intent_classifier(["hi"])[1][0].keys())

        self.network = self._init_network(network_parameters)

        self.reset()
Esempio n. 43
0
def change_savepath_for_model(config):
    params_helper = ParamsSearch()

    dirs_for_saved_models = set()
    for p in params_helper.find_model_path(config, SAVE_PATH_ELEMENT_NAME):
        p.append(SAVE_PATH_ELEMENT_NAME)
        save_path = Path(params_helper.get_value_from_config(config, p))
        new_save_path = save_path.parent / TEMP_DIR_FOR_CV / save_path.name

        dirs_for_saved_models.add(expand_path(new_save_path.parent))

        params_helper.insert_value_or_dict_into_config(config, p, str(new_save_path))

    return config, dirs_for_saved_models
Esempio n. 44
0
    def __init__(self,
                 preprocess: Component,
                 save_path: str,
                 load_path: str,
                 entropy_fields: list,
                 min_similarity: float = 0.5,
                 min_entropy: float = 0.5,
                 **kwargs) -> None:

        self.preprocess = preprocess
        self.save_path = expand_path(save_path)

        if isinstance(load_path, list):
            self.load_path: List = [expand_path(path) for path in load_path]
        else:
            self.load_path: List = [expand_path(load_path)]

        self.min_similarity = min_similarity
        self.min_entropy = min_entropy
        self.entropy_fields = entropy_fields
        self.ec_data: List = []
        if kwargs.get('mode') != 'train':
            self.load()
Esempio n. 45
0
def run_population(population, evolution, gpus):
    """
    Change save and load paths for obtained population, save config.json with model config,
    run population via current python executor (with which evolve.py already run)
    and on given devices (-1 means CPU, other integeres - visible for evolve.py GPUs)
    Args:
        population: list of dictionaries - configs of current population
        evolution: ParamsEvolution
        gpus: list of given devices (list of integers)

    Returns:
        None
    """
    population_size = len(population)
    for k in range(population_size // len(gpus) + 1):
        procs = []
        for j in range(len(gpus)):
            i = k * len(gpus) + j
            if i < population_size:
                save_path = expand_path(Path(evolution.get_value_from_config(
                    population[i], evolution.main_model_path + ["save_path"])).parent)

                save_path.mkdir(parents=True, exist_ok=True)
                f_name = save_path.joinpath("config.json")
                save_json(population[i], f_name)

                if len(gpus) == 1 and gpus[0] == -1:
                    procs.append(Popen("{} -m deeppavlov train {}"
                                       " 1>{}/out.txt 2>{}/err.txt".format(sys.executable,
                                                                           str(f_name),
                                                                           str(save_path),
                                                                           str(save_path)
                                                                           ),
                                       shell=True, stdout=PIPE, stderr=PIPE))
                else:
                    procs.append(Popen("CUDA_VISIBLE_DEVICES={} {} -m deeppavlov train {}"
                                 " 1>{}/out.txt 2>{}/err.txt".format(gpus[j],
                                                                     sys.executable,
                                                                     str(f_name),
                                                                     str(save_path),
                                                                     str(save_path)
                                                                     ),
                                       shell=True, stdout=PIPE, stderr=PIPE))
        for j, proc in enumerate(procs):
            i = k * len(gpus) + j
            log.info(f'Waiting on {i}th proc')
            proc.wait()
    return None
Esempio n. 46
0
    def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the dataset for ranking or paraphrase identification with Siamese networks.

        Args:
            data_path: A path to a folder with dataset files.
        """

        dataset = {'train': None, 'valid': None, 'test': None}
        data_path = expand_path(data_path)
        train_fname = data_path / 'train.csv'
        valid_fname = data_path / 'valid.csv'
        test_fname = data_path / 'test.csv'
        dataset["train"] = self._preprocess_data_train(train_fname)
        dataset["valid"] = self._preprocess_data_valid_test(valid_fname)
        dataset["test"] = self._preprocess_data_valid_test(test_fname)
        return dataset
Esempio n. 47
0
    def read(self,
             data_path: str,
             seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the paraphraser.ru dataset from files.

        Args:
            data_path: A path to a folder with dataset files.
            seed: Random seed.
        """

        data_path = expand_path(data_path)
        train_fname = data_path / 'paraphrases.xml'
        test_fname =  data_path / 'paraphrases_gold.xml'
        train_data = self.build_data(train_fname)
        test_data = self.build_data(test_fname)
        dataset = {"train": train_data, "valid": [], "test": test_data}
        return dataset
    def read(self,
             data_path: str,
             seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the pretraining dataset for the paraphrase identification task from files.

        Args:
            data_path: A path to a folder with dataset files.
            seed: Random seed.
        """

        data_path = expand_path(data_path)
        train_fname = data_path / 'paraphraser_pretrain_train.json'
        test_fname = data_path / 'paraphraser_pretrain_val.json'
        train_data = self.build_data(train_fname)
        test_data = self.build_data(test_fname)
        dataset = {"train": train_data, "valid": test_data, "test": test_data}
        return dataset
Esempio n. 49
0
 def __init__(self, data: List[Tuple], dataset_path: str, seed: int = None, shuffle: bool = False):
     self.shuffle = shuffle
     self.random = Random(seed)
     # TODO: include slot vals to dstc2.tar.gz
     dataset_path = expand_path(dataset_path) / 'slot_vals.json'
     self._build_slot_vals(dataset_path)
     with open(dataset_path, encoding='utf8') as f:
         self._slot_vals = json.load(f)
     for data_type in ['train', 'test', 'valid']:
         bio_markup_data = self._preprocess(data.get(data_type, []))
         setattr(self, data_type, bio_markup_data)
     self.data = {
         'train': self.train,
         'valid': self.valid,
         'test': self.test,
         'all': self.train + self.test + self.valid
     }
     self.shuffle = shuffle
Esempio n. 50
0
    def __init__(self, vocabs_path, save_path, load_path,
                 max_sequence_length, padding="post", truncating="post",
                 max_token_length=None, token_embeddings=True, char_embeddings=False,
                 char_pad="post", char_trunc="post",
                 tok_dynamic_batch=False, char_dynamic_batch=False, update_embeddings = False):

        super().__init__(save_path, load_path,
                         max_sequence_length, max_token_length,
                         padding, truncating,
                         token_embeddings, char_embeddings,
                         char_pad, char_trunc,
                         tok_dynamic_batch, char_dynamic_batch, update_embeddings)

        vocabs_path = expand_path(vocabs_path)
        self.int2tok_fname = Path(vocabs_path) / 'vocabulary'
        self.response2ints_fname = Path(vocabs_path) / 'answers.label.token_idx'
        self.train_context2ints_fname = Path(vocabs_path) / 'question.train.token_idx.label'
        self.val_context2ints_fname = Path(vocabs_path) / 'question.dev.label.token_idx.pool'
        self.test_context2ints_fname = Path(vocabs_path) / 'question.test1.label.token_idx.pool'
 def read(self, data_path: str,
          seed: int = None, *args, **kwargs) -> Dict[str, List[Tuple[Tuple[str, str], int]]]:
     data_path = expand_path(data_path)
     fname = data_path / 'train.csv'
     contexts = []
     responses = []
     labels = []
     with open(fname, 'r') as f:
         reader = csv.reader(f)
         next(reader)
         for el in reader:
             contexts.append(el[-3].replace('\n', '').lower())
             responses.append(el[-2].replace('\n', '').lower())
             labels.append(int(el[-1]))
     data = list(zip(contexts, responses))
     data = list(zip(data, labels))
     data = {"train": data,
             "valid": [],
             "test": []}
     return data
Esempio n. 52
0
    def read(self, data_path: str,
             *args, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the Ubuntu V2 dataset from csv files.

        Args:
            data_path: A path to a folder with dataset csv files.
        """

        data_path = expand_path(data_path)
        dataset = {'train': None, 'valid': None, 'test': None}
        train_fname = Path(data_path) / 'train.csv'
        valid_fname = Path(data_path) / 'valid.csv'
        test_fname = Path(data_path) / 'test.csv'
        self.sen2int_vocab = {}
        self.classes_vocab_train = {}
        self.classes_vocab_valid = {}
        self.classes_vocab_test = {}
        dataset["train"] = self.preprocess_data_train(train_fname)
        dataset["valid"] = self.preprocess_data_validation(valid_fname)
        dataset["test"] = self.preprocess_data_validation(test_fname)
        return dataset
Esempio n. 53
0
    def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.info('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i+1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.info('built')
        else:
            log.info('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
Esempio n. 54
0
    def read(self, data_path: str, **kwargs) -> Dict[str, List[Tuple[List[str], int]]]:
        """Read the InsuranceQA V1 dataset from files.

        Args:
            data_path: A path to a folder with dataset files.
        """

        data_path = expand_path(data_path)
        dataset = {'train': None, 'valid': None, 'test': None}
        train_fname = data_path / 'insuranceQA-master/V1/question.train.token_idx.label'
        valid_fname = data_path / 'insuranceQA-master/V1/question.dev.label.token_idx.pool'
        test_fname = data_path / 'insuranceQA-master/V1/question.test1.label.token_idx.pool'
        int2tok_fname = data_path / 'insuranceQA-master/V1/vocabulary'
        response2ints_fname = data_path / 'insuranceQA-master/V1/answers.label.token_idx'
        self.int2tok_vocab = self._build_int2tok_vocab(int2tok_fname)
        self.idxs2cont_vocab = self._build_context2toks_vocab(train_fname, valid_fname, test_fname)
        self.response2str_vocab = self._build_response2str_vocab(response2ints_fname)
        dataset["valid"] = self._preprocess_data_valid_test(valid_fname)
        dataset["train"] = self._preprocess_data_train(train_fname)
        dataset["test"] = self._preprocess_data_valid_test(test_fname)

        return dataset
Esempio n. 55
0
    def __init__(self, load_path: Union[str, Path], batch_size: Optional[int] = None,
                 shuffle: Optional[bool] = None, seed: Optional[int] = None, **kwargs) -> None:

        load_path = str(expand_path(load_path))
        logger.info("Connecting to database, path: {}".format(load_path))
        try:
            self.connect = sqlite3.connect(load_path, check_same_thread=False)
        except sqlite3.OperationalError as e:
            e.args = e.args + ("Check that DB path exists and is a valid DB file",)
            raise e
        try:
            self.db_name = self.get_db_name()
        except TypeError as e:
            e.args = e.args + (
                'Check that DB path was created correctly and is not empty. '
                'Check that a correct dataset_format is passed to the ODQAReader config',)
            raise e
        self.doc_ids = self.get_doc_ids()
        self.doc2index = self.map_doc2idx()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.random = Random(seed)
Esempio n. 56
0
def predict_with_model(config_path: [Path, str]) -> List[Optional[List[str]]]:
    """Returns predictions of morphotagging model given in config :config_path:.

    Args:
        config_path: a path to config

    Returns:
        a list of morphological analyses for each sentence. Each analysis is either a list of tags
        or a list of full CONLL-U descriptions.

    """
    config = parse_config(config_path)

    reader_config = config['dataset_reader']
    reader = get_model(reader_config['class_name'])()
    data_path = expand_path(reader_config.get('data_path', ''))
    read_params = {k: v for k, v in reader_config.items() if k not in ['class_name', 'data_path']}
    data: Dict = reader.read(data_path, **read_params)

    iterator_config = config['dataset_iterator']
    iterator: MorphoTaggerDatasetIterator = from_params(iterator_config, data=data)

    model = build_model(config, load_trained=True)
    answers = [None] * len(iterator.test)
    batch_size = config['predict'].get("batch_size", -1)
    for indexes, (x, _) in iterator.gen_batches(
            batch_size=batch_size, data_type="test", shuffle=False, return_indexes=True):
        y = model(x)
        for i, elem in zip(indexes, y):
            answers[i] = elem
    outfile = config['predict'].get("outfile")
    if outfile is not None:
        outfile = Path(outfile)
        if not outfile.exists():
            outfile.parent.mkdir(parents=True, exist_ok=True)
        with open(outfile, "w", encoding="utf8") as fout:
            for elem in answers:
                fout.write(elem + "\n")
    return answers
    def read(self, data_path: str, catalog: list, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """Load data from specific catalog

        Parameters:
            data_path: where the dataset is located
            catalog: names of the specific subcategories

        Returns:
            dataset: loaded dataset
        """

        logger.info(f"Ecommerce loader is loaded with catalog {catalog}")

        if not isinstance(catalog, list):
            catalog = [catalog]

        ec_data_global: List[Any] = []
        data_path = Path(expand_path(data_path))

        if not is_done(data_path):
            self._download_data(data_path)

        if data_path.is_dir():
            for fname in data_path.rglob("*.txt"):
                if any(cat in fname.name for cat in catalog):
                    logger.info(f"File {fname.name} is loaded")
                    ec_data_global += self._load_amazon_ecommerce_file(fname)

        dataset = {
            'train': [((item['Title'], [], {}), item) for item in ec_data_global],
            'valid': [],
            'test':  []
            }

        logger.info(f"In total {len(ec_data_global)} items are loaded")
        return dataset
Esempio n. 58
0
 def load(self) -> None:
     """Load classifier parameters"""
     logger.info("Loading faq_logreg_model from {}".format(self.load_path))
     self.logreg = load_pickle(expand_path(self.load_path))
Esempio n. 59
0
 def save(self) -> None:
     """Save classifier parameters"""
     logger.info("Saving faq_logreg_model to {}".format(self.save_path))
     path = expand_path(self.save_path)
     make_all_dirs(path)
     save_pickle(self.logreg, path)