def build_vocab_generator(self, generators: List[CorpusGenerator]) -> None: if not self.vocab2idx: vocab2idx = self._initial_vocab_dic token2count: Dict[str, int] = {} for gen in generators: for sentence, label in tqdm.tqdm( gen, desc="Preparing text vocab dict"): if self.build_vocab_from_labels: target = label else: target = sentence for token in target: count = token2count.get(token, 0) token2count[token] = count + 1 sorted_token2count = sorted(token2count.items(), key=operator.itemgetter(1), reverse=True) token2count = collections.OrderedDict(sorted_token2count) for token, token_count in token2count.items(): if token not in vocab2idx and token_count >= self.min_count: vocab2idx[token] = len(vocab2idx) self.vocab2idx = vocab2idx self.idx2vocab = dict([(v, k) for k, v in self.vocab2idx.items()]) top_k_vocab = [k for (k, v) in list(self.vocab2idx.items())[:10]] logger.debug( f"--- Build vocab dict finished, Total: {len(self.vocab2idx)} ---" ) logger.debug(f"Top-10: {top_k_vocab}")
def get_seq_length_from_corpus(self, generators: List[CorpusGenerator], *, use_label: bool = False, cover_rate: float = 0.95) -> int: """ Calculate proper sequence length according to the corpus Args: generators: use_label: cover_rate: Returns: """ seq_lens = [] for gen in generators: for sentence, label in tqdm.tqdm( gen, desc="Calculating sequence length"): if use_label: seq_lens.append(len(label)) else: seq_lens.append(len(sentence)) if cover_rate == 1.0: target_index = -1 else: target_index = int(cover_rate * len(seq_lens)) sequence_length = sorted(seq_lens)[target_index] logger.debug(f'Calculated sequence length = {sequence_length}') return sequence_length
def load_data( cls, subset_name: str = 'train', shuffle: bool = True) -> Tuple[List[List[str]], List[List[str]]]: """ Load dataset as sequence labeling format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=K.DATA_PATH, untar=True) if subset_name == 'train': file_path = os.path.join(corpus_path, 'example.train') elif subset_name == 'test': file_path = os.path.join(corpus_path, 'example.test') else: file_path = os.path.join(corpus_path, 'example.dev') x_data, y_data = DataReader.read_conll_format_file(file_path) if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logger.debug( f"loaded {len(x_data)} samples from {file_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def embed(self, sentences: List[List[str]], *, debug: bool = False) -> np.ndarray: """ batch embed sentences Args: sentences: Sentence list to embed debug: show debug info Returns: vectorized sentence list """ if self._text_processor is None: raise ValueError( 'Need to setup the `embedding.setup_text_processor` before calling the embed function.' ) tensor_x = self._text_processor.transform(sentences, segment=self.segment, seq_length=self.max_position) if debug: logger.debug(f'sentence tensor: {tensor_x}') embed_results = self.embed_model.predict(tensor_x) return embed_results
def test_with_model(self): x, y = SMP2018ECDTCorpus.load_data('test') embedding = self.build_embedding() model = BiGRU_Model(embedding=embedding) model.build_model(x, y) model_summary = [] embedding.embed_model.summary( print_fn=lambda x: model_summary.append(x)) logger.debug('\n'.join(model_summary)) model.fit(x, y, epochs=1) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) model.save(model_path)
def load_data(cls, subset_name: str = 'train', shuffle: bool = True, cutter: str = 'char') -> Tuple[List[List[str]], List[str]]: """ Load dataset as sequence classification format, char level tokenized Args: subset_name: {train, test, valid} shuffle: should shuffle or not, default True. cutter: sentence cutter, {char, jieba} Returns: dataset_features and dataset labels """ corpus_path = get_file(cls.__corpus_name__, cls.__zip_file__name, cache_dir=K.DATA_PATH, untar=True) if cutter not in ['char', 'jieba', 'none']: raise ValueError( 'cutter error, please use one onf the {char, jieba}') df_path = os.path.join(corpus_path, f'{subset_name}.csv') df = pd.read_csv(df_path) if cutter == 'jieba': try: import jieba except ModuleNotFoundError: raise ModuleNotFoundError( "please install jieba, `$ pip install jieba`") x_data = [list(jieba.cut(item)) for item in df['query'].to_list()] elif cutter == 'char': x_data = [list(item) for item in df['query'].to_list()] y_data = df['label'].to_list() if shuffle: x_data, y_data = utils.unison_shuffled_copies(x_data, y_data) logger.debug(f"loaded {len(x_data)} samples from {df_path}. Sample:\n" f"x[0]: {x_data[0]}\n" f"y[0]: {y_data[0]}") return x_data, y_data
def predict(self, x_data: TextSamplesVar, *, batch_size: int = 32, truncating: bool = False, predict_kwargs: Dict = None) -> List[List[str]]: """ Generates output predictions for the input samples. Computation is done in batches. Args: x_data: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs). batch_size: Integer. If unspecified, it will default to 32. truncating: remove values from sequences larger than `model.embedding.sequence_length` predict_kwargs: arguments passed to :meth:`tf.keras.Model.predict` Returns: array(s) of predictions. """ if predict_kwargs is None: predict_kwargs = {} with kashgari.utils.custom_object_scope(): if truncating: seq_length = self.sequence_length else: seq_length = None print(self.crf_layer) tensor = self.text_processor.transform( x_data, segment=self.embedding.segment, seq_length=seq_length, max_position=self.embedding.max_position) logger.debug('predict seq_length: {}, input: {}'.format( seq_length, np.array(tensor).shape)) pred = self.tf_model.predict(tensor, batch_size=batch_size, verbose=1, **predict_kwargs) pred = pred.argmax(-1) lengths = [len(sen) for sen in x_data] res: List[List[str]] = self.label_processor.inverse_transform( pred, # type: ignore lengths=lengths) logger.debug('predict output: {}'.format(np.array(pred).shape)) logger.debug('predict output argmax: {}'.format(pred)) return res
def load_embed_vocab(self) -> Optional[Dict[str, int]]: w2v = KeyedVectors.load_word2vec_format(self.w2v_path, **self.w2v_kwargs) token2idx = {'[PAD]': 0, '[UNK]': 1, '[BOS]': 2, '[EOS]': 3} for token in w2v.index2word: token2idx[token] = len(token2idx) vector_matrix = np.zeros((len(token2idx), w2v.vector_size)) vector_matrix[1] = np.random.rand(w2v.vector_size) vector_matrix[4:] = w2v.vectors self.embedding_size = w2v.vector_size self.w2v_matrix = vector_matrix w2v_top_words = w2v.index2entity[:50] logger.debug('------------------------------------------------') logger.debug("Loaded gensim word2vec model's vocab") logger.debug('model : {}'.format(self.w2v_path)) logger.debug('word count : {}'.format(len(self.w2v_matrix))) logger.debug('Top 50 words : {}'.format(w2v_top_words)) logger.debug('------------------------------------------------') return token2idx
def fit_generator(self, train_sample_gen: CorpusGenerator, valid_sample_gen: CorpusGenerator = None, batch_size: int = 64, epochs: int = 5, callbacks: List['tf.keras.callbacks.Callback'] = None, fit_kwargs: Dict = None) -> 'tf.keras.callbacks.History': """ Trains the model for a given number of epochs with given data generator. Data generator must be the subclass of `CorpusGenerator` Args: train_sample_gen: train data generator. valid_sample_gen: valid data generator. batch_size: Number of samples per gradient update, default to 64. epochs: Number of epochs to train the model. An epoch is an iteration over the entire `x` and `y` data provided. callbacks: List of `tf.keras.callbacks.Callback` instances. List of callbacks to apply during training. See `tf.keras.callbacks`. fit_kwargs: fit_kwargs: additional arguments passed to :meth:`tf.keras.Model.fit` Returns: A :py:class:`tf.keras.callback.History` object. Its `History.history` attribute is a record of training loss values and metrics values at successive epochs, as well as validation loss values and validation metrics values (if applicable). """ self.build_model_generator( [g for g in [train_sample_gen, valid_sample_gen] if g]) train_set = BatchDataSet(train_sample_gen, text_processor=self.text_processor, label_processor=self.label_processor, segment=self.embedding.segment, seq_length=self.sequence_length, max_position=self.embedding.max_position, batch_size=batch_size) if fit_kwargs is None: fit_kwargs = {} if valid_sample_gen: valid_set = BatchDataSet(valid_sample_gen, text_processor=self.text_processor, label_processor=self.label_processor, segment=self.embedding.segment, seq_length=self.sequence_length, max_position=self.embedding.max_position, batch_size=batch_size) fit_kwargs['validation_data'] = valid_set.take() fit_kwargs['validation_steps'] = len(valid_set) for x, y in train_set.take(1): logger.debug('fit input shape: {}'.format(np.array(x).shape)) logger.debug('fit input shape: {}'.format(np.array(y).shape)) return self.tf_model.fit(train_set.take(), steps_per_epoch=len(train_set), epochs=epochs, callbacks=callbacks, **fit_kwargs)
def load_embed_vocab(self) -> Optional[Dict[str, int]]: token2idx: Dict[str, int] = {} with codecs.open(self.vocab_path, 'r', 'utf8') as reader: for line in reader: token = line.strip() self.vocab_list.append(token) token2idx[token] = len(token2idx) top_words = [k for k, v in list(token2idx.items())[:50]] logger.debug('------------------------------------------------') logger.debug("Loaded transformer model's vocab") logger.debug(f'config_path : {self.config_path}') logger.debug(f'vocab_path : {self.vocab_path}') logger.debug(f'checkpoint_path : {self.checkpoint_path}') logger.debug(f'Top 50 words : {top_words}') logger.debug('------------------------------------------------') return token2idx