def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`BasicPreprocessor` instance. """ units = self._default_processor_units() data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, data_pack, flatten=False, mode='right', verbose=verbose) data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, mode='right', verbose=verbose) self._context['filter_unit'] = fitted_filter_unit vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit self._context['vocab_size'] = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(self._fixed_length_left, ), (self._fixed_length_right, )] return self
def build_unit_from_data_pack(unit: StatefulUnit, data_pack: mz.DataPack, mode: str = 'both', flatten: bool = True, verbose: int = 1) -> StatefulUnit: """ Build a :class:`StatefulUnit` from a :class:`DataPack` object. :param unit: :class:`StatefulUnit` object to be built. :param data_pack: The input :class:`DataPack` object. :param mode: One of 'left', 'right', and 'both', to determine the source data for building the :class:`VocabularyUnit`. :param flatten: Flatten the datapack or not. `True` to organize the :class:`DataPack` text as a list, and `False` to organize :class:`DataPack` text as a list of list. :param verbose: Verbosity. :return: A built :class:`StatefulUnit` object. """ corpus = [] if flatten: data_pack.apply_on_text(corpus.extend, mode=mode, verbose=verbose) else: data_pack.apply_on_text(corpus.append, mode=mode, verbose=verbose) if verbose: description = 'Building ' + unit.__class__.__name__ + \ ' from a datapack.' corpus = tqdm(corpus, desc=description) unit.fit(corpus) return unit
def fit(self, data_pack: DataPack, verbose: int = 1): ## 经过分词、去标点以及去停用词 data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) ## 过滤高频词和低频词 ## 先通过build进行统计 fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, data_pack, flatten=False, mode='right', verbose=verbose) ### 基于上面统计的结果进行转换并保存模型 data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, mode='right', verbose=verbose) self._context['filter_unit'] = fitted_filter_unit ## 构建词表 vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size return self
def transform(self, pack: matchzoo.DataPack): """ Converting the raw path to mapped indices """ def left_to_indices(images: List[str]): images_indices = [ self.left_img_path2index[p] for p in images[:self.max_num_left_images] ] images_indices += [0] * ( self.max_num_left_images - len(images_indices)) # padding return images_indices def right_to_indices(images: List[str]): images_indices = [ self.right_img_path2index[p] for p in images[:self.max_num_right_images] ] images_indices += [0] * ( self.max_num_right_images - len(images_indices)) # padding return images_indices pack.left["images_left"] = pack.left["images_left"].apply( left_to_indices) pack.right["images_right"] = pack.right["images_right"].apply( right_to_indices) return pack
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:'DIINPreprocessor' instance. """ func = chain_transform(self._units) data_pack = data_pack.apply_on_text(func, mode='both', verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) vocab_size = len(vocab_unit.state['term_index']) self._context['vocab_unit'] = vocab_unit self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size data_pack = data_pack.apply_on_text( units.NgramLetter(ngram=1, reduce_dim=True).transform, mode='both', verbose=verbose) char_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['char_unit'] = char_unit self._context['input_shapes'] = [ (self._fixed_length_left,), (self._fixed_length_right,), (self._fixed_length_left, self._fixed_length_word,), (self._fixed_length_right, self._fixed_length_word,), (self._fixed_length_left,), (self._fixed_length_right,) ] return self
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`BasicPreprocessor` instance. """ data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, data_pack, flatten=False, mode='right', verbose=verbose) data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, mode='right', verbose=verbose) self._context['filter_unit'] = fitted_filter_unit vocab_unit = build_vocab_unit( data_pack, verbose=verbose, mode="right") # only rely on the right side self._context['vocab_unit'] = vocab_unit vocab_size = len( vocab_unit.state['term_index']) # + 1 # +1 for padding self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size self._context['input_shapes'] = [(self._fixed_length_left, ), (self._fixed_length_right, )] return self
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create `letter-ngram` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() func = chain_transform(self._default_units()) data_pack.apply_on_text(func, inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) post_units = [units.NgramLetter(reduce_dim=False)] if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] post_units.append(units.WordHashing(term_index)) data_pack.apply_on_text(chain_transform(post_units), inplace=True, verbose=verbose) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) def convert_to_bow(input_: List[str]): """the list of tokens will be converted to """ vocab_unit = self._context['vocab_unit'] ans = [0.0] * self._context['vocab_size'] for token in input_: index = vocab_unit._state['term_index'][token] ans[index] = 1.0 return ans data_pack.apply_on_text(convert_to_bow, mode='both', inplace=True, verbose=verbose) data_pack.right['images_right'] = data_pack.right[ "images_right"].progress_apply(self._images_unit.transform) return data_pack
def __init__(self, data_pack: mz.DataPack, mode='point', num_dup: int = 1, num_neg: int = 1, resample: bool = True, batch_size: int = 128, shuffle: bool = True, callbacks: typing.List[Callback] = None): """Init.""" if callbacks is None: callbacks = [] if mode not in ('point', 'pair', 'list'): raise ValueError(f"{mode} is not a valid mode type." f"Must be one of `point`, `pair` or `list`.") self._mode = mode self._num_dup = num_dup self._num_neg = num_neg self._batch_size = batch_size self._shuffle = shuffle self._resample = resample self._orig_relation = data_pack.relation self._callbacks = callbacks if mode == 'pair': data_pack.relation = self._reorganize_pair_wise(data_pack.relation, num_dup=num_dup, num_neg=num_neg) self._data_pack = data_pack self._batch_indices = None self.reset_index()
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack.apply_on_text(self._tokenizer.encode, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create truncated length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ units_ = self._default_units() units_.append(self._context['vocab_unit']) units_.append( units.TruncatedLength(text_length=30, truncate_mode='post')) func = chain_transform(units_) data_pack.apply_on_text(func, inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) return data_pack
def __init__( self, data_pack: mz.DataPack, mode='point', num_dup: int = 1, num_neg: int = 1, batch_size: int = 32, resample: bool = False, shuffle: bool = True, sort: bool = False, callbacks: typing.List[BaseCallback] = None ): """Init.""" if callbacks is None: callbacks = [] if mode not in ('point', 'pair', 'list'): raise ValueError(f"{mode} is not a valid mode type." f"Must be one of `point`, `pair` or `list`.") if shuffle and sort: raise ValueError(f"parameters `shuffle` and `sort` conflict, " f"should not both be `True`.") data_pack = data_pack.copy() self._mode = mode self._num_dup = num_dup self._num_neg = num_neg self._batch_size = batch_size self._resample = (resample if mode != 'point' else False) self._shuffle = shuffle self._sort = sort self._orig_relation = data_pack.relation self._callbacks = callbacks if mode == 'pair': data_pack.relation = self._reorganize_pair_wise( relation=self._orig_relation, num_dup=num_dup, num_neg=num_neg ) self._data_pack = data_pack self._batch_indices = None self.reset_index()
def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create `tri-letter` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() if self._with_word_hashing: term_index = self._context['vocab_unit'].state['term_index'] units.append(processor_units.WordHashingUnit(term_index)) data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) return data_pack
def data_pack(): relation = [['qid0', 'did0', 1], ['qid1', 'did1', 0]] left = [['qid0', [1, 2]], ['qid1', [2, 3]]] right = [['did0', [2, 3, 4]], ['did1', [3, 4, 5]]] relation = pd.DataFrame(relation, columns=['id_left', 'id_right', 'label']) left = pd.DataFrame(left, columns=['id_left', 'text_left']) left.set_index('id_left', inplace=True) right = pd.DataFrame(right, columns=['id_right', 'text_right']) right.set_index('id_right', inplace=True) return DataPack(relation=relation, left=left, right=right)
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`BasicPreprocessor` instance. """ data_pack = data_pack.apply_on_text( ChainTransform(self._units), multiprocessing=self.multiprocessing, verbose=verbose) fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, data_pack, flatten=False, mode='right', verbose=verbose) data_pack = data_pack.apply_on_text(ChainTransform(fitted_filter_unit), mode='right', multiprocessing=False, verbose=verbose) self._context['filter_unit'] = fitted_filter_unit vocab_unit = build_vocab_unit(data_pack, verbose=verbose) if self.extra_terms: vocab_unit.fit_incrementally(self.extra_terms) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size if self._ngram_size: data_pack = data_pack.apply_on_text( ChainTransform(self._context['ngram_process_unit']), mode='both', multiprocessing=self.multiprocessing, verbose=verbose) ngram_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['ngram_vocab_unit'] = ngram_unit self._context['ngram_vocab_size'] = len( ngram_unit.state['term_index']) return self
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`NaivePreprocessor` instance. """ func = chain_transform(self._default_units()) data_pack = data_pack.apply_on_text(func, verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit return self
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create `tri-letter` representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ units = self._default_processor_units() units.append(self._context['vocab_unit']) units.append(processor_units.FixedLengthUnit(text_length=30, pad_mode='post')) return data_pack.apply_on_text(chain_transform(units), verbose=verbose)
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param data_pack: data_pack to be preprocessed. :param verbose: Verbosity. :return: class:`BasicPreprocessor` instance. """ data_pack = data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) # fitted_filter_unit = build_unit_from_data_pack(self._filter_unit, # data_pack, # flatten=False, # mode='right', # verbose=verbose) # data_pack = data_pack.apply_on_text(fitted_filter_unit.transform, # mode='right', verbose=verbose) # self._context['filter_unit'] = fitted_filter_unit vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size if self._ngram_size: data_pack = data_pack.apply_on_text( self._context['ngram_process_unit'].transform, mode='both', verbose=verbose) ngram_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['ngram_vocab_unit'] = ngram_unit self._context['ngram_vocab_size'] = len( ngram_unit.state['term_index']) return self
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: data_pack to be preprocessed. :return: class:`DSSMPreprocessor` instance. """ units = self._default_processor_units() data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit triletter_dim = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(triletter_dim, ), (triletter_dim, )] return self
def fit(self, data_pack: DataPack, verbose: int = 1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: Data_pack to be preprocessed. :return: class:`CDSSMPreprocessor` instance. """ fit_units = self._default_units() + [units.NgramLetter()] func = chain_transform(fit_units) data_pack = data_pack.apply_on_text(func, verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) self._context['vocab_size'] = vocab_size self._context['embedding_input_dim'] = vocab_size return self
def fit(self, data_pack: DataPack, verbose=1): """ Fit pre-processing context for transformation. :param verbose: Verbosity. :param data_pack: Data_pack to be preprocessed. :return: class:`CDSSMPreprocessor` instance. """ units = self._default_processor_units() units.append(processor_units.NgramLetterUnit()) data_pack = data_pack.apply_on_text(chain_transform(units), verbose=verbose) vocab_unit = build_vocab_unit(data_pack, verbose=verbose) self._context['vocab_unit'] = vocab_unit vocab_size = len(vocab_unit.state['term_index']) + 1 self._context['input_shapes'] = [(self._fixed_length_left, vocab_size), (self._fixed_length_right, vocab_size) ] return self
def transform(self, data_pack: DataPack, verbose=1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() units = self._default_processor_units() data_pack.apply_on_text(chain_transform(units), inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) max_len_left = self._fixed_length_left max_len_right = self._fixed_length_right data_pack.left['length_left'] = data_pack.left['length_left'].apply( lambda val: val if val <= max_len_left else max_len_left) data_pack.right['length_right'] = data_pack.right[ 'length_right'].apply(lambda val: val if val <= max_len_right else max_len_right) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create truncated length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(ChainTransform(self._units), inplace=True, multiprocessing=self.multiprocessing, verbose=verbose) data_pack.apply_on_text(ChainTransform(self._context['filter_unit']), mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(ChainTransform(self._context['vocab_unit']), mode='both', inplace=True, verbose=verbose) if self._truncated_length_left: data_pack.apply_on_text(ChainTransform( self._left_truncatedlength_unit), mode='left', inplace=True, verbose=verbose) if self._truncated_length_right: data_pack.apply_on_text(ChainTransform( self._right_truncatedlength_unit), mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.drop_empty(inplace=True) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), verbose=verbose) data_pack.apply_on_text(self._context['filter_unit'].transform, mode='right', inplace=True, verbose=verbose) data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) if self._truncated_length_left: data_pack.apply_on_text(self._left_truncatedlength_unit.transform, mode='left', inplace=True, verbose=verbose) if self._truncated_length_right: data_pack.apply_on_text(self._right_truncatedlength_unit.transform, mode='right', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.drop_empty(inplace=True) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data, create fixed length representation. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:`DataPack` object. """ data_pack = data_pack.copy() data_pack.apply_on_text(chain_transform(self._units), inplace=True, verbose=verbose) # data_pack.apply_on_text(self._context['filter_unit'].transform, # mode='right', inplace=True, verbose=verbose) # data_pack.apply_on_text(self._char_left.transform, mode='left', inplace=True, verbose=verbose, rename="char_left") # data_pack.apply_on_text(self._char_right.transform, mode='right', inplace=True, verbose=verbose, rename="char_right") data_pack.apply_on_text(self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) data_pack.append_text_length(inplace=True, verbose=verbose) data_pack.apply_on_text(self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text(self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) def process_decoder_input_output(text: str): tokens = chain_transform(self._units)(text) tokens = self._context['vocab_unit'].transform(tokens) return self._right_fixedlength_unit.transform(tokens) data_pack.right[KeyWordSettings.TextRightInput] = data_pack.right[ KeyWordSettings.TextRightInput].apply(process_decoder_input_output) data_pack.right[KeyWordSettings.TextRightOutput] = data_pack.right[ KeyWordSettings.TextRightOutput].apply( process_decoder_input_output) max_len_left = self._fixed_length_left max_len_right = self._fixed_length_right data_pack.left['length_left'] = \ data_pack.left['length_left'].apply( lambda val: min(val, max_len_left)) data_pack.right['length_right'] = \ data_pack.right['length_right'].apply( lambda val: min(val, max_len_right)) return data_pack
def transform(self, data_pack: DataPack, verbose: int = 1) -> DataPack: """ Apply transformation on data. :param data_pack: Inputs to be preprocessed. :param verbose: Verbosity. :return: Transformed data as :class:'DataPack' object. """ data_pack = data_pack.copy() data_pack.apply_on_text( chain_transform(self._units), mode='both', inplace=True, verbose=verbose) # Process character representation data_pack.apply_on_text( units.NgramLetter(ngram=1, reduce_dim=False).transform, rename=('char_left', 'char_right'), mode='both', inplace=True, verbose=verbose) char_index_dict = self._context['char_unit'].state['term_index'] left_charindex_unit = units.CharacterIndex( char_index_dict, self._fixed_length_left, self._fixed_length_word) right_charindex_unit = units.CharacterIndex( char_index_dict, self._fixed_length_right, self._fixed_length_word) data_pack.left['char_left'] = data_pack.left['char_left'].apply( left_charindex_unit.transform) data_pack.right['char_right'] = data_pack.right['char_right'].apply( right_charindex_unit.transform) # Process word representation data_pack.apply_on_text( self._context['vocab_unit'].transform, mode='both', inplace=True, verbose=verbose) # Process exact match representation frame = data_pack.relation.join( data_pack.left, on='id_left', how='left' ).join(data_pack.right, on='id_right', how='left') left_exactmatch_unit = units.WordExactMatch( self._fixed_length_left, match='text_left', to_match='text_right') right_exactmatch_unit = units.WordExactMatch( self._fixed_length_right, match='text_right', to_match='text_left') data_pack.relation['match_left'] = frame.apply( left_exactmatch_unit.transform, axis=1) data_pack.relation['match_right'] = frame.apply( right_exactmatch_unit.transform, axis=1) data_pack.apply_on_text( self._left_fixedlength_unit.transform, mode='left', inplace=True, verbose=verbose) data_pack.apply_on_text( self._right_fixedlength_unit.transform, mode='right', inplace=True, verbose=verbose) return data_pack