Example #1
0
    def __init__(self,
                 fname,
                 new_mask_fname,
                 pad_symbol,
                 symbol_to_mask=None,
                 padding_mode='both',
                 axis=1,
                 **kwargs):
        """
        :param fname: str or list of str names that should represent
                            fields that should be padded.
        :param new_mask_fname: str or list of str names that will contain
                                masks.
        :param pad_symbol: a symbol that should be used for padding.
        :param symbol_to_mask: a symbol(token) that should be masked in
                               sequences. E.g. Can be used to mask <UNK> tokens.
        :param padding_mode: left, right, or both. Defines the side to which
                             padding symbols should be appended.
        :param axis: defines an axis of data to which padding should be applied.
                     Currently only axes 1 or 2 are supported.
        """
        try:
            validate_field_names(fname)
        except Exception as e:
            raise e

        super(Padder, self).__init__(**kwargs)
        self.fnames = listify(fname)
        self.mask_fnames = listify(new_mask_fname)
        assert (len(self.mask_fnames) == len(self.fnames))
        self.pad_symbol = pad_symbol
        self.symbol_to_mask = symbol_to_mask
        self.padding_mode = padding_mode
        self.axis = axis
Example #2
0
    def __init__(self,
                 field_names,
                 window_size=5,
                 step_size=1,
                 only_full_windows=False,
                 new_window_field_name_suffix='window',
                 **kwargs):
        """
        :param field_names: str or list of str (str) corresponding to fields
                            which should be slided over.
        :param window_size: self-explanatory.
        :param step_size: self-explanatory.
        :param only_full_windows: if set to True guarantees that all windows
                                  will be of the same size.
        :param new_window_field_name_suffix: suffix for all newly created fields.
        """
        try:
            validate_field_names(field_names)
        except Exception as e:
            raise e

        super(WindowSlider, self).__init__(**kwargs)
        self.field_names = listify(field_names)
        self.window_size = window_size
        self.step_size = step_size
        self.only_full_windows = only_full_windows
        self.new_windw_fn_suffix = new_window_field_name_suffix
Example #3
0
    def __init__(self, fnames, **kwargs):
        """
        :param fnames: str or list of str names that should represent
                            fields that should be selected from data-chunks.
                            Other fields are discarded.
        """
        try:
            validate_field_names(fnames)
        except Exception as e:
            raise e

        super(FieldSelector, self).__init__(**kwargs)
        self.fnames = listify(fnames)
Example #4
0
    def __init__(self, field_name_to_func, **kwargs):
        """
        :param field_name_to_func: a dict of mappings, where values are
                                  functions of the form: x -> y.
        """
        try:
            validate_field_names(list(field_name_to_func.keys()))
        except Exception as e:
            raise e
        for f in field_name_to_func.values():
            if not callable(f):
                raise ValueError(
                    "Please provide all valid callable functions.")

        super(FunctionApplier, self).__init__(**kwargs)
        self.field_name_to_func = field_name_to_func
Example #5
0
    def __init__(self,
                 fnames,
                 tokenization_func=lambda x: x.split(),
                 token_cleaning_func=None,
                 token_matching_func=None,
                 lower_case=True,
                 **kwargs):
        """
        :param fnames: str or list of string corresponding to fields that
                            should be tokenized.
        :param tokenization_func: a function that splits string sequences into
                                  sequences of tokens. The form should be:
                                  x -> y where x is a str and y is a list/array
                                  of tokens.
        :param token_cleaning_func: the function responsible for normalization
                                    of tokens, elimination of unwanted
                                    characters, etc. format: x -> y, where x is
                                    a str token, and y is a clean str token.
        :param token_matching_func: a function that matches raw text tokens to
                                    to a special set of tokens. E.g. to twitter
                                    emoticons ':)' -> '<POSIT_EMOT>'.
                                    The format: x -> y, where x is a str token,
                                    and y is either False, if it does not match
                                    or a string token otherwise.
        :param lower_case: whether to lower-case strings before tokenization.
        """
        try:
            validate_field_names(fnames)
        except Exception as e:
            raise e
        msg = "Please provide a valid callable %s function."
        if not callable(tokenization_func):
            raise ValueError(msg % "tokenization")
        if token_cleaning_func is not None and not callable(
                token_cleaning_func):
            raise ValueError(msg % "token cleaning")
        if token_matching_func is not None and not callable(
                token_matching_func):
            raise ValueError(msg % "token matching")

        super(TokenProcessor, self).__init__(**kwargs)
        self.field_names = listify(fnames)
        self.tokenization_func = tokenization_func
        self.token_cleaning_func = token_cleaning_func
        self.token_matching_func = token_matching_func
        self.lower_case = lower_case
Example #6
0
    def create(self,
               data_source,
               data_fnames,
               min_count=1,
               max_size=None,
               add_default_special_symbols=True):
        """
        Create vocabulary by passing data_source to the corresponding data-chunk
        iterable and fetching chunks out of it.

        Assumes that tokens are strings, if they are not, it will try to convert
        them to strings.

        :param data_source: dictionary of attributes that should be passed to
                            the data_chunk_iterable.
        :param data_fnames: String or List of (string) attributes that map
                            to the symbols which should be used to create
                            the vocabulary.
        :param min_count: minimum frequency of a token to be added to the
                          vocabulary.
        :param max_size: maximum number of symbols to store to the vocabulary.
        :param add_default_special_symbols: whether default symbols,
                                    such as <PAD> and <UNK> should be added.
                                    In some cases, e.g. labels vocab
                                    those symbols are not necessary.
        """
        try:
            validate_field_names(data_fnames)
        except Exception as e:
            raise e

        data_fnames = listify(data_fnames)
        dfn_formatted_str = ', '.join(["'%s'" % dfn for dfn in data_fnames])
        logger.info("Creating a vocabulary from %s data_source, and %s"
                    " chunk field(s). min_count: %d, max_vocab_size: %s." %
                    (data_source, dfn_formatted_str, min_count, str(max_size)))
        temp_token_to_count = {}
        for data_chunk in self._data_chunk_iterable.iter(**data_source):
            for data_attr in data_fnames:
                for tokens in data_chunk[data_attr]:

                    if not isinstance(tokens, (list, np.ndarray)):
                        tokens = [tokens]

                    for token in flatten(tokens):
                        if token == '':
                            continue

                        if not isinstance(token, (int, float, str)):
                            raise TypeError("Token is not of a correct type"
                                            " (should be int, float, str,"
                                            " unicode).")

                        if isinstance(token, (int, float)):
                            token = str(token)

                        if token not in temp_token_to_count:
                            temp_token_to_count[token] = 0
                        temp_token_to_count[token] += 1

        # populate the collectors
        for token, count in sort_hash(temp_token_to_count, by_key=False):
            if max_size and len(self) >= max_size:
                break
            if count >= min_count:
                symbol = self.add_symbol(token, count)
                self._total_count += count
                if match_special_symbol(token):
                    self.special_symbols[token] = symbol
        if add_default_special_symbols:
            self.add_special_symbols(DEFAULT_SPECIAL_TOKENS)

        logger.info("Created the vocabulary.")