Exemple #1
0
    def _iter(self, data_path, ext='csv'):
        """
        :param data_path: a string corresponding to a location of data
                          (file or folder). If list is provided, will assume
                          multiple data paths.
        :return: generator of data-chunks.
        """
        try:
            validate_data_paths(data_path)
        except Exception as e:
            raise e

        data_paths = listify(data_path)
        file_openers = create_openers_of_valid_files(
            data_paths, ext=ext, encoding=self.parser_kwargs['encoding'])
        if not file_openers:
            raise ValueError(
                "No valid files to open, please check the provided"
                " 'data_path' (%s). Note that files without the '%s' extension"
                " are ignored." % (data_paths, '.csv'))

        if self.worker_threads_num > 1:
            chunk_iter = self._create_multi_th_gen(file_openers)
        else:
            chunk_iter = self._create_single_th_gen(file_openers)
        return chunk_iter
    def __init__(self,
                 input_dim,
                 output_dim,
                 hidden_dim=None,
                 non_linearity=None):
        """
        The network inputs a continues vector as input that at least contains
        the average embedding of aspects.

        :param hidden_dim: self-explanatory.
        :type hidden_dim: int or list of ints.
        :param output_dim: self-explanatory.
        :param non_linearity: an object for non-linear transformations.
        """
        super(Ffnn, self).__init__()
        hidden_dims = listify(hidden_dim) if hidden_dim is not None else None
        self._seq_model = Sequential()

        prev_hd = input_dim
        i = 0
        if hidden_dims is not None:
            for hd in hidden_dims:
                self._seq_model.add_module(str(i), Linear(prev_hd, hd))
                i += 1
                if non_linearity is not None:
                    self._seq_model.add_module(str(i), non_linearity)
                i += 1
                prev_hd = hd
        self._seq_model.add_module(str(i), Linear(prev_hd, output_dim))
Exemple #3
0
 def __init__(self,
              fname,
              corr_prob,
              beta_a,
              beta_b,
              excl_symbols=None,
              substitute=None,
              **kwargs):
     """
     :param fname: str or list of name strs of fields to which corruption
                   should be applied. 
     :param corr_prob: probability of corrupting a data-unit.
     :param beta_a: first parameter of the Beta distribution.
     :param beta_b: second parameter of the beta distribution.
     :param excl_symbols: a set of symbols that are never dropped.
     :param substitute: a symbol with which a dropped symbol should be 
                        replaced.
     """
     super(HierWordDropper, self).__init__(**kwargs)
     assert corr_prob <= 1.
     self.fnames = listify(fname)
     self.corr_prob = corr_prob
     self.beta_a = beta_a
     self.beta_b = beta_b
     self.excluded_symbols = excl_symbols if excl_symbols else {}
     self.substitute = substitute
Exemple #4
0
    def _iter(self, data_path):
        """
        :param data_path: a string corresponding to a location of data
                          (file or folder). If list is provided, will assume
                          multiple data paths.
        """
        try:
            validate_data_paths(data_path)
        except Exception as e:
            raise e

        data_paths = listify(data_path)
        file_openers = create_openers_of_valid_files(data_paths, ext='.json')
        if not file_openers:
            raise ValueError(
                "No valid files to open, please check the provided"
                " 'data_path' (%s). Note that files without the '%s' extension"
                " are ignored." % (data_paths, 'json'))

        for fo in file_openers:
            f = fo(encoding=self.encoding)
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            data_chunk = self._to_data_chunk(json_dict)
            yield data_chunk
            f.close()
Exemple #5
0
    def __init__(self,
                 field_names,
                 window_size=5,
                 step_size=1,
                 only_full_windows=False,
                 new_window_field_name_suffix='window',
                 **kwargs):
        """
        :param field_names: str or list of str (str) corresponding to fields
                            which should be slided over.
        :param window_size: self-explanatory.
        :param step_size: self-explanatory.
        :param only_full_windows: if set to True guarantees that all windows
                                  will be of the same size.
        :param new_window_field_name_suffix: suffix for all newly created fields.
        """
        try:
            validate_field_names(field_names)
        except Exception as e:
            raise e

        super(WindowSlider, self).__init__(**kwargs)
        self.field_names = listify(field_names)
        self.window_size = window_size
        self.step_size = step_size
        self.only_full_windows = only_full_windows
        self.new_windw_fn_suffix = new_window_field_name_suffix
Exemple #6
0
 def _is_of_allowed_types(self, val, target_types):
     """Checks if val belongs to specific target types (if allowed)."""
     target_types = listify(target_types)
     return any([
         t in self.allowed_types and isinstance(val, t)
         for t in target_types
     ])
Exemple #7
0
 def __init__(self, fname, words, **kwargs):
     """
     :param fname: name(s) of fields that contain symbols.
     :param words: symbol(str/int) or list of str/int to eliminate.
     """
     super(WordEliminator, self).__init__(**kwargs)
     self.fnames = listify(fname)
     self.words = set(words) if isinstance(words, list) else {words}
Exemple #8
0
 def __init__(self, fname, start_el, end_el, **kwargs):
     """
     :param fname: the name or names(list) of the field(s) sequences of 
                        which to wrap.
     :param start_el: id or token of the start elem.
     :param end_el: id or token of the end elem.
     :param kwargs: e.g. name_prefix.
     """
     super(SeqWrapper, self).__init__(**kwargs)
     self.fname = listify(fname)
     self.start_el = start_el
     self.end_el = end_el
Exemple #9
0
 def __init__(self, f, repr_funcs=None, grouping_fnames=None, indent=2):
     """
     :param f: an opened file where data-chunks should to be written.
     :param repr_funcs: dict of field names mapping to functions that
                        should be used to obtain str. reprs of field values.
     :param grouping_fnames: list of field names based on which data should
                             be grouped into a tree.
     :param indent: self-explanatory.
     """
     super(JsonWriter, self).__init__(f=f, repr_funcs=repr_funcs)
     self.grouping_fnames = listify(
         grouping_fnames) if grouping_fnames else None
     self.indent = indent
    def __init__(self, fnames, **kwargs):
        """
        :param fnames: str or list of str names that should represent
                            fields that should be selected from data-chunks.
                            Other fields are discarded.
        """
        try:
            validate_field_names(fnames)
        except Exception as e:
            raise e

        super(FieldSelector, self).__init__(**kwargs)
        self.fnames = listify(fnames)
    def __call__(self, data_path):
        try:
            validate_data_paths(data_path)
        except Exception as e:
            raise e

        safe_mkdir(self.output_folder)
        for dp in listify(data_path):
            for file_path in get_file_paths(dp):
                file_name = os.path.basename(file_path)
                output_file_path = os.path.join(self.output_folder, file_name)
                if not os.path.exists(output_file_path):
                    self._clean_and_save_file(file_path, output_file_path)
        return {"data_path": self.output_folder}
    def __init__(self, fnames, tokenization_func=None,
                 token_cleaning_func=None, token_matching_func=None,
                 lower_case=True, **kwargs):
        """
        :param fnames: str or list of string corresponding to fields that
                            should be tokenized.
        :param tokenization_func: a function that splits string sequences into
                                  sequences of tokens. The form should be:
                                  x -> y where x is a str and y is a list/array
                                  of tokens.
        :param token_cleaning_func: the function responsible for normalization
                                    of tokens, elimination of unwanted
                                    characters, etc. format: x -> y, where x is
                                    a str token, and y is a clean str token.
        :param token_matching_func: a function that matches raw text tokens to
                                    to a special set of tokens. E.g. to twitter
                                    emoticons ':)' -> '<POSIT_EMOT>'.
                                    The format: x -> y, where x is a str token,
                                    and y is either False, if it does not match
                                    or a string token otherwise.
        :param lower_case: whether to lower-case strings before tokenization.
        """
        try:
            validate_field_names(fnames)
        except Exception as e:
            raise e
        msg = "Please provide a valid callable %s function."
        if tokenization_func is None:
            tokenization_func = lambda x: x.split()
        if not callable(tokenization_func):
            raise ValueError(msg % "tokenization")
        if token_cleaning_func is not None and not callable(
                token_cleaning_func):
            raise ValueError(msg % "token cleaning")
        if token_matching_func is not None and not callable(
                token_matching_func):
            raise ValueError(msg % "token matching")

        super(TokenProcessor, self).__init__(**kwargs)
        self.field_names = listify(fnames)
        self.tokenization_func = tokenization_func
        self.token_cleaning_func = token_cleaning_func
        self.token_matching_func = token_matching_func
        self.lower_case = lower_case
Exemple #13
0
    def create(self,
               data_source,
               data_fnames,
               min_count=1,
               max_size=None,
               add_default_special_symbols=True):
        """
        Create vocabulary by passing data_source to the corresponding data-chunk
        iterable and fetching chunks out of it.

        Assumes that tokens are strings, if they are not, it will try to convert
        them to strings.

        :param data_source: dictionary of attributes that should be passed to
                            the data_chunk_iterable.
        :param data_fnames: String or List of (string) attributes that map
                            to the symbols which should be used to create
                            the vocabulary.
        :param min_count: minimum frequency of a token to be added to the
                          vocabulary.
        :param max_size: maximum number of symbols to store to the vocabulary.
        :param add_default_special_symbols: whether default symbols,
                                    such as <PAD> and <UNK> should be added.
                                    In some cases, e.g. labels vocab
                                    those symbols are not necessary.
        """
        try:
            validate_field_names(data_fnames)
        except Exception as e:
            raise e

        data_fnames = listify(data_fnames)
        dfn_formatted_str = ', '.join(["'%s'" % dfn for dfn in data_fnames])
        logger.info("Creating a vocabulary from %s data_source, and %s"
                    " chunk field(s). min_count: %d, max_vocab_size: %s." %
                    (data_source, dfn_formatted_str, min_count, str(max_size)))
        temp_token_to_count = {}
        for data_chunk in self._data_chunk_iterable.iter(**data_source):
            for data_attr in data_fnames:
                for tokens in data_chunk[data_attr]:

                    if not isinstance(tokens, (list, np.ndarray)):
                        tokens = [tokens]

                    for token in flatten(tokens):
                        if token == '':
                            continue

                        if not isinstance(token, (int, float, str)):
                            raise TypeError("Token is not of a correct type"
                                            " (should be int, float, str,"
                                            " unicode).")

                        if isinstance(token, (int, float)):
                            token = str(token)

                        if token not in temp_token_to_count:
                            temp_token_to_count[token] = 0
                        temp_token_to_count[token] += 1

        # populate the collectors
        for token, count in sort_hash(temp_token_to_count, by_key=False):
            if max_size and len(self) >= max_size:
                break
            if count >= min_count:
                symbol = self.add_symbol(token, count)
                self._total_count += count
                if match_special_symbol(token):
                    self.special_symbols[token] = symbol
        if add_default_special_symbols:
            self.add_special_symbols(DEFAULT_SPECIAL_TOKENS)

        logger.info("Created the vocabulary.")
        logging.info("Total word count: %d." % self._total_count)
        logging.info("Vocab size: %d." % len(self))
Exemple #14
0
 def __init__(self, fnames, **kwargs):
     """
     :param fnames: field(s) to be removed from chunks. Can be a list or str.
     """
     super(FieldDropper, self).__init__(**kwargs)
     self.fnames = listify(fnames)
Exemple #15
0
 def __init__(self, fname, new_len_fname, dtype='int64', **kwargs):
     super(SeqLenComputer, self).__init__(**kwargs)
     self.fnames = listify(fname)
     self.new_fnames = listify(new_len_fname)
     assert (len(self.fnames) == len(self.new_fnames))
     self.dtype = dtype