def _iter(self, data_path, ext='csv'): """ :param data_path: a string corresponding to a location of data (file or folder). If list is provided, will assume multiple data paths. :return: generator of data-chunks. """ try: validate_data_paths(data_path) except Exception as e: raise e data_paths = listify(data_path) file_openers = create_openers_of_valid_files( data_paths, ext=ext, encoding=self.parser_kwargs['encoding']) if not file_openers: raise ValueError( "No valid files to open, please check the provided" " 'data_path' (%s). Note that files without the '%s' extension" " are ignored." % (data_paths, '.csv')) if self.worker_threads_num > 1: chunk_iter = self._create_multi_th_gen(file_openers) else: chunk_iter = self._create_single_th_gen(file_openers) return chunk_iter
def __init__(self, input_dim, output_dim, hidden_dim=None, non_linearity=None): """ The network inputs a continues vector as input that at least contains the average embedding of aspects. :param hidden_dim: self-explanatory. :type hidden_dim: int or list of ints. :param output_dim: self-explanatory. :param non_linearity: an object for non-linear transformations. """ super(Ffnn, self).__init__() hidden_dims = listify(hidden_dim) if hidden_dim is not None else None self._seq_model = Sequential() prev_hd = input_dim i = 0 if hidden_dims is not None: for hd in hidden_dims: self._seq_model.add_module(str(i), Linear(prev_hd, hd)) i += 1 if non_linearity is not None: self._seq_model.add_module(str(i), non_linearity) i += 1 prev_hd = hd self._seq_model.add_module(str(i), Linear(prev_hd, output_dim))
def __init__(self, fname, corr_prob, beta_a, beta_b, excl_symbols=None, substitute=None, **kwargs): """ :param fname: str or list of name strs of fields to which corruption should be applied. :param corr_prob: probability of corrupting a data-unit. :param beta_a: first parameter of the Beta distribution. :param beta_b: second parameter of the beta distribution. :param excl_symbols: a set of symbols that are never dropped. :param substitute: a symbol with which a dropped symbol should be replaced. """ super(HierWordDropper, self).__init__(**kwargs) assert corr_prob <= 1. self.fnames = listify(fname) self.corr_prob = corr_prob self.beta_a = beta_a self.beta_b = beta_b self.excluded_symbols = excl_symbols if excl_symbols else {} self.substitute = substitute
def _iter(self, data_path): """ :param data_path: a string corresponding to a location of data (file or folder). If list is provided, will assume multiple data paths. """ try: validate_data_paths(data_path) except Exception as e: raise e data_paths = listify(data_path) file_openers = create_openers_of_valid_files(data_paths, ext='.json') if not file_openers: raise ValueError( "No valid files to open, please check the provided" " 'data_path' (%s). Note that files without the '%s' extension" " are ignored." % (data_paths, 'json')) for fo in file_openers: f = fo(encoding=self.encoding) json_dict = json.load(f, object_pairs_hook=OrderedDict) data_chunk = self._to_data_chunk(json_dict) yield data_chunk f.close()
def __init__(self, field_names, window_size=5, step_size=1, only_full_windows=False, new_window_field_name_suffix='window', **kwargs): """ :param field_names: str or list of str (str) corresponding to fields which should be slided over. :param window_size: self-explanatory. :param step_size: self-explanatory. :param only_full_windows: if set to True guarantees that all windows will be of the same size. :param new_window_field_name_suffix: suffix for all newly created fields. """ try: validate_field_names(field_names) except Exception as e: raise e super(WindowSlider, self).__init__(**kwargs) self.field_names = listify(field_names) self.window_size = window_size self.step_size = step_size self.only_full_windows = only_full_windows self.new_windw_fn_suffix = new_window_field_name_suffix
def _is_of_allowed_types(self, val, target_types): """Checks if val belongs to specific target types (if allowed).""" target_types = listify(target_types) return any([ t in self.allowed_types and isinstance(val, t) for t in target_types ])
def __init__(self, fname, words, **kwargs): """ :param fname: name(s) of fields that contain symbols. :param words: symbol(str/int) or list of str/int to eliminate. """ super(WordEliminator, self).__init__(**kwargs) self.fnames = listify(fname) self.words = set(words) if isinstance(words, list) else {words}
def __init__(self, fname, start_el, end_el, **kwargs): """ :param fname: the name or names(list) of the field(s) sequences of which to wrap. :param start_el: id or token of the start elem. :param end_el: id or token of the end elem. :param kwargs: e.g. name_prefix. """ super(SeqWrapper, self).__init__(**kwargs) self.fname = listify(fname) self.start_el = start_el self.end_el = end_el
def __init__(self, f, repr_funcs=None, grouping_fnames=None, indent=2): """ :param f: an opened file where data-chunks should to be written. :param repr_funcs: dict of field names mapping to functions that should be used to obtain str. reprs of field values. :param grouping_fnames: list of field names based on which data should be grouped into a tree. :param indent: self-explanatory. """ super(JsonWriter, self).__init__(f=f, repr_funcs=repr_funcs) self.grouping_fnames = listify( grouping_fnames) if grouping_fnames else None self.indent = indent
def __init__(self, fnames, **kwargs): """ :param fnames: str or list of str names that should represent fields that should be selected from data-chunks. Other fields are discarded. """ try: validate_field_names(fnames) except Exception as e: raise e super(FieldSelector, self).__init__(**kwargs) self.fnames = listify(fnames)
def __call__(self, data_path): try: validate_data_paths(data_path) except Exception as e: raise e safe_mkdir(self.output_folder) for dp in listify(data_path): for file_path in get_file_paths(dp): file_name = os.path.basename(file_path) output_file_path = os.path.join(self.output_folder, file_name) if not os.path.exists(output_file_path): self._clean_and_save_file(file_path, output_file_path) return {"data_path": self.output_folder}
def __init__(self, fnames, tokenization_func=None, token_cleaning_func=None, token_matching_func=None, lower_case=True, **kwargs): """ :param fnames: str or list of string corresponding to fields that should be tokenized. :param tokenization_func: a function that splits string sequences into sequences of tokens. The form should be: x -> y where x is a str and y is a list/array of tokens. :param token_cleaning_func: the function responsible for normalization of tokens, elimination of unwanted characters, etc. format: x -> y, where x is a str token, and y is a clean str token. :param token_matching_func: a function that matches raw text tokens to to a special set of tokens. E.g. to twitter emoticons ':)' -> '<POSIT_EMOT>'. The format: x -> y, where x is a str token, and y is either False, if it does not match or a string token otherwise. :param lower_case: whether to lower-case strings before tokenization. """ try: validate_field_names(fnames) except Exception as e: raise e msg = "Please provide a valid callable %s function." if tokenization_func is None: tokenization_func = lambda x: x.split() if not callable(tokenization_func): raise ValueError(msg % "tokenization") if token_cleaning_func is not None and not callable( token_cleaning_func): raise ValueError(msg % "token cleaning") if token_matching_func is not None and not callable( token_matching_func): raise ValueError(msg % "token matching") super(TokenProcessor, self).__init__(**kwargs) self.field_names = listify(fnames) self.tokenization_func = tokenization_func self.token_cleaning_func = token_cleaning_func self.token_matching_func = token_matching_func self.lower_case = lower_case
def create(self, data_source, data_fnames, min_count=1, max_size=None, add_default_special_symbols=True): """ Create vocabulary by passing data_source to the corresponding data-chunk iterable and fetching chunks out of it. Assumes that tokens are strings, if they are not, it will try to convert them to strings. :param data_source: dictionary of attributes that should be passed to the data_chunk_iterable. :param data_fnames: String or List of (string) attributes that map to the symbols which should be used to create the vocabulary. :param min_count: minimum frequency of a token to be added to the vocabulary. :param max_size: maximum number of symbols to store to the vocabulary. :param add_default_special_symbols: whether default symbols, such as <PAD> and <UNK> should be added. In some cases, e.g. labels vocab those symbols are not necessary. """ try: validate_field_names(data_fnames) except Exception as e: raise e data_fnames = listify(data_fnames) dfn_formatted_str = ', '.join(["'%s'" % dfn for dfn in data_fnames]) logger.info("Creating a vocabulary from %s data_source, and %s" " chunk field(s). min_count: %d, max_vocab_size: %s." % (data_source, dfn_formatted_str, min_count, str(max_size))) temp_token_to_count = {} for data_chunk in self._data_chunk_iterable.iter(**data_source): for data_attr in data_fnames: for tokens in data_chunk[data_attr]: if not isinstance(tokens, (list, np.ndarray)): tokens = [tokens] for token in flatten(tokens): if token == '': continue if not isinstance(token, (int, float, str)): raise TypeError("Token is not of a correct type" " (should be int, float, str," " unicode).") if isinstance(token, (int, float)): token = str(token) if token not in temp_token_to_count: temp_token_to_count[token] = 0 temp_token_to_count[token] += 1 # populate the collectors for token, count in sort_hash(temp_token_to_count, by_key=False): if max_size and len(self) >= max_size: break if count >= min_count: symbol = self.add_symbol(token, count) self._total_count += count if match_special_symbol(token): self.special_symbols[token] = symbol if add_default_special_symbols: self.add_special_symbols(DEFAULT_SPECIAL_TOKENS) logger.info("Created the vocabulary.") logging.info("Total word count: %d." % self._total_count) logging.info("Vocab size: %d." % len(self))
def __init__(self, fnames, **kwargs): """ :param fnames: field(s) to be removed from chunks. Can be a list or str. """ super(FieldDropper, self).__init__(**kwargs) self.fnames = listify(fnames)
def __init__(self, fname, new_len_fname, dtype='int64', **kwargs): super(SeqLenComputer, self).__init__(**kwargs) self.fnames = listify(fname) self.new_fnames = listify(new_len_fname) assert (len(self.fnames) == len(self.new_fnames)) self.dtype = dtype