def split_data(self, test_max_size: int = None, source_vocab_size: int = None, target_vocab_size: int = None, shuffle=True, take_parallel_data=True, take_corpus_instances=False, legacy_test=None): """ Guarantees test data without data identical to training and only with vocabulary that exists in training :param test_max_size: int = max examples on test data :param source_vocab_size: int = restrict most common vocab :param target_vocab_size: int = restrict most common vocab :param shuffle: bool = randomize :param take_parallel_data: bool = zipped data if true else return (x_train, y_train, x_test, y_test) :param take_corpus_instances: bool = return new instances for train data and test data :param legacy_test: List[Tuple[str,str]] = parallel data """ self.source.reset_freq() self.target.reset_freq() train = [] test = [] if legacy_test is not None: test = Corpus(*self.distinct_from_parallel(legacy_test), source_name=self.source_language, target_name=self.target_language) test_max_size = test_max_size if test_max_size is not None and isinstance(test_max_size, (int, float)) else len( self.source.data) - self.n_train if source_vocab_size is not None or target_vocab_size is not None: data = list(self._get_vocab_data(source_vocab_size=source_vocab_size, target_vocab_size=target_vocab_size)) else: data = list(zip(self.source.data, self.target.data)) if shuffle: random.shuffle(data) for x, y in data: # remove blank line if x == '' or y == '': continue if legacy_test is not None: # remove sentence from train. if self.source.preprocess(x) in test.source.phrases_freq: continue if (self._can_go_test(x, y) and len(test) < test_max_size) and legacy_test is None: test.append([x, y]) self._update_filters(x, y) continue train.append([x, y]) if take_parallel_data is False: return (*get_cols(train), *get_cols(test)) if take_corpus_instances is True: train = self.load_from_parallel_data(train, self.source_language, self.target_language) test = self.load_from_parallel_data(test, self.source_language, self.target_language) return train, test return train, test
def __getitem__(self, item): if isinstance(item, str): if item not in self.cols: raise ValueError(f'Column name <{item}> not in {self.cols}') return get_cols(self.data)[self.cols.index(item)] if isinstance(item, tuple): lines, col = item if isinstance(col, tuple): raise ValueError("Isn't Possible.") assert col <= self.n_cols - 1, ValueError( f"Invalid Column. Choice available index {list(range(self.n_cols))}") return get_cols(self.data)[col][lines] return super().__getitem__(item)
def to_dict(self): return dict(zip(self.cols, get_cols(self.lines)))
def to_dict(self): return dict(zip(self.cols, get_cols(self.data)))
def distinct_from_parallel(cls, data): return get_cols(data)