Python IO.get_liness Exemples

Langage de programmation: Python

Espace de nommage/Pack: rtg.utils

Class/Type: IO

Méthode/Fonction: get_liness

Exemples au hotexamples.com: 2

Python IO.get_liness - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de rtg.utils.IO.get_liness extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

reader(12)

writer(10)

get_lines(6)

copy_file(5)

write_lines(4)

maybe_tmpfs(3)

get_liness(2)

maybe_backup(1)

Méthodes fréquemment utilisées

reader (12)

writer (10)

get_lines (6)

copy_file (5)

write_lines (4)

maybe_tmpfs (3)

get_liness (2)

maybe_backup (1)

Exemple #1

0

Afficher le fichier

Fichier : codec.py Projet : isi-nlp/rtg

def train(cls, model_type: str, vocab_size: int, model_path: Union[Path, str], files: List[str], tok_coverage=0.9999, **kwargs): # Note: char_coverage is abused as subword_coverage hub_api = cls.load_hub_model(model_type) bpe = hub_api.bpe dicto = hub_api.task.dictionary freqs = coll.Counter() lines = IO.get_liness(*files) for line in tqdm(lines, mininterval=2, dynamic_ncols=True, unit='line'): freqs.update(bpe.encode(line).split()) total_toks = sum(freqs.values()) log.info(f"Found {len(freqs)} bpe types and {total_toks} toks") freqs = list(sorted(freqs.items(), reverse=True, key=lambda x: x[1])) vocabulary, oovs = [], [] cumulative = 0 for t, f in freqs: if cumulative / total_toks <= tok_coverage: vocabulary.append((t, f)) cumulative += f else: oovs.append((t, f)) oovs_str = ' '.join(f'{t}:{f}' for t, f in oovs) log.info(f'Excluded {len(oovs)} types as OOVs.\n:{oovs_str}') log.info(f'Included {len(vocabulary)} types as in vocabulary; ' f'Coverage = {cumulative / total_toks:g}') # TODO: mapping should be list[int] with one on one map types, indices = [], {} for typ, new_idx in cls.reserved(): assert len(types) == new_idx types.append(typ) old_idx = dicto.indices.get(typ, -1) indices[typ] = [new_idx, old_idx] for typ, freq in vocabulary: # [new index, old index] indices[typ] = [len(types), dicto.indices.get(typ, -1)] types.append(typ) data = {'model_id': model_type, 'mapping': indices} with IO.writer(model_path) as wrtr: yaml.dump(data, wrtr) return cls(model_path)

Exemple #2

0

Afficher le fichier

Fichier : codec.py Projet : isi-nlp/rtg

def train(cls, model_type: str, vocab_size: int, model_path: str, files: List[str], no_split_toks: Optional[List[str]] = None, char_coverage: float = 0, dedup=True, spark=None): """ :param model_type: word, char, bpe :param vocab_size: vocabulary size :param model_path: where to store vocabulary model :param files: text for creating vcabulary :param no_split_toks: :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage :return: """ assert not no_split_toks, 'not supported in nlcodec yet' from nlcodec import learn_vocab, term_freq kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {} if not spark: inp = IO.get_liness(*files) else: # extract and store frequencies to this file stats_file = model_path + '.termfreqs' if not Path(stats_file).exists(): log.info("Extracting term frequencies... ") paths = [f if isinstance(f, Path) else Path(f) for f in files] wfs, chfs, n_lines = term_freq.word_counts(paths=paths, dedup=dedup, spark=spark) log.info( f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}" ) stats = chfs if model_type == 'char' else wfs log.info(f"Writing frequencies to {stats_file}") with IO.writer(stats_file) as out: term_freq.write_stats(stats=stats, out=out, line_count=n_lines) kwargs['term_freqs'] = True inp = IO.get_lines(stats_file, delim='\n') learn_vocab(inp=inp, level=model_type, model=model_path, vocab_size=vocab_size, **kwargs) return cls(model_path)