Beispiel #1
0
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: Union[Path, str],
              files: List[str],
              tok_coverage=0.9999,
              **kwargs):
        # Note: char_coverage is abused as subword_coverage
        hub_api = cls.load_hub_model(model_type)
        bpe = hub_api.bpe
        dicto = hub_api.task.dictionary

        freqs = coll.Counter()
        lines = IO.get_liness(*files)
        for line in tqdm(lines, mininterval=2, dynamic_ncols=True,
                         unit='line'):
            freqs.update(bpe.encode(line).split())
        total_toks = sum(freqs.values())
        log.info(f"Found {len(freqs)} bpe types and {total_toks} toks")

        freqs = list(sorted(freqs.items(), reverse=True, key=lambda x: x[1]))
        vocabulary, oovs = [], []
        cumulative = 0
        for t, f in freqs:
            if cumulative / total_toks <= tok_coverage:
                vocabulary.append((t, f))
                cumulative += f
            else:
                oovs.append((t, f))

        oovs_str = ' '.join(f'{t}:{f}' for t, f in oovs)
        log.info(f'Excluded {len(oovs)} types as OOVs.\n:{oovs_str}')
        log.info(f'Included {len(vocabulary)} types as in vocabulary; '
                 f'Coverage = {cumulative / total_toks:g}')
        # TODO: mapping should be list[int] with one on one map
        types, indices = [], {}
        for typ, new_idx in cls.reserved():
            assert len(types) == new_idx
            types.append(typ)
            old_idx = dicto.indices.get(typ, -1)
            indices[typ] = [new_idx, old_idx]

        for typ, freq in vocabulary:
            # [new index, old index]
            indices[typ] = [len(types), dicto.indices.get(typ, -1)]
            types.append(typ)

        data = {'model_id': model_type, 'mapping': indices}
        with IO.writer(model_path) as wrtr:
            yaml.dump(data, wrtr)
        return cls(model_path)
Beispiel #2
0
    def train(cls,
              model_type: str,
              vocab_size: int,
              model_path: str,
              files: List[str],
              no_split_toks: Optional[List[str]] = None,
              char_coverage: float = 0,
              dedup=True,
              spark=None):
        """
        :param model_type: word, char, bpe
        :param vocab_size: vocabulary size
        :param model_path: where to store vocabulary model
        :param files: text for creating vcabulary
        :param no_split_toks:
        :param char_coverage: character coverage (0, 1]. value <= 0 => default coverage
        :return:
        """
        assert not no_split_toks, 'not supported in nlcodec yet'
        from nlcodec import learn_vocab, term_freq
        kwargs = dict(char_coverage=char_coverage) if char_coverage > 0 else {}
        if not spark:
            inp = IO.get_liness(*files)
        else:
            # extract and store frequencies to this file
            stats_file = model_path + '.termfreqs'
            if not Path(stats_file).exists():
                log.info("Extracting term frequencies... ")
                paths = [f if isinstance(f, Path) else Path(f) for f in files]
                wfs, chfs, n_lines = term_freq.word_counts(paths=paths,
                                                           dedup=dedup,
                                                           spark=spark)
                log.info(
                    f"Lines = {n_lines:,}, Word Types: {len(wfs):,} Char Types:{len(chfs):,}"
                )
                stats = chfs if model_type == 'char' else wfs
                log.info(f"Writing frequencies to {stats_file}")
                with IO.writer(stats_file) as out:
                    term_freq.write_stats(stats=stats,
                                          out=out,
                                          line_count=n_lines)
                kwargs['term_freqs'] = True
            inp = IO.get_lines(stats_file, delim='\n')

        learn_vocab(inp=inp,
                    level=model_type,
                    model=model_path,
                    vocab_size=vocab_size,
                    **kwargs)
        return cls(model_path)