Example #1
0
def normalize_file(
    language, processes, normalize_quote_commas, normalize_numbers, encoding
):
    moses = MosesPunctNormalizer(
        language,
        norm_quote_commas=normalize_quote_commas,
        norm_numbers=normalize_numbers,
    )
    moses_normalize = partial(moses.normalize)

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                # TODO: Actually moses_normalize(fin.read()) gives the same output
                #       and it's a lot better but it's inconsistent with the other
                #       preprocessing interfaces, so we're doing it line by line here.
                for line in tqdm(fin.readlines()):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(moses_normalize(line), end="", file=fout)
            else:
                for outline in parallelize_preprocess(
                    moses_normalize, fin.readlines(), processes, progress_bar=True
                ):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(outline, end="", file=fout)
def tokenize_file(language, processes, xml_escape, aggressive_dash_splits,
                  protected_patterns, custom_nb_prefixes, encoding, quiet):
    moses = MosesTokenizer(lang=language,
                           custom_nonbreaking_prefixes_file=custom_nb_prefixes)

    if protected_patterns:
        with open(protected_patterns, encoding="utf8") as fin:
            protected_patterns = [
                pattern.strip() for pattern in fin.readlines()
            ]

    moses_tokenize = partial(
        moses.tokenize,
        return_str=True,
        aggressive_dash_splits=aggressive_dash_splits,
        escape=xml_escape,
        protected_patterns=protected_patterns,
    )

    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_tokenize(line), end="\n", file=fout)
            else:
                for outline in parallelize_preprocess(
                        moses_tokenize,
                        fin.readlines(),
                        processes,
                        progress_bar=(not quiet)):
                    print(outline, end="\n", file=fout)
Example #3
0
def parallel_or_not(iterator, func, processes, quiet):
    if processes == 1:
        for line in iterator:
            yield func(line)
    else:
        for outline in parallelize_preprocess(
            func, iterator, processes, progress_bar=(not quiet)
        ):
            yield outline
Example #4
0
 def processor(iterator):
     if processes == 1:
         for line in list(iterator):
             yield moses_tokenize(line)
     else:
         for outline in parallelize_preprocess(
             moses_tokenize, list(iterator), processes, progress_bar=(not quiet)
         ):
             yield outline
Example #5
0
def detokenize_file(language, processes, xml_unescape, encoding):
    moses = MosesDetokenizer(lang=language)
    moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape)
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detokenize(str.split(line)), end="\n", file=fout)
            else:
                document_iterator = map(str.split, fin.readlines())
                for outline in parallelize_preprocess(
                    moses_detokenize, document_iterator, processes, progress_bar=True
                ):
                    print(outline, end="\n", file=fout)
Example #6
0
def convert_chinese(t2s, processes, encoding):
    convert = simplify if t2s else tradify
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                # TODO: Actually moses_normalize(fin.read()) gives the same output
                #       and it's a lot better but it's inconsistent with the other
                #       preprocessing interfaces, so we're doing it line by line here.
                for line in tqdm(fin.readlines()):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(convert(line), end="", file=fout)
            else:
                for outline in parallelize_preprocess(convert, fin.readlines(), processes, progress_bar=True):
                    # Note: not stripping newlines, so don't need end='\n' when printing to stdout.
                    print(outline, end="", file=fout)
Example #7
0
def detruecase_file(processes, is_headline, encoding):
    moses = MosesDetruecaser()
    moses_detruecase = partial(
        moses.detruecase, return_str=True, is_headline=is_headline
    )
    with click.get_text_stream("stdin", encoding=encoding) as fin:
        with click.get_text_stream("stdout", encoding=encoding) as fout:
            # If it's single process, joblib parallization is slower,
            # so just process line by line normally.
            if processes == 1:
                for line in tqdm(fin.readlines()):
                    print(moses_detruecase(line), end="\n", file=fout)
            else:
                for outline in parallelize_preprocess(
                    moses_detruecase, fin.readlines(), processes, progress_bar=True
                ):
                    print(outline, end="\n", file=fout)
Example #8
0
def tokenize_file(processes, xml_escape, aggressive_dash_splits):
    moses = MosesTokenizer()
    moses_tokenize = partial(moses.tokenize,
                             return_str=True,
                             aggressive_dash_splits=aggressive_dash_splits,
                             escape=xml_escape)

    with click.get_text_stream('stdin') as fin, click.get_text_stream(
            'stdout') as fout:
        # If it's single process, joblib parallization is slower,
        # so just process line by line normally.
        if processes == 1:
            for line in tqdm(fin.readlines()):
                print(moses_tokenize(line), end='\n', file=fout)
        else:
            for outline in parallelize_preprocess(moses_tokenize,
                                                  fin.readlines(),
                                                  processes,
                                                  progress_bar=True):
                print(outline, end='\n', file=fout)
    def _train(
        self,
        document_iterator,
        save_to=None,
        possibly_use_first_token=False,
        processes=1,
        progress_bar=False,
    ):
        """
        :param document_iterator: The input document, each outer list is a sentence,
                          the inner list is the list of tokens for each sentence.
        :type document_iterator: iter(list(str))

        :param possibly_use_first_token: When True, on the basis that the first
            word of a sentence is always capitalized; if this option is provided then:
            a) if a sentence-initial token is *not* capitalized, then it is counted, and
            b) if a capitalized sentence-initial token is the only token of the segment,
               then it is counted, but with only 10% of the weight of a normal token.
        :type possibly_use_first_token: bool

        :returns: A dictionary of the best, known objects as values from `_casing_to_model()`
        :rtype: {'best': dict, 'known': Counter}
        """
        casing = defaultdict(Counter)
        train_truecaser = partial(
            self.learn_truecase_weights,
            possibly_use_first_token=possibly_use_first_token,
        )
        token_weights = chain(
            *parallelize_preprocess(
                train_truecaser, document_iterator, processes, progress_bar=progress_bar
            )
        )
        # Collect the token_weights from every sentence.
        for lowercase_token, surface_token, weight in token_weights:
            casing[lowercase_token][surface_token] += weight

        # Save to file if specified.
        if save_to:
            self._save_model_from_casing(casing, save_to)
        return self._casing_to_model(casing)