def normalize_file( language, processes, normalize_quote_commas, normalize_numbers, encoding ): moses = MosesPunctNormalizer( language, norm_quote_commas=normalize_quote_commas, norm_numbers=normalize_numbers, ) moses_normalize = partial(moses.normalize) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: # TODO: Actually moses_normalize(fin.read()) gives the same output # and it's a lot better but it's inconsistent with the other # preprocessing interfaces, so we're doing it line by line here. for line in tqdm(fin.readlines()): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(moses_normalize(line), end="", file=fout) else: for outline in parallelize_preprocess( moses_normalize, fin.readlines(), processes, progress_bar=True ): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(outline, end="", file=fout)
def tokenize_file(language, processes, xml_escape, aggressive_dash_splits, protected_patterns, custom_nb_prefixes, encoding, quiet): moses = MosesTokenizer(lang=language, custom_nonbreaking_prefixes_file=custom_nb_prefixes) if protected_patterns: with open(protected_patterns, encoding="utf8") as fin: protected_patterns = [ pattern.strip() for pattern in fin.readlines() ] moses_tokenize = partial( moses.tokenize, return_str=True, aggressive_dash_splits=aggressive_dash_splits, escape=xml_escape, protected_patterns=protected_patterns, ) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_tokenize(line), end="\n", file=fout) else: for outline in parallelize_preprocess( moses_tokenize, fin.readlines(), processes, progress_bar=(not quiet)): print(outline, end="\n", file=fout)
def parallel_or_not(iterator, func, processes, quiet): if processes == 1: for line in iterator: yield func(line) else: for outline in parallelize_preprocess( func, iterator, processes, progress_bar=(not quiet) ): yield outline
def processor(iterator): if processes == 1: for line in list(iterator): yield moses_tokenize(line) else: for outline in parallelize_preprocess( moses_tokenize, list(iterator), processes, progress_bar=(not quiet) ): yield outline
def detokenize_file(language, processes, xml_unescape, encoding): moses = MosesDetokenizer(lang=language) moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_detokenize(str.split(line)), end="\n", file=fout) else: document_iterator = map(str.split, fin.readlines()) for outline in parallelize_preprocess( moses_detokenize, document_iterator, processes, progress_bar=True ): print(outline, end="\n", file=fout)
def convert_chinese(t2s, processes, encoding): convert = simplify if t2s else tradify with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: # TODO: Actually moses_normalize(fin.read()) gives the same output # and it's a lot better but it's inconsistent with the other # preprocessing interfaces, so we're doing it line by line here. for line in tqdm(fin.readlines()): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(convert(line), end="", file=fout) else: for outline in parallelize_preprocess(convert, fin.readlines(), processes, progress_bar=True): # Note: not stripping newlines, so don't need end='\n' when printing to stdout. print(outline, end="", file=fout)
def detruecase_file(processes, is_headline, encoding): moses = MosesDetruecaser() moses_detruecase = partial( moses.detruecase, return_str=True, is_headline=is_headline ) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_detruecase(line), end="\n", file=fout) else: for outline in parallelize_preprocess( moses_detruecase, fin.readlines(), processes, progress_bar=True ): print(outline, end="\n", file=fout)
def tokenize_file(processes, xml_escape, aggressive_dash_splits): moses = MosesTokenizer() moses_tokenize = partial(moses.tokenize, return_str=True, aggressive_dash_splits=aggressive_dash_splits, escape=xml_escape) with click.get_text_stream('stdin') as fin, click.get_text_stream( 'stdout') as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_tokenize(line), end='\n', file=fout) else: for outline in parallelize_preprocess(moses_tokenize, fin.readlines(), processes, progress_bar=True): print(outline, end='\n', file=fout)
def _train( self, document_iterator, save_to=None, possibly_use_first_token=False, processes=1, progress_bar=False, ): """ :param document_iterator: The input document, each outer list is a sentence, the inner list is the list of tokens for each sentence. :type document_iterator: iter(list(str)) :param possibly_use_first_token: When True, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token. :type possibly_use_first_token: bool :returns: A dictionary of the best, known objects as values from `_casing_to_model()` :rtype: {'best': dict, 'known': Counter} """ casing = defaultdict(Counter) train_truecaser = partial( self.learn_truecase_weights, possibly_use_first_token=possibly_use_first_token, ) token_weights = chain( *parallelize_preprocess( train_truecaser, document_iterator, processes, progress_bar=progress_bar ) ) # Collect the token_weights from every sentence. for lowercase_token, surface_token, weight in token_weights: casing[lowercase_token][surface_token] += weight # Save to file if specified. if save_to: self._save_model_from_casing(casing, save_to) return self._casing_to_model(casing)