Ejemplo n.º 1
0
class Truecaser:
    def __init__(self, language, path):
        self.language = language
        self.model = Path(f"{path}/model.truecase.{language}")

        if self.trained:
            self.truecaser = MosesTruecaser(self.model)
        else:
            self.truecaser = MosesTruecaser()

    def __repr__(self):
        return f"Truecaser({self.language})"

    @property
    def trained(self):
        return os.path.isfile(self.model)

    def __call__(self, line):
        if os.path.isfile(self.model):
            toks = self.truecaser.truecase(line)
            string = " ".join(toks)
            return string.strip()
        else:
            raise UntrainedModel("Truecaser not trained")

    def train(self, filename):
        if not os.path.isfile(self.model):
            self.truecaser.train_from_file(
                filename, save_to=self.model
            )
Ejemplo n.º 2
0
    def __init__(self, language, path):
        self.language = language
        self.model = Path(f"{path}/model.truecase.{language}")

        if self.trained:
            self.truecaser = MosesTruecaser(self.model)
        else:
            self.truecaser = MosesTruecaser()
Ejemplo n.º 3
0
def train_truecase(corpus: iCorpus, save_to: str, threads=1) -> None:
    truecaser = MosesTruecaser()
    # Testing loading data beforehand.
    data = [line.split(' ') for line in corpus]
    truecaser.train(data,
                    save_to=save_to,
                    possibly_use_first_token=True,
                    processes=threads,
                    progress_bar=True)
Ejemplo n.º 4
0
def preprocess(source_lang,tcmodel,escape):
	mtok = MosesTokenizer(lang=source_lang)
	mtr = MosesTruecaser(tcmodel)
	sys.stderr.write("model loaded\n")
	for line in sys.stdin:
		tokenized = mtok.tokenize(line,escape=escape)
		truecased = mtr.truecase(" ".join(tokenized))
		sys.stderr.write("sentence processed\n")
		sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8"))
		sys.stdout.flush()
Ejemplo n.º 5
0
def create_bpe_vocabulary(bpe_vocab_fp, bpe_int_fp, data_path, truecaser_fp):
    """Creates vocabulary that is used to map BPEs to ids and vice-versa.

    It iterates over data, performs tokenization with the BPE tokenizer and
    true caser and creates a vocabulary of unique symbols that is saved.

    Args:
        bpe_vocab_fp: path to the new vocabulary that will be created.
        data_path: path to data with text.
        bpe_int_fp: internal file with BPEs.
        truecaser_fp: self-explanatory.
    """
    bpe = BPE(glossaries=SPECIAL_TOKENS)
    bpe.load(bpcodes_fp=bpe_int_fp, merges=-1)

    tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True)
    tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)
    unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split())

    #   PIPELINES AND VOCAB   #

    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             lowercase=False,
                                             tok_func=unsup_tok_func)
    subword_vocab = Vocabulary(vocab_pipeline,
                               name_prefix="word",
                               special_tokens=SPECIAL_TOKENS)
    subword_vocab.create(data_source={"data_path": data_path},
                         max_size=None,
                         data_fnames=InpDataF.REV_TEXT)
    subword_vocab.write(bpe_vocab_fp, sep=' ')
Ejemplo n.º 6
0
def train_and_save_true_casing_model(input_fps, text_fname, output_fp):
    """Trains the Moses model on tokenized csv files; saves params."""
    mtr = MosesTruecaser(is_asr=True)
    reader = CsvReader(quoting=QUOTE_NONE,
                       sep='\t',
                       engine='python',
                       encoding='utf-8')
    texts = []
    logger.info("Loading data from: '%s'." % input_fps)
    for dc in reader.iter(data_path=input_fps):
        for du in dc.iter():
            texts.append(du[text_fname].split())
    logger.info("Loaded the data.")
    safe_mkfdir(output_fp)
    logger.info("Training the truecaser.")
    mtr.train(texts, save_to=output_fp, progress_bar=True, processes=1)
    logger.info("Done, saved the model to: '%s'." % output_fp)
Ejemplo n.º 7
0
class Truecaser(BatchProcessor):

	def __init__(self, model):

		self.handler = MosesTruecaser(load_from=model)

	def process(self, input):

		return self.handler.truecase(input.encode("utf-8", "ignore")), return_str=True).decode("utf-8", "ignore")
Ejemplo n.º 8
0
class Detruecaser(BatchProcessor):

	def __init__(self):

		self.handler = MosesTruecaser()

	def process(self, input, is_headline=False):

		return self.handler.detruecase(input, is_headline=False, return_str=True)
Ejemplo n.º 9
0
    def __train_truecase(self):
        """
        :infile: path to the train data.
        return a model in modelfile.
        """
        sentences = self.sentences
        if self.infile:
            with open(self.infile, 'r',
                      encoding=which_encoding(self.infile)) as train_file:
                sentences = train_file.readlines()

        assert (len(sentences) != 0)
        sentences = [sentence.strip().split() for sentence in sentences]
        mtr = MosesTruecaser()
        mtr.train(sentences,
                  save_to=self.modelfile,
                  processes=20,
                  progress_bar=True)
        return mtr
Ejemplo n.º 10
0
def create_word_vocab(vocab_fp, data_path, truecaser_fp):
    """Creates a vocabulary using a vocabulary specific pipeline."""
    tcaser = MosesTruecaser(load_from=truecaser_fp, is_asr=True)
    tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)
    tok_func = lambda x: tcase_func(x).split()
    vocab_pipeline = assemble_vocab_pipeline(text_fname=InpDataF.REV_TEXT,
                                             lowercase=False, tok_func=tok_func)
    word_vocab = Vocabulary(vocab_pipeline, name_prefix="word",
                            special_tokens=SPECIAL_TOKENS)

    word_vocab.create(data_source={'data_path': data_path},
                      data_fnames=InpDataF.REV_TEXT)
    word_vocab.write(vocab_fp, sep=' ')
Ejemplo n.º 11
0
def get_truecase_preprocessor():
    """
    get sacremoses truecase processor
    Note: 此预处理器必须用在mosestokenize处理器之后
    >>> text = "How are you"
    >>> get_truecase_preprocessor()(text) if "truecase_model" in global_config else "how are you"
    'how are you'
    """
    from sacremoses import MosesTruecaser
    truecase_model = global_config["truecase_model"]
    mtr = MosesTruecaser(truecase_model)

    def preprocessor(line):
        return mtr.truecase(line, return_str=True)

    return preprocessor
Ejemplo n.º 12
0
	def __init__(self):

		self.handler = MosesTruecaser()
Ejemplo n.º 13
0
    def __init__(self):
        super(RunHP, self).__init__()

        #   GENERAL  #
        self.seed = 42
        self.cuda_device_id = 6
        self.device = 'cpu'  # 'cuda' or 'cpu'
        self.training_logging_step = 50  # how often to print internal metrics
        self.epochs = 10  # if set to 0 will immediately just to evaluation
        self.learning_rate = 0.0005
        self.grads_clip = 0.25

        # GENERAL DATA RELATED #
        self.dataset = 'yelp'
        self.train_max_groups_per_batch = 6
        self.val_max_groups_per_batch = 13
        self.eval_max_groups_per_batch = 20
        self.max_rev_per_group = 8

        #   DATA SOURCES  #
        # `early_term` limits the number of chunks per epoch
        self.train_early_term = None
        self.val_early_term = None
        self.gener_early_term = 2

        #  GENERAL PATHS   #
        self.root_path = 'copycat'
        self.experiments_folder = 'first_run'
        self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}'
        self.checkpoint_full_fn = 'checkpoint.tar'
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)
        self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar'
        self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model'

        #   DATA PATHS  #
        self.base_data_path = f'data/{self.dataset}/'
        self.train_fp = comb_paths(self.base_data_path, "split/train/")
        self.val_fp = comb_paths(self.base_data_path, 'split/val/')
        self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt'
        self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv')
        self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv')

        #   ANNEALING   #
        self.c_m = 8.
        self.c_r = 0.8
        self.c_kl_ann_max_val = 1.
        self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000
        self.z_m = 8.
        self.z_c = 0.8
        self.z_kl_ann_max_val = 1.
        self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000

        #   DECODING/GENERATION  #
        self.beam_size = 5
        self.beam_len_norm = True
        self.beam_excl_words = []
        self.block_ngram_repeat = 3  # or None
        self.ngram_mirror_window = 3  # or None
        self.mirror_conjs = ["and", 'or', ',', 'but']  # or None
        self.block_consecutive = True
        self.min_gen_seq_len = 20

        #   POST-PROCESSING AND ANALYTICS #
        mt = MosesTokenizer()
        self.tok_func = partial(mt.tokenize, escape=False)
        self.sent_split_func = nltk.sent_tokenize
        dt = MosesDetokenizer()
        self.detok_func = partial(dt.detokenize, unescape=False)
        true_caser = MosesTruecaser(load_from=self.tcaser_model_path,
                                    is_asr=True)
        self.true_case_func = partial(true_caser.truecase,
                                      return_str=True,
                                      use_known=True)
        self.analytics_func = partial(ngram_seq_analysis,
                                      tokenizer=self.tok_func,
                                      sent_splitter=self.sent_split_func,
                                      n_grams_to_comp=(2, 3, 4))
Ejemplo n.º 14
0
        'train': uns_train_data_source,
        'val': uns_val_data_source
    },
    PLUGIN_TUNING: {
        'train': gold_train_data_source,
        'val': gold_val_data_source
    },
    JOINT_TUNING: {
        'train': gold_train_data_source,
        'val': gold_val_data_source
    }
}

#   TRUECASER   #

tcaser = MosesTruecaser(load_from=run_conf.tcaser_model_path, is_asr=True)
tcase_func = partial(tcaser.truecase, return_str=True, use_known=True)

#   WORD TOKENIZERS / DE-TOKENIZERS   #

mt = MosesTokenizer()
dt = MosesDetokenizer()

#   SUB-WORD TOKENIZER   #

bpe = BPE(glossaries=SPECIAL_TOKENS)
bpe.load(bpcodes_fp=run_conf.bpe_fp)

unsup_tok_func = lambda x: bpe.tokenize(tcase_func(x).split())
gold_tok_func = lambda x: bpe.tokenize(mt.tokenize(tcase_func(x), escape=False)
                                       )
Ejemplo n.º 15
0
 def  __init__(self, exp_path):
     
     self.exp_path = exp_path
     self.load_experiment()
     self.detokenizer = MosesDetokenizer(lang='en')
     self.truecaser = MosesTruecaser(load_from='sm.cnndm.tc.model')
Ejemplo n.º 16
0
from sacremoses import MosesTruecaser, MosesTokenizer

mtr = MosesTruecaser()
mtok = MosesTokenizer(lang='en')

# Save the truecase model to 'big.truecasemodel' using `save_to`
tokenized_docs = [mtok.tokenize(line) for line in open('fi')]
mtr.train(tokenized_docs, save_to='cnndm.truecasemodel')
Ejemplo n.º 17
0
 def __load_truecaser(self):
     """"
     Load the model file to do truecasting.
     The model will be load onto distribution_words attribute.
     """
     return MosesTruecaser(self.modelfile)
Ejemplo n.º 18
0
# import fasttext

# model = fasttext.train_supervised(input="./textos.txt")

from sacremoses import MosesTruecaser, MosesTokenizer

mtr = MosesTruecaser()
mtr.train('./textos.txt')
mtr.save_model('big.truecasemodel')

a = mtr.truecase("a seleção brasileira não fez mas o pessoal de tambiá", return_str=True)
print(a)
Ejemplo n.º 19
0
	def __init__(self, model):

		self.handler = MosesTruecaser(load_from=model)
Ejemplo n.º 20
0
def _lazy_load_moses_truecaser(load_from):
    global lazy_objects
    if load_from not in lazy_objects:
        lazy_objects[load_from] = MosesTruecaser(load_from=load_from)
    return lazy_objects[load_from]
Ejemplo n.º 21
0
             "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
             "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that",
             "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
             "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as",
             "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through",
             "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
             "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
             "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
             "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
stopwords = set(stopwords)
OLD_ENGLISH = {"thy": "your", "thou": "you", "Thy": "Your", "Thou": "You"}

# moses tokenizer
from sacremoses import MosesTruecaser, MosesTokenizer, MosesDetokenizer, MosesDetruecaser
mtok = MosesTokenizer(lang='en')
mtr = MosesTruecaser("vocab/truecase-model.en")
md = MosesDetokenizer(lang="en")
mdtr = MosesDetruecaser()

# bpe tokenizer
from subword_nmt.apply_bpe import BPE, read_vocabulary
vocabulary = read_vocabulary(codecs.open("vocab/vocab.bpe35000.chr", encoding='utf-8'), 10)
bpe = BPE(codes=codecs.open("vocab/codes_file_chr_35000", encoding='utf-8'), merges=35000, vocab=vocabulary)

# load nmt models
import onmt.opts
from translator_for_demo import build_translator
from onmt.utils.parse import ArgumentParser


def _parse_opt(opt):
Ejemplo n.º 22
0
from oauth2client.client import GoogleCredentials
from six.moves import input  # pylint: disable=redefined-builtin

from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
from tensor2tensor.serving import serving_utils
from tensor2tensor.utils import registry
from tensor2tensor.utils import usr_dir
from tensor2tensor.utils.hparam import HParams
import tensorflow as tf

import nltk
import re
from datetime import datetime


mtr = MosesTruecaser('truecase_model/en-truecase.mdl')
mtok = MosesTokenizer()
mdtk = MosesDetokenizer()
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') 
pattern = re.compile(r'([\u4e00-\u9fa5, ]{1})\s+([\u4e00-\u9fa5, ]{1})')


flags = tf.flags
FLAGS = flags.FLAGS
#t2t-query-server --t2t_usr_dir=./user_dir --data_dir=./data --problem=translate_enzh_sub32k --servable_name=my_model --server=localhost:8501

flags.DEFINE_string("server", "localhost:8501", "Address to Tensorflow Serving server.")
flags.DEFINE_string("servable_name", "my_model", "Name of served model.")
flags.DEFINE_string("problem", "translate_enzh_sub50k", "Problem name.")
flags.DEFINE_string("data_dir", "./data", "Data directory, for vocab files.")
flags.DEFINE_string("t2t_usr_dir", "./user_dir", "Usr dir for registrations.")