Ejemplo n.º 1
0
class Truecaser:
    def __init__(self, language, path):
        self.language = language
        self.model = Path(f"{path}/model.truecase.{language}")

        if self.trained:
            self.truecaser = MosesTruecaser(self.model)
        else:
            self.truecaser = MosesTruecaser()

    def __repr__(self):
        return f"Truecaser({self.language})"

    @property
    def trained(self):
        return os.path.isfile(self.model)

    def __call__(self, line):
        if os.path.isfile(self.model):
            toks = self.truecaser.truecase(line)
            string = " ".join(toks)
            return string.strip()
        else:
            raise UntrainedModel("Truecaser not trained")

    def train(self, filename):
        if not os.path.isfile(self.model):
            self.truecaser.train_from_file(
                filename, save_to=self.model
            )
Ejemplo n.º 2
0
class Truecaser(BatchProcessor):

	def __init__(self, model):

		self.handler = MosesTruecaser(load_from=model)

	def process(self, input):

		return self.handler.truecase(input.encode("utf-8", "ignore")), return_str=True).decode("utf-8", "ignore")
Ejemplo n.º 3
0
def preprocess(source_lang,tcmodel,escape):
	mtok = MosesTokenizer(lang=source_lang)
	mtr = MosesTruecaser(tcmodel)
	sys.stderr.write("model loaded\n")
	for line in sys.stdin:
		tokenized = mtok.tokenize(line,escape=escape)
		truecased = mtr.truecase(" ".join(tokenized))
		sys.stderr.write("sentence processed\n")
		sys.stdout.buffer.write((" ".join(truecased) + "\n").encode("utf-8"))
		sys.stdout.flush()
Ejemplo n.º 4
0
# import fasttext

# model = fasttext.train_supervised(input="./textos.txt")

from sacremoses import MosesTruecaser, MosesTokenizer

mtr = MosesTruecaser()
mtr.train('./textos.txt')
mtr.save_model('big.truecasemodel')

a = mtr.truecase("a seleção brasileira não fez mas o pessoal de tambiá", return_str=True)
print(a)