def evaluate(model, pre_wordscount, post_wordscount, lang, correction, trace, crash, max_pred): size = f"# Size of model in memory: {model.sizemo} Mo\n\n" files = os.listdir(test) stats = {} if trace: tracetxt = "(trace on)" else: tracetxt = "" # Evaluate the model on every test files print(f">>> Evaluating {tracetxt}: ", end="") for file in files: if file.startswith("t-") and file.endswith(lang): test_type = file.split("-")[-1].split(".")[0] testfile = f"{test}{file}" print(test_type, end=" ") (content, encoding) = fread(testfile) reader = NMask(content, pre_wordscount, post_wordscount) stats[test_type] = _evaluate(model, reader, correction[test_type], trace, max_pred) if trace: write_ans(testfile, test_type, lang, encoding) now = datetime.datetime.now() # Write the results in a file (1 file per model) modelname = model.__class__.__name__ with open( f"{results}{modelname}_{lang}_{pre_wordscount}c{post_wordscount}_{max_pred}p.txt", "w") as result: desc = f"# Results for {modelname} in {lang} with {max_pred} predictions and this context: {'_ '*pre_wordscount}<unk/> {'_ '*post_wordscount}\n" date = f"\n# {now.day}/{now.month}/{now.year} {now.hour}:{now.minute}:{now.second}" result.write(desc + size + yaml.dump(stats, default_flow_style=False) + date) print()
def evaluate(model, pre_wordscount, post_wordscount, lang, correction, trace, max_pred): files = os.listdir(test) stats = {} if trace: tracetxt = "(trace on)" else: tracetxt = "" # Evaluate the model on every test files print(f">>> Evaluating {tracetxt}: ", end="") for file in files: if file.startswith("t-") and file.endswith(lang): test_type = file.split("-")[-1].split(".")[0] testfile = f"{test}{file}" print(test_type, end=" ") try: with open(testfile, "r", encoding="utf-8") as f: reader = NMask(f.read(), pre_wordscount, post_wordscount) encoding = "utf-8" except UnicodeDecodeError: try: with open(testfile, "r", encoding="utf-16") as f: reader = NMask(f.read(), pre_wordscount, post_wordscount) encoding = "utf-16" except UnicodeError: with open(testfile, "r", encoding="iso-8859-1") as f: reader = NMask(f.read(), pre_wordscount, post_wordscount) encoding = "iso-8859-1" stats[test_type] = _evaluate(model, reader, correction[test_type], trace, max_pred) if trace: write_ans(testfile, test_type, lang, encoding) now = datetime.datetime.now() # Write the results in a file (1 file per model) modelname = model.__class__.__name__ with open( f"{results}{modelname}_{lang}_{pre_wordscount}c{post_wordscount}_{max_pred}p.txt", "w") as result: desc = f"# Results for {modelname} in {lang} with {max_pred} predictions and this context: {'_ '*pre_wordscount}<unk/> {'_ '*post_wordscount}\n\n" result.write( desc + yaml.dump(stats) + f"\n{now.day}/{now.month}/{now.year} {now.hour}:{now.minute}:{now.second}" ) print()
def train(self, filename): with open(filename, "r", encoding="utf-8") as file: print(f">>> Reading {filename}") data = file.read() reader = NMask(data, self.pre_words, self.post_words) tenth = int(len(data)/10) currtenth = tenth while reader.e < len(data): if reader.e > currtenth: print(".", end="", flush=True) currtenth += tenth self.add_ngram(reader) reader.next_token() # We must add the last N-Gram self.add_ngram(reader) # Normalize the struct print() print("Done")
def train(self, filename): with open(filename, "r", encoding="utf-8") as file: print(f">>> Reading {filename}") data = file.read() reader = NMask(data, self.pre_words, self.post_words) tenth = int(len(data) / 10) currtenth = tenth while reader.e < len(data): if reader.e > currtenth: print(".", end="", flush=True) currtenth += tenth self.add_ngram(reader) reader.next_token() # We must add the last N-Gram self.add_ngram(reader) print("Done") print("Computing the size of the model ... ", end="") self.sizemo = self.getsize() / 1000000 print(f"{self.sizemo} Mo")
def evaluatecrash(model, pre_wordscount, post_wordscount, lang, correction): (content, encoding) = fread(f"{test}t-unk-europarl-v7.fi-en-u05.{lang}") reader = NMask(content, pre_wordscount, post_wordscount) _evaluatecrash(model, reader, correction["u05"])