def pre_process(self): save_data = temp_dir() train_src, train_tgt = self.train_data dev_src, dev_tgt = self.dev_data if self.features: train_src = list(map(add_features, train_src)) dev_src = list(map(add_features, dev_src)) run_param('preprocess.py', { "train_src": save_temp(train_src), "train_tgt": save_temp(train_tgt), "valid_src": save_temp(dev_src), "valid_tgt": save_temp(dev_tgt), "save_data": save_data + "data", "dynamic_dict": None # This will add a dynamic-dict parameter }) data_zip = shutil.make_archive(base_name=temp_name(), format="gztar", root_dir=save_data) f = open(data_zip, "rb") bin_data = f.read() f.close() return bin_data
def pre_process(self): save_data = temp_dir() train_src = save_temp( [add_features(d.plan) for d in self.train_reader.data]) train_tgt = save_temp([d.delex for d in self.train_reader.data]) valid_src = save_temp( [add_features(d.plan) for d in self.dev_reader.data]) valid_tgt = save_temp([d.delex for d in self.dev_reader.data]) run_param( 'preprocess.py', { "train_src": train_src, "train_tgt": train_tgt, "valid_src": valid_src, "valid_tgt": valid_tgt, "save_data": save_data + "data", "dynamic_dict": None # This will add a dynamic-dict parameter }) data_zip = shutil.make_archive(base_name=temp_name(), format="gztar", root_dir=save_data) f = open(data_zip, "rb") bin_data = f.read() f.close() return bin_data
def BLEU(hyps, refs, single_ref=False, tokenizer=None, hyp_tokenizer=None, ref_tokenizer=None, remove_empty=False): """ hyps - array of strings refs - array of arrays containing strings. each array correlates to a single hypothesis """ if len(hyps) == 0: return [0, 0, 0, 0, 0] # Add execution permissions os.popen("chmod +x " + base + "/multi-bleu.perl").read() if single_ref: refs = [[r] for r in refs] if remove_empty: refs = [ref for i, ref in enumerate(refs) if hyps[i] != ""] hyps = [hyp for hyp in hyps if hyp != ""] # Apply default tokenizer if not hyp_tokenizer and tokenizer: hyp_tokenizer = tokenizer if not ref_tokenizer and tokenizer: ref_tokenizer = tokenizer if hyp_tokenizer: hyps = [" ".join(t) for t in map(hyp_tokenizer, hyps)] if ref_tokenizer: refs = [[" ".join(t) for t in map(ref_tokenizer, ref)] for ref in refs] max_refs = max([len(ref) for ref in refs]) refs = [ref + [""] * (max_refs - len(ref)) for ref in refs] dist_refs = list(zip(*refs)) ref_path = [] for i, refs in enumerate(dist_refs): ref_path.append(save_temp(list(map(unicode.lower, map(unicode, refs))))) hyps = list(map(unicode.lower, map(unicode, hyps))) hyp_path = save_temp(hyps) if all(map(lambda s: s == "", hyps)): return [0, 0, 0, 0, 0] cmd = base + "/multi-bleu.perl " + " ".join(ref_path) + " < " + hyp_path # print cmd res = os.popen(cmd).read() # print res search = re.search(" (\d*[\.\d]*?), (\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?) ", str(res)) if search: scores = list(map(lambda k: float(k), search.groups())) return scores print(cmd) print(search) raise Exception(res)
def eval(texts, version="2.1"): header = json.load(open(path.join(base, "header.json"))) versions = [v for v in header["versions"] if v["version"] == version] if len(versions) == 0: raise ValueError("Version not found") url = versions[0]["url"] v_file_name = path.join(gettempdir(), header["title"] + header["version"] + ".perl") if not path.isfile(v_file_name): # TODO download file from URL # Add execution permissions os.popen("chmod +x " + v_file_name).read() if len(texts) == 0: return {"BLEU": 0, "BLEU-1": 0, "BLEU-2": 0, "BLEU-3": 0, "BLEU-4": 0} hypothesis = [t["hypothesis"] for t in texts] references = [t["references"] for t in texts] # Pad references max_refs = max([len(ref) for ref in references]) references = [ref + [""] * (max_refs - len(ref)) for ref in references] # Split references to files ref_paths = [ save_temp(list(map(unicode.lower, map(unicode, refs)))) for refs in zip(*references) ] hypothesis = list(map(unicode.lower, map(unicode, hypothesis))) hyp_path = save_temp(hypothesis) cmd = v_file_name + " " + " ".join(ref_paths) + " < " + hyp_path res = os.popen(cmd).read() search = re.search( " (\d*[\.\d]*?), (\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?) ", str(res)) if search: scores = list(map(lambda k: float(k), search.groups())) return { "BLEU": scores[0], "BLEU-1": scores[1], "BLEU-2": scores[2], "BLEU-3": scores[3], "BLEU-4": scores[4] } print(cmd) print(search) raise Exception(res)
def translate(self, plans: List[str], opts=None): # Translate entire reader file using a model if not hasattr(self, "features"): # TODO remove after EMNLP self.features = True if not hasattr(self, "sentences_cache"): # TODO remove after EMNLP self.sentences_cache = {} if not opts: opts = { "beam_size": BEAM_SIZE, "find_best": True } featureize = lambda p: add_features(p) if self.features else p o_lines = [[featureize(s.strip()) for i, s in enumerate(s.split("."))] if s != "" else [] for s in plans] n_lines = [l for l in list(set(chain.from_iterable(o_lines))) if l not in self.sentences_cache] if len(n_lines) == 0: return [] print("Translating", len(n_lines), "sentences") source_path = save_temp(n_lines) target_path = temp_name() n_best = opts["beam_size"] if opts["find_best"] else 1 self.run_traslate(source_path, target_path, { "replace_unk": None, "beam_size": opts["beam_size"], "n_best": n_best, "batch_size": 64 }) out_lines_f = open(target_path, "r", encoding="utf-8") out_lines = chunks(out_lines_f.read().splitlines(), n_best) out_lines_f.close() for n, out in zip(n_lines, out_lines): self.sentences_cache[n] = find_best_out(n, out) return [" ".join([self.sentences_cache[s] for s in lines]) for lines in o_lines]
def translate(self, plans: List[str], opts=None): # Translate entire reader file using a model if not opts: opts = {"beam_size": BEAM_SIZE, "find_best": True} model_path = save_temp_bin(self.model_bin) o_lines = [ [add_features(s.strip()) for i, s in enumerate(s.split("."))] if s != "" else [] for s in plans ] n_lines = list(set(chain.from_iterable(o_lines))) if len(n_lines) == 0: return [] source_path = save_temp(n_lines) target_path = temp_name() n_best = opts["beam_size"] if opts["find_best"] else 1 self.run_traslate( model_path, source_path, target_path, { "replace_unk": None, "beam_size": opts["beam_size"], "n_best": n_best, "batch_size": 64 }) out_lines_f = open(target_path, "r", encoding="utf-8") out_lines = chunks(out_lines_f.read().splitlines(), n_best) out_lines_f.close() map_lines = { n: find_best_out(n, out) for n, out in zip(n_lines, out_lines) } return [" ".join([map_lines[s] for s in lines]) for lines in o_lines]