コード例 #1
0
    def pre_process(self):
        save_data = temp_dir()

        train_src, train_tgt = self.train_data
        dev_src, dev_tgt = self.dev_data

        if self.features:
            train_src = list(map(add_features, train_src))
            dev_src = list(map(add_features, dev_src))

        run_param('preprocess.py', {
            "train_src": save_temp(train_src),
            "train_tgt": save_temp(train_tgt),
            "valid_src": save_temp(dev_src),
            "valid_tgt": save_temp(dev_tgt),
            "save_data": save_data + "data",
            "dynamic_dict": None  # This will add a dynamic-dict parameter
        })

        data_zip = shutil.make_archive(base_name=temp_name(), format="gztar", root_dir=save_data)

        f = open(data_zip, "rb")
        bin_data = f.read()
        f.close()
        return bin_data
コード例 #2
0
ファイル: open_nmt.py プロジェクト: dendisuhubdy/chimera
    def pre_process(self):
        save_data = temp_dir()

        train_src = save_temp(
            [add_features(d.plan) for d in self.train_reader.data])
        train_tgt = save_temp([d.delex for d in self.train_reader.data])
        valid_src = save_temp(
            [add_features(d.plan) for d in self.dev_reader.data])
        valid_tgt = save_temp([d.delex for d in self.dev_reader.data])

        run_param(
            'preprocess.py',
            {
                "train_src": train_src,
                "train_tgt": train_tgt,
                "valid_src": valid_src,
                "valid_tgt": valid_tgt,
                "save_data": save_data + "data",
                "dynamic_dict": None  # This will add a dynamic-dict parameter
            })

        data_zip = shutil.make_archive(base_name=temp_name(),
                                       format="gztar",
                                       root_dir=save_data)

        f = open(data_zip, "rb")
        bin_data = f.read()
        f.close()
        return bin_data
コード例 #3
0
def BLEU(hyps, refs, single_ref=False, tokenizer=None, hyp_tokenizer=None, ref_tokenizer=None, remove_empty=False):
    """
    hyps - array of strings
    refs - array of arrays containing strings. each array correlates to a single hypothesis
    """

    if len(hyps) == 0:
        return [0, 0, 0, 0, 0]

    # Add execution permissions
    os.popen("chmod +x " + base + "/multi-bleu.perl").read()

    if single_ref:
        refs = [[r] for r in refs]

    if remove_empty:
        refs = [ref for i, ref in enumerate(refs) if hyps[i] != ""]
        hyps = [hyp for hyp in hyps if hyp != ""]

    # Apply default tokenizer
    if not hyp_tokenizer and tokenizer:
        hyp_tokenizer = tokenizer
    if not ref_tokenizer and tokenizer:
        ref_tokenizer = tokenizer

    if hyp_tokenizer:
        hyps = [" ".join(t) for t in map(hyp_tokenizer, hyps)]

    if ref_tokenizer:
        refs = [[" ".join(t) for t in map(ref_tokenizer, ref)] for ref in refs]

    max_refs = max([len(ref) for ref in refs])
    refs = [ref + [""] * (max_refs - len(ref)) for ref in refs]

    dist_refs = list(zip(*refs))

    ref_path = []
    for i, refs in enumerate(dist_refs):
        ref_path.append(save_temp(list(map(unicode.lower, map(unicode, refs)))))

    hyps = list(map(unicode.lower, map(unicode, hyps)))
    hyp_path = save_temp(hyps)

    if all(map(lambda s: s == "", hyps)):
        return [0, 0, 0, 0, 0]

    cmd = base + "/multi-bleu.perl " + " ".join(ref_path) + " < " + hyp_path
    # print cmd
    res = os.popen(cmd).read()
    # print res
    search = re.search(" (\d*[\.\d]*?), (\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?) ", str(res))
    if search:
        scores = list(map(lambda k: float(k), search.groups()))
        return scores

    print(cmd)
    print(search)
    raise Exception(res)
コード例 #4
0
ファイル: tool.py プロジェクト: AmitMY/meta-scholar
def eval(texts, version="2.1"):
    header = json.load(open(path.join(base, "header.json")))
    versions = [v for v in header["versions"] if v["version"] == version]
    if len(versions) == 0:
        raise ValueError("Version not found")

    url = versions[0]["url"]

    v_file_name = path.join(gettempdir(),
                            header["title"] + header["version"] + ".perl")
    if not path.isfile(v_file_name):
        # TODO download file from URL
        # Add execution permissions
        os.popen("chmod +x " + v_file_name).read()

    if len(texts) == 0:
        return {"BLEU": 0, "BLEU-1": 0, "BLEU-2": 0, "BLEU-3": 0, "BLEU-4": 0}

    hypothesis = [t["hypothesis"] for t in texts]
    references = [t["references"] for t in texts]

    # Pad references
    max_refs = max([len(ref) for ref in references])
    references = [ref + [""] * (max_refs - len(ref)) for ref in references]

    # Split references to files
    ref_paths = [
        save_temp(list(map(unicode.lower, map(unicode, refs))))
        for refs in zip(*references)
    ]

    hypothesis = list(map(unicode.lower, map(unicode, hypothesis)))
    hyp_path = save_temp(hypothesis)

    cmd = v_file_name + " " + " ".join(ref_paths) + " < " + hyp_path
    res = os.popen(cmd).read()

    search = re.search(
        " (\d*[\.\d]*?), (\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?)\/(\d*[\.\d]*?) ",
        str(res))
    if search:
        scores = list(map(lambda k: float(k), search.groups()))
        return {
            "BLEU": scores[0],
            "BLEU-1": scores[1],
            "BLEU-2": scores[2],
            "BLEU-3": scores[3],
            "BLEU-4": scores[4]
        }

    print(cmd)
    print(search)
    raise Exception(res)
コード例 #5
0
    def translate(self, plans: List[str], opts=None):  # Translate entire reader file using a model
        if not hasattr(self, "features"):  # TODO remove after EMNLP
            self.features = True
        if not hasattr(self, "sentences_cache"):  # TODO remove after EMNLP
            self.sentences_cache = {}

        if not opts:
            opts = {
                "beam_size": BEAM_SIZE,
                "find_best": True
            }


        featureize = lambda p: add_features(p) if self.features else p

        o_lines = [[featureize(s.strip()) for i, s in enumerate(s.split("."))] if s != "" else [] for s in plans]
        n_lines = [l for l in list(set(chain.from_iterable(o_lines))) if l not in self.sentences_cache]

        if len(n_lines) == 0:
            return []

        print("Translating", len(n_lines), "sentences")

        source_path = save_temp(n_lines)
        target_path = temp_name()

        n_best = opts["beam_size"] if opts["find_best"] else 1

        self.run_traslate(source_path, target_path, {
            "replace_unk": None,
            "beam_size": opts["beam_size"],
            "n_best": n_best,
            "batch_size": 64
        })

        out_lines_f = open(target_path, "r", encoding="utf-8")
        out_lines = chunks(out_lines_f.read().splitlines(), n_best)
        out_lines_f.close()

        for n, out in zip(n_lines, out_lines):
            self.sentences_cache[n] = find_best_out(n, out)

        return [" ".join([self.sentences_cache[s] for s in lines]) for lines in o_lines]
コード例 #6
0
ファイル: open_nmt.py プロジェクト: dendisuhubdy/chimera
    def translate(self,
                  plans: List[str],
                  opts=None):  # Translate entire reader file using a model
        if not opts:
            opts = {"beam_size": BEAM_SIZE, "find_best": True}

        model_path = save_temp_bin(self.model_bin)

        o_lines = [
            [add_features(s.strip())
             for i, s in enumerate(s.split("."))] if s != "" else []
            for s in plans
        ]
        n_lines = list(set(chain.from_iterable(o_lines)))

        if len(n_lines) == 0:
            return []

        source_path = save_temp(n_lines)
        target_path = temp_name()

        n_best = opts["beam_size"] if opts["find_best"] else 1

        self.run_traslate(
            model_path, source_path, target_path, {
                "replace_unk": None,
                "beam_size": opts["beam_size"],
                "n_best": n_best,
                "batch_size": 64
            })

        out_lines_f = open(target_path, "r", encoding="utf-8")
        out_lines = chunks(out_lines_f.read().splitlines(), n_best)
        out_lines_f.close()

        map_lines = {
            n: find_best_out(n, out)
            for n, out in zip(n_lines, out_lines)
        }

        return [" ".join([map_lines[s] for s in lines]) for lines in o_lines]