Esempio n. 1
0
def generate_main(data_dir, extra_flags=None):
    if extra_flags is None:
        extra_flags = [
            '--print-alignment',
        ]
    generate_parser = options.get_generation_parser()
    generate_args = options.parse_args_and_arch(
        generate_parser,
        [
            data_dir,
            '--path',
            os.path.join(data_dir, 'checkpoint_last.pt'),
            '--beam',
            '3',
            '--batch-size',
            '64',
            '--max-len-b',
            '5',
            '--gen-subset',
            'valid',
            '--no-progress-bar',
        ] + (extra_flags or []),
    )

    # evaluate model in batch mode
    generate.main(generate_args)

    # evaluate model interactively
    generate_args.buffer_size = 0
    generate_args.input = '-'
    generate_args.max_sentences = None
    orig_stdin = sys.stdin
    sys.stdin = StringIO('h e l l o\n')
    interactive.main(generate_args)
    sys.stdin = orig_stdin
Esempio n. 2
0
def mask_all_heads_combination():
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    args.quiet = True
    number_of_transformer_layers = 4
    number_of_attention_heads = 4
    mask_layer_combinations = ['enc-enc', 'enc-dec', 'dec-dec']
    results_dict = {
        i: np.zeros((number_of_transformer_layers, number_of_attention_heads))
        for i in mask_layer_combinations
    }
    for i in mask_layer_combinations:
        for j in range(number_of_transformer_layers):
            for k in range(number_of_attention_heads):
                args.model_overrides = str({
                    "mask_layer": j,
                    "mask_head": k,
                    "mask_layer_type": i
                })
                scorer = main(args)
                results_dict[i][j][k] = float(
                    parse_bleu_scoring(scorer.result_string()))

    for name in mask_layer_combinations:
        print("table of score with masking {} attention head".format(name))
        print("rows are transformer layer number and columns are head number".
              format(name))
        df = pd.DataFrame(
            data=results_dict[name],
            index=[str(j) for j in range(number_of_transformer_layers)],
            columns=[str(k) for k in range(number_of_attention_heads)])
        print(df)
Esempio n. 3
0
def generate_main(data_dir, extra_flags=None, path=None):
    if extra_flags is None:
        extra_flags = [
            "--print-alignment",
        ]
    if path is None:
        path = os.path.join(data_dir, "checkpoint_last.pt")
    generate_parser = options.get_generation_parser()
    generate_args = options.parse_args_and_arch(
        generate_parser,
        [
            data_dir,
            "--path",
            path,
            "--beam",
            "3",
            "--batch-size",
            "64",
            "--max-len-b",
            "5",
            "--gen-subset",
            "valid",
            "--no-progress-bar",
            "--num-workers",
            "0",
        ]
        + (extra_flags or []),
    )

    # evaluate model in batch mode
    generate.main(generate_args)

    # evaluate model interactively
    generate_args.buffer_size = 0
    generate_args.input = "-"
    generate_args.batch_size = None
    orig_stdin = sys.stdin
    sys.stdin = StringIO("h e l l o\n")
    interactive.main(generate_args)
    sys.stdin = orig_stdin
def mask_all_heads_combination():
    parser = options.get_generation_parser()
    args = options.parse_args_and_arch(parser)
    number_of_transformer_layers = 6
    number_of_attention_heads = 8
    experiment = "basline-16-heads-6l-no-changes"
    mask_layer_combinations = ['enc-enc', 'enc-dec', 'dec-dec']
    # mask_layer_combinations = ['enc-dec']
    results_dict = {
        i: np.zeros((number_of_transformer_layers, number_of_attention_heads))
        for i in mask_layer_combinations
    }
    outF = open("mask_all_heads_combination.txt", "a")
    for i in mask_layer_combinations:
        for j in range(number_of_transformer_layers):
            for k in range(number_of_attention_heads):
                args.model_overrides = str({
                    "mask_layer": j,
                    "mask_head": k,
                    "mask_layer_type": i
                })
                scorer = main(args)
                results_dict[i][j][k] = float(
                    parse_bleu_scoring(scorer.result_string()))
                # outF.write("type : {}, layer : {}, head : {}, result : {}".format(i, j, k, results_dict[i][j][k]))
                print(
                    "Guy test - > type : {}, layer : {}, head : {}, result : {}"
                    .format(i, j, k, results_dict[i][j][k]))

    outF.close()
    for name in mask_layer_combinations:
        print("table of score with masking {} attention head".format(name))
        print("rows are transformer layer number and columns are head number".
              format(name))
        df = pd.DataFrame(
            data=results_dict[name],
            index=[str(j) for j in range(number_of_transformer_layers)],
            columns=[str(k) for k in range(number_of_attention_heads)])
        print(df)
        df.to_csv(r' mask_all_heads_combination_{}_{}.csv'.format(
            experiment, name),
                  index=False)
Esempio n. 5
0
def gen_and_reprocess_nbest(args):
    if args.score_dict_dir is None:
        args.score_dict_dir = args.data
    if args.prefix_len is not None:
        assert (args.right_to_left1 is False
                ), "prefix length not compatible with right to left models"
        assert (args.right_to_left2 is False
                ), "prefix length not compatible with right to left models"

    if args.nbest_list is not None:
        assert args.score_model2 is None

    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    store_data = (os.path.join(os.path.dirname(__file__)) + "/rerank_data/" +
                  args.data_dir_name)
    if not os.path.exists(store_data):
        os.makedirs(store_data)

    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )
    assert not (args.right_to_left1
                and args.backwards1), "backwards right to left not supported"
    assert not (args.right_to_left2
                and args.backwards2), "backwards right to left not supported"
    assert not (args.prefix_len is not None
                and args.target_prefix_frac is not None
                ), "target prefix frac and target prefix len incompatible"

    # make directory to store generation results
    if not os.path.exists(pre_gen):
        os.makedirs(pre_gen)

    rerank1_is_gen = (args.gen_model == args.score_model1
                      and args.source_prefix_frac is None)
    rerank2_is_gen = (args.gen_model == args.score_model2
                      and args.source_prefix_frac is None)

    if args.nbest_list is not None:
        rerank2_is_gen = True

    # make directories to store preprossed nbest list for reranking
    if not os.path.exists(left_to_right_preprocessed_dir):
        os.makedirs(left_to_right_preprocessed_dir)
    if not os.path.exists(right_to_left_preprocessed_dir):
        os.makedirs(right_to_left_preprocessed_dir)
    if not os.path.exists(lm_preprocessed_dir):
        os.makedirs(lm_preprocessed_dir)
    if not os.path.exists(backwards_preprocessed_dir):
        os.makedirs(backwards_preprocessed_dir)

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1,
    )
    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2,
        )

    predictions_bpe_file = pre_gen + "/generate_output_bpe.txt"

    using_nbest = args.nbest_list is not None

    if using_nbest:
        print("Using predefined n-best list from interactive.py")
        predictions_bpe_file = args.nbest_list

    else:
        if not os.path.isfile(predictions_bpe_file):
            print(
                "STEP 1: generate predictions using the p(T|S) model with bpe")
            print(args.data)
            param1 = [
                args.data,
                "--path",
                args.gen_model,
                "--shard-id",
                str(args.shard_id),
                "--num-shards",
                str(args.num_shards),
                "--nbest",
                str(args.num_rescore),
                "--batch-size",
                str(args.batch_size),
                "--beam",
                str(args.num_rescore),
                "--batch-size",
                str(args.num_rescore),
                "--gen-subset",
                args.gen_subset,
                "--source-lang",
                args.source_lang,
                "--target-lang",
                args.target_lang,
            ]
            if args.sampling:
                param1 += ["--sampling"]

            gen_parser = options.get_generation_parser()
            input_args = options.parse_args_and_arch(gen_parser, param1)

            print(input_args)
            with open(predictions_bpe_file, "w") as f:
                with redirect_stdout(f):
                    generate.main(input_args)

    gen_output = rerank_utils.BitextOutputFromGen(
        predictions_bpe_file,
        bpe_symbol=args.post_process,
        nbest=using_nbest,
        prefix_len=args.prefix_len,
        target_prefix_frac=args.target_prefix_frac,
    )

    if args.diff_bpe:
        rerank_utils.write_reprocessed(
            gen_output.no_bpe_source,
            gen_output.no_bpe_hypo,
            gen_output.no_bpe_target,
            pre_gen + "/source_gen_bpe." + args.source_lang,
            pre_gen + "/target_gen_bpe." + args.target_lang,
            pre_gen + "/reference_gen_bpe." + args.target_lang,
        )
        bitext_bpe = args.rescore_bpe_code
        bpe_src_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/source_gen_bpe." + args.source_lang,
            "--output",
            pre_gen + "/rescore_data." + args.source_lang,
        ]
        bpe_tgt_param = [
            "-c",
            bitext_bpe,
            "--input",
            pre_gen + "/target_gen_bpe." + args.target_lang,
            "--output",
            pre_gen + "/rescore_data." + args.target_lang,
        ]

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_src_param,
            shell=False,
        )

        subprocess.call(
            [
                "python",
                os.path.join(os.path.dirname(__file__),
                             "subword-nmt/subword_nmt/apply_bpe.py"),
            ] + bpe_tgt_param,
            shell=False,
        )

    if (not os.path.isfile(score1_file)
            and not rerank1_is_gen) or (args.score_model2 is not None
                                        and not os.path.isfile(score2_file)
                                        and not rerank2_is_gen):
        print(
            "STEP 2: process the output of generate.py so we have clean text files with the translations"
        )

        rescore_file = "/rescore_data"
        if args.prefix_len is not None:
            prefix_len_rescore_file = rescore_file + "prefix" + str(
                args.prefix_len)
        if args.target_prefix_frac is not None:
            target_prefix_frac_rescore_file = (rescore_file +
                                               "target_prefix_frac" +
                                               str(args.target_prefix_frac))
        if args.source_prefix_frac is not None:
            source_prefix_frac_rescore_file = (rescore_file +
                                               "source_prefix_frac" +
                                               str(args.source_prefix_frac))

        if not args.right_to_left1 or not args.right_to_left2:
            if not args.diff_bpe:
                rerank_utils.write_reprocessed(
                    gen_output.source,
                    gen_output.hypo,
                    gen_output.target,
                    pre_gen + rescore_file + "." + args.source_lang,
                    pre_gen + rescore_file + "." + args.target_lang,
                    pre_gen + "/reference_file",
                    bpe_symbol=args.post_process,
                )
                if args.prefix_len is not None:
                    bw_rescore_file = prefix_len_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.source_lang,
                        pre_gen + prefix_len_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        prefix_len=args.prefix_len,
                        bpe_symbol=args.post_process,
                    )
                elif args.target_prefix_frac is not None:
                    bw_rescore_file = target_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + target_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        target_prefix_frac=args.target_prefix_frac,
                    )
                else:
                    bw_rescore_file = rescore_file

                if args.source_prefix_frac is not None:
                    fw_rescore_file = source_prefix_frac_rescore_file
                    rerank_utils.write_reprocessed(
                        gen_output.source,
                        gen_output.hypo,
                        gen_output.target,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.source_lang,
                        pre_gen + source_prefix_frac_rescore_file + "." +
                        args.target_lang,
                        pre_gen + "/reference_file",
                        bpe_symbol=args.post_process,
                        source_prefix_frac=args.source_prefix_frac,
                    )
                else:
                    fw_rescore_file = rescore_file

        if args.right_to_left1 or args.right_to_left2:
            rerank_utils.write_reprocessed(
                gen_output.source,
                gen_output.hypo,
                gen_output.target,
                pre_gen + "/right_to_left_rescore_data." + args.source_lang,
                pre_gen + "/right_to_left_rescore_data." + args.target_lang,
                pre_gen + "/right_to_left_reference_file",
                right_to_left=True,
                bpe_symbol=args.post_process,
            )

        print("STEP 3: binarize the translations")
        if (not args.right_to_left1
                or args.score_model2 is not None and not args.right_to_left2
                or not rerank1_is_gen):

            if args.backwards1 or args.backwards2:
                if args.backwards_score_dict_dir is not None:
                    bw_dict = args.backwards_score_dict_dir
                else:
                    bw_dict = args.score_dict_dir
                bw_preprocess_param = [
                    "--source-lang",
                    scorer1_src,
                    "--target-lang",
                    scorer1_tgt,
                    "--trainpref",
                    pre_gen + bw_rescore_file,
                    "--srcdict",
                    bw_dict + "/dict." + scorer1_src + ".txt",
                    "--tgtdict",
                    bw_dict + "/dict." + scorer1_tgt + ".txt",
                    "--destdir",
                    backwards_preprocessed_dir,
                ]
                preprocess_parser = options.get_preprocessing_parser()
                input_args = preprocess_parser.parse_args(bw_preprocess_param)
                preprocess.main(input_args)

            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + fw_rescore_file,
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                left_to_right_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

        if args.right_to_left1 or args.right_to_left2:
            preprocess_param = [
                "--source-lang",
                scorer1_src,
                "--target-lang",
                scorer1_tgt,
                "--trainpref",
                pre_gen + "/right_to_left_rescore_data",
                "--srcdict",
                args.score_dict_dir + "/dict." + scorer1_src + ".txt",
                "--tgtdict",
                args.score_dict_dir + "/dict." + scorer1_tgt + ".txt",
                "--destdir",
                right_to_left_preprocessed_dir,
            ]
            preprocess_parser = options.get_preprocessing_parser()
            input_args = preprocess_parser.parse_args(preprocess_param)
            preprocess.main(input_args)

    return gen_output
Esempio n. 6
0
def score_bw(args):
    if args.backwards1:
        scorer1_src = args.target_lang
        scorer1_tgt = args.source_lang
    else:
        scorer1_src = args.source_lang
        scorer1_tgt = args.target_lang

    if args.score_model2 is not None:
        if args.backwards2:
            scorer2_src = args.target_lang
            scorer2_tgt = args.source_lang
        else:
            scorer2_src = args.source_lang
            scorer2_tgt = args.target_lang

    rerank1_is_gen = (args.gen_model == args.score_model1
                      and args.source_prefix_frac is None)
    rerank2_is_gen = (args.gen_model == args.score_model2
                      and args.source_prefix_frac is None)

    (
        pre_gen,
        left_to_right_preprocessed_dir,
        right_to_left_preprocessed_dir,
        backwards_preprocessed_dir,
        lm_preprocessed_dir,
    ) = rerank_utils.get_directories(
        args.data_dir_name,
        args.num_rescore,
        args.gen_subset,
        args.gen_model_name,
        args.shard_id,
        args.num_shards,
        args.sampling,
        args.prefix_len,
        args.target_prefix_frac,
        args.source_prefix_frac,
    )

    score1_file = rerank_utils.rescore_file_name(
        pre_gen,
        args.prefix_len,
        args.model1_name,
        target_prefix_frac=args.target_prefix_frac,
        source_prefix_frac=args.source_prefix_frac,
        backwards=args.backwards1,
    )

    if args.score_model2 is not None:
        score2_file = rerank_utils.rescore_file_name(
            pre_gen,
            args.prefix_len,
            args.model2_name,
            target_prefix_frac=args.target_prefix_frac,
            source_prefix_frac=args.source_prefix_frac,
            backwards=args.backwards2,
        )

    if args.right_to_left1:
        rerank_data1 = right_to_left_preprocessed_dir
    elif args.backwards1:
        rerank_data1 = backwards_preprocessed_dir
    else:
        rerank_data1 = left_to_right_preprocessed_dir

    gen_param = [
        "--batch-size",
        str(128), "--score-reference", "--gen-subset", "train"
    ]
    if not rerank1_is_gen and not os.path.isfile(score1_file):
        print("STEP 4: score the translations for model 1")

        model_param1 = [
            "--path",
            args.score_model1,
            "--source-lang",
            scorer1_src,
            "--target-lang",
            scorer1_tgt,
        ]
        gen_model1_param = [rerank_data1] + gen_param + model_param1

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model1_param)

        with open(score1_file, "w") as f:
            with redirect_stdout(f):
                generate.main(input_args)

    if (args.score_model2 is not None and not os.path.isfile(score2_file)
            and not rerank2_is_gen):
        print("STEP 4: score the translations for model 2")

        if args.right_to_left2:
            rerank_data2 = right_to_left_preprocessed_dir
        elif args.backwards2:
            rerank_data2 = backwards_preprocessed_dir
        else:
            rerank_data2 = left_to_right_preprocessed_dir

        model_param2 = [
            "--path",
            args.score_model2,
            "--source-lang",
            scorer2_src,
            "--target-lang",
            scorer2_tgt,
        ]
        gen_model2_param = [rerank_data2] + gen_param + model_param2

        gen_parser = options.get_generation_parser()
        input_args = options.parse_args_and_arch(gen_parser, gen_model2_param)

        with open(score2_file, "w") as f:
            with redirect_stdout(f):
                generate.main(input_args)
Esempio n. 7
0
def _fairseq_generate(complex_filepath,
                      output_pred_filepath,
                      checkpoint_paths,
                      complex_dictionary_path,
                      simple_dictionary_path,
                      beam=5,
                      hypothesis_num=1,
                      lenpen=1.,
                      diverse_beam_groups=None,
                      diverse_beam_strength=0.5,
                      sampling=False,
                      batch_size=128):
    # exp_dir must contain checkpoints/checkpoint_best.pt, and dict.{complex,simple}.txt
    # First copy input complex file to exp_dir and create dummy simple file
    tmp_dir = Path(tempfile.mkdtemp())
    new_complex_filepath = tmp_dir / 'tmp.complex-simple.complex'
    dummy_simple_filepath = tmp_dir / 'tmp.complex-simple.simple'
    shutil.copy(complex_filepath, new_complex_filepath)
    shutil.copy(complex_filepath, dummy_simple_filepath)
    shutil.copy(complex_dictionary_path, tmp_dir / 'dict.complex.txt')
    shutil.copy(simple_dictionary_path, tmp_dir / 'dict.simple.txt')
    generate_parser = options.get_generation_parser()
    args = [
        tmp_dir,
        '--path',
        ':'.join([str(path) for path in checkpoint_paths]),
        '--beam',
        beam,
        '--nbest',
        hypothesis_num,
        '--lenpen',
        lenpen,
        '--diverse-beam-groups',
        diverse_beam_groups if diverse_beam_groups is not None else -1,
        '--diverse-beam-strength',
        diverse_beam_strength,
        '--batch-size',
        batch_size,
        '--raw-text',
        '--print-alignment',
        '--gen-subset',
        'tmp',
        # We don't want to reload pretrained embeddings
        '--model-overrides',
        {
            'encoder_embed_path': None,
            'decoder_embed_path': None
        },
    ]
    if sampling:
        args.extend([
            '--sampling',
            '--sampling-topk',
            10,
        ])
    args = [str(arg) for arg in args]
    generate_args = options.parse_args_and_arch(generate_parser, args)
    out_filepath = tmp_dir / 'generation.out'
    with log_stdout(out_filepath, mute_stdout=True):
        # evaluate model in batch mode
        generate.main(generate_args)
    # Retrieve translations

    def parse_all_hypotheses(out_filepath):
        hypotheses_dict = defaultdict(list)
        for line in yield_lines(out_filepath):
            match = re.match(r'^H-(\d+)\t-?\d+\.\d+\t(.*)$', line)
            if match:
                sample_id, hypothesis = match.groups()
                hypotheses_dict[int(sample_id)].append(hypothesis)
        # Sort in original order
        return [hypotheses_dict[i] for i in range(len(hypotheses_dict))]

    all_hypotheses = parse_all_hypotheses(out_filepath)
    predictions = [
        hypotheses[hypothesis_num - 1] for hypotheses in all_hypotheses
    ]
    write_lines(predictions, output_pred_filepath)
    os.remove(dummy_simple_filepath)
    os.remove(new_complex_filepath)