Beispiel #1
0
def load_lang_ls(root, lang_ls, postfix="-ud-test-sent_segmented.txt"):
    data_lang = []
    y = []
    for i_lang, lang in enumerate(lang_ls):
        dir = root + f"/{lang}{postfix}"
        data_lang = load_data(dir, data_lang)
        y.extend([i_lang for _ in range(len(data_lang))])
    return np.array(data_lang), np.array(y)
 def reshape_y(z, n_seq):
     'multiply each element n_seq times'
     new_z = []
     for _z in z:
         #for _ in range(n_seq):
         new_z.extend([_z for _ in range(n_seq)])
     return np.array(new_z)
Beispiel #3
0
def get_mask_input(input_tokens_tensor, use_gpu, pad):
    new_input = np.array(input_tokens_tensor.cpu())
    _input_mask = [[
        0 if new_input[ind_sent][ind_tok] == pad else 1
        for ind_tok in range(len(new_input[ind_sent]))
    ] for ind_sent in range(len(new_input))]
    input_mask = torch.Tensor(_input_mask).long()
    if use_gpu:
        input_mask = input_mask.cuda()

    return input_mask
 def reshape_x(z):
     return np.array(z.view(z.size(0) * z.size(1), -1))
Beispiel #5
0
def get_bpe_label_word_level_task(labels,
                                  batch,
                                  input_tokens_tensor,
                                  input_alignement_with_raw,
                                  use_gpu,
                                  label_name,
                                  pad,
                                  graph_labels=False):

    if labels is not None:
        output_tokens_tensor = np.array(labels.cpu())
    else:
        output_tokens_tensor = None
    new_input = np.array(input_tokens_tensor.cpu())
    len_max = max([len(sent) for sent in new_input])
    new_input = [[inp
                  for inp in sent] + [pad for _ in range(len_max - len(sent))]
                 for sent in new_input]
    # we mask bpe token that have been split (we don't mask the first bpe token of each word)
    _input_mask = [[
        0 if new_input[ind_sent][ind_tok] == pad
        or input_alignement_with_raw[ind_sent][ind_tok - 1]
        == input_alignement_with_raw[ind_sent][ind_tok] else 1
        for ind_tok in range(len(new_input[ind_sent]))
    ] for ind_sent in range(len(new_input))]
    cumulate_shift = None
    if graph_labels:
        # for each sentence : each bpe token : we count the number of multi-bpe token before it
        def get_cumulated_non_first_bpe_counter(sent):
            counter = 0
            new_sent = []
            counter_former = 0
            cumulated = 0
            for ind, token in enumerate(sent):
                if ind + 1 < len(sent) and token == sent[ind +
                                                         1] and token != 1000:
                    counter += 1
                elif token != 1000:
                    new_sent.append(counter_former + cumulated)
                    cumulated += counter_former
                    counter_former = counter
                    counter = 0
            return new_sent

        #def test_get_cumulated_non_first_bpe_counter():
        #    assert [0, 0, 0, 1, 1, 1, 3, 3, 3, 5, 5, 5] == get_cumulated_non_first_bpe_counter([0, 1, 2, 2 ,3, 4, 5, 5, 5, 6, 7, 8, 8, 8, 9, 10 ,11, 1000])
        #    assert [0, 0, 0, 1, 1, 1, 3, 3, 3, 5, 5, 5] == get_cumulated_non_first_bpe_counter([0, 1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 8, 8, 8, 9, 10, 11])
        #    #print("TEST passed ")
        #test_get_cumulated_non_first_bpe_counter()

        cumulate_shift = [
            get_cumulated_non_first_bpe_counter(
                input_alignement_with_raw[ind_sent])
            for ind_sent in range(len(input_alignement_with_raw))
        ]

    output_tokens_tensor_new = []
    for ind_sent in range(len(_input_mask)):
        output_tokens_tensor_new_ls = []
        shift = 0
        for ind_tok in range(len(_input_mask[ind_sent])):
            mask = _input_mask[ind_sent][ind_tok]

            if labels is not None:
                try:
                    label = output_tokens_tensor[ind_sent, ind_tok - shift]
                    if graph_labels:
                        # as CLS is appended at the begining of each sentences : we need to adjust the labels for it
                        # CLS and SEQ points to the first token indexed by -1 so become 1
                        if label not in [
                                ROOT_HEADS_INDEX, END_HEADS_INDEX
                        ] and cumulate_shift[ind_sent][label] > 0:
                            label += cumulate_shift[ind_sent][label]
                        label += CLS_ADJUST
                except Exception as e:
                    try:
                        assert input_alignement_with_raw[ind_sent][
                            ind_tok] == 1000, "ERROR we should have reached the end of get labels also "
                        label = LABEL_PARAMETER[label_name][
                            "pad_value"]  #PAD_ID_TAG if not graph_labels else PAD_ID_HEADS # output_tokens_tensor[ind_sent, output_tokens_tensor.shape[1] - 1]
                    except Exception as f:
                        print(
                            "ERROR (get_bpe_labels): we reached the end of output labels but input is not done ! ",
                            f)
                        print(
                            "ERROR ind_send:{} ind_tok {} shift {} output_tokens_tensor {} alignement {} -  {}"
                            .format(ind_sent, ind_tok, shift,
                                    output_tokens_tensor,
                                    input_alignement_with_raw[ind_sent], e))
                        print("ERROR ind_send ", batch.raw_input,
                              batch.raw_output)
                        raise (e)

            if mask == 0 and labels is not None:
                # 1 for _PAD_POS fpr PAD_ID_HEADS 0
                pad = LABEL_PARAMETER[label_name][
                    "pad_value"]  #PAD_ID_TAG if not graph_labels else PAD_ID_HEADS
                output_tokens_tensor_new_ls.append(pad)
                shift += 1
            elif labels is not None:
                output_tokens_tensor_new_ls.append(label)
        output_tokens_tensor_new.append(output_tokens_tensor_new_ls)

    def sanity_test_parsing_label(labels, output_tokens_tensor_new,
                                  input_alignement_with_raw, cumulate_shift):
        for sent in range(labels.size(0)):
            ind_max = len(cumulate_shift[sent]) - 1
            for _ in range(5):
                ind = np.random.choice(range(ind_max))
                # the new label must be equal to the old one at the corresponding position + 1 + the number of non-first-bpe-token (original indexing of the label)
                if output_tokens_tensor_new[sent][ind] not in [
                        ROOT_HEADS_INDEX + 1, END_HEADS_INDEX,
                        PAD_ID_LOSS_STANDART
                ]:
                    try:
                        assert output_tokens_tensor_new[sent][ind] == labels[sent, int(input_alignement_with_raw[sent][ind])]+CLS_ADJUST+cumulate_shift[sent][labels[sent, int(input_alignement_with_raw[sent][ind])]], \
                        "ERROR sent {} ind word {} " \
                        "new {} and old {} cumulted {} ".format(sent, ind, output_tokens_tensor_new[sent][ind],
                                                            labels[sent, input_alignement_with_raw[sent][ind]], cumulate_shift[sent][ind])
                    except AssertionError as e:
                        print(e)
                        pdb.set_trace()
                    #print("TEST passed for sent {} word {}".format(sent, ind))

    if graph_labels and labels is not None:
        sanity_test_parsing_label(labels, output_tokens_tensor_new,
                                  input_alignement_with_raw, cumulate_shift)
    if labels is not None:
        output_tokens_tensor = torch.Tensor(output_tokens_tensor_new).long()
        head_mask = torch.Tensor(_input_mask).long()
    input_tokens_tensor = torch.Tensor(new_input).long()
    if use_gpu:
        if labels is not None:
            head_mask = head_mask.cuda()
            output_tokens_tensor = output_tokens_tensor.cuda()
        input_tokens_tensor = input_tokens_tensor.cuda()
    return output_tokens_tensor, head_mask, input_tokens_tensor, cumulate_shift
def main(args, dict_path, model_dir):

    model, tokenizer, run_id = load_all_analysis(args, dict_path, model_dir)

    if args.compare_to_pretrained:
        print("Loading Pretrained model also for comparison with pretrained")
        args_origin = args_attention_analysis()
        args_origin = args_preprocess_attention_analysis(args_origin)
        args_origin.init_args_dir = None
        args_origin, dict_path_0, model_dir_0 = get_dirs(args_origin)
        args_origin.model_id_pref += "again"
        model_origin, tokenizer_0, _ = load_all_analysis(
            args_origin, dict_path_0, model_dir_0)
        model_origin.eval()
        print("seco,")
    # only allow output of the model to be hidden states here
    print("Checkpoint loaded")
    assert not args.output_attentions
    assert args.output_all_encoded_layers and args.output_hidden_states_per_head

    data = ["I am here", "How are you"]
    model.eval()
    n_obs = args.n_sent
    max_len = args.max_seq_len
    lang_ls = args.raw_text_code

    lang = [
        "fr_pud", "de_pud", "ru_pud", "tr_pud", "id_pud", "ar_pud", "pt_pud",
        "es_pud", "fi_pud", "it_pud", "sv_pud", "cs_pud", "pl_pud", "hi_pud",
        "zh_pud", "ko_pud", "ja_pud", "th_pud"
    ]
    #lang = ["fr_pud", "fr_gsd"]
    src_lang_ls = ["en_pud"]  #, "fr_pud", "ru_pud", "ar_pud"]
    print("Loading data...")
    data_target_ls = [
        load_data(DATA_UD + f"/{target}-ud-test.conllu",
                  line_filter="# text = ") for target in lang
    ]
    data_target_dic = OrderedDict([
        (lang, data) for lang, data in zip(lang, data_target_ls)
    ])
    #pdb.set_trace()
    src = src_lang_ls[0]  #"en_pud"

    data_en = data_target_ls[
        0]  #load_data(DATA_UD+f"/{src}-ud-test.conllu", line_filter="# text = ")

    for _data_target in data_target_dic:
        try:
            assert len(data_target_dic[_data_target]) == len(
                data_en
            ), f"Should have as much sentences on both sides en:{len(data_en)} target:{len(data_target_dic[_data_target])}"
        except:
            data_en = data_en[:len(data_target_dic[_data_target])]
            print(f"Cutting {src} dataset based on target")
        assert len(data_target_dic[_data_target]) == len(
            data_en
        ), f"Should have as much sentences on both sides en:{len(data_en)} target:{len(data_target_dic[_data_target])}"
    #reg = linear_model.LogisticRegression()
    # just to get the keyw
    layer_all = get_hidden_representation(data,
                                          model,
                                          tokenizer,
                                          max_len=max_len)
    # removed hidden_per_layer
    #pdb.set_trace()
    assert len(layer_all) == 1
    #assert len(layer_all) == 2, "ERROR should only have hidden_per_layer and hidden_per_head_layer"

    report_ls = []
    accuracy_dic = OrderedDict()
    sampling = args.sampling
    metric = args.similarity_metric
    if metric == "cka":
        pad_below_max_len, output_dic = False, True
    else:
        pad_below_max_len, output_dic = False, True
    assert metric in ["cos", "cka"]
    if metric == "cos":
        batch_size = 1
    else:
        batch_size = len(data_en) // 4

    task_tuned = "No"

    if args.init_args_dir is None:
        #args.init_args_dir =
        id_model = f"{args.bert_model}-init-{args.random_init}"

        hyperparameters = OrderedDict([
            ("bert_model", args.bert_model),
            ("random_init", args.random_init),
            ("not_load_params_ls", args.not_load_params_ls),
            ("dict_path", dict_path),
            ("model_id", id_model),
        ])
        info_checkpoint = OrderedDict([("epochs", 0),
                                       ("batch_size", batch_size),
                                       ("train_path", 0), ("dev_path", 0),
                                       ("num_labels_per_task", 0)])

        args.init_args_dir = write_args(os.environ.get("MT_NORM_PARSE", "./"),
                                        model_id=id_model,
                                        info_checkpoint=info_checkpoint,
                                        hyperparameters=hyperparameters,
                                        verbose=1)
        print("args_dir checkout ", args.init_args_dir)
        model_full_name_val = task_tuned + "-" + id_model
    else:
        argument = json.load(open(args.init_args_dir, 'r'))
        task_tuned = argument["hyperparameters"]["tasks"][0][
            0] if not "wiki" in argument["info_checkpoint"][
                "train_path"] else "ner"
        model_full_name_val = task_tuned + "-" + args.init_args_dir.split(
            "/")[-1]

    if args.analysis_mode == "layer":
        studied_ind = 0
    elif args.analysis_mode == "layer_head":
        studied_ind = 1
    else:
        raise (
            Exception(f"args.analysis_mode : {args.analysis_mode} corrupted"))
    layer_analysed = layer_all[studied_ind]

    #for ind, layer_head in enumerate(list(layer_analysed.keys())):
    report = OrderedDict()
    accuracy_ls = []
    src_lang = src

    cosine_sent_to_src = OrderedDict([(src_lang + "-" + lang, OrderedDict())
                                      for src_lang in src_lang_ls
                                      for lang in data_target_dic.keys()])
    cosine_sent_to_origin = OrderedDict([(lang, OrderedDict())
                                         for lang in data_target_dic.keys()])
    cosine_sent_to_origin_src = OrderedDict([(lang, OrderedDict())
                                             for lang in src_lang_ls])
    cosine_sent_to_former_layer_src = OrderedDict([(lang, OrderedDict())
                                                   for lang in src_lang_ls])
    cosine_sent_to_former_layer = OrderedDict([
        (lang, OrderedDict()) for lang in data_target_dic.keys()
    ])
    cosine_sent_to_first_layer = OrderedDict([
        (lang, OrderedDict()) for lang in data_target_dic.keys()
    ])
    #layer_head = list(layer_analysed.keys())[len(list(layer_analysed.keys())) - ind -1]

    cosinus = nn.CosineSimilarity(dim=1)
    info_model = f" task {args.tasks} args {'/'.join(args.init_args_dir.split('/')[-2:]) if args.init_args_dir is not None else None} bert {args.bert_model} random init {args.random_init} "
    #"cka"
    output_dic = True
    pad_below_max_len = False
    max_len = 200

    n_batch = len(data_en) // batch_size

    for i_data in range(n_batch):

        for src_lang in src_lang_ls:
            print(f"Starting src", {src_lang})
            data_en = load_data(DATA_UD + f"/{src_lang}-ud-test.conllu",
                                line_filter="# text = ")
            en_batch = data_en[i_data:i_data + batch_size]
            all = get_hidden_representation(
                en_batch,
                model,
                tokenizer,
                pad_below_max_len=pad_below_max_len,
                max_len=max_len,
                output_dic=output_dic)
            analysed_batch_dic_en = all[studied_ind]
            i_lang = 0

            if args.compare_to_pretrained:
                all_origin = get_hidden_representation(
                    en_batch,
                    model_origin,
                    tokenizer_0,
                    pad_below_max_len=pad_below_max_len,
                    max_len=max_len,
                    output_dic=output_dic)
                analysed_batch_dic_src_origin = all_origin[studied_ind]

            for lang, target in data_target_dic.items():
                print(f"Starting target", {lang})
                i_lang += 1
                target_batch = target[i_data:i_data + batch_size]

                all = get_hidden_representation(
                    target_batch,
                    model,
                    tokenizer,
                    pad_below_max_len=pad_below_max_len,
                    max_len=max_len,
                    output_dic=output_dic)

                if args.compare_to_pretrained:
                    all_origin = get_hidden_representation(
                        target_batch,
                        model_origin,
                        tokenizer_0,
                        pad_below_max_len=pad_below_max_len,
                        max_len=max_len,
                        output_dic=output_dic)
                    analysed_batch_dic_target_origin = all_origin[studied_ind]

                analysed_batch_dic_target = all[studied_ind]

                former_layer, former_mean_target, former_mean_src = None, None, None
                for layer in analysed_batch_dic_target:
                    print(f"Starting layer", {layer})
                    # get average for sentence removing first and last special tokens
                    if output_dic:
                        mean_over_sent_src = []
                        mean_over_sent_target = []
                        mean_over_sent_target_origin = []
                        mean_over_sent_src_origin = []
                        for i_sent in range(len(analysed_batch_dic_en[layer])):
                            # removing special characters first and last and
                            mean_over_sent_src.append(
                                np.array(analysed_batch_dic_en[layer][i_sent][
                                    0, 1:-1, :].mean(dim=0).cpu()))
                            mean_over_sent_target.append(
                                np.array(analysed_batch_dic_target[layer]
                                         [i_sent][0,
                                                  1:-1, :].mean(dim=0).cpu()))

                            if args.compare_to_pretrained:
                                mean_over_sent_target_origin.append(
                                    np.array(
                                        analysed_batch_dic_target_origin[layer]
                                        [i_sent][0,
                                                 1:-1, :].mean(dim=0).cpu()))
                                if i_lang == 1:
                                    mean_over_sent_src_origin.append(
                                        np.array(analysed_batch_dic_src_origin[
                                            layer][i_sent][0, 1:-1, :].mean(
                                                dim=0).cpu()))

                        if args.compare_to_pretrained:
                            mean_over_sent_target_origin = np.array(
                                mean_over_sent_target_origin)
                            if i_lang == 1:
                                mean_over_sent_src_origin = np.array(
                                    mean_over_sent_src_origin)
                        mean_over_sent_src = np.array(mean_over_sent_src)
                        mean_over_sent_target = np.array(mean_over_sent_target)

                    else:
                        mean_over_sent_src = analysed_batch_dic_en[
                            layer][:, 1:-1, :].mean(dim=1).cpu()
                        mean_over_sent_target = analysed_batch_dic_target[
                            layer][:, 1:-1, :].mean(dim=1).cpu()

                    if layer not in cosine_sent_to_src[src_lang + "-" + lang]:
                        cosine_sent_to_src[src_lang + "-" + lang][layer] = []
                    if layer not in cosine_sent_to_origin[lang]:
                        cosine_sent_to_origin[lang][layer] = []
                    if layer not in cosine_sent_to_origin_src[src_lang]:
                        cosine_sent_to_origin_src[src_lang][layer] = []

                    if metric == "cka":
                        mean_over_sent_src = np.array(mean_over_sent_src)
                        mean_over_sent_target = np.array(mean_over_sent_target)

                        cosine_sent_to_src[src_lang + "-" +
                                           lang][layer].append(
                                               kernel_CKA(
                                                   mean_over_sent_src,
                                                   mean_over_sent_target))
                        if args.compare_to_pretrained:
                            cosine_sent_to_origin[lang][layer].append(
                                kernel_CKA(mean_over_sent_target,
                                           mean_over_sent_target_origin))
                            if i_lang == 1:
                                cosine_sent_to_origin_src[src_lang][
                                    layer].append(
                                        kernel_CKA(mean_over_sent_src_origin,
                                                   mean_over_sent_src))
                                print(
                                    f"Measured EN TO ORIGIN {metric} {layer} {cosine_sent_to_origin_src[src_lang][layer][-1]} "
                                    + info_model)
                            print(
                                f"Measured LANG {lang} TO ORIGIN {metric} {layer} {cosine_sent_to_origin[lang][layer][-1]} "
                                + info_model)

                        print(
                            f"Measured {metric} {layer} {kernel_CKA(mean_over_sent_src,mean_over_sent_target)} "
                            + info_model)
                    else:
                        cosine_sent_to_src[
                            src_lang + "-" + lang][layer].append(
                                cosinus(mean_over_sent_src,
                                        mean_over_sent_target).item())

                    if former_layer is not None:
                        if layer not in cosine_sent_to_former_layer[lang]:
                            cosine_sent_to_former_layer[lang][layer] = []
                        if layer not in cosine_sent_to_former_layer_src[
                                src_lang]:
                            cosine_sent_to_former_layer_src[src_lang][
                                layer] = []
                        if metric == "cka":
                            cosine_sent_to_former_layer[lang][layer].append(
                                kernel_CKA(former_mean_target,
                                           mean_over_sent_target))
                            if i_lang == 1:
                                cosine_sent_to_former_layer_src[src_lang][
                                    layer].append(
                                        kernel_CKA(former_mean_src,
                                                   mean_over_sent_src))
                        else:
                            cosine_sent_to_former_layer[lang][layer].append(
                                cosinus(former_mean_target,
                                        mean_over_sent_target).item())
                            if i_lang == 1:
                                cosine_sent_to_former_layer_src[src_lang][
                                    layer].append(
                                        cosinus(former_mean_target,
                                                mean_over_sent_target).item())

                    former_layer = layer
                    former_mean_target = mean_over_sent_target
                    former_mean_src = mean_over_sent_src

    # summary
    print_all = True
    lang_i = 0
    src_lang_i = 0
    #for lang, cosine_per_layer in cosine_sent_to_src.items():
    for lang, cosine_per_layer in cosine_sent_to_former_layer.items():
        layer_i = 0
        src_lang_i += 1
        for src_lang in src_lang_ls:
            lang_i += 1
            for layer, cosine_ls in cosine_per_layer.items():
                print(
                    f"Mean {metric} between {src_lang} and {lang} for {layer} is {np.mean(cosine_sent_to_src[src_lang+'-'+lang][layer])} std:{np.std(cosine_sent_to_src[src_lang+'-'+lang][layer])} measured on {len(cosine_sent_to_src[src_lang+'-'+lang][layer])} model  "
                    + info_model)
                if layer_i > 0 and print_all:

                    print(
                        f"Mean {metric} for {lang} beween {layer} and former is {np.mean(cosine_sent_to_former_layer[lang][layer])} std:{np.std(cosine_sent_to_former_layer[lang][layer])} measured on {len(cosine_sent_to_former_layer[lang][layer])} model "
                        + info_model)

                    report = report_template(
                        metric_val=metric,
                        subsample=lang + "_to_former_layer",
                        info_score_val=None,
                        score_val=np.mean(
                            cosine_sent_to_former_layer[lang][layer]),
                        n_sents=n_obs,
                        avg_per_sent=np.std(
                            cosine_sent_to_former_layer[lang][layer]),
                        n_tokens_score=n_obs * max_len,
                        model_full_name_val=model_full_name_val,
                        task="hidden_state_analysis",
                        evaluation_script_val="exact_match",
                        model_args_dir=args.init_args_dir,
                        token_type="word",
                        report_path_val=None,
                        data_val=layer,
                    )
                    report_ls.append(report)

                    if lang_i == 1:
                        print(
                            f"Mean {metric} for {lang} beween {layer} and former is {np.mean(cosine_sent_to_former_layer_src[src_lang][layer])} std:{np.std(cosine_sent_to_former_layer_src[src_lang][layer])} measured on {len(cosine_sent_to_former_layer_src[src_lang][layer])} model "
                            + info_model)

                        report = report_template(
                            metric_val=metric,
                            subsample=src_lang + "_to_former_layer",
                            info_score_val=None,
                            score_val=np.mean(
                                cosine_sent_to_former_layer_src[src_lang]
                                [layer]),
                            n_sents=n_obs,
                            avg_per_sent=np.std(
                                cosine_sent_to_former_layer_src[src_lang]
                                [layer]),
                            n_tokens_score=n_obs * max_len,
                            model_full_name_val=model_full_name_val,
                            task="hidden_state_analysis",
                            evaluation_script_val="exact_match",
                            model_args_dir=args.init_args_dir,
                            token_type="word",
                            report_path_val=None,
                            data_val=layer,
                        )
                        report_ls.append(report)

                layer_i += 1

                report = report_template(
                    metric_val=metric,
                    subsample=lang + "_to_" + src_lang,
                    info_score_val=None,
                    score_val=np.mean(cosine_sent_to_src[src_lang + '-' +
                                                         lang][layer]),
                    n_sents=n_obs,
                    #avg_per_sent=np.std(cosine_ls),
                    avg_per_sent=np.std(cosine_sent_to_src[src_lang + '-' +
                                                           lang][layer]),
                    n_tokens_score=n_obs * max_len,
                    model_full_name_val=model_full_name_val,
                    task="hidden_state_analysis",
                    evaluation_script_val="exact_match",
                    model_args_dir=args.init_args_dir,
                    token_type="word",
                    report_path_val=None,
                    data_val=layer,
                )

                report_ls.append(report)

                #
                if args.compare_to_pretrained:

                    print(
                        f"Mean {metric} for {lang} beween {layer} and origin model is {np.mean(cosine_sent_to_origin[lang][layer])} std:{np.std(cosine_sent_to_origin[lang][layer])} measured on {len(cosine_sent_to_origin[lang][layer])} model "
                        + info_model)
                    report = report_template(
                        metric_val=metric,
                        subsample=lang + "_to_origin",
                        info_score_val=None,
                        score_val=np.mean(cosine_sent_to_origin[lang][layer]),
                        n_sents=n_obs,
                        avg_per_sent=np.std(
                            cosine_sent_to_origin[lang][layer]),
                        n_tokens_score=n_obs * max_len,
                        model_full_name_val=model_full_name_val,
                        task="hidden_state_analysis",
                        evaluation_script_val="exact_match",
                        model_args_dir=args.init_args_dir,
                        token_type="word",
                        report_path_val=None,
                        data_val=layer)
                    report_ls.append(report)

                    if lang_i == 1:

                        print(
                            f"Mean {metric} for en beween {layer} and origin model is {np.mean(cosine_sent_to_origin_src[src_lang][layer])} std:{np.std(cosine_sent_to_origin_src[src_lang][layer])} measured on {len(cosine_sent_to_origin_src[src_lang][layer])} model "
                            + info_model)
                        report = report_template(
                            metric_val=metric,
                            subsample=src_lang + "_to_origin",
                            info_score_val=None,
                            score_val=np.mean(
                                cosine_sent_to_origin_src[src_lang][layer]),
                            n_sents=n_obs,
                            avg_per_sent=np.std(
                                cosine_sent_to_origin_src[src_lang][layer]),
                            n_tokens_score=n_obs * max_len,
                            model_full_name_val=model_full_name_val,
                            task="hidden_state_analysis",
                            evaluation_script_val="exact_match",
                            model_args_dir=args.init_args_dir,
                            token_type="word",
                            report_path_val=None,
                            data_val=layer)
                        report_ls.append(report)

        # break

    if args.report_dir is None:
        report_dir = PROJECT_PATH + f"/../../analysis/attention_analysis/report/{run_id}-report"
        os.mkdir(report_dir)
    else:
        report_dir = args.report_dir
    assert os.path.isdir(report_dir)
    with open(report_dir + "/report.json", "w") as f:
        json.dump(report_ls, f)

    overall_report = args.overall_report_dir + "/" + args.overall_label + "-grid-report.json"
    with open(overall_report, "r") as g:
        report_all = json.load(g)
        report_all.extend(report_ls)
    with open(overall_report, "w") as file:
        json.dump(report_all, file)

    print("{} {} ".format(REPORT_FLAG_DIR_STR, overall_report))
def main(args, dict_path, model_dir):
    model, tokenizer, run_id = load_all_analysis(args, dict_path, model_dir)
    
    if args.compare_to_pretrained:
        print("Loading Pretrained model also for comparison with pretrained")
        args_origin = args_attention_analysis()
        args_origin = args_preprocess_attention_analysis(args_origin)
        args_origin.init_args_dir = None
        args_origin, dict_path_0, model_dir_0 = get_dirs(args_origin)
        args_origin.model_id_pref += "again"
        model_origin, tokenizer_0, _ = load_all_analysis(args_origin, dict_path_0, model_dir_0)
        model_origin.eval()
        print("seco,")
    # only allow output of the model to be hidden states here
    print("Checkpoint loaded")
    assert not args.output_attentions
    assert args.output_all_encoded_layers and args.output_hidden_states_per_head

    data = ["I am here", "How are you"]
    model.eval()
    n_obs = args.n_sent
    max_len = args.max_seq_len
    lang_ls = args.raw_text_code

    lang = ["fr_pud",  # "de_pud", "ru_pud", "tr_pud", "id_pud", "ar_pud", "pt_pud",  "es_pud", "fi_pud",
            # "it_pud", "sv_pud", "cs_pud", "pl_pud", "hi_pud", "zh_pud", "ko_pud", "ja_pud","th_pud"
            ]
    src_lang_ls = ["tr_imst", "en_ewt", #"ja_gsd", "ar_padt",  #"en_pud", "tr_pud", "ru_pud",# "ar_pud", #"de_pud", "ko_pud",
                   "ug_udt"
                    ]  # , "fr_pud", "ru_pud", "ar_pud"]
    src_lang_ls = ["tr_dedup", "az_100k_shuff",
                    "en_100k", "kk_100k_shuff", #"hu_dedup", #"ar_padt",
                   # "en_pud", "tr_pud", "ru_pud",# "ar_pud", #"de_pud", "ko_pud",
                   #"ckb_dedup",# "ja_dedup_200k",
                   #"ar_dedup_200k", "fa_dedup_200k", 
                   "ug_udt",
                   ]
    src_lang_ls = [#"ar_oscar", "tr_dedup", "az_100k_shuff", "fa_dedup_200k",
                   # "it_oscar", "en_oscar", #"hu_dedup", #"ar_padt",
                   "ar_oscar","de_oscar","en_oscar","fa_oscar" ,"fi_oscar" ,"fr_oscar", "he_oscar", "hi_oscar","hu_oscar","it_oscar","ja_oscar", "ko_oscar", "ru_oscar","tr_oscar", 
                   ]
    src_lang_ls.append(args.target_lang)

    def add_demo(src_lang_ls):
        for i in range(len(src_lang_ls)):
            if src_lang_ls[i]!="mt_mudt":
                src_lang_ls[i] += "_demo"
        return src_lang_ls

    #add_demo(src_lang_ls)
    

    # target is last
    target_class_ind = len(src_lang_ls)-1
    target_lang = src_lang_ls[target_class_ind]
    #to_class = [""]
    set_ = "test"
    #set_ = "test-demo"
    #print("Loading data...")

    #data_en = load_data(DATA_UD + f"/{src_lang_ls[0]}-ud-{set_}.conllu", line_filter="# text = ")

    #id_start_start_class, id_end_target_class = get_id_sent_target(target_class_ind, data_target_dic)

    # reg = linear_model.LogisticRegression()
    # just to get the keyw
    layer_all = get_hidden_representation(data, model, tokenizer, max_len=max_len)
    # removed hidden_per_layer

    assert len(layer_all) == 1
    # assert len(layer_all) == 2, "ERROR should only have hidden_per_layer and hidden_per_head_layer"

    report_ls = []
    accuracy_dic = OrderedDict()
    sampling = args.sampling
    metric = args.similarity_metric
    if metric == "cka":
        pad_below_max_len, output_dic = False, True
    else:
        pad_below_max_len, output_dic = False, True
    assert metric in ["cos", "cka"]

    batch_size = args.batch_size #len(data_en) // 4

    task_tuned = "No"

    if args.init_args_dir is None:
        # args.init_args_dir =
        id_model = f"{args.bert_model}-init-{args.random_init}"

        hyperparameters = OrderedDict([("bert_model", args.bert_model),
                                       ("random_init", args.random_init),
                                       ("not_load_params_ls", args.not_load_params_ls),
                                       ("dict_path", dict_path),
                                       ("model_id", id_model), ])
        info_checkpoint = OrderedDict([("epochs", 0), ("batch_size", batch_size),
                                       ("train_path", 0), ("dev_path", 0), ("num_labels_per_task", 0)])

        args.init_args_dir = write_args(os.environ.get("MT_NORM_PARSE", "./"), model_id=id_model,
                                        info_checkpoint=info_checkpoint,
                                        hyperparameters=hyperparameters, verbose=1)
        print("args_dir checkout ", args.init_args_dir)
        model_full_name_val = task_tuned + "-" + id_model
    else:
        argument = json.load(open(args.init_args_dir, 'r'))
        task_tuned = argument["hyperparameters"]["tasks"][0][0] if not "wiki" in argument["info_checkpoint"][
            "train_path"] else "ner"
        model_full_name_val = task_tuned + "-" + args.init_args_dir.split("/")[-1]

    if args.analysis_mode == "layer":
        studied_ind = 0
    elif args.analysis_mode == "layer_head":
        studied_ind = 1
    else:
        raise (Exception(f"args.analysis_mode : {args.analysis_mode} corrupted"))


    output_dic = True
    pad_below_max_len = False
    max_len = 500

    sent_embeddings_per_lang = OrderedDict()
    sent_text_per_lang = OrderedDict()
    pick_layer = ["layer_6"]
    n_batch = args.n_batch
    #assert n_batch==1, "ERROR not working otherwise ! "
    demo = 0
    assert args.n_sent_extract <= args.batch_size * args.n_batch * (len(src_lang_ls) - 1), "ERROR not enough data provided for the selection"
    
    print(f"Starting processing : {n_batch} batch of size {batch_size}")

    def sanity_len_check(src_lang_ls, n_sent_per_lang):
        for src_lang in src_lang_ls:
            
            dir_data = OSCAR + f"/{src_lang}-train.txt"
            num_lines = sum(1 for line in open(dir_data))
            print(f"Sanity checking {src_lang} should have more than {n_sent_per_lang} sentences, it has {num_lines}")
            assert num_lines>=n_sent_per_lang, f"ERROR {src_lang} {num_lines} < {n_sent_per_lang} n_sent_per_lang"
    
    sanity_len_check(src_lang_ls[:-1], n_sent_per_lang=args.batch_size * args.n_batch)



    for i_data in tqdm(range(n_batch)):
        if demo:
            batch_size = 50
            n_batch = 1
            if i_data > 0:
                break
        for src_lang in tqdm(src_lang_ls):
            print(f"Loading lang {src_lang} batch size {batch_size}")
            
            #data_en = load_data(DATA_UD + f"/{src_lang}-ud-{set_}.conllu", line_filter="# text = ")
            #en_batch =  # data[i_data:i_data + batch_size]
            try:
                dir_data = get_dir_data(set="train", data_code=src_lang)
                filter_row = "# text = "
            except Exception as e:
                dir_data = OSCAR + f"/{src_lang}-train.txt"
                filter_row = ""
                print(f"{src_lang} not supported or missing : data defined as {dir_data} filter empty")
            try:
                en_batch = load_data(dir_data, line_filter=filter_row, id_start=i_data*batch_size, id_end=(i_data+1)*batch_size)
            except Exception as e:
                print(f"ERROR: cannot load data {dir_data} skipping")
                if i_data==0:
                    raise(Exception(e))
                continue
            if en_batch is None:
                print(f"lang {src_lang} reading {i_data*batch_size} seems empty so skipping")
                continue

            if src_lang not in sent_text_per_lang:
                sent_text_per_lang[src_lang] = []
            sent_text_per_lang[src_lang].extend(en_batch)

            all = get_hidden_representation(en_batch, model, tokenizer, pad_below_max_len=pad_below_max_len,
                                            max_len=max_len, output_dic=output_dic)

            analysed_batch_dic_en = all[studied_ind]
            i_lang = 0

            if args.compare_to_pretrained:
                all_origin = get_hidden_representation(en_batch, model_origin, tokenizer_0,
                                                       pad_below_max_len=pad_below_max_len,
                                                       max_len=max_len, output_dic=output_dic)
                analysed_batch_dic_src_origin = all_origin[studied_ind]

            for layer in analysed_batch_dic_en:
                if layer not in pick_layer:
                    continue
                else:
                    print(f"Picking {pick_layer} layer")
                print(f"Starting layer", {layer})
                # get average for sentence removing first and last special tokens
                if layer not in sent_embeddings_per_lang:
                    sent_embeddings_per_lang[layer] = OrderedDict()
                if src_lang not in sent_embeddings_per_lang[layer]:
                    sent_embeddings_per_lang[layer][src_lang] = []
                if output_dic:
                    mean_over_sent_src = []
                    #mean_over_sent_target = []
                    #mean_over_sent_target_origin = []
                    mean_over_sent_src_origin = []
                    for i_sent in range(len(analysed_batch_dic_en[layer])):
                        # removing special characters first and last and
                        mean_over_sent_src.append(
                            np.array(analysed_batch_dic_en[layer][i_sent][0, 1:-1, :].cpu().mean(dim=0)))
                        #mean_over_sent_target.append(
                        #    np.array(analysed_batch_dic_target[layer][i_sent][0, 1:-1, :].mean(dim=0)))
                        if args.compare_to_pretrained:
                        #    mean_over_sent_target_origin.append(
                        #        np.array(analysed_batch_dic_target_origin[layer][i_sent][0, 1:-1, :].mean(dim=0)))
                            if i_lang == 1:
                                mean_over_sent_src_origin.append(
                                    np.array(analysed_batch_dic_src_origin[layer][i_sent][0, 1:-1, :].mean(dim=0)))
                    if args.compare_to_pretrained:
                    #    mean_over_sent_target_origin = np.array(mean_over_sent_target_origin)
                        if i_lang == 1:
                            mean_over_sent_src_origin = np.array(mean_over_sent_src_origin)
                    mean_over_sent_src = np.array(mean_over_sent_src)
                    #mean_over_sent_target = np.array(mean_over_sent_target)
                else:
                    mean_over_sent_src = analysed_batch_dic_en[layer][:, 1:-1, :].mean(dim=1)
                    #mean_over_sent_target = analysed_batch_dic_target[layer][:, 1:-1, :].mean(dim=1)

                sent_embeddings_per_lang[layer][src_lang].append(mean_over_sent_src)

    def get_id_sent_target(target_class_ind, data_target_dic):
        n_sent_total = 0

        assert target_class_ind <= len(data_target_dic)
        for ind_class, lang in enumerate(src_lang_ls):
            n_sent_total += len(data_target_dic[lang])
            if ind_class == target_class_ind:
                n_sent_class = len(data_target_dic[lang])
                id_start_start_class = n_sent_total
                id_end_target_class = n_sent_total + n_sent_class
        return id_start_start_class, id_end_target_class

    clustering = "distance"

    if clustering in ["gmm", "spectral"]:
        concat_train,  concat_test, y_train, y_test, lang2id = concat_all_lang_space_split_train_test(sent_embeddings_per_lang, src_lang_ls, pick_layer)
        #X = np.array(concat).squeeze(1)
        X_train = np.array(concat_train)
        X_test = np.array(concat_test)

        if len(X_train.shape) > 2:

            X_train = X_train.reshape(X_train.shape[0]*X_train.shape[1],-1)
            X_test = X_test.reshape(X_test.shape[0]*X_test.shape[1],-1)
        if clustering == "gmm":
            model = mixture.GaussianMixture(n_components=len(src_lang_ls)-1, covariance_type='full')
            model.fit(X_train)
            model_based_clustering = True
        elif clustering == "spectral":
            model = cluster.spectral_clustering(n_clusters=len(src_lang_ls))
            model.fit(X_train)
            model_based_clustering = True

    elif clustering == "distance":
        # concat batch_size

        for layer in sent_embeddings_per_lang:
            assert len(sent_embeddings_per_lang[layer])>1, "ERRO you're doing distance measure ! "
            for lang in sent_embeddings_per_lang[layer]:
                arr = np.array(sent_embeddings_per_lang[layer][lang])
                if arr.shape[0]!=n_batch:
                    print(f"WARNNIG: shape: {lang}  {np.array(sent_embeddings_per_lang[layer][lang]).shape} reshaping to {arr.shape[0]*arr.shape[1]}")
                sent_embeddings_per_lang[layer][lang] = arr.reshape(arr.shape[0] * arr.shape[1], -1)
            assert sent_embeddings_per_lang[layer][lang].shape[0] == len(sent_text_per_lang[lang]), f"ERROR lang {lang} layer {layer}  {sent_embeddings_per_lang[layer][lang].shape}[0]<>{len(sent_text_per_lang[lang])}"

        sent_embeddings_per_lang_train, sent_embeddings_per_lang_test, sent_text_per_lang = \
            split_train_test(sent_embeddings_per_lang, sent_text_per_lang,
                             keep_text_test=True, target_lang=target_lang,
                             target_lang_no_test=True,
                             prop_train=1 / 20)

        centroid_train, ls_lang = get_centroid(sent_embeddings_per_lang_train, target_lang=target_lang, only_target_centoid=False)
        # outputing for each sentence (with layer x lang information)
        print("ls_lang", ls_lang)
        closest_lang, score_to_target_test = get_closest_centroid(sent_embeddings_per_lang_test, centroid_train, ls_lang, ind_lang_target=target_class_ind)

        get_stat_distance(closest_lang, ls_lang, target_lang)

        count_n_extracted_sent = 0
        for layer in score_to_target_test:
            for lang in score_to_target_test[layer]:
                count_n_extracted_sent += len(score_to_target_test[layer][lang])
        print(f"Cosine extracted sent {count_n_extracted_sent}")
        test_sent_extracted, index_test_extraxted, info_per_layer_select = get_closest_n_sent(n_sent=args.n_sent_extract, score_to_target=score_to_target_test, sent_text_per_lang=sent_text_per_lang, lang_ls=src_lang_ls,
                                                                                              target_lang=target_lang)
        get_iou_inter(index_test_extraxted)


        dir_file = os.path.join(os.environ.get("OSCAR", "/Users/bemuller/Documents/Work/INRIA/dev/data"),"data_selected")
        #dir_file = "/Users/bemuller/Documents/Work/INRIA/dev/data/data_selected"
        write_down_selected(test_sent_extracted, info_per_layer_select, dir_file, id=f"select-{args.overall_label}-{args.bert_model}-{target_lang}-n_sent-{args.n_sent_extract}")


    if clustering in ["gmm", "spectral"]:
        target_class_ind = X_train
        predict_proba_train = model.predict_proba(X_train)
        predict_train = model.predict(X_train)
        predict_proba = model.predict_proba(X_test)
        predict_test = model.predict(X_test)

        def get_most_common_per_class(predict, lang2id):
            " for each class : finding the clustering predicting using majority vote "
            id_class_start = 0
            id_class_end = 0
            pred_label_to_real_label = {}
            for lang in lang2id:
                id_class_end += lang2id[lang]["n_sent_train"]

                pred_class = predict[id_class_start:id_class_end]

                assert len(pred_class)>0
                id_class_start = id_class_end
                from collections import Counter
                pred_class_counter = Counter(pred_class)
                lang2id[lang]["pred_label"] = pred_class_counter.most_common()[0][0]
                if pred_class_counter.most_common()[0][0] in pred_label_to_real_label:
                    print(f"WARNING: {pred_class_counter.most_common()[0][0]} pred label as mot_common in a class is predicted in two classes")
                pred_label_to_real_label[pred_class_counter.most_common()[0][0]] = lang2id[lang]["id"]
            return lang2id, pred_label_to_real_label

        lang2id, pred_label_to_real_label = get_most_common_per_class(predict_train, lang2id)
        print(f"V metric train {v_measure_score(predict_train, y_train)}")
        print(f"V metric test {v_measure_score(predict_test, y_test)}")

        def adapt_label(pred_label_to_real_label, pred):
            " based on majority bvote prediction : adapt prediction set to real label set"
            pred_new = []
            for label_pred in pred:
                if label_pred not in pred_label_to_real_label:
                    print("Warning : pred label not associated to any true label")
                pred_new.append(pred_label_to_real_label.get(label_pred, label_pred))
            return pred_new

        def print_report(report, src_lang_ls, lang2id):
            for lang in src_lang_ls:
                id_label = lang2id[lang]["id"]
                print(f"Lang {lang} summary {report[str(id_label)]}")

            print(f"Macro Avg {lang} summary {report['macro avg']}")

        pred_new_train = adapt_label(pred_label_to_real_label, predict_train)
        report = classification_report(y_pred=pred_new_train, y_true=y_train, output_dict=True)
        print_report(report, src_lang_ls, lang2id)

        pred_new_test = adapt_label(pred_label_to_real_label, predict_test)
        report = classification_report(y_pred=pred_new_test, y_true=y_test, output_dict=True)

        print_report(report, src_lang_ls, lang2id)

        #print(predict_proba_train, predict_proba)


    #print(gmm.predict(X_len(-train),
    #gmm.predict_proba(X[:1, :]))

    # based on this --> for a given source set of sentences (ex : uyghur sentences)
    # 1 - find the cluster id of Uyghur sentences
    # 2 - get the x top sentences that have high proba for
    # 3 - print it to see if that makes sense
    # do it for 1000 uy , 1000k for 10 other languages
    # then same
    # then compare overlap per layers


    # summary
    print_all = True
    lang_i = 0
    src_lang_i = 0