def prediction_topk_to_string(predictions_topk,
                              input_alignement_with_raw,
                              topk,
                              tokenizer,
                              null_token_index,
                              null_str,
                              tasks,
                              verbose=1):

    sentence_pred = from_bpe_token_to_str(predictions_topk,
                                          topk,
                                          null_str=null_str,
                                          null_token_index=null_token_index,
                                          tokenizer=tokenizer,
                                          pred_mode=True)
    sentence_pred_aligned = []

    for top in range(topk):
        realign_sent = realigne(sentence_pred[top],
                                input_alignement_with_raw,
                                tasks=tasks,
                                null_str=null_str,
                                remove_null_str=True,
                                mask_str="X")
        assert len(
            realign_sent
        ) == 1, "ERROR : only batch len 1 accepted here (we are doing interaction)"
        printing("{} top-pred : bpe {}",
                 var=[top, realign_sent],
                 verbose_level=2,
                 verbose=verbose)
        realign_sent = " ".join(realign_sent[0])
        sentence_pred_aligned.append(realign_sent)
    return sentence_pred_aligned, sentence_pred
def interact(dic_path,
             model_full_name,
             dir_model,
             debug=False,
             model_specific_dictionary=True,
             beam_size=2,
             word_decoding=False,
             extra_arg_specific_label="",
             save_attention=False,
             show_attention=False,
             beam_decode=False,
             max_len=MAX_LEN,
             verbose=2):
    from model.seq2seq import LexNormalizer

    assert model_specific_dictionary
    char_dictionary = None
    voc_size = None

    if not debug:
        pdb.set_trace = lambda: 1
    model = LexNormalizer(generator=Generator,
                          voc_size=voc_size,
                          load=True,
                          model_full_name=model_full_name,
                          model_specific_dictionary=model_specific_dictionary,
                          dict_path=dic_path,
                          dir_model=dir_model,
                          extra_arg_specific_label=extra_arg_specific_label,
                          loading_sanity_test=True,
                          word_decoding=word_decoding,
                          char_decoding=not word_decoding,
                          verbose=verbose)
    model.eval()
    if show_attention or save_attention:
        assert model.decoder.attn_layer is not None, "ERROR : no attention to plot "
    if save_attention:
        dir_attention = os.path.join(dir_model, "attention_plot")
        if os.path.isdir(dir_attention):
            info = "existing"
        else:
            os.mkdir(dir_attention)
            info = "created"
        printing("Saving to {} {}",
                 var=[info, dir_attention],
                 verbose_level=1,
                 verbose=verbose)
    else:
        dir_attention = None
    decode_interacively(max_len=max_len,
                        model=model,
                        char_dictionary=char_dictionary,
                        sent_mode=True,
                        dir_attention=dir_attention,
                        save_attention=save_attention,
                        show_attention=show_attention,
                        beam_decode=beam_decode,
                        beam_size=beam_size,
                        showing_attention=show_attention,
                        verbose=verbose)
def get_args(args, dropout_loading_strict=True, verbose=0):
    default_dropout = None if dropout_loading_strict else 0
    if dropout_loading_strict:
        printing("WARNING : errors might come from misloading of dropout ",
                 verbose=verbose,
                 verbose_level=0)
    return args["char_embedding_dim"], args["output_dim"], args["hidden_size_encoder"], args[
        "hidden_size_sent_encoder"], args["encoder_arch"].get("dropout_sent_encoder_cell", default_dropout), args["encoder_arch"].get(
        "dropout_word_encoder_cell", default_dropout), args["encoder_arch"].get("drop_out_sent_encoder_out", default_dropout), args[
               "encoder_arch"].get("drop_out_word_encoder_out", default_dropout), \
           args["encoder_arch"].get("n_layers_word_encoder"), args["encoder_arch"].get("n_layers_sent_cell",1),args["encoder_arch"].get("dir_sent_encoder"), \
           args["encoder_arch"].get("word_recurrent_cell_encoder", None), args["encoder_arch"].get("dir_word_encoder",1), \
           args["hidden_size_decoder"], args["decoder_arch"].get("cell_word", None), args["decoder_arch"].get(
              "drop_out_word_decoder_cell", default_dropout), args["decoder_arch"].get("drop_out_char_embedding_decoder", default_dropout),\
            args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm", False), args["decoder_arch"].get("unrolling_word", False), args["decoder_arch"].get("char_src_attention", False),\
            args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm-dense_dim", None), args.get("shared_context","all"),  args["decoder_arch"].get("teacher_force", 1), \
           args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm-dense_dim_2"), args["decoder_arch"].get("stable_decoding_state", False), args["decoder_arch"].get("init_context_decoder", True),\
           args["decoder_arch"].get("word_decoding", 0), args["decoder_arch"].get("char_decoding", 1), \
           args.get("auxilliary_arch", {}).get("auxilliary_task_pos", False), \
           args.get("auxilliary_arch", {}).get("dense_dim_auxilliary_pos", None), args.get("auxilliary_arch", {}).get("dense_dim_auxilliary_pos_2", None), \
            args["decoder_arch"].get("dense_dim_word_pred",0), args["decoder_arch"].get("dense_dim_word_pred_2",0), args["decoder_arch"].get("dense_dim_word_pred_3",0), \
           args.get("symbolic_root", False), args.get("symbolic_end", False), \
           args["encoder_arch"].get("word_embedding_dim", 0), args["encoder_arch"].get("word_embed", False), \
           args["encoder_arch"].get("word_embedding_projected_dim", None),  args["decoder_arch"].get("activation_char_decoder"), args["decoder_arch"].get("activation_word_decoder"), \
            args["encoder_arch"].get("attention_tagging", False), \
           args["encoder_arch"].get("char_level_embedding_projection_dim",0),\
           args["encoder_arch"].get("mode_word_encoding", "cat"), \
           args.get("multi_task_loss_ponderation", "all"), args.get("sanity_test", {})
def reframe_tsv_to_sentConll(src_dir, target_dir, verbose=1):

    with open(src_dir, "r") as f:
        with open(target_dir, "w") as g:
            line = "0"
            while len(line) > 0:
                line = f.readline()
                line = line.strip()
                sent = line.split("\t")
                print(sent)
                if len(line) > 0:

                    if len(sent) < 3:
                        g.write("#sent_id {}\n".format("XXX"))
                        g.write(
                            "1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format(
                                sent[0], sent[1]))
                    else:
                        g.write("#sent_id {}\n".format(sent[0]))
                        g.write(
                            "1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format(
                                sent[1], sent[2]))
                    g.write("\n")

    printing("WRITTEN {} src and {} target directory".format(
        src_dir, target_dir),
             verbose=verbose,
             verbose_level=1)
Exemple #5
0
 def __init__(self, input_dim, dense_dim, dense_dim_2=0, verbose=1):
     super(BinaryPredictor, self).__init__()
     self.verbose = verbose
     self.dense_2 = None
     self.dense = None
     dim_predictor = input_dim
     if dense_dim is not None:
         if dense_dim > 0:
             self.dense = nn.Linear(input_dim, dense_dim)
             dim_predictor = dense_dim
             printing(
                 "WARNING : BinaryPredictor dense_dim is set to {} in norm_not_norm predictor",
                 var=dense_dim,
                 verbose=self.verbose,
                 verbose_level=1)
             if dense_dim_2 is not None:
                 if dense_dim_2 > 0:
                     self.dense_2 = nn.Linear(dense_dim, dense_dim_2)
                     dim_predictor = dense_dim_2
     else:
         assert dense_dim_2 is None or dense_dim_2 == 0, "ERROR : dense_dim_2 cannot be not null if dense_dim is "
         printing(
             "WARNING : BinaryPredictor as dense_dim is None no dense layer added to norm_not_norm predictor",
             verbose=self.verbose,
             verbose_level=1)
     self.predictor = nn.Linear(dim_predictor, out_features=2)
Exemple #6
0
def writer_weights_and_grad(model,
                            freq_writer,
                            epoch,
                            writer,
                            verbose,
                            report_grad=True,
                            report_weight=True):
    if epoch % freq_writer == 0:
        # # TODO : make this a unit test in test/
        if TEST_CLIPPING:
            for name, param in model.named_parameters():
                if param.requires_grad and param.grad is not None:
                    norm = param.grad.norm()
                    print("grad_norm writer_weights_and_grad", norm)
                    assert norm < CLIP
        for name, param in model.named_parameters():
            if report_weight:
                try:
                    writer.add_histogram(name,
                                         param.clone().cpu().data.numpy(),
                                         epoch)
                except Exception as e:
                    print("ERROR unable to report histogram ")
                    print(e)
            if param.requires_grad and param.grad is not None and report_grad:
                writer.add_histogram("grad" + name + "-grad",
                                     param.grad.clone().cpu().data.numpy(),
                                     epoch)
        printing("REPORTING : storing weights and grad in writer ",
                 verbose=verbose,
                 verbose_level=1)
Exemple #7
0
def scheduling_policy(phases_ls, epoch, tasks, verbose=1):

    if phases_ls is None:
        ponderation = 1
        weight_binary_loss = 1
        weight_pos_loss = 1
        if len(tasks) == 1:
            mode = tasks[0]
        else:
            mode = "all"
        printing(
            "WARNING : default policy scheduling (no scheduling {} ponderation_normalize_loss,  {}"
            " weight_binary_loss and pos {} : LOSS MODE SET TO {} ",
            var=[ponderation, weight_binary_loss, weight_pos_loss, mode],
            verbose_level=1,
            verbose=verbose)
        return mode, ponderation, weight_binary_loss, weight_pos_loss
    for phase in phases_ls:

        assert phase.get("epoch_start") is not None
        assert phase.get("epoch_stop") is not None
        assert phase.get("epoch_stop") > phase.get("epoch_start")
        assert phase.get("weight_binary_loss") is not None
        assert phase.get("ponderation_normalize_loss") is not None
        assert phase.get("multi_task_mode") in AVAILABLE_TASKS

        if phase["epoch_start"] <= epoch < phase["epoch_stop"]:
            return phase["multi_task_mode"], phase[
                "ponderation_normalize_loss"], phase["weight_binary_loss"]
def reframe_tsv_monolingual_to_sentConll(src_dir,
                                         src_dir_2,
                                         target_dir,
                                         verbose=1):

    with open(src_dir, "r") as f:
        with open(src_dir_2, "r") as f2:
            with open(target_dir, "w") as g:
                line = "0"
                line_2 = "0"
                i = 0
                while len(line) > 0:
                    while len(line_2) > 0:
                        line = f.readline()
                        line_2 = f2.readline()
                        line = line.strip()
                        line_2 = line_2.strip()
                        i += 1
                        print(line, line_2)
                        if len(line) > 0:
                            g.write("#sent_id {}\n".format("i"))
                            g.write("1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".
                                    format(line, line_2))
                            g.write("\n")

    printing("WRITTEN {} src and {} target directory".format(
        src_dir, target_dir),
             verbose=verbose,
             verbose_level=1)
def append_reporting_sheet(git_id,
                           tasks,
                           rioc_job,
                           description,
                           log_dir,
                           target_dir,
                           env,
                           status,
                           verbose=1):

    sheet, sheet_name, tab_name = open_client()
    # Find a workbook by name and open the first sheet
    # Make sure you use the right name here.
    #worksheet_list = sheet.worksheets()
    if not rioc_job.startswith("local"):
        sheet.append_row([
            git_id, rioc_job, tasks, description, log_dir, target_dir, env,
            status, None, None, None, None, "-"
        ])
        list_of_hashes = sheet.get_all_records()
        printing(
            "REPORT : Appending report to page {} in sheet {} of {} rows and {} columns ",
            var=[
                tab_name, sheet_name,
                len(list_of_hashes) + 1,
                len(list_of_hashes[0])
            ],
            verbose=verbose,
            verbose_level=1)
    else:
        list_of_hashes = ["NOTHING"]
    return len(list_of_hashes) + 1, len(list_of_hashes[0])
def pos_specific_dic_builder(pos_specific_data_set, pos_dictionary):
    if pos_specific_data_set is not None:
        assert os.path.exists(
            pos_specific_data_set), "{} does not exist".format(
                pos_specific_data_set)
        with codecs.open(pos_specific_data_set, 'r', 'utf-8',
                         errors='ignore') as file:
            li = 0
            for line in file:
                line = line.strip()
                if len(line) == 0 or line[0] == '#':
                    continue
                tokens = line.split('\t')
                if '-' in tokens[0] or '.' in tokens[0]:
                    continue
                pos = tokens[
                    3]  # if tokens[4]=='_' else tokens[3]+'$$$'+tokens[4]
                #xpos = tokens[4]
                pos_dictionary.add(pos)
                #xpos_dictionary.add(xpos)
        printing(
            "VOCABULARY : POS Vocabulary : pos dictionary built on {} ".format(
                pos_specific_data_set),
            verbose_level=1,
            verbose=1)
        return pos_dictionary
    printing("VOCABULARY : POS Vocabulary : pos dictionary untouched",
             verbose_level=1,
             verbose=1)
    return pos_dictionary
def write_args(dir,
               model_id,
               checkpoint_dir=None,
               hyperparameters=None,
               info_checkpoint=None,
               verbose=1):

    args_dir = os.path.join(dir, "{}-args.json".format(model_id))
    if os.path.isfile(args_dir):
        info = "updated"
        args = json.load(open(args_dir, "r"))
        args["checkpoint_dir"] = checkpoint_dir
        args["info_checkpoint"] = info_checkpoint
        json.dump(args, open(args_dir, "w"))
    else:
        assert hyperparameters is not None, "REPORT : args.json created for the first time : hyperparameters dic required "
        assert info_checkpoint is None, "REPORT : args. created for the first time : no checkpoint yet "
        info = "new"
        json.dump(
            OrderedDict([("checkpoint_dir", checkpoint_dir),
                         ("hyperparameters", hyperparameters),
                         ("info_checkpoint", None)]), open(args_dir, "w"))
    printing("MODEL args.json {} written {} ".format(info, args_dir),
             verbose_level=1,
             verbose=verbose)
    return args_dir
Exemple #12
0
def construct_word_embedding_table(word_dim,
                                   word_dictionary,
                                   word_embed_init_toke2vec,
                                   verbose=1):
    scale = np.sqrt(5.0 / word_dim * 10)
    # +1 required for default value
    table = np.zeros([len(word_dictionary) + 1, word_dim], dtype=np.float32)
    # WARNING: it means that unfilled commodities will get 0 which is the defulat index !!
    if verbose >= 1:
        print(
            "Initializing table with shape {} based onword_dictionary and word_dim  "
            .format(table.shape))
    table[UNK_ID, :] = np.random.uniform(-scale, scale,
                                         [1, word_dim]).astype(np.float32)
    oov = 0
    inv = 0
    var = 0
    mean = 0
    var_oov = 0
    mean_oov = 0
    for word, index in word_dictionary.items():

        if word in word_embed_init_toke2vec:
            embedding = word_embed_init_toke2vec[word]
            inv += 1
            #print("PRETRAINED VECTOR", index, word, embedding)
            mean += np.mean(embedding)
            var += np.var(embedding)
        elif word.lower() in word_embed_init_toke2vec:
            embedding = word_embed_init_toke2vec[word.lower()]
            #print("LOWER PRETRAINED VECTOR", index, word, embedding)
            inv += 1
            mean += np.mean(embedding)
            var += np.var(embedding)
        else:
            if word == "the":
                print("word ", word, " --> not accounted")
            embedding = np.random.uniform(-scale, scale,
                                          [1, word_dim]).astype(np.float32)
            mean_oov += np.mean(embedding)
            var_oov += np.var(embedding)
            #print("RANDOMY GENERATED", index, word, embedding)
            oov += 1
        table[index, :] = embedding
        #print("repeat", table[index, :])
    printing(
        "W2V INFO : Mean of preloaded w2v {} var {} "
        "while the one generated randomly have {} mean and {} var in average",
        var=[mean / inv, var / inv, mean_oov / oov, var_oov / oov],
        verbose_level=1,
        verbose=verbose)
    printing('W2V INFO  : OOV: %d/%d (%f rate (percent)) in %d' %
             (oov, len(word_dictionary) + 1,
              100 * float(oov / (len(word_dictionary) + 1)), inv),
             verbose_level=1,
             verbose=verbose)
    word = "the"
    print("word {} of index {} has vector {} ".format(word, index, embedding))
    return torch.from_numpy(table)
def update_status(row, value, col_number=8, sheet=None, verbose=1):
    if sheet is None:
        sheet, sheet_name, tab_name = open_client()
    if value is not None:
        sheet.update_cell(row, col_number, value)
        printing("REPORT : col {} updated in sheet with {} ",
                 var=[col_number, value],
                 verbose=verbose,
                 verbose_level=1)
 def expand_vocab(data_paths):
     counter_match_dev = 0
     expand = 0
     vocab_set = set(vocab_list)
     vocab_norm_set = set(vocab_norm_list)
     for data_path in data_paths:
         with codecs.open(data_path, 'r', 'utf-8', errors='ignore') as file:
             li = 0
             for line in file:
                 line = line.strip()
                 if len(line) == 0 or line[0] == '#':
                     continue
                 tokens = line.split('\t')
                 if '-' in tokens[0] or '.' in tokens[0]:
                     continue
                 for char in tokens[1]:
                     char_dictionary.add(char)
                 word = DIGIT_RE.sub(b"0", str.encode(tokens[1])).decode()
                 if case == "lower":
                     word = word.lower()
                 pos = tokens[
                     3]  # if tokens[4]=='_' else tokens[3]+'$$$'+tokens[4]
                 xpos = tokens[4]
                 typ = tokens[7]
                 # TODO SOMEHITNG
                 if word_normalization:
                     token_norm, _ = get_normalized_token(tokens[9],
                                                          0,
                                                          verbose=0)
                 if word_normalization:
                     # TODO : add word_norm_embed_dict to allow expansion !
                     if False and word_norm not in vocab_norm_set:
                         vocab_norm_set.add(word_norm)
                         vocab_norm_list.append(word_norm)
                 # TODO : ANswer : WHY WOULD WE LIKE TO EXPAND IT ON DEV, TEST ?
                 #if pos_specific_data_set is None:
                 #  pos_dictionary.add(pos)
                 #xpos_dictionary.add(xpos)
                 #type_dictionary.add(typ)
                 # if word not already in vocab_set (loaded as trained and each time expand_vocab was called :
                 # but found in new dataset and appear in word_embed_dict then we add it to vocab # otherwise not need to load them to vocab (they won't have any representation)
                 # but found in new dataset and appear in word_embed_dict then we add it to vocab # otherwise not need to load them to vocab (they won't have any representation)
                 if word not in vocab_set and (word in word_embed_dict
                                               or word.lower()
                                               in word_embed_dict):
                     vocab_set.add(word)
                     expand += 1
                     vocab_list.append(word)
                 li = li + 1
                 if dry_run and li == 100:
                     break
             printing(
                 "VOCABULARY EXPAND word source vocabulary expanded of {} tokens based on {} ",
                 var=[expand, data_path],
                 verbose=verbose,
                 verbose_level=0)
Exemple #15
0
def use_gpu_(use_gpu, verbose=0):
    if use_gpu is not None and use_gpu:
        assert torch.cuda.is_available(
        ), "ERROR : use_gpu was set to True but cuda not available "
    use_gpu = torch.cuda.is_available() if use_gpu is None else use_gpu
    printing("HARDWARE : use_gpu set to {} ",
             var=[use_gpu],
             verbose=verbose,
             verbose_level=1)
    return use_gpu
def checkpoint(loss_saved,
               loss,
               model,
               model_dir,
               epoch,
               epochs,
               info_checkpoint,
               saved_epoch,
               counter_no_decrease,
               verbose,
               extra_checkpoint_label="",
               extra_arg_specific_label="",
               checkpointing_metric="loss-dev-all",
               checkpoint_dir_former=None,
               keep_all_checkpoint=False):

    if loss < loss_saved:
        saved_epoch = epoch
        printing(
            'Checkpoint info : {} (former:{} current:{}) decreased so saving model '
            'saved epoch is {} (actual epoch {}) (counter_no_decrease set to 0)',
            var=[checkpointing_metric, loss_saved, loss, saved_epoch, epoch],
            verbose=verbose,
            verbose_level=1)
        loss_saved = loss

        _, _, checkpoint_dir = model.save(
            model_dir,
            model,
            info_checkpoint=info_checkpoint,
            extra_arg_specific_label=extra_arg_specific_label,
            suffix_name="{}-{}of{}epoch".format(extra_checkpoint_label, epoch,
                                                epochs),
            verbose=verbose)
        if not keep_all_checkpoint:
            model.rm_checkpoint(checkpoint_dir_former, verbose=verbose)
        checkpoint_dir_former = checkpoint_dir
        counter_no_decrease = 0
    else:
        # could add loading former model if loss suddenly pick
        #printing('Checkpoint info : Loss decreased so saving model', verbose=verbose, verbose_level=1)
        #model.load_state_dict(torch.load(checkpoint_dir))
        # TODO : load former checkpoint : and do change loss append IF error suddendly pick
        counter_no_decrease += 1
        printing(
            "Checkpoint info: {} (former:{} current:{}) did not decrease so keeping former model of epoch {} "
            "counter_no_decrease is now {} (actual epoch {}) ",
            var=[
                checkpointing_metric, loss_saved, loss, saved_epoch,
                counter_no_decrease, epoch
            ],
            verbose=verbose,
            verbose_level=1)

    return model, loss_saved, counter_no_decrease, saved_epoch, checkpoint_dir_former
Exemple #17
0
def simple_plot(
        final_loss,
        loss_ls,
        epoch_ls_1,
        epoch_ls_2=None,
        loss_2=None,
        epochs=None,
        V=None,
        seq_len=None,
        label="",
        label_2="",
        dir="/Users/bemuller/Documents/Work/INRIA/dev/mt_norm_parse/test_/test_logs",
        lr=None,
        save=False,
        show=True,
        prefix="test",
        verbose=0,
        verbose_level=1):

    if loss_2 is None:
        assert len(label_2) == 0, "Label_2 should be '' as loss_2 is None "
    if len(label_2) > 0:
        assert len(label) > 0, "label should be specified as label_2 is "

    printing("REPORT : Final Loss to be plotted {} ".format(final_loss),
             verbose=verbose,
             verbose_level=1)
    plt.figure()
    plt.title("Training Loss with after {} epo (lr {}) ".format(epochs, lr))
    plt.xlabel("epoch")
    color_train = "red"
    plt.plot(epoch_ls_1, loss_ls, label="plot1", color=color_train)
    patches = [mpatches.Patch(color=color_train, label=label)]
    if loss_2 is not None:
        assert epoch_ls_2 is not None, "epoch_ls_2 should not be None"
        color_dev = "blue"
        plt.plot(epoch_ls_2, loss_2, label="plot2", color=color_dev)
        patches.append(mpatches.Patch(color=color_dev, label=label_2))
    plt.legend(handles=patches)
    dir_fig = os.path.join(
        dir, "{}-{}-plo-seq.png".format(prefix, "last", V, lr, seq_len))
    if save:
        plt.savefig(dir_fig)
        printing("REPORT : Learning curve saved at {} ",
                 var=([dir_fig]),
                 verbose=verbose,
                 verbose_level=verbose_level)

    if show:
        print("Not Showing loss")
        #plt.show()
    plt.close()
    return dir_fig
 def forward(self, x):
     # return F.log_softmax(self.proj(x), dim=-1)
     # the log_softmax is done within the loss
     activation = nn.ReLU#eval(self.activation)
     y = activation()(self.dense(x))
     proj = self.proj(y)
     printing("TYPE  proj {} is cuda ",
              var=(proj.is_cuda), verbose=0, verbose_level=4)
     if self.verbose >= 3:
         print("PROJECTION {} size".format(proj.size()))
     if self.verbose >= 5:
         print("PROJECTION data {} ".format(proj))
     return proj
def get_loss(model, data_label, tasks, use_gpu, word_decoding, char_decoding,
             max_char_len, bucketing,batch_size,
             symbolic_end=1, add_end_char=1, add_start_char=1,
             symbolic_root=1,
             verbose=1):

    ponderation_normalize_loss = model.arguments["hyperparameters"]["ponderation_normalize_loss"]
    weight_pos_loss = model.arguments["hyperparameters"]["weight_pos_loss"]
    weight_binary_loss = model.arguments["hyperparameters"]["weight_binary_loss"]
    dataset = [REPO_LABEL2SET[_data_label] for _data_label in data_label]
    printing("SANITY TEST performed on {}".format(dataset), verbose=verbose, verbose_level=1)
    readers_dev = readers_load(datasets=dataset,
                               tasks=tasks, word_dictionary=model.word_dictionary,
                               word_dictionary_norm=model.word_nom_dictionary, char_dictionary=model.char_dictionary,
                               pos_dictionary=model.pos_dictionary, xpos_dictionary=model.xpos_dictionary,
                               type_dictionary=model.type_dictionary, use_gpu=use_gpu,
                               norm_not_norm="norm_not_norm" in tasks, word_decoder=word_decoding,
                               add_start_char=add_start_char, add_end_char=add_end_char, symbolic_end=symbolic_end,
                               symbolic_root=symbolic_root, bucket=bucketing, max_char_len=max_char_len,
                               verbose=verbose)

    batchIter_eval = data_gen_multi_task_sampling_batch(tasks=tasks, readers=readers_dev, batch_size=batch_size,
                                                        word_dictionary=model.word_dictionary,
                                                        char_dictionary=model.char_dictionary,
                                                        word_dictionary_norm=model.word_nom_dictionary,
                                                        pos_dictionary=model.pos_dictionary, dropout_input=0,
                                                        extend_n_batch=1, get_batch_mode=False, verbose=verbose)

    printing("SANITY TEST EVALUATION : computing loss ", verbose=verbose, verbose_level=2)

    loss_obj = LossCompute(model.generator, use_gpu=use_gpu, verbose=verbose,
                           multi_task_loss_ponderation=model.multi_task_loss_ponderation,
                           use="dev",
                           pos_pred="pos" in tasks,
                           tasks=tasks,
                           vocab_char_size=len(list(model.char_dictionary.instance2index.keys())) + 1,
                           char_decoding=char_decoding, word_decoding=word_decoding,
                           auxilliary_task_norm_not_norm="norm_not_norm" in tasks)

    print("PONDERATION", ponderation_normalize_loss)

    loss_dev, loss_details_dev, step_dev = run_epoch(batchIter_eval, model, loss_compute=loss_obj,
                                                     verbose=verbose, timing="", step=0,
                                                     weight_binary_loss=weight_binary_loss,
                                                     ponderation_normalize_loss=ponderation_normalize_loss,
                                                     weight_pos_loss=weight_pos_loss,
                                                     pos_batch="pos" in tasks,
                                                     log_every_x_batch=100)

    return loss_dev, loss_details_dev, step_dev
def data_gen_dummy(V, batch, nbatches, sent_len=9, word_len=5, verbose=0, seed=None):
    "Generate random data for a src-tgt copy task."
    if seed is not None:
        np.random.seed(seed)
    for i in tqdm(range(nbatches), disable=disable_tqdm_level(verbose, verbose_level=2)):
        data = torch.from_numpy(np.random.randint(low=2, high=V, size=(batch, sent_len, word_len)))
        data[:, :,0] = 2
        # we force padding in the dummy model
        data[:, :, -1] = 1
        data[:, :, -2] = 1
        printing("DATA dummy {} ", var=(data), verbose=verbose, verbose_level=5)
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield MaskBatch(src, tgt, pad=1)
Exemple #21
0
def predict(batch_size, data_path,
            dict_path, model_full_name,
            bucket=False, model_specific_dictionary=True,
            print_raw=False, dir_normalized=None, dir_original=None,
            get_batch_mode=False,
            normalization=True, debug=False, use_gpu=None, verbose=0):

    assert model_specific_dictionary, "ERROR : only model_specific_dictionary = True supported now"
    # NB : now : you have to load dictionary when evaluating (cannot recompute) (could add in the LexNormalizer ability)
    use_gpu = use_gpu_(use_gpu)
    hardware_choosen = "GPU" if use_gpu else "CPU"
    printing("{} mode ", var=([hardware_choosen]), verbose_level=0, verbose=verbose)

    if not debug:
        pdb.set_trace = lambda: 1

    model = LexNormalizer(generator=Generator, load=True, model_full_name=model_full_name,
                          voc_size=None, use_gpu=use_gpu, dict_path=dict_path, model_specific_dictionary=True,
                          dir_model=os.path.join(PROJECT_PATH, "checkpoints",
                                                 model_full_name + "-folder"),
                          char_decoding=True, word_decoding=False,
                          verbose=verbose
                          )

    data_read = conllu_data.read_data_to_variable(data_path, model.word_dictionary, model.char_dictionary,
                                                  model.pos_dictionary,
                                                  model.xpos_dictionary, model.type_dictionary,
                                                  use_gpu=use_gpu,
                                                  norm_not_norm=model.auxilliary_task_norm_not_norm,
                                                  symbolic_end=True, symbolic_root=True,
                                                  dry_run=0, lattice=False, verbose=verbose,
                                                  normalization=normalization,
                                                  bucket=bucket,
                                                  add_start_char=1, add_end_char=1)

    batchIter = data_gen_conllu(data_read, model.word_dictionary, model.char_dictionary,
                                batch_size=batch_size,
                                get_batch_mode=False,
                                normalization=normalization,
                                print_raw=print_raw,  verbose=verbose)
    model.eval()
    greedy_decode_batch(char_dictionary=model.char_dictionary, verbose=verbose,
                        gold_output=False,
                        use_gpu=use_gpu,
                        write_output=True,
                        label_data=REPO_DATASET[data_path],
                        batchIter=batchIter, model=model, dir_normalized=dir_normalized, dir_original=dir_original,
                        batch_size=batch_size)
Exemple #22
0
def sanity_check_loss_poneration(ponderation_dic, verbose=1):
    if isinstance(ponderation_dic, dict):
        for task in AVAILABLE_TASKS:
            if task != "all":  # Still some ambiguity in 'all' setting
                assert task in ponderation_dic, "ERROR : task {} is not related to a ponderation while it should ".format(
                    task)
    elif isinstance(ponderation_dic, str):
        assert ponderation_dic in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE, "ERROR ponderation should be in {}".format(
            ponderation_dic, MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE)
        printing("WARNING : COULD NOT SANITY CHECK ponderation_dic {} ",
                 var=[ponderation_dic],
                 verbose=verbose,
                 verbose_level=1)
    else:
        raise (Exception("ponderation_dic is neither string or dict {}".format(
            ponderation_dic)))
def data_gen_multi_task_sampling_batch(tasks, readers, word_dictionary, char_dictionary, pos_dictionary,
                                       word_dictionary_norm,
                                       extend_n_batch,
                                       batch_size,  get_batch_mode, mode_batch_sampling="proportional",
                                       padding=PAD_ID_CHAR,
                                       dropout_input=0, print_raw=False,
                                       verbose=1):
    "multitask learning iterator"
    assert len(tasks) == len(readers)
    assert mode_batch_sampling in MODE_BATCH_SAMPLING_AVAILABLE
    iterator = {}
    end_task_flag = {}
    n_sents_per_task_dataset_cumul = {}
    cumul_n_sent = 0
    for task in tasks:
        iterator[task] = data_gen_conllu(data=readers[task], word_dictionary=word_dictionary, task_info=task,
                                         char_dictionary=char_dictionary, pos_dictionary=pos_dictionary,
                                         word_dictionary_norm=word_dictionary_norm,
                                         batch_size=batch_size, extend_n_batch=extend_n_batch,
                                         get_batch_mode=get_batch_mode, dropout_input=dropout_input,
                                         padding=padding,
                                         print_raw=print_raw, normalization=TASKS_PARAMETER[task]["normalization"],
                                         verbose=verbose)
        end_task_flag[task] = False
        cumul_n_sent += readers[task][-1]
        n_sents_per_task_dataset_cumul[task] = cumul_n_sent
    n_sents_per_task_dataset_cumul["all"] = n_sents_per_task_dataset_cumul[tasks[-1]]
    printing("TRAINING : MultiTask batch sampling iterator {} cumulated n_sent   ", var=[n_sents_per_task_dataset_cumul], verbose_level=1, verbose=verbose)
    batch_iter = 0
    while True:
        n_sent_start = 0
        random_sample_id = np.random.randint(0, 100)
        for ind, task in enumerate(tasks):
            if sampling_proportion(n_sent_start, n_sents_per_task_dataset_cumul["all"]) < random_sample_id < sampling_proportion(n_sents_per_task_dataset_cumul[task], n_sents_per_task_dataset_cumul["all"]) and not end_task_flag[task]:
                try:
                    batch, order = iterator[task].__next__()
                    sanity_check_batch_label(task, batch, verbose=verbose)
                    batch_iter += 1
                    yield batch
                except StopIteration:
                    end_task_flag[task] = True
                    printing("ITERATOR END {} ", var=[task], verbose_level=1, verbose=verbose)
                    break
            else:
                n_sent_start = n_sents_per_task_dataset_cumul[task]
        if sum(end_task_flag.values()) == len(tasks):
            break
def readers_load(datasets, tasks, word_dictionary, word_dictionary_norm , char_dictionary,
                 pos_dictionary, xpos_dictionary, type_dictionary,
                 use_gpu,
                 norm_not_norm=False,
                 word_decoder=False, must_get_norm=True,
                 simultanuous_training=False, bucket=True,max_char_len=None,
                 add_start_char=1, add_end_char=1, symbolic_end=True, symbolic_root=True,
                 verbose=1):

    readers = {}
    #assert not simultanuous_training, "ERROR : so far : "
    assert "all" not in tasks, "ERROR not supported yet (pb for simultanuous training..) "
    if not "all" in tasks and not simultanuous_training:
        assert len(tasks) == len(datasets), "ERROR : as simultanuous_training is {} : " \
                                            "we need 1 dataset per task but have only {} for task {} ".format(simultanuous_training, datasets, tasks)
    elif not simultanuous_training:
        assert len(tasks) == 1, "ERROR : if all should have only all nothing else"
        printing("TRAINING : MultiTask Iterator wit task 'all' ", verbose_level=1, verbose=verbose)
    elif simultanuous_training:
        printing("TRAINING : Training simulatnuously tasks provided in {} (should have all required labels in datasets)",
                 verbose_level=1, verbose=verbose)
        raise(Exception("Not supported yet --> should handle the loop "))

    for task, data in zip(tasks, datasets):
        if task == "normalize":
            tasks = ["normalize", "norm_not_norm"]
        else:
            tasks = [task]
        print("WARNING : data_iterator : None hardcdoed for max_char_len")
        readers[task] = conllu_data.read_data_to_variable(data, word_dictionary, char_dictionary,
                                                          pos_dictionary,
                                                          xpos_dictionary, type_dictionary,
                                                          use_gpu=use_gpu,
                                                          word_decoder=word_decoder,
                                                          symbolic_end=symbolic_end, symbolic_root=symbolic_root,
                                                          dry_run=0, lattice=False,
                                                          normalization=TASKS_PARAMETER[task]["normalization"],
                                                          bucket=bucket,
                                                          add_start_char=add_start_char,
                                                          add_end_char=add_end_char, tasks=tasks,
                                                          max_char_len=None,
                                                          must_get_norm=must_get_norm,
                                                          word_norm_dictionary=word_dictionary_norm, verbose=verbose)

    return readers
def get_optimizer(parameters, lr, optimizer="adam", betas=None, verbose=1):

    assert optimizer in AVAILABLE_OPTIMIZER, "ERROR optimizers supported are {} ".format(AVAILABLE_OPTIMIZER)
    if optimizer == "adam":
        if betas is None:
            # betas = (0.9, 0.9)
            print("DEFAULT betas:", betas)
        opt = torch.optim.Adam(parameters, lr=lr, #betas=betas,
                               eps=1e-9)
    elif optimizer == "SGD":
        assert betas is None, "ERROR "
        opt = torch.optim.SGD(parameters, lr=lr)
    elif optimizer == "bahdanu-adadelta":
        assert betas is None, "ERROR betas not supported for optimizer {}".format(optimizer)
        opt = torch.optim.Adadelta(parameters, eps=10e-6, rho=0.95)
    printing("TRAINING : optimizer {} has been reloaded with lr {} betas {} ", var=[optimizer, lr, betas], verbose=verbose, verbose_level=1)

    return opt
def interact_bert(bert_token_classification,
                  tokenizer,
                  null_token_index,
                  null_str,
                  tasks,
                  topk=1,
                  verbose=1,
                  print_input=True,
                  use_gpu=False):

    printing("INFO : input_string should be white space tokenized",
             verbose=verbose,
             verbose_level=1)

    input_string = input(
        "What would you like to normalize ? type STOP to stop ")
    if input_string == "STOP":
        print("ENDING interaction")
        return None, 0

    input_string = ["[CLS] " + input_string + " [SEP]"]

    input_tokens_tensor, input_segments_tensors, inp_bpe_tokenized, \
    input_alignement_with_raw, input_mask = get_indexes(input_string, tokenizer, verbose, use_gpu)
    token_type_ids = torch.zeros_like(input_tokens_tensor)
    if print_input:
        print("SRC : BPE ", inp_bpe_tokenized)
    logits = bert_token_classification(input_tokens_tensor, token_type_ids, input_mask)[0]["logits_task_1"] \
        if tasks[0] == "normalize" else None

    predictions_topk = torch.argsort(logits, dim=-1,
                                     descending=True)[:, :, :topk]

    sentence_pred_aligned, sentence_pred = prediction_topk_to_string(
        predictions_topk,
        input_alignement_with_raw,
        topk,
        tokenizer,
        tasks=tasks,
        null_token_index=null_token_index,
        null_str=null_str)

    return input_string, sentence_pred_aligned, sentence_pred
def get_biggest_bpe_in(char_list, vocab, token_begining=True, verbose=1):
    ind_start = 0
    while ind_start < len(char_list):
        ind_end = len(char_list)
        while ind_start < ind_end:
            substr = "".join(char_list[ind_start:ind_end])
            if not token_begining > 0:
                substr = "##" + substr
            if substr in vocab:
                cur_substr = substr
                printing(
                    "WARNING : LEAVING ABREVIATION MATCH (SHOULD ADD MASK)",
                    var=[char_list[ind_end:], char_list],
                    verbose=verbose,
                    verbose_level="raw_data")
                return cur_substr, ind_end, char_list[ind_end:]
            ind_end -= 1
        # sub_tokens_gold_is_abbreviation.append(cur_substr)
        ind_start = ind_end
    raise (Exception(
        "WARNING  : not match found for char_list {}".format(char_list)))
def freeze_param(model, freeze_layer_prefix_ls=None, not_freeze_layer_prefix_ls=None,verbose=1):
    freezing_layer = 0

    if not_freeze_layer_prefix_ls is None:
        not_freeze_layer_prefix_ls = []
    if freeze_layer_prefix_ls is None:
        freeze_layer_prefix_ls = []
    for name, param in model.named_parameters():
        for prefix in freeze_layer_prefix_ls:
            if name.startswith(prefix):
                param.requires_grad = False
                freezing_layer += 1
                printing("TRAINING : freezing {} parameter ", var=[name], verbose=verbose, verbose_level=2)
        to_freeze = 0
        for prefix in not_freeze_layer_prefix_ls:
            if not name.startswith(prefix):
                to_freeze += 1
            if not to_freeze == len(not_freeze_layer_prefix_ls):
                param.requires_grad = False
                freezing_layer += 1
                printing("TRAINING :- freezing {} parameter ", var=[name], verbose=verbose, verbose_level=1)
    printing("TRAINING : freezing {} layers : {} prefix , not freezing {} ",
             var=[freezing_layer, freeze_layer_prefix_ls, not_freeze_layer_prefix_ls],
             verbose=verbose,
             verbose_level=1)
    assert freezing_layer > 0, "ERROR : did not fine any layers starting with {}".format(prefix)

    return model
def show_attention(prediction_word,
                   input_word,
                   attentions,
                   model_full_name=None,
                   dir_save=None,
                   show=False,
                   save=False,
                   verbose=1):
    # Set up figure with colorbar
    fig = plt.figure(figsize=(20, 16))
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    pdb.set_trace()
    ax.set_xticklabels([''] + prediction_word, rotation=45)
    ax.set_yticklabels([''] + input_word)
    plt.xlabel("Predicted  Normalization word")
    plt.ylabel("Noisy source word")
    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    #show_plot_visdom()
    if save:
        model_full_name = "no_model" if model_full_name is None else model_full_name
        file_name = "{}_model-{}_pred_word-attention.png".format(
            model_full_name, "".join(prediction_word))
        dir_save = "/Users/bemuller/Documents/Work/INRIA/dev/mt_norm_parse/test_/test_plot_attention" if dir_save is None else dir_save
        dir_save = os.path.join(dir_save, file_name)
        plt.savefig(dir_save)
        printing("Attention saved in {}",
                 var=[dir_save],
                 verbose=verbose,
                 verbose_level=1)
    if show:
        plt.show()
    plt.close()
Exemple #30
0
def reframe_conll_to_sentConll(src_dir, target_dir, n_hashtag=1, verbose=1):

    with open(src_dir, "r") as f:
        with open(target_dir, "w") as g:
            line = "0"
            sent = []

            while len(line) > 0:
                line = f.readline()
                if line.startswith("#"):
                    g.write(line)
                    # if new_sent == n_hashtag:
                    sent = []
                    output_sent = 0
                elif line != "\n" and len(line) > 0:
                    splitted = line.split('\t')
                    if "-" in splitted[0]:
                        continue
                    sent.append(splitted)
                    src_sent = ""
                    target_sent = ""
                if line == "\n":
                    output_sent = 1
                if output_sent == 1:
                    space = ""
                    for row in sent:
                        src_sent += space + row[1]
                        target_sent += space + get_normalized_token(
                            norm_field=row[9], n_exception=0, verbose=1)[0]
                        space = " "
                    g.write("1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format(
                        src_sent, target_sent))
                    g.write("\n")

    printing("WRITTEN {} src and {} target directory".format(
        src_dir, target_dir),
             verbose=verbose,
             verbose_level=1)