def prediction_topk_to_string(predictions_topk, input_alignement_with_raw, topk, tokenizer, null_token_index, null_str, tasks, verbose=1): sentence_pred = from_bpe_token_to_str(predictions_topk, topk, null_str=null_str, null_token_index=null_token_index, tokenizer=tokenizer, pred_mode=True) sentence_pred_aligned = [] for top in range(topk): realign_sent = realigne(sentence_pred[top], input_alignement_with_raw, tasks=tasks, null_str=null_str, remove_null_str=True, mask_str="X") assert len( realign_sent ) == 1, "ERROR : only batch len 1 accepted here (we are doing interaction)" printing("{} top-pred : bpe {}", var=[top, realign_sent], verbose_level=2, verbose=verbose) realign_sent = " ".join(realign_sent[0]) sentence_pred_aligned.append(realign_sent) return sentence_pred_aligned, sentence_pred
def interact(dic_path, model_full_name, dir_model, debug=False, model_specific_dictionary=True, beam_size=2, word_decoding=False, extra_arg_specific_label="", save_attention=False, show_attention=False, beam_decode=False, max_len=MAX_LEN, verbose=2): from model.seq2seq import LexNormalizer assert model_specific_dictionary char_dictionary = None voc_size = None if not debug: pdb.set_trace = lambda: 1 model = LexNormalizer(generator=Generator, voc_size=voc_size, load=True, model_full_name=model_full_name, model_specific_dictionary=model_specific_dictionary, dict_path=dic_path, dir_model=dir_model, extra_arg_specific_label=extra_arg_specific_label, loading_sanity_test=True, word_decoding=word_decoding, char_decoding=not word_decoding, verbose=verbose) model.eval() if show_attention or save_attention: assert model.decoder.attn_layer is not None, "ERROR : no attention to plot " if save_attention: dir_attention = os.path.join(dir_model, "attention_plot") if os.path.isdir(dir_attention): info = "existing" else: os.mkdir(dir_attention) info = "created" printing("Saving to {} {}", var=[info, dir_attention], verbose_level=1, verbose=verbose) else: dir_attention = None decode_interacively(max_len=max_len, model=model, char_dictionary=char_dictionary, sent_mode=True, dir_attention=dir_attention, save_attention=save_attention, show_attention=show_attention, beam_decode=beam_decode, beam_size=beam_size, showing_attention=show_attention, verbose=verbose)
def get_args(args, dropout_loading_strict=True, verbose=0): default_dropout = None if dropout_loading_strict else 0 if dropout_loading_strict: printing("WARNING : errors might come from misloading of dropout ", verbose=verbose, verbose_level=0) return args["char_embedding_dim"], args["output_dim"], args["hidden_size_encoder"], args[ "hidden_size_sent_encoder"], args["encoder_arch"].get("dropout_sent_encoder_cell", default_dropout), args["encoder_arch"].get( "dropout_word_encoder_cell", default_dropout), args["encoder_arch"].get("drop_out_sent_encoder_out", default_dropout), args[ "encoder_arch"].get("drop_out_word_encoder_out", default_dropout), \ args["encoder_arch"].get("n_layers_word_encoder"), args["encoder_arch"].get("n_layers_sent_cell",1),args["encoder_arch"].get("dir_sent_encoder"), \ args["encoder_arch"].get("word_recurrent_cell_encoder", None), args["encoder_arch"].get("dir_word_encoder",1), \ args["hidden_size_decoder"], args["decoder_arch"].get("cell_word", None), args["decoder_arch"].get( "drop_out_word_decoder_cell", default_dropout), args["decoder_arch"].get("drop_out_char_embedding_decoder", default_dropout),\ args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm", False), args["decoder_arch"].get("unrolling_word", False), args["decoder_arch"].get("char_src_attention", False),\ args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm-dense_dim", None), args.get("shared_context","all"), args["decoder_arch"].get("teacher_force", 1), \ args.get("auxilliary_arch", {}).get("auxilliary_task_norm_not_norm-dense_dim_2"), args["decoder_arch"].get("stable_decoding_state", False), args["decoder_arch"].get("init_context_decoder", True),\ args["decoder_arch"].get("word_decoding", 0), args["decoder_arch"].get("char_decoding", 1), \ args.get("auxilliary_arch", {}).get("auxilliary_task_pos", False), \ args.get("auxilliary_arch", {}).get("dense_dim_auxilliary_pos", None), args.get("auxilliary_arch", {}).get("dense_dim_auxilliary_pos_2", None), \ args["decoder_arch"].get("dense_dim_word_pred",0), args["decoder_arch"].get("dense_dim_word_pred_2",0), args["decoder_arch"].get("dense_dim_word_pred_3",0), \ args.get("symbolic_root", False), args.get("symbolic_end", False), \ args["encoder_arch"].get("word_embedding_dim", 0), args["encoder_arch"].get("word_embed", False), \ args["encoder_arch"].get("word_embedding_projected_dim", None), args["decoder_arch"].get("activation_char_decoder"), args["decoder_arch"].get("activation_word_decoder"), \ args["encoder_arch"].get("attention_tagging", False), \ args["encoder_arch"].get("char_level_embedding_projection_dim",0),\ args["encoder_arch"].get("mode_word_encoding", "cat"), \ args.get("multi_task_loss_ponderation", "all"), args.get("sanity_test", {})
def reframe_tsv_to_sentConll(src_dir, target_dir, verbose=1): with open(src_dir, "r") as f: with open(target_dir, "w") as g: line = "0" while len(line) > 0: line = f.readline() line = line.strip() sent = line.split("\t") print(sent) if len(line) > 0: if len(sent) < 3: g.write("#sent_id {}\n".format("XXX")) g.write( "1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format( sent[0], sent[1])) else: g.write("#sent_id {}\n".format(sent[0])) g.write( "1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format( sent[1], sent[2])) g.write("\n") printing("WRITTEN {} src and {} target directory".format( src_dir, target_dir), verbose=verbose, verbose_level=1)
def __init__(self, input_dim, dense_dim, dense_dim_2=0, verbose=1): super(BinaryPredictor, self).__init__() self.verbose = verbose self.dense_2 = None self.dense = None dim_predictor = input_dim if dense_dim is not None: if dense_dim > 0: self.dense = nn.Linear(input_dim, dense_dim) dim_predictor = dense_dim printing( "WARNING : BinaryPredictor dense_dim is set to {} in norm_not_norm predictor", var=dense_dim, verbose=self.verbose, verbose_level=1) if dense_dim_2 is not None: if dense_dim_2 > 0: self.dense_2 = nn.Linear(dense_dim, dense_dim_2) dim_predictor = dense_dim_2 else: assert dense_dim_2 is None or dense_dim_2 == 0, "ERROR : dense_dim_2 cannot be not null if dense_dim is " printing( "WARNING : BinaryPredictor as dense_dim is None no dense layer added to norm_not_norm predictor", verbose=self.verbose, verbose_level=1) self.predictor = nn.Linear(dim_predictor, out_features=2)
def writer_weights_and_grad(model, freq_writer, epoch, writer, verbose, report_grad=True, report_weight=True): if epoch % freq_writer == 0: # # TODO : make this a unit test in test/ if TEST_CLIPPING: for name, param in model.named_parameters(): if param.requires_grad and param.grad is not None: norm = param.grad.norm() print("grad_norm writer_weights_and_grad", norm) assert norm < CLIP for name, param in model.named_parameters(): if report_weight: try: writer.add_histogram(name, param.clone().cpu().data.numpy(), epoch) except Exception as e: print("ERROR unable to report histogram ") print(e) if param.requires_grad and param.grad is not None and report_grad: writer.add_histogram("grad" + name + "-grad", param.grad.clone().cpu().data.numpy(), epoch) printing("REPORTING : storing weights and grad in writer ", verbose=verbose, verbose_level=1)
def scheduling_policy(phases_ls, epoch, tasks, verbose=1): if phases_ls is None: ponderation = 1 weight_binary_loss = 1 weight_pos_loss = 1 if len(tasks) == 1: mode = tasks[0] else: mode = "all" printing( "WARNING : default policy scheduling (no scheduling {} ponderation_normalize_loss, {}" " weight_binary_loss and pos {} : LOSS MODE SET TO {} ", var=[ponderation, weight_binary_loss, weight_pos_loss, mode], verbose_level=1, verbose=verbose) return mode, ponderation, weight_binary_loss, weight_pos_loss for phase in phases_ls: assert phase.get("epoch_start") is not None assert phase.get("epoch_stop") is not None assert phase.get("epoch_stop") > phase.get("epoch_start") assert phase.get("weight_binary_loss") is not None assert phase.get("ponderation_normalize_loss") is not None assert phase.get("multi_task_mode") in AVAILABLE_TASKS if phase["epoch_start"] <= epoch < phase["epoch_stop"]: return phase["multi_task_mode"], phase[ "ponderation_normalize_loss"], phase["weight_binary_loss"]
def reframe_tsv_monolingual_to_sentConll(src_dir, src_dir_2, target_dir, verbose=1): with open(src_dir, "r") as f: with open(src_dir_2, "r") as f2: with open(target_dir, "w") as g: line = "0" line_2 = "0" i = 0 while len(line) > 0: while len(line_2) > 0: line = f.readline() line_2 = f2.readline() line = line.strip() line_2 = line_2.strip() i += 1 print(line, line_2) if len(line) > 0: g.write("#sent_id {}\n".format("i")) g.write("1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n". format(line, line_2)) g.write("\n") printing("WRITTEN {} src and {} target directory".format( src_dir, target_dir), verbose=verbose, verbose_level=1)
def append_reporting_sheet(git_id, tasks, rioc_job, description, log_dir, target_dir, env, status, verbose=1): sheet, sheet_name, tab_name = open_client() # Find a workbook by name and open the first sheet # Make sure you use the right name here. #worksheet_list = sheet.worksheets() if not rioc_job.startswith("local"): sheet.append_row([ git_id, rioc_job, tasks, description, log_dir, target_dir, env, status, None, None, None, None, "-" ]) list_of_hashes = sheet.get_all_records() printing( "REPORT : Appending report to page {} in sheet {} of {} rows and {} columns ", var=[ tab_name, sheet_name, len(list_of_hashes) + 1, len(list_of_hashes[0]) ], verbose=verbose, verbose_level=1) else: list_of_hashes = ["NOTHING"] return len(list_of_hashes) + 1, len(list_of_hashes[0])
def pos_specific_dic_builder(pos_specific_data_set, pos_dictionary): if pos_specific_data_set is not None: assert os.path.exists( pos_specific_data_set), "{} does not exist".format( pos_specific_data_set) with codecs.open(pos_specific_data_set, 'r', 'utf-8', errors='ignore') as file: li = 0 for line in file: line = line.strip() if len(line) == 0 or line[0] == '#': continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: continue pos = tokens[ 3] # if tokens[4]=='_' else tokens[3]+'$$$'+tokens[4] #xpos = tokens[4] pos_dictionary.add(pos) #xpos_dictionary.add(xpos) printing( "VOCABULARY : POS Vocabulary : pos dictionary built on {} ".format( pos_specific_data_set), verbose_level=1, verbose=1) return pos_dictionary printing("VOCABULARY : POS Vocabulary : pos dictionary untouched", verbose_level=1, verbose=1) return pos_dictionary
def write_args(dir, model_id, checkpoint_dir=None, hyperparameters=None, info_checkpoint=None, verbose=1): args_dir = os.path.join(dir, "{}-args.json".format(model_id)) if os.path.isfile(args_dir): info = "updated" args = json.load(open(args_dir, "r")) args["checkpoint_dir"] = checkpoint_dir args["info_checkpoint"] = info_checkpoint json.dump(args, open(args_dir, "w")) else: assert hyperparameters is not None, "REPORT : args.json created for the first time : hyperparameters dic required " assert info_checkpoint is None, "REPORT : args. created for the first time : no checkpoint yet " info = "new" json.dump( OrderedDict([("checkpoint_dir", checkpoint_dir), ("hyperparameters", hyperparameters), ("info_checkpoint", None)]), open(args_dir, "w")) printing("MODEL args.json {} written {} ".format(info, args_dir), verbose_level=1, verbose=verbose) return args_dir
def construct_word_embedding_table(word_dim, word_dictionary, word_embed_init_toke2vec, verbose=1): scale = np.sqrt(5.0 / word_dim * 10) # +1 required for default value table = np.zeros([len(word_dictionary) + 1, word_dim], dtype=np.float32) # WARNING: it means that unfilled commodities will get 0 which is the defulat index !! if verbose >= 1: print( "Initializing table with shape {} based onword_dictionary and word_dim " .format(table.shape)) table[UNK_ID, :] = np.random.uniform(-scale, scale, [1, word_dim]).astype(np.float32) oov = 0 inv = 0 var = 0 mean = 0 var_oov = 0 mean_oov = 0 for word, index in word_dictionary.items(): if word in word_embed_init_toke2vec: embedding = word_embed_init_toke2vec[word] inv += 1 #print("PRETRAINED VECTOR", index, word, embedding) mean += np.mean(embedding) var += np.var(embedding) elif word.lower() in word_embed_init_toke2vec: embedding = word_embed_init_toke2vec[word.lower()] #print("LOWER PRETRAINED VECTOR", index, word, embedding) inv += 1 mean += np.mean(embedding) var += np.var(embedding) else: if word == "the": print("word ", word, " --> not accounted") embedding = np.random.uniform(-scale, scale, [1, word_dim]).astype(np.float32) mean_oov += np.mean(embedding) var_oov += np.var(embedding) #print("RANDOMY GENERATED", index, word, embedding) oov += 1 table[index, :] = embedding #print("repeat", table[index, :]) printing( "W2V INFO : Mean of preloaded w2v {} var {} " "while the one generated randomly have {} mean and {} var in average", var=[mean / inv, var / inv, mean_oov / oov, var_oov / oov], verbose_level=1, verbose=verbose) printing('W2V INFO : OOV: %d/%d (%f rate (percent)) in %d' % (oov, len(word_dictionary) + 1, 100 * float(oov / (len(word_dictionary) + 1)), inv), verbose_level=1, verbose=verbose) word = "the" print("word {} of index {} has vector {} ".format(word, index, embedding)) return torch.from_numpy(table)
def update_status(row, value, col_number=8, sheet=None, verbose=1): if sheet is None: sheet, sheet_name, tab_name = open_client() if value is not None: sheet.update_cell(row, col_number, value) printing("REPORT : col {} updated in sheet with {} ", var=[col_number, value], verbose=verbose, verbose_level=1)
def expand_vocab(data_paths): counter_match_dev = 0 expand = 0 vocab_set = set(vocab_list) vocab_norm_set = set(vocab_norm_list) for data_path in data_paths: with codecs.open(data_path, 'r', 'utf-8', errors='ignore') as file: li = 0 for line in file: line = line.strip() if len(line) == 0 or line[0] == '#': continue tokens = line.split('\t') if '-' in tokens[0] or '.' in tokens[0]: continue for char in tokens[1]: char_dictionary.add(char) word = DIGIT_RE.sub(b"0", str.encode(tokens[1])).decode() if case == "lower": word = word.lower() pos = tokens[ 3] # if tokens[4]=='_' else tokens[3]+'$$$'+tokens[4] xpos = tokens[4] typ = tokens[7] # TODO SOMEHITNG if word_normalization: token_norm, _ = get_normalized_token(tokens[9], 0, verbose=0) if word_normalization: # TODO : add word_norm_embed_dict to allow expansion ! if False and word_norm not in vocab_norm_set: vocab_norm_set.add(word_norm) vocab_norm_list.append(word_norm) # TODO : ANswer : WHY WOULD WE LIKE TO EXPAND IT ON DEV, TEST ? #if pos_specific_data_set is None: # pos_dictionary.add(pos) #xpos_dictionary.add(xpos) #type_dictionary.add(typ) # if word not already in vocab_set (loaded as trained and each time expand_vocab was called : # but found in new dataset and appear in word_embed_dict then we add it to vocab # otherwise not need to load them to vocab (they won't have any representation) # but found in new dataset and appear in word_embed_dict then we add it to vocab # otherwise not need to load them to vocab (they won't have any representation) if word not in vocab_set and (word in word_embed_dict or word.lower() in word_embed_dict): vocab_set.add(word) expand += 1 vocab_list.append(word) li = li + 1 if dry_run and li == 100: break printing( "VOCABULARY EXPAND word source vocabulary expanded of {} tokens based on {} ", var=[expand, data_path], verbose=verbose, verbose_level=0)
def use_gpu_(use_gpu, verbose=0): if use_gpu is not None and use_gpu: assert torch.cuda.is_available( ), "ERROR : use_gpu was set to True but cuda not available " use_gpu = torch.cuda.is_available() if use_gpu is None else use_gpu printing("HARDWARE : use_gpu set to {} ", var=[use_gpu], verbose=verbose, verbose_level=1) return use_gpu
def checkpoint(loss_saved, loss, model, model_dir, epoch, epochs, info_checkpoint, saved_epoch, counter_no_decrease, verbose, extra_checkpoint_label="", extra_arg_specific_label="", checkpointing_metric="loss-dev-all", checkpoint_dir_former=None, keep_all_checkpoint=False): if loss < loss_saved: saved_epoch = epoch printing( 'Checkpoint info : {} (former:{} current:{}) decreased so saving model ' 'saved epoch is {} (actual epoch {}) (counter_no_decrease set to 0)', var=[checkpointing_metric, loss_saved, loss, saved_epoch, epoch], verbose=verbose, verbose_level=1) loss_saved = loss _, _, checkpoint_dir = model.save( model_dir, model, info_checkpoint=info_checkpoint, extra_arg_specific_label=extra_arg_specific_label, suffix_name="{}-{}of{}epoch".format(extra_checkpoint_label, epoch, epochs), verbose=verbose) if not keep_all_checkpoint: model.rm_checkpoint(checkpoint_dir_former, verbose=verbose) checkpoint_dir_former = checkpoint_dir counter_no_decrease = 0 else: # could add loading former model if loss suddenly pick #printing('Checkpoint info : Loss decreased so saving model', verbose=verbose, verbose_level=1) #model.load_state_dict(torch.load(checkpoint_dir)) # TODO : load former checkpoint : and do change loss append IF error suddendly pick counter_no_decrease += 1 printing( "Checkpoint info: {} (former:{} current:{}) did not decrease so keeping former model of epoch {} " "counter_no_decrease is now {} (actual epoch {}) ", var=[ checkpointing_metric, loss_saved, loss, saved_epoch, counter_no_decrease, epoch ], verbose=verbose, verbose_level=1) return model, loss_saved, counter_no_decrease, saved_epoch, checkpoint_dir_former
def simple_plot( final_loss, loss_ls, epoch_ls_1, epoch_ls_2=None, loss_2=None, epochs=None, V=None, seq_len=None, label="", label_2="", dir="/Users/bemuller/Documents/Work/INRIA/dev/mt_norm_parse/test_/test_logs", lr=None, save=False, show=True, prefix="test", verbose=0, verbose_level=1): if loss_2 is None: assert len(label_2) == 0, "Label_2 should be '' as loss_2 is None " if len(label_2) > 0: assert len(label) > 0, "label should be specified as label_2 is " printing("REPORT : Final Loss to be plotted {} ".format(final_loss), verbose=verbose, verbose_level=1) plt.figure() plt.title("Training Loss with after {} epo (lr {}) ".format(epochs, lr)) plt.xlabel("epoch") color_train = "red" plt.plot(epoch_ls_1, loss_ls, label="plot1", color=color_train) patches = [mpatches.Patch(color=color_train, label=label)] if loss_2 is not None: assert epoch_ls_2 is not None, "epoch_ls_2 should not be None" color_dev = "blue" plt.plot(epoch_ls_2, loss_2, label="plot2", color=color_dev) patches.append(mpatches.Patch(color=color_dev, label=label_2)) plt.legend(handles=patches) dir_fig = os.path.join( dir, "{}-{}-plo-seq.png".format(prefix, "last", V, lr, seq_len)) if save: plt.savefig(dir_fig) printing("REPORT : Learning curve saved at {} ", var=([dir_fig]), verbose=verbose, verbose_level=verbose_level) if show: print("Not Showing loss") #plt.show() plt.close() return dir_fig
def forward(self, x): # return F.log_softmax(self.proj(x), dim=-1) # the log_softmax is done within the loss activation = nn.ReLU#eval(self.activation) y = activation()(self.dense(x)) proj = self.proj(y) printing("TYPE proj {} is cuda ", var=(proj.is_cuda), verbose=0, verbose_level=4) if self.verbose >= 3: print("PROJECTION {} size".format(proj.size())) if self.verbose >= 5: print("PROJECTION data {} ".format(proj)) return proj
def get_loss(model, data_label, tasks, use_gpu, word_decoding, char_decoding, max_char_len, bucketing,batch_size, symbolic_end=1, add_end_char=1, add_start_char=1, symbolic_root=1, verbose=1): ponderation_normalize_loss = model.arguments["hyperparameters"]["ponderation_normalize_loss"] weight_pos_loss = model.arguments["hyperparameters"]["weight_pos_loss"] weight_binary_loss = model.arguments["hyperparameters"]["weight_binary_loss"] dataset = [REPO_LABEL2SET[_data_label] for _data_label in data_label] printing("SANITY TEST performed on {}".format(dataset), verbose=verbose, verbose_level=1) readers_dev = readers_load(datasets=dataset, tasks=tasks, word_dictionary=model.word_dictionary, word_dictionary_norm=model.word_nom_dictionary, char_dictionary=model.char_dictionary, pos_dictionary=model.pos_dictionary, xpos_dictionary=model.xpos_dictionary, type_dictionary=model.type_dictionary, use_gpu=use_gpu, norm_not_norm="norm_not_norm" in tasks, word_decoder=word_decoding, add_start_char=add_start_char, add_end_char=add_end_char, symbolic_end=symbolic_end, symbolic_root=symbolic_root, bucket=bucketing, max_char_len=max_char_len, verbose=verbose) batchIter_eval = data_gen_multi_task_sampling_batch(tasks=tasks, readers=readers_dev, batch_size=batch_size, word_dictionary=model.word_dictionary, char_dictionary=model.char_dictionary, word_dictionary_norm=model.word_nom_dictionary, pos_dictionary=model.pos_dictionary, dropout_input=0, extend_n_batch=1, get_batch_mode=False, verbose=verbose) printing("SANITY TEST EVALUATION : computing loss ", verbose=verbose, verbose_level=2) loss_obj = LossCompute(model.generator, use_gpu=use_gpu, verbose=verbose, multi_task_loss_ponderation=model.multi_task_loss_ponderation, use="dev", pos_pred="pos" in tasks, tasks=tasks, vocab_char_size=len(list(model.char_dictionary.instance2index.keys())) + 1, char_decoding=char_decoding, word_decoding=word_decoding, auxilliary_task_norm_not_norm="norm_not_norm" in tasks) print("PONDERATION", ponderation_normalize_loss) loss_dev, loss_details_dev, step_dev = run_epoch(batchIter_eval, model, loss_compute=loss_obj, verbose=verbose, timing="", step=0, weight_binary_loss=weight_binary_loss, ponderation_normalize_loss=ponderation_normalize_loss, weight_pos_loss=weight_pos_loss, pos_batch="pos" in tasks, log_every_x_batch=100) return loss_dev, loss_details_dev, step_dev
def data_gen_dummy(V, batch, nbatches, sent_len=9, word_len=5, verbose=0, seed=None): "Generate random data for a src-tgt copy task." if seed is not None: np.random.seed(seed) for i in tqdm(range(nbatches), disable=disable_tqdm_level(verbose, verbose_level=2)): data = torch.from_numpy(np.random.randint(low=2, high=V, size=(batch, sent_len, word_len))) data[:, :,0] = 2 # we force padding in the dummy model data[:, :, -1] = 1 data[:, :, -2] = 1 printing("DATA dummy {} ", var=(data), verbose=verbose, verbose_level=5) src = Variable(data, requires_grad=False) tgt = Variable(data, requires_grad=False) yield MaskBatch(src, tgt, pad=1)
def predict(batch_size, data_path, dict_path, model_full_name, bucket=False, model_specific_dictionary=True, print_raw=False, dir_normalized=None, dir_original=None, get_batch_mode=False, normalization=True, debug=False, use_gpu=None, verbose=0): assert model_specific_dictionary, "ERROR : only model_specific_dictionary = True supported now" # NB : now : you have to load dictionary when evaluating (cannot recompute) (could add in the LexNormalizer ability) use_gpu = use_gpu_(use_gpu) hardware_choosen = "GPU" if use_gpu else "CPU" printing("{} mode ", var=([hardware_choosen]), verbose_level=0, verbose=verbose) if not debug: pdb.set_trace = lambda: 1 model = LexNormalizer(generator=Generator, load=True, model_full_name=model_full_name, voc_size=None, use_gpu=use_gpu, dict_path=dict_path, model_specific_dictionary=True, dir_model=os.path.join(PROJECT_PATH, "checkpoints", model_full_name + "-folder"), char_decoding=True, word_decoding=False, verbose=verbose ) data_read = conllu_data.read_data_to_variable(data_path, model.word_dictionary, model.char_dictionary, model.pos_dictionary, model.xpos_dictionary, model.type_dictionary, use_gpu=use_gpu, norm_not_norm=model.auxilliary_task_norm_not_norm, symbolic_end=True, symbolic_root=True, dry_run=0, lattice=False, verbose=verbose, normalization=normalization, bucket=bucket, add_start_char=1, add_end_char=1) batchIter = data_gen_conllu(data_read, model.word_dictionary, model.char_dictionary, batch_size=batch_size, get_batch_mode=False, normalization=normalization, print_raw=print_raw, verbose=verbose) model.eval() greedy_decode_batch(char_dictionary=model.char_dictionary, verbose=verbose, gold_output=False, use_gpu=use_gpu, write_output=True, label_data=REPO_DATASET[data_path], batchIter=batchIter, model=model, dir_normalized=dir_normalized, dir_original=dir_original, batch_size=batch_size)
def sanity_check_loss_poneration(ponderation_dic, verbose=1): if isinstance(ponderation_dic, dict): for task in AVAILABLE_TASKS: if task != "all": # Still some ambiguity in 'all' setting assert task in ponderation_dic, "ERROR : task {} is not related to a ponderation while it should ".format( task) elif isinstance(ponderation_dic, str): assert ponderation_dic in MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE, "ERROR ponderation should be in {}".format( ponderation_dic, MULTI_TASK_LOSS_PONDERATION_PREDEFINED_MODE) printing("WARNING : COULD NOT SANITY CHECK ponderation_dic {} ", var=[ponderation_dic], verbose=verbose, verbose_level=1) else: raise (Exception("ponderation_dic is neither string or dict {}".format( ponderation_dic)))
def data_gen_multi_task_sampling_batch(tasks, readers, word_dictionary, char_dictionary, pos_dictionary, word_dictionary_norm, extend_n_batch, batch_size, get_batch_mode, mode_batch_sampling="proportional", padding=PAD_ID_CHAR, dropout_input=0, print_raw=False, verbose=1): "multitask learning iterator" assert len(tasks) == len(readers) assert mode_batch_sampling in MODE_BATCH_SAMPLING_AVAILABLE iterator = {} end_task_flag = {} n_sents_per_task_dataset_cumul = {} cumul_n_sent = 0 for task in tasks: iterator[task] = data_gen_conllu(data=readers[task], word_dictionary=word_dictionary, task_info=task, char_dictionary=char_dictionary, pos_dictionary=pos_dictionary, word_dictionary_norm=word_dictionary_norm, batch_size=batch_size, extend_n_batch=extend_n_batch, get_batch_mode=get_batch_mode, dropout_input=dropout_input, padding=padding, print_raw=print_raw, normalization=TASKS_PARAMETER[task]["normalization"], verbose=verbose) end_task_flag[task] = False cumul_n_sent += readers[task][-1] n_sents_per_task_dataset_cumul[task] = cumul_n_sent n_sents_per_task_dataset_cumul["all"] = n_sents_per_task_dataset_cumul[tasks[-1]] printing("TRAINING : MultiTask batch sampling iterator {} cumulated n_sent ", var=[n_sents_per_task_dataset_cumul], verbose_level=1, verbose=verbose) batch_iter = 0 while True: n_sent_start = 0 random_sample_id = np.random.randint(0, 100) for ind, task in enumerate(tasks): if sampling_proportion(n_sent_start, n_sents_per_task_dataset_cumul["all"]) < random_sample_id < sampling_proportion(n_sents_per_task_dataset_cumul[task], n_sents_per_task_dataset_cumul["all"]) and not end_task_flag[task]: try: batch, order = iterator[task].__next__() sanity_check_batch_label(task, batch, verbose=verbose) batch_iter += 1 yield batch except StopIteration: end_task_flag[task] = True printing("ITERATOR END {} ", var=[task], verbose_level=1, verbose=verbose) break else: n_sent_start = n_sents_per_task_dataset_cumul[task] if sum(end_task_flag.values()) == len(tasks): break
def readers_load(datasets, tasks, word_dictionary, word_dictionary_norm , char_dictionary, pos_dictionary, xpos_dictionary, type_dictionary, use_gpu, norm_not_norm=False, word_decoder=False, must_get_norm=True, simultanuous_training=False, bucket=True,max_char_len=None, add_start_char=1, add_end_char=1, symbolic_end=True, symbolic_root=True, verbose=1): readers = {} #assert not simultanuous_training, "ERROR : so far : " assert "all" not in tasks, "ERROR not supported yet (pb for simultanuous training..) " if not "all" in tasks and not simultanuous_training: assert len(tasks) == len(datasets), "ERROR : as simultanuous_training is {} : " \ "we need 1 dataset per task but have only {} for task {} ".format(simultanuous_training, datasets, tasks) elif not simultanuous_training: assert len(tasks) == 1, "ERROR : if all should have only all nothing else" printing("TRAINING : MultiTask Iterator wit task 'all' ", verbose_level=1, verbose=verbose) elif simultanuous_training: printing("TRAINING : Training simulatnuously tasks provided in {} (should have all required labels in datasets)", verbose_level=1, verbose=verbose) raise(Exception("Not supported yet --> should handle the loop ")) for task, data in zip(tasks, datasets): if task == "normalize": tasks = ["normalize", "norm_not_norm"] else: tasks = [task] print("WARNING : data_iterator : None hardcdoed for max_char_len") readers[task] = conllu_data.read_data_to_variable(data, word_dictionary, char_dictionary, pos_dictionary, xpos_dictionary, type_dictionary, use_gpu=use_gpu, word_decoder=word_decoder, symbolic_end=symbolic_end, symbolic_root=symbolic_root, dry_run=0, lattice=False, normalization=TASKS_PARAMETER[task]["normalization"], bucket=bucket, add_start_char=add_start_char, add_end_char=add_end_char, tasks=tasks, max_char_len=None, must_get_norm=must_get_norm, word_norm_dictionary=word_dictionary_norm, verbose=verbose) return readers
def get_optimizer(parameters, lr, optimizer="adam", betas=None, verbose=1): assert optimizer in AVAILABLE_OPTIMIZER, "ERROR optimizers supported are {} ".format(AVAILABLE_OPTIMIZER) if optimizer == "adam": if betas is None: # betas = (0.9, 0.9) print("DEFAULT betas:", betas) opt = torch.optim.Adam(parameters, lr=lr, #betas=betas, eps=1e-9) elif optimizer == "SGD": assert betas is None, "ERROR " opt = torch.optim.SGD(parameters, lr=lr) elif optimizer == "bahdanu-adadelta": assert betas is None, "ERROR betas not supported for optimizer {}".format(optimizer) opt = torch.optim.Adadelta(parameters, eps=10e-6, rho=0.95) printing("TRAINING : optimizer {} has been reloaded with lr {} betas {} ", var=[optimizer, lr, betas], verbose=verbose, verbose_level=1) return opt
def interact_bert(bert_token_classification, tokenizer, null_token_index, null_str, tasks, topk=1, verbose=1, print_input=True, use_gpu=False): printing("INFO : input_string should be white space tokenized", verbose=verbose, verbose_level=1) input_string = input( "What would you like to normalize ? type STOP to stop ") if input_string == "STOP": print("ENDING interaction") return None, 0 input_string = ["[CLS] " + input_string + " [SEP]"] input_tokens_tensor, input_segments_tensors, inp_bpe_tokenized, \ input_alignement_with_raw, input_mask = get_indexes(input_string, tokenizer, verbose, use_gpu) token_type_ids = torch.zeros_like(input_tokens_tensor) if print_input: print("SRC : BPE ", inp_bpe_tokenized) logits = bert_token_classification(input_tokens_tensor, token_type_ids, input_mask)[0]["logits_task_1"] \ if tasks[0] == "normalize" else None predictions_topk = torch.argsort(logits, dim=-1, descending=True)[:, :, :topk] sentence_pred_aligned, sentence_pred = prediction_topk_to_string( predictions_topk, input_alignement_with_raw, topk, tokenizer, tasks=tasks, null_token_index=null_token_index, null_str=null_str) return input_string, sentence_pred_aligned, sentence_pred
def get_biggest_bpe_in(char_list, vocab, token_begining=True, verbose=1): ind_start = 0 while ind_start < len(char_list): ind_end = len(char_list) while ind_start < ind_end: substr = "".join(char_list[ind_start:ind_end]) if not token_begining > 0: substr = "##" + substr if substr in vocab: cur_substr = substr printing( "WARNING : LEAVING ABREVIATION MATCH (SHOULD ADD MASK)", var=[char_list[ind_end:], char_list], verbose=verbose, verbose_level="raw_data") return cur_substr, ind_end, char_list[ind_end:] ind_end -= 1 # sub_tokens_gold_is_abbreviation.append(cur_substr) ind_start = ind_end raise (Exception( "WARNING : not match found for char_list {}".format(char_list)))
def freeze_param(model, freeze_layer_prefix_ls=None, not_freeze_layer_prefix_ls=None,verbose=1): freezing_layer = 0 if not_freeze_layer_prefix_ls is None: not_freeze_layer_prefix_ls = [] if freeze_layer_prefix_ls is None: freeze_layer_prefix_ls = [] for name, param in model.named_parameters(): for prefix in freeze_layer_prefix_ls: if name.startswith(prefix): param.requires_grad = False freezing_layer += 1 printing("TRAINING : freezing {} parameter ", var=[name], verbose=verbose, verbose_level=2) to_freeze = 0 for prefix in not_freeze_layer_prefix_ls: if not name.startswith(prefix): to_freeze += 1 if not to_freeze == len(not_freeze_layer_prefix_ls): param.requires_grad = False freezing_layer += 1 printing("TRAINING :- freezing {} parameter ", var=[name], verbose=verbose, verbose_level=1) printing("TRAINING : freezing {} layers : {} prefix , not freezing {} ", var=[freezing_layer, freeze_layer_prefix_ls, not_freeze_layer_prefix_ls], verbose=verbose, verbose_level=1) assert freezing_layer > 0, "ERROR : did not fine any layers starting with {}".format(prefix) return model
def show_attention(prediction_word, input_word, attentions, model_full_name=None, dir_save=None, show=False, save=False, verbose=1): # Set up figure with colorbar fig = plt.figure(figsize=(20, 16)) ax = fig.add_subplot(111) cax = ax.matshow(attentions.numpy(), cmap='bone') fig.colorbar(cax) # Set up axes pdb.set_trace() ax.set_xticklabels([''] + prediction_word, rotation=45) ax.set_yticklabels([''] + input_word) plt.xlabel("Predicted Normalization word") plt.ylabel("Noisy source word") # Show label at every tick ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) #show_plot_visdom() if save: model_full_name = "no_model" if model_full_name is None else model_full_name file_name = "{}_model-{}_pred_word-attention.png".format( model_full_name, "".join(prediction_word)) dir_save = "/Users/bemuller/Documents/Work/INRIA/dev/mt_norm_parse/test_/test_plot_attention" if dir_save is None else dir_save dir_save = os.path.join(dir_save, file_name) plt.savefig(dir_save) printing("Attention saved in {}", var=[dir_save], verbose=verbose, verbose_level=1) if show: plt.show() plt.close()
def reframe_conll_to_sentConll(src_dir, target_dir, n_hashtag=1, verbose=1): with open(src_dir, "r") as f: with open(target_dir, "w") as g: line = "0" sent = [] while len(line) > 0: line = f.readline() if line.startswith("#"): g.write(line) # if new_sent == n_hashtag: sent = [] output_sent = 0 elif line != "\n" and len(line) > 0: splitted = line.split('\t') if "-" in splitted[0]: continue sent.append(splitted) src_sent = "" target_sent = "" if line == "\n": output_sent = 1 if output_sent == 1: space = "" for row in sent: src_sent += space + row[1] target_sent += space + get_normalized_token( norm_field=row[9], n_exception=0, verbose=1)[0] space = " " g.write("1\t{}\t_\t_\t_\t_\t1\t_\t_\tNorm={}|\n".format( src_sent, target_sent)) g.write("\n") printing("WRITTEN {} src and {} target directory".format( src_dir, target_dir), verbose=verbose, verbose_level=1)