raise Exception("Got unexpected input_type of {}".format(input_type))

lang_tokenizer = setup_model.create_tokenizer(dataset)
vocab_size = max(lang_tokenizer.index_word.keys())
## SAVE TOKENIZER
with open('tokenizers/names_tokenizer.pickle', 'wb') as handle:
    pickle.dump(lang_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
## LOAD TOKENIZER
#with open('tokenizers/names_tokenizer.pickle', 'rb') as handle:
#    lang_tokenizer = pickle.load(handle)
label_to_idx, idx_to_label = setup_model.create_labels(dataset, prog_type_dict,
                                                       LABEL_CHOICE,
                                                       USE_OTHER_TYPE,
                                                       LABEL_NUM,
                                                       MIN_PROGNUM_LABELS)
setup_model.save_labels(
    label_to_idx, 'names_{}_{}label_to_idx'.format(LABEL_CHOICE, other_tag))
setup_model.save_labels(
    idx_to_label, 'names_{}_{}idx_to_label'.format(LABEL_CHOICE, other_tag))
num_labels = len(label_to_idx)
train_dataset, dev_dataset = setup_model.split_train_dev(dataset)
#train_ds = setup_model.prepare_data(train_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
#dev_ds = setup_model.prepare_data(dev_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
#train_ds = setup_model.get_twin_data(train_dataset, DATA_SIZE, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
train_ds = setup_model.get_prog_twin_data(train_dataset, DATA_SIZE,
                                          lang_tokenizer, label_to_idx,
                                          USE_OTHER_TYPE)
input_dim = len(train_ds[0][0][0])


def get_twin_net(input_dim):
    left_input = tf.keras.Input(input_dim)
Beispiel #2
0
lang_tokenizer = setup_model.create_tokenizer(dataset)
vocab_size = max(lang_tokenizer.index_word.keys())
## SAVE TOKENIZER
with open('tokenizers/twin_{}_tokenizer.pickle'.format(input_type),
          'wb') as handle:
    pickle.dump(lang_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
## LOAD TOKENIZER
#with open('tokenizers/twin_{}_tokenizer.pickle'.format(input_type), 'rb') as handle:
#    lang_tokenizer = pickle.load(handle)
label_to_idx, idx_to_label = setup_model.create_labels(dataset, prog_type_dict,
                                                       LABEL_CHOICE,
                                                       USE_OTHER_TYPE,
                                                       LABEL_NUM,
                                                       MIN_PROGNUM_LABELS)
setup_model.save_labels(
    label_to_idx, 'twin_{}_{}_{}label_to_idx'.format(input_type, LABEL_CHOICE,
                                                     other_tag))
setup_model.save_labels(
    idx_to_label, 'twin_{}_{}_{}idx_to_label'.format(input_type, LABEL_CHOICE,
                                                     other_tag))
num_labels = len(label_to_idx)
train_dataset, dev_dataset = setup_model.split_train_dev(dataset)
#train_ds = setup_model.prepare_data(train_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
#dev_ds = setup_model.prepare_data(dev_dataset, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
#train_ds = setup_model.get_twin_data(train_dataset, DATA_SIZE, lang_tokenizer, label_to_idx, USE_OTHER_TYPE)
train_ds = setup_model.get_prog_twin_data(train_dataset, DATA_SIZE,
                                          lang_tokenizer, label_to_idx,
                                          USE_OTHER_TYPE, 90, False)
dev_ds = setup_model.get_prog_twin_data(dev_dataset, int(DATA_SIZE * 0.1),
                                        lang_tokenizer, label_to_idx,
                                        USE_OTHER_TYPE, 90, False)