indexes = open("../wang/data/" + mode + "/dev.filtered",
                       'r').read().strip('\n').split('\n')
        ign_indexes_val = [int(x) - 1 for x in indexes]
        indexes = open("../wang/data/" + mode + "/test.filtered",
                       'r').read().strip('\n').split('\n')
        ign_indexes_test = [int(x) - 1 for x in indexes]
    else:
        ign_indexes_train, ign_indexes_val, ign_indexes_test = [], [], []

    print("Reading vocabulary...")
    vocab_dict, word_embedding_array = DataProcessor(
    ).prepare_vocab_embeddingdict()
    print("Reading training set...")
    train_data = DataProcessor().prepare_news_data(
        vocab_dict, data_type="training")  # subsampled
    train_batch = batch_load_data(train_data)
    # data in whole batch with padded matrixes
    print("Reading validation set...")
    val_batch = batch_load_data(DataProcessor().prepare_news_data(
        vocab_dict,
        data_type="validation",
        normalizer=train_data.normalizer,
        pca_model=train_data.pca_model))

    print("Reading test set...")
    test_batch = batch_load_data(DataProcessor().prepare_news_data(
        vocab_dict,
        data_type="test",
        normalizer=train_data.normalizer,
        pca_model=train_data.pca_model))
    del train_data
Ejemplo n.º 2
0
  FLAGS.mlp_size = 100

  FLAGS.use_dropout = False
  FLAGS.use_dropout_outatt = False
  FLAGS.dropout = 0.8

  prev_drpt = FLAGS.use_dropout


  # set sentence, doc length to maximum
  output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w')

  vocab_dict, word_embedding_array = DataProcessor().prepare_vocab_embeddingdict()
  train_data = DataProcessor().prepare_news_data(vocab_dict, data_type="training") # subsampled
  # data in whole batch with padded matrixes
  val_batch = batch_load_data(DataProcessor().prepare_news_data(vocab_dict, data_type="validation"))

  setup_by_id = {}
  results_by_id = {}
  results_by_id_mrr = {}
  setup_id = 0
  best_global_acc = -1
  best_global_mrr = -1
  best_setup_id = -1
  best_setup_id_mrr = -1

  parameter_grid = {
    "batch_size" : [64],
    "learning_rate" : [0.0001],
    "mlp_size":[100],
    "sentembed_size":[348],
Ejemplo n.º 3
0
  #dir ubuntu
  FLAGS.raw_data_dir = "/home/usuario/datasets"

  if args.dataset=='eus':
    FLAGS.max_audio_length = 680 # obtained from sequence lengths histogram
    FLAGS.max_freq_length = 201
  elif args.dataset=='quz':
    FLAGS.max_audio_length = 100 # TBD
    FLAGS.max_freq_length = 100

  # set sentence, doc length to maximum
  output = open("tunning_"+FLAGS.data_mode+"/"+FLAGS.data_mode + "_hp_grid_tuning_%s.txt" % args.file_suffix,'w')

  vocab_dict,inverted_vocab = get_vocab()
  train_data = DataProcessor(vocab_dict,inverted_vocab,data_type="train")
  val_batch = batch_load_data(DataProcessor(vocab_dict,inverted_vocab,data_type="val"))
  
  setup_by_id = {}
  results_by_id_wer = {}
  results_by_id_cer = {}
  setup_id = 0
  best_global_wer = 200
  best_global_cer = 200
  best_setup_id_wer = -1
  best_setup_id_cer = -1

  ## FLAGS.___ = ___ # set as constant so it doesn't clutter output
  #FLAGS.use_conv2d = True
  #FLAGS.use_dropout = False 
  #FLAGS.birnn = False
  #FLAGS.lookahead_conv = False
Ejemplo n.º 4
0
    FLAGS.batch_size = 20
    FLAGS.learning_rate = 0.0001
    FLAGS.size = 600
    FLAGS.max_gradient_norm = 10

    FLAGS.sentembed_size = 350

    vocab_dict, word_embedding_array = DataProcessor(
    ).prepare_vocab_embeddingdict()
    train_data = DataProcessor().prepare_news_data(
        vocab_dict, data_type="training")  # subsampled
    # data without padding
    validation_data = DataProcessor().prepare_news_data(
        vocab_dict, data_type="validation")  # complete
    # data in whole batch with padded matrixes
    val_batch = batch_load_data(validation_data)
    # we don't neeed this anymore
    del validation_data

    ###############################################################################################

    assignments = {
        'batch_size': 64,
        'log_learning_rate': -6.907755278982137,
        'size': 2500,
        'sentembed_size': 1024
    }

    value = evaluate_model(assignments, train_data, val_batch, metric)

    conn = Connection(client_token="<your_API_token>")