Exemple #1
0
tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences = loader.load_sentences(opts.train, lower, zeros)
dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
test_sentences = loader.load_sentences(opts.test, lower, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
Exemple #2
0
tag_scheme = parameters['tag_scheme']

# Load sentences
train_sentences = loader.load_sentences(opts.train, zeros)
dev_sentences = loader.load_sentences(opts.dev, zeros)
test_sentences = loader.load_sentences(opts.test, zeros)

# Use selected tagging scheme (IOB / IOBES)
update_tag_scheme(train_sentences, tag_scheme)
update_tag_scheme(dev_sentences, tag_scheme)
update_tag_scheme(test_sentences, tag_scheme)

# Create a dictionary / mapping of words
# If we use pretrained embeddings, we add them to the dictionary.
if parameters['pre_emb']:
    dico_words_train = word_mapping(train_sentences, lower)[0]
    dico_words, word_to_id, id_to_word = augment_with_pretrained(
        dico_words_train.copy(),
        parameters['pre_emb'],
        list(itertools.chain.from_iterable(
            [[w[0] for w in s] for s in dev_sentences + test_sentences])
        ) if not parameters['all_emb'] else None
    )
else:
    dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
    dico_words_train = dico_words

# Create a dictionary and a mapping for words / POS tags / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)
Exemple #3
0
def runModelInLoop(dropout,char_dim,char_lstm_dim,word_dim,word_lstm_dim):
    #results File
    resultsPath = "/Users/Ehsan/Documents/Ehsan_General/HMQ/HMQ_Projects/DNR2/COLING-2016-Code/i2b2-2010/results/"
    for u_dropout in dropout:
        for v_char_dim in char_dim:
            for w_char_lstm_dim in char_lstm_dim:
                for x_word_dim in word_dim:
                    for y_word_lstm_dim in word_lstm_dim:
                        for dataset in datasets:
                            print "+++++++++++++++"
                            print u_dropout,v_char_dim,w_char_lstm_dim,x_word_dim,y_word_lstm_dim,dataset
                            parameters['dropout'] = u_dropout

                            parameters['char_dim'] = v_char_dim
                            parameters['char_lstm_dim'] =w_char_lstm_dim
                            parameters['word_dim'] = x_word_dim
                            parameters['word_lstm_dim'] = y_word_lstm_dim

                            # If dataset is DrugBank assign predefined path

                            if(dataset == "i2b2-2010"):
                                opts.train = i2b2BasePath+"train.txt"
                                opts.dev = i2b2BasePath+ "dev.txt"
                                opts.test = i2b2BasePath+ "test.txt"
                                resultsFile = resultsPath +"i2b2_2010_Results.txt"



                            # Initialize model
                            model = Model(parameters=parameters, models_path=models_path)
                            print "Model location: %s" % model.model_path

                            # Data parameters
                            lower = parameters['lower']
                            zeros = parameters['zeros']
                            tag_scheme = parameters['tag_scheme']

                            # Load sentences
                            train_sentences = loader.load_sentences(opts.train, lower, zeros)
                            dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
                            test_sentences = loader.load_sentences(opts.test, lower, zeros)

                            # Use selected tagging scheme (IOB / IOBES)
                            update_tag_scheme(train_sentences, tag_scheme)
                            update_tag_scheme(dev_sentences, tag_scheme)
                            update_tag_scheme(test_sentences, tag_scheme)

                            # Create a dictionary / mapping of words
                            # If we use pretrained embeddings, we add them to the dictionary.
                            if parameters['pre_emb']:
                                dico_words_train = word_mapping(train_sentences, lower)[0]
                                dico_words, word_to_id, id_to_word = augment_with_pretrained(
                                    dico_words_train.copy(),
                                    parameters['pre_emb'],
                                    list(itertools.chain.from_iterable(
                                        [[w[0] for w in s] for s in dev_sentences + test_sentences])
                                    ) if not parameters['all_emb'] else None
                                )
                            else:
                                dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
                                dico_words_train = dico_words

                            # Create a dictionary and a mapping for words / POS tags / tags
                            dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
                            dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

                            print "Calling the prepare_dataset :--"
                            # Index data
                            train_data = prepare_dataset(
                                train_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            dev_data = prepare_dataset(
                                dev_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )
                            test_data = prepare_dataset(
                                test_sentences, word_to_id, char_to_id, tag_to_id, lower
                            )

                            print "%i / %i / %i sentences in train / dev / test." % (
                                len(train_data), len(dev_data), len(test_data))

                            # Save the mappings to disk
                            print 'Saving the mappings to disk...'
                            model.save_mappings(id_to_word, id_to_char, id_to_tag)

                            # Build the model
                            f_train, f_eval = model.build(**parameters)

                            # Reload previous model values
                            if opts.reload:
                                print 'Reloading previous model...'
                                model.reload()


                            # Train network
                            #
                            singletons = set([word_to_id[k] for k, v
                                              in dico_words_train.items() if v == 1])
                            n_epochs = 2  # number of epochs over the training set
                            freq_eval = 1000  # evaluate on dev every freq_eval steps
                            best_dev = -np.inf
                            best_test = -np.inf
                            count = 0
                            for epoch in xrange(n_epochs):
                                epoch_costs = []
                                print "Starting epoch %i..." % epoch
                                for i, index in enumerate(np.random.permutation(len(train_data))):
                                    count += 1
                                    input = create_input(train_data[index], parameters, True, singletons)
                                    new_cost = f_train(*input)
                                    epoch_costs.append(new_cost)
                                    #if i % 50 == 0 and i > 0 == 0:
                                    #    print "%i, cost average: %f" % (i, np.mean(epoch_costs[-50:]))
                                    if count % freq_eval == 0:
                                        dev_score = evaluate(parameters, f_eval, dev_sentences,
                                                             dev_data, id_to_tag, dico_tags)
                                        test_score = evaluate(parameters, f_eval, test_sentences,
                                                              test_data, id_to_tag, dico_tags)
                                        print "Score on dev: %.5f" % dev_score
                                        print "Score on test: %.5f" % test_score
                                        if dev_score > best_dev:
                                            best_dev = dev_score
                                            print "New best score on dev."+str(best_dev)
                                            # print "Saving model to disk..."
                                            # model.save()
                                        if test_score > best_test:
                                            best_test = test_score
                                            print "New best score on test."+str(best_test)
                                        # print "Config values used are : "


                                print "Epoch %i done. Average cost: %f" % (epoch, np.mean(epoch_costs))
                            # Write the best dev and test scores to the file
                            del model


                            with open(resultsFile, 'a') as f:
                                    f.write("dropout: "+ str(parameters['dropout'] ) +"| char_dim:  |"+str(parameters['char_dim'])+ "| char_lstm_dim:  "+str(parameters['char_lstm_dim']) +" word_dim: "+ str(parameters['word_dim']) +" |word_lstm_dim: "+ str( parameters['word_lstm_dim'] )+" | Best Dev Score: "+str(best_dev) + " | Best Test Score: "+str(best_test) +"\n")


    return
Exemple #4
0
    lower = parameters["lower"]
    zeros = parameters["zeros"]
    tag_scheme = parameters["tag_scheme"]

    train_sentences = loader.load_sentences(opts.train, lower, zeros)
    dev_sentences = loader.load_sentences(opts.dev, lower, zeros)
    test_sentences = loader.load_sentences(opts.test, lower, zeros)
    test_train_sentences = loader.load_sentences(opts.test_train, lower, zeros)

    loader.update_tag_scheme(train_sentences, tag_scheme)
    loader.update_tag_scheme(dev_sentences, tag_scheme)
    loader.update_tag_scheme(test_sentences, tag_scheme)
    loader.update_tag_scheme(test_train_sentences, tag_scheme)

    dico_words_train = loader.word_mapping(train_sentences, lower)[0]

    dico_words, word_to_id, id_to_word = loader.augment_with_pretrained(
        dico_words_train.copy(),
        parameters["pre_emb"],
        list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences +
                                            test_sentences])) if not parameters["all_emb"] else None,
    )

    dico_chars, char_to_id, id_to_char = loader.char_mapping(train_sentences)
    dico_tags, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences)

    train_data = loader.prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, lower)
    dev_data = loader.prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, lower)
    test_data = loader.prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, lower)
    test_train_data = loader.prepare_dataset(test_train_sentences, word_to_id, char_to_id, tag_to_id, lower)
Exemple #5
0
    def train(self,
              n_epochs=100,
              freq_eval=1000,
              verbose=True,
              eval_test_set=False):
        """
        :param n_epochs: number of epochs over the training set
        :param freq_eval: evaluate on dev every freq_eval steps
        :return: Saves the model with the best F1-Score, evaluated on the dev set
        """
        # Initialize model
        model = Model(parameters=self.parameters, models_path=models_path)
        print("Model location: %s" % model.model_path)

        # Data parameters
        lower = self.parameters['lower']
        zeros = self.parameters['zeros']
        tag_scheme = self.parameters['tag_scheme']

        # Load sentences
        train_sentences = loader.load_sentences(self.parameters['train'],
                                                lower, zeros)
        dev_sentences = loader.load_sentences(self.parameters['dev'], lower,
                                              zeros)
        test_sentences = loader.load_sentences(self.parameters['test'], lower,
                                               zeros)

        # Use selected tagging scheme (IOB / IOBES)
        update_tag_scheme(train_sentences, tag_scheme)
        update_tag_scheme(dev_sentences, tag_scheme)
        update_tag_scheme(test_sentences, tag_scheme)

        # Create a dictionary / mapping of words
        # If we use pretrained embeddings, we add them to the dictionary.
        if self.parameters['pre_emb']:
            dico_words_train = word_mapping(train_sentences, lower)[0]
            dico_words, word_to_id, id_to_word = augment_with_pretrained(
                dico_words_train.copy(), self.parameters['pre_emb'],
                list(
                    itertools.chain.from_iterable([[w[0] for w in s]
                                                   for s in dev_sentences +
                                                   test_sentences]))
                if not self.parameters['all_emb'] else None)
        else:
            dico_words, word_to_id, id_to_word = word_mapping(
                train_sentences, lower)
            dico_words_train = dico_words

        # Create a dictionary and a mapping for words / POS tags / tags
        dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
        dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

        # Index data
        train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                                     tag_to_id, lower)
        dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id,
                                   tag_to_id, lower)
        test_data = prepare_dataset(test_sentences, word_to_id, char_to_id,
                                    tag_to_id, lower)

        print("%i / %i / %i sentences in train / dev / test." %
              (len(train_data), len(dev_data), len(test_data)))

        # Save the mappings to disk
        print('Saving the mappings to disk...')
        model.save_mappings(id_to_word, id_to_char, id_to_tag)

        # Build the model
        f_train, f_eval = model.build(**self.parameters)

        # Reload previous model values
        if self.parameters['reload']:
            print('Reloading previous model...')
            model.reload()

        #
        # Train network
        #
        singletons = set(
            [word_to_id[k] for k, v in dico_words_train.items() if v == 1])
        best_dev = -np.inf
        best_test = -np.inf
        count = 0
        for epoch in range(n_epochs):
            epoch_costs = []
            print("Starting epoch %i at..." % epoch, time.ctime())
            for i, index in enumerate(np.random.permutation(len(train_data))):
                count += 1
                input = create_input(train_data[index], self.parameters, True,
                                     singletons)
                new_cost = f_train(*input)
                epoch_costs.append(new_cost)
                if i % 50 == 0 and i > 0 == 0 and verbose:
                    print("%i, cost average: %f" %
                          (i, np.mean(epoch_costs[-50:])))
                if count % freq_eval == 0:
                    dev_score = evaluate(self.parameters,
                                         f_eval,
                                         dev_sentences,
                                         dev_data,
                                         id_to_tag,
                                         verbose=verbose)
                    if eval_test_set:
                        test_score = evaluate(self.parameters,
                                              f_eval,
                                              test_sentences,
                                              test_data,
                                              id_to_tag,
                                              verbose=verbose)
                    print("Score on dev: %.5f" % dev_score)
                    if eval_test_set:
                        print("Score on test: %.5f" % test_score)
                    if dev_score > best_dev:
                        best_dev = dev_score
                        print("New best score on dev.")
                        print("Saving model to disk...")
                        model.save()
                    if eval_test_set:
                        if test_score > best_test:
                            best_test = test_score
                            print("New best score on test.")
            print(
                "Epoch %i done. Average cost: %f. Ended at..." %
                (epoch, np.mean(epoch_costs)), time.ctime())
        return best_dev
Exemple #6
0
def main(argv=None):  # pylint: disable=unused-argument

  # if tf.gfile.Exists(FLAGS.eval_dir):
  #   tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  # tf.gfile.MakeDirs(FLAGS.eval_dir)

  # Read parameters from command line
  opts = read_args(evaluation=True)

  # Parse parameters
  parameters = form_parameters_dict(opts)

  # Check parameters validity
  assert os.path.isfile(opts.train)
  assert os.path.isfile(opts.dev)
  assert os.path.isfile(opts.test)
  assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0
  assert 0. <= parameters['dropout'] < 1.0
  assert parameters['t_s'] in ['iob', 'iobes']
  assert not parameters['all_emb'] or parameters['pre_emb']
  assert not parameters['pre_emb'] or parameters['word_dim'] > 0
  assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb'])

  # Check evaluation script / folders
  if not os.path.isfile(eval_script):
      raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
  if not os.path.exists(eval_temp):
      os.makedirs(eval_temp)
  if not os.path.exists(models_path):
      os.makedirs(models_path)
  event_logs_path = os.path.join(eval_temp, "eval_logs")
  # if not os.path.exists(event_logs_path):
  #     os.makedirs(event_logs_path)

  # Initialize model
  model = MainTaggerModel(parameters=parameters, models_path=models_path,
                          overwrite_mappings=opts.overwrite_mappings)
  print "MainTaggerModel location: %s" % model.model_path

  # Data parameters
  lower = parameters['lower']
  zeros = parameters['zeros']
  tag_scheme = parameters['t_s']

  max_sentence_lengths = {}
  max_word_lengths = {}

  # Load sentences
  train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \
      loader.load_sentences(opts.train, lower, zeros)
  dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences(
      opts.dev, lower, zeros)
  test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences(
      opts.test, lower, zeros)

  global_max_sentence_length, global_max_char_length = \
      calculate_global_maxes(max_sentence_lengths, max_word_lengths)

  # Use selected tagging scheme (IOB / IOBES)
  update_tag_scheme(train_sentences, tag_scheme)
  update_tag_scheme(dev_sentences, tag_scheme)
  update_tag_scheme(test_sentences, tag_scheme)

  # Create a dictionary / mapping of words
  # If we use pretrained embeddings, we add them to the dictionary.
  if parameters['pre_emb']:
      dico_words_train = word_mapping(train_sentences, lower)[0]
      dico_words, word_to_id, id_to_word = augment_with_pretrained(
          dico_words_train.copy(),
          parameters['pre_emb'],
          list(itertools.chain.from_iterable(
              [[w[0] for w in s] for s in dev_sentences + test_sentences])
          ) if not parameters['all_emb'] else None
      )
  else:
      dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower)
      dico_words_train = dico_words

  # Create a dictionary and a mapping for words / POS tags / tags
  dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
  dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

  if opts.overwrite_mappings:
      print 'Saving the mappings to disk...'
      model.save_mappings(id_to_word, id_to_char, id_to_tag)

  model.reload_mappings()

  # Index data
  train_buckets, train_stats, train_unique_words = prepare_dataset(
      train_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  dev_buckets, dev_stats, dev_unique_words = prepare_dataset(
      dev_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )
  test_buckets, test_stats, test_unique_words = prepare_dataset(
      test_sentences, word_to_id, char_to_id, tag_to_id,
      global_max_sentence_length, global_max_char_length,
      lower
  )

  print "%i / %i / %i sentences in train / dev / test." % (
      len(train_stats), len(dev_stats), len(test_stats))

  print "%i / %i / %i words in train / dev / test." % (
      sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]),
      sum([x[0] for x in test_stats]))

  print "%i / %i / %i longest sentences in train / dev / test." % (
      max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]),
      max([x[0] for x in test_stats]))

  print "%i / %i / %i shortest sentences in train / dev / test." % (
      min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]),
      min([x[0] for x in test_stats]))

  for i, label in [[2, 'char']]:
      print "%i / %i / %i total %s in train / dev / test." % (
          sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]),
          sum([sum(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i max. %s lengths in train / dev / test." % (
          max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]),
          max([max(x[i]) for x in test_stats]),
          label)

      print "%i / %i / %i min. %s lengths in train / dev / test." % (
          min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]),
          min([min(x[i]) for x in test_stats]),
          label)

  print "Max. sentence lengths: %s" % max_sentence_lengths
  print "Max. char lengths: %s" % max_word_lengths

  for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words],
                                           ['dev', dev_stats, dev_unique_words],
                                           ['test', test_stats, test_unique_words]]:
      int32_items = len(train_stats) * (
          max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1)
      float32_items = n_unique_words * parameters['word_dim']
      total_size = int32_items + float32_items
      logging.info("Input ids size of the %s dataset is %d" % (label, int32_items))
      logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % (
          n_unique_words, label, float32_items))
      logging.info("Total size of the %s dataset is %d" % (label, total_size))

  batch_size = 5

  # Build the model
  cost, train_step, tag_scores, tag_ids, word_ids, \
  crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build(
      max_sentence_length_scalar=global_max_sentence_length,
      max_word_length_scalar=global_max_char_length,
      batch_size_scalar=batch_size,
      **parameters)

  FLAGS = tf.app.flags.FLAGS

  tf.app.flags.DEFINE_string('eval_dir', event_logs_path,
                             """Directory where to write event logs.""")
  tf.app.flags.DEFINE_string('eval_data', 'test',
                             """Either 'test' or 'train_eval'.""")
  tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path,
                             """Directory where to read model checkpoints.""")
  tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5,
                              """How often to run the eval.""")
  tf.app.flags.DEFINE_integer('num_examples', 10000,
                              """Number of examples to run.""")
  tf.app.flags.DEFINE_boolean('run_once', False,
                              """Whether to run eval only once.""")

  evaluate(model,
           dev_buckets, test_buckets,
           FLAGS, opts,
           id_to_tag,
           batch_size,
           placeholders,
           enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths,
           FLAGS.eval_dir,
           tag_scheme)
Exemple #7
0
print opts.train
assert os.path.isfile(opts.train)
assert os.path.isfile(opts.dev)
assert os.path.isfile(opts.test)

# Data parameters
lower = parameters['lower']
zeros = parameters['zeros']
pos = parameters['use_pos']

# Load sentences
train_sentences = loader.load_sentences(opts.train)
dev_sentences = loader.load_sentences(opts.dev)
test_sentences = loader.load_sentences(opts.test)

dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower,
                                                  zeros)
dico_words_train = dico_words

# Create a dictionary and a mapping for words / tags
dico_chars, char_to_id, id_to_char = char_mapping(train_sentences)
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences)

# Index data
train_data = prepare_dataset(train_sentences, word_to_id, char_to_id,
                             tag_to_id, lower, zeros)
dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id,
                           lower, zeros)
test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id,
                            lower, zeros)

##Write to CRFPP Feature File