def __init__(self, config):
        self.config = config
        self.embedding_info = [(emb["size"], emb["lowercase"])
                               for emb in config["embeddings"]]
        self.embedding_size = sum(size for size, _ in self.embedding_info)
        self.char_embedding_size = config["char_embedding_size"]
        self.char_dict = util.load_char_dict(config["char_vocab_path"])
        self.embedding_dicts = [
            util.load_embedding_dict(emb["path"], emb["size"], emb["format"])
            for emb in config["embeddings"]
        ]
        self.max_mention_width = config["max_mention_width"]
        self.max_context_width = config["max_context_width"]
        self.genres = {g: i for i, g in enumerate(config["genres"])}
        self.eval_data = None  # Load eval data lazily.

        input_props = []
        input_props.append(
            (tf.float32, [None, None,
                          self.embedding_size]))  # Text embeddings.
        input_props.append((tf.int32, [None, None,
                                       None]))  # Character indices.
        input_props.append((tf.int32, [None]))  # Text lengths.
        input_props.append((tf.int32, [None]))  # Speaker IDs.
        input_props.append((tf.int32, []))  # Genre.
        input_props.append((tf.bool, []))  # Is training.
        input_props.append((tf.int32, [None]))  # Gold starts.
        input_props.append((tf.int32, [None]))  # Gold ends.
        input_props.append((tf.int32, [None]))  # Cluster ids.

        self.queue_input_tensors = [
            tf.placeholder(dtype, shape) for dtype, shape in input_props
        ]
        dtypes, shapes = zip(*input_props)
        queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
        self.enqueue_op = queue.enqueue(self.queue_input_tensors)
        self.input_tensors = queue.dequeue()

        self.predictions, self.loss = self.get_predictions_and_loss(
            *self.input_tensors)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.reset_global_step = tf.assign(self.global_step, 0)
        learning_rate = tf.train.exponential_decay(
            self.config["learning_rate"],
            self.global_step,
            self.config["decay_frequency"],
            self.config["decay_rate"],
            staircase=True)
        trainable_params = tf.trainable_variables()
        gradients = tf.gradients(self.loss, trainable_params)
        gradients, _ = tf.clip_by_global_norm(gradients,
                                              self.config["max_gradient_norm"])
        optimizers = {
            "adam": tf.train.AdamOptimizer,
            "sgd": tf.train.GradientDescentOptimizer
        }
        optimizer = optimizers[self.config["optimizer"]](learning_rate)
        self.train_op = optimizer.apply_gradients(zip(gradients,
                                                      trainable_params),
                                                  global_step=self.global_step)
Exemple #2
0
  def __init__(self, config):
    self.config = config
    self.embedding_info = [(emb["size"], emb["lowercase"]) for emb in config["embeddings"]]
    self.embedding_size = sum(size for size, _ in self.embedding_info)
    self.char_embedding_size = config["char_embedding_size"]
    self.char_dict = util.load_char_dict(config["char_vocab_path"])
    self.embedding_dicts = [util.load_embedding_dict(emb["path"], emb["size"], emb["format"]) for emb in config["embeddings"]]
    self.max_mention_width = config["max_mention_width"]
    self.genres = { g:i for i,g in enumerate(config["genres"]) }
    self.eval_data = None # Load eval data lazily.

    input_props = []
    input_props.append((tf.float32, [None, None, self.embedding_size])) # Text embeddings.
    input_props.append((tf.int32, [None, None, None])) # Character indices.
    input_props.append((tf.int32, [None])) # Text lengths.
    input_props.append((tf.int32, [None])) # Speaker IDs.
    input_props.append((tf.int32, [])) # Genre.
    input_props.append((tf.bool, [])) # Is training.
    input_props.append((tf.int32, [None])) # Gold starts.
    input_props.append((tf.int32, [None])) # Gold ends.
    input_props.append((tf.int32, [None])) # Cluster ids.

    self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in input_props]
    dtypes, shapes = zip(*input_props)
    queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
    self.enqueue_op = queue.enqueue(self.queue_input_tensors)
    self.input_tensors = queue.dequeue()

    self.predictions, self.loss = self.get_predictions_and_loss(*self.input_tensors)
    self.global_step = tf.Variable(0, name="global_step", trainable=False)
    self.reset_global_step = tf.assign(self.global_step, 0)
    learning_rate = tf.train.exponential_decay(self.config["learning_rate"], self.global_step,
                                               self.config["decay_frequency"], self.config["decay_rate"], staircase=True)
    trainable_params = tf.trainable_variables()
    gradients = tf.gradients(self.loss, trainable_params)
    gradients, _ = tf.clip_by_global_norm(gradients, self.config["max_gradient_norm"])
    optimizers = {
      "adam" : tf.train.AdamOptimizer,
      "sgd" : tf.train.GradientDescentOptimizer
    }
    optimizer = optimizers[self.config["optimizer"]](learning_rate)
    self.train_op = optimizer.apply_gradients(zip(gradients, trainable_params), global_step=self.global_step)
Exemple #3
0
        char_index = char_index[sentence_offset:sentence_offset + max_training_sentences, :, :]
        text_len = text_len[sentence_offset:sentence_offset + max_training_sentences]

        speaker_ids = speaker_ids[word_offset: word_offset + num_words]
        gold_spans = np.logical_and(gold_ends >= word_offset, gold_starts < word_offset + num_words)
        gold_starts = gold_starts[gold_spans] - word_offset
        gold_ends = gold_ends[gold_spans] - word_offset
        cluster_ids = cluster_ids[gold_spans]

        return word_emb, char_index, text_len, speaker_ids, genre, is_training, gold_starts, gold_ends, cluster_ids


if __name__ == "__main__":
    objective = "train"
    config = util.get_config("experiments.conf")['best']
    word_embeddings = [util.load_embedding_dict(emb["path"], emb["size"], emb["format"]) for emb in config["embeddings"]]
    train_dataset = TrainCorefDataset(config, word_embeddings, objective)
    res = ""

    mention_dict = {}

    for i in range(train_dataset.length):
        example = train_dataset.train_examples[i]
        flat_list = [item for sublist in example["sentences"] for item in sublist]
        clean_flat_list = flat_list[:]
        for j, cluster in enumerate(example["clusters"]):
            words = "Cluster {}: [".format(j)
            for mention in cluster:
                word = ""
                flag = False
                for k in range(mention[0], mention[1] + 1):
Exemple #4
0
  def __init__(self, config):
    self.config = config

    self.pos_tag_dict = util.load_pos_tags(config["pos_tag_path"])
    self.ner_tag_dict = util.load_pos_tags(config["ner_tag_path"])
    self.categories_dict = util.load_pos_tags(config["categories_path"])

    self.embedding_info = [(emb["size"], emb["lowercase"]) for emb in config["embeddings"]]
    self.embedding_size = sum(size for size, _ in self.embedding_info) # 350
    self.char_embedding_size = config["char_embedding_size"]

    self.glove_embedding_size = 300

    self.char_dict = util.load_char_dict(config["char_vocab_path"])
    self.l = float(config["l"])


    print "l value:", self.l
    print "l adapted:", self.config["l_adapted"]
    
    # glove and turian
    self.embedding_dicts = [util.load_embedding_dict(emb["path"], emb["size"], emb["format"]) for emb in config["embeddings"]]
    
    # glove only
    glove_emb = config["embeddings"][0]
    self.glove_embedding_dict = util.load_embedding_dict(glove_emb["path"], glove_emb["size"], glove_emb["format"])

    self.max_mention_width = config["max_mention_width"]
    self.genres = { g:i for i,g in enumerate(config["genres"]) }
    self.eval_data = None # Load eval data lazily.

    input_props = []
    input_props.append((tf.float32, [None, None, self.embedding_size])) # Text embeddings. --> sentences x words x embedding size
    input_props.append((tf.int32, [None, None, None])) # Character indices.
    input_props.append((tf.int32, [None])) # Text lengths.
    input_props.append((tf.int32, [None])) # Speaker IDs.
    input_props.append((tf.int32, [])) # Genre.
    input_props.append((tf.bool, [])) # Is training.
    input_props.append((tf.int32, [None])) # Gold starts.
    input_props.append((tf.int32, [None])) # Gold ends.
    input_props.append((tf.int32, [None])) # Cluster ids.

    input_props.append((tf.float32, [None, None, len(self.pos_tag_dict)])) # POS tags --> sentences x tags
    input_props.append((tf.float32, [None, None, len(self.ner_tag_dict)])) # NER indicator variable

    input_props.append((tf.float32, [None, None, len(self.categories_dict)])) # categories

    input_props.append((tf.int32, [None])) # NER IDs. # matching speakers
    input_props.append((tf.float32, [None, None, self.glove_embedding_size])) # categories with glove embeddings

    # DOMAIN ADAPTATION THING
    input_props.append((tf.float32, [len(self.genres)])) # domain labels
    input_props.append((tf.float32, [])) # l

    self.queue_input_tensors = [tf.placeholder(dtype, shape) for dtype, shape in input_props]
    dtypes, shapes = zip(*input_props)
    queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
    self.enqueue_op = queue.enqueue(self.queue_input_tensors)
    self.input_tensors = queue.dequeue()

    self.predictions, self.loss, self.domain_loss, self.domain_predictions, self.values = self.get_predictions_and_loss(*self.input_tensors)
    # self.predictions, self.loss = self.get_predictions_and_loss(*self.input_tensors)
    self.global_step = tf.Variable(0, name="global_step", trainable=False)
    self.reset_global_step = tf.assign(self.global_step, 0)
    learning_rate = tf.train.exponential_decay(self.config["learning_rate"], self.global_step,
                                               self.config["decay_frequency"], self.config["decay_rate"], staircase=True)
    
    self.total_loss = self.loss + self.domain_loss
    trainable_params = tf.trainable_variables()
    # gradients = tf.gradients(self.loss, trainable_params)
    
    gradients = tf.gradients(self.total_loss, trainable_params)
    gradients, _ = tf.clip_by_global_norm(gradients, self.config["max_gradient_norm"])
    optimizers = {
      "adam" : tf.train.AdamOptimizer,
      "sgd" : tf.train.GradientDescentOptimizer
    }
    optimizer = optimizers[self.config["optimizer"]](learning_rate)
    self.train_op = optimizer.apply_gradients(zip(gradients, trainable_params), global_step=self.global_step)
    def __init__(self, config):
        self.config = config
        self.embedding_info = [
            (emb["size"], emb["lowercase"]) for emb in config["embeddings"]
        ]  #[(300,false)(50,false)]
        self.embedding_size = sum(
            size for size, _ in self.embedding_info)  #350 = 300+50
        self.char_embedding_size = config["char_embedding_size"]  #8
        self.char_dict = util.load_char_dict(
            config["char_vocab_path"])  #all characters + <unk> size 115
        self.embedding_dicts = [
            util.load_embedding_dict(emb["path"], emb["size"], emb["format"])
            for emb in config["embeddings"]
        ]  #dictionary [(43994?,300)(268822,50)]
        self.max_mention_width = config["max_mention_width"]  #10
        self.genres = {g: i
                       for i, g in enumerate(config["genres"])
                       }  #types of corpus documents
        #(news = nw, conversational telephone speech=tc, weblogs=wb, usenet newsgroups, broadcast=bc, talk shows)
        #[bc, bn, mz, nw, pt, tc, wb]
        self.eval_data = None  # Load eval data lazily.

        input_props = []
        input_props.append((tf.FloatTensor, [None, None, self.embedding_size
                                             ]))  # Text embeddings. [?,?,350]
        input_props.append((tf.IntTensor, [None, None,
                                           None]))  # Character indices.
        input_props.append((tf.IntTensor, [None]))  # Text lengths.
        input_props.append((tf.IntTensor, [None]))  # Speaker IDs.
        input_props.append((tf.IntTensor, []))  # Genre.
        input_props.append((tf.ByteTensor, []))  # Is training.
        input_props.append((tf.IntTensor, [None]))  # Gold starts.
        input_props.append((tf.IntTensor, [None]))  # Gold ends.
        input_props.append((tf.IntTensor, [None]))  # Cluster ids.
        self.queue_input_tensors = [
            tf.zeros(shape).type(dtype) for dtype, shape in input_props
        ]
        # dtypes, shapes = zip(*input_props)
        # queue = tf.PaddingFIFOQueue(capacity=10, dtypes=dtypes, shapes=shapes)
        # self.enqueue_op = queue.enqueue(self.queue_input_tensors)
        # self.input_tensors = queue.dequeue()
        self.input_tensors = self.queue_input_tensors  #9 items from input_props that are split when calling get_prediction_and_loss
        # this is the training step more or less
        self.predictions, self.loss = self.get_predictions_and_loss(
            *self.input_tensors)

        self.global_step = tf.zeros(
        )  #.Variable(0, name="global_step", trainable=False)
        # self.reset_global_step = tf.assign(self.global_step, 0)

        #here you update something based on yout prediction and loss
        trainable_params = autograd.Variable(
            0
        )  #this is equivalent to model.parameters() tf.trainable_variables()
        gradients = tf.gradients(
            self.loss, trainable_params)  #this is autograd backward pass
        # Constructs symbolic derivatives of sum of self.loss w.r.t. x in trainable_params
        gradients, _ = nn.utils.clip_grad_norm(
            gradients, self.config["max_gradient_norm"])
        optimizers = {
            "adam":
            optim.Adam(trainable_params,
                       lr=self.config["learning_rate"],
                       weight_decay=self.config["decay_rate"]),
            "sgd":
            optim.SGD(trainable_params,
                      lr=self.config["learning_rate"],
                      weight_decay=self.config["decay_rate"])
        }
        optimizer = optimizers[self.config["optimizer"]]

        learning_rate = optim.lr_scheduler.ExponentialLR(
            optimizer, gamma=self.config["decay_frequency"])
        learning_rate.step()
if __name__ == "__main__":
    config = util.get_config("experiments.conf")['best']
    device = -1
    model = cm.CorefModel(config)
    if device >= 0:
        model = model.cuda(device)
    parameters = [[n, p] for n, p in model.named_parameters()
                  if p.requires_grad]
    parameters = [param for name, param in parameters]
    optimizer = optim.Adam(parameters, lr=config["learning_rate"])
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
                                                        factor=0.5,
                                                        mode="max",
                                                        patience=2)
    word_embeddings = [
        util.load_embedding_dict(emb["path"], emb["size"], emb["format"])
        for emb in config["embeddings"]
    ]
    train_dataset = cmdata.TrainCorefDataset(config, word_embeddings, "train")
    val_dataset = cmdata.TrainCorefDataset(config, word_embeddings, "test")
    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      train_dataset=train_dataset,
                      validation_dataset=val_dataset,
                      patience=10,
                      validation_metric="+coref_f1",
                      num_epochs=15,
                      cuda_device=device,
                      grad_norm=config["max_gradient_norm"],
                      grad_clipping=config["max_gradient_norm"],
                      learning_rate_scheduler=None)