Ejemplo n.º 1
0
    def encode_rel_names(self, vocab):
        rel_names = dotDict()
        rel_names.raw = self.vocab.rel.rev_names[1:]
        rel_names.word = [
            vocab.word.sent2ids(rel_name) for rel_name in rel_names.raw
        ]
        rel_names.char = [
            vocab.char.sent2ids(rel_name) for rel_name in rel_names.raw
        ]

        rel_names.word = padding(rel_names.word,
                                 minlen=[max(self.rel_cnn.filter_widths)],
                                 maxlen=[0])
        rel_names.char = padding(rel_names.char,
                                 minlen=[0, max(self.rel_cnn.filter_widths)],
                                 maxlen=[0, 0])

        rel_embs = dotDict()
        rel_embs.word = self.encoder.word_encoder.word_encode(rel_names.word)
        rel_embs.char = self.encoder.word_encoder.char_encode(rel_names.char)

        with tf.variable_scope('RelWordsComposition'):
            rel_repls = cnn(
                tf.concat([rel_embs.word, rel_embs.char], axis=-1),
                filter_widths=self.rel_cnn.filter_widths,
                filter_size=int(self.ffnn_size /
                                len(self.rel_cnn.filter_widths)),
            )
        return rel_repls
Ejemplo n.º 2
0
    def formatize_and_print(self_class, flat_batches, predictions, vocab=None):
        '''
    Args:
    - predictions: A list of a tuple (relations, mention_starts, mention_ends), which contains the predicted relations (both of subj, obj) and mention spans. Each element of the list corresponds to each example.

    '''
        n_data = 0
        n_success = 0
        triples = recDotDict({'gold': [], 'prediction': []})
        mentions = recDotDict({'gold': [], 'prediction': []})

        for i, (b, p) in enumerate(zip(flat_batches, predictions)):
            query = b.query
            gold_triples = b.triples
            predicted_triples = recDotDefaultDict()
            predicted_triples.subjective = []
            predicted_triples.objective = []

            gold_mentions = [
                recDotDict({
                    'raw': r,
                    'flat_position': p
                }) for r, p in zip(b.mentions.raw, b.mentions.flat_position)
            ]

            predicted_mentions = []
            for (subj_rel_id, obj_rel_id), (mention_start,
                                            mention_end) in zip(*p):
                if mention_end <= len(b.text.flat) and (
                        mention_start, mention_end) != (PAD_ID, PAD_ID):
                    mention = recDotDict()
                    mention.raw = ' '.join(
                        b.text.flat[mention_start:mention_end + 1])
                    mention.flat_position = (mention_start, mention_end)
                    predicted_mentions.append(mention)
                else:
                    continue
                if subj_rel_id != vocab.rel.UNK_ID:
                    rel = dotDict({
                        'raw': vocab.rel.id2token(subj_rel_id),
                        'name': vocab.rel.id2name(subj_rel_id),
                    })
                    predicted_triples.subjective.append([query, rel, mention])
                if obj_rel_id != vocab.rel.UNK_ID:
                    rel = dotDict({
                        'raw': vocab.rel.id2token(obj_rel_id),
                        'name': vocab.rel.id2name(obj_rel_id),
                    })
                    predicted_triples.objective.append([mention, rel, query])
            triples.gold.append(gold_triples)
            triples.prediction.append(predicted_triples)
            mentions.gold.append(gold_mentions)
            mentions.prediction.append(predicted_mentions)
            _id = BOLD + '<%04d>' % (i) + RESET
            print(_id)
            self_class.print_example(
                b, vocab, prediction=[predicted_triples, predicted_mentions])
            print('')
        return triples, mentions
Ejemplo n.º 3
0
    def __init__(self, sess, config):
        super().__init__(sess, config)
        self.hidden_activation = getattr(tf.nn, config.hidden_activation)
        self.output_activation = getattr(tf.nn, config.output_activation)
        #self.activation_f = tf.nn.sigmoid
        self.hidden_size = config.hidden_size
        self.num_ff_layers = config.num_ff_layers
        self.vocab_size = config.vocab_size
        self.max_num_card = config.max_num_card
        self.td_gamma = config.td_gamma

        # Define placeholders.
        with tf.name_scope('Placeholders'):
            self.ph = dotDict()
            self.ph.is_training = tf.placeholder(tf.bool,
                                                 name='is_training',
                                                 shape=[])
            self.ph.state = tf.placeholder(
                tf.float32,
                name='ph.state',
                shape=[
                    None, NUM_TURNS,
                    Nconfig.vocab_size.card * (config.max_num_card + 1)
                ])
            self.ph.candidates = tf.placeholder(
                tf.int32,
                name='ph.candidates',
                shape=[None, NUM_TURNS, NUM_CANDIDATES])

            self.ph.next_state = tf.placeholder(
                tf.float32,
                name='ph.next_state',
                shape=[
                    None, NUM_TURNS,
                    config.vocab_size.card * (config.max_num_card + 1)
                ])

            self.ph.next_candidates = tf.placeholder(
                tf.int32,
                name='ph.next_candidates',
                shape=[
                    None, NUM_TURNS, config.num_next_candidates_samples,
                    NUM_CANDIDATES
                ])

            self.ph.action = tf.placeholder(tf.int32,
                                            name='ph.action',
                                            shape=[None, NUM_TURNS])
            self.ph.reward = tf.placeholder(tf.float32,
                                            name='ph.reward',
                                            shape=[None, NUM_TURNS])
            self.ph.is_end_state = tf.placeholder(tf.bool,
                                                  name='ph.is_end_state',
                                                  shape=[None, NUM_TURNS])

        with tf.name_scope('keep_prob'):
            self.keep_prob = 1.0 - tf.to_float(
                self.ph.is_training) * config.dropout_rate
Ejemplo n.º 4
0
  def get_updates_by_task(self):
    updates = dotDict()
    reuse = False

    for task_name, task_model in self.tasks.items():
      with tf.variable_scope(task_name):
        updates[task_name] = super().get_updates(task_model.loss, 
                                                 task_model.global_step) 
      reuse = True
    return updates
Ejemplo n.º 5
0
    def setup_placeholders(self, config):
        # Define placeholders.
        with tf.name_scope('Placeholders'):
            ph = dotDict()
            ph.is_training = tf.placeholder(tf.bool,
                                            name='is_training',
                                            shape=[])
            ph.state = tf.placeholder(
                tf.int32,
                name='ph.state',
                shape=[
                    None, NUM_TURNS,
                    config.vocab_size.card * (config.max_num_card + 1)
                ])

            ph.candidates = tf.placeholder(
                tf.int32,
                name='ph.next_state',
                shape=[None, NUM_TURNS, NUM_CANDIDATES])

            ph.next_state = tf.placeholder(
                tf.int32,
                name='ph.next_state',
                shape=[
                    None, NUM_TURNS,
                    config.vocab_size.card * (config.max_num_card + 1)
                ])

            ph.next_candidates = tf.placeholder(
                tf.int32,
                name='ph.next_state',
                shape=[
                    None, NUM_TURNS, config.num_next_candidates_samples,
                    NUM_CANDIDATES
                ])

            ph.action = tf.placeholder(tf.int32,
                                       name='ph.action',
                                       shape=[None, NUM_TURNS])
            ph.reward = tf.placeholder(tf.float32,
                                       name='ph.reward',
                                       shape=[None, NUM_TURNS])
            ph.is_end_state = tf.placeholder(tf.bool,
                                             name='ph.is_end_state',
                                             shape=[None, NUM_TURNS])
            ph.is_sente = tf.placeholder(tf.int32,
                                         name='ph.is_sente',
                                         shape=[None, NUM_TURNS, 2])
            ph.current_num_cards = tf.placeholder(tf.int32,
                                                  name='ph.current_num_cards',
                                                  shape=[None, NUM_TURNS])
        return ph
Ejemplo n.º 6
0
    def __init__(self, sess, config, vocab, activation=tf.nn.relu):
        super(MTLTrainerBase, self).__init__(sess, config.trainer, vocab)
        # Define each task.
        self.tasks = self.setup_tasks(sess, config)
        self.trainable_tasks = dotDict({
            k: v
            for k, v in self.tasks.items()
            if hasattr(v, 'loss') and v.loss is not None
        })

        # Calculate losses of the tasks and gradients.
        self.losses = [t.loss for t in self.trainable_tasks.values()]
        self.updates = self.get_updates(self.trainable_tasks)
Ejemplo n.º 7
0
    def setup_tasks(self, sess, config):
        # Assign GPUs (for multi-gpu computation).
        devices = [assign_device(i) for i in range(len(config.tasks))]

        tasks = dotDict()
        for i, (task_name, task_config) in enumerate(config.tasks.items()):
            device = devices[i]
            sys.stdout.write('Building %s model to %s...\n' %
                             (task_name, device))
            with tf.variable_scope(task_name, reuse=tf.AUTO_REUSE) as scope:
                with tf.device(device):
                    task_class = available_models[task_config.model_type]
                    args = [sess, task_config, self, self.vocab]
                    tasks[task_name] = task_class(*args)
        return tasks
Ejemplo n.º 8
0
    def __init__(self, sess, config, trainer, vocab):
        super(LoLBase, self).__init__(sess, config, trainer, vocab)

        self.ph = self.setup_placeholder(config)
        self.embeddings = dotDict()
        self.embeddings.champions = initialize_embeddings(
            'champions', [vocab.champion.size, config.emb_size.champion])
        self.embeddings.roles = initialize_embeddings(
            'role', [vocab.role.size, config.emb_size.role])
        pick_repls = tf.nn.embedding_lookup(self.embeddings.champions,
                                            self.ph.picks)
        role_repls = tf.nn.embedding_lookup(self.embeddings.roles,
                                            self.ph.roles)
        board_repls = tf.concat([pick_repls, role_repls], axis=-1)
        with tf.variable_scope(trainer.shared_scope):
            self.board_repls = self.encode(board_repls, config.num_ffnn_layers)
Ejemplo n.º 9
0
    def setup_tasks(self, sess, config):
        num_gpus = max(1, len(get_available_gpus()))
        sys.stdout.write('Available GPUs: %s\n' %
                         str(['/gpu:%d' % i for i in range(num_gpus)]))
        tasks = dotDict()
        for task_idx, (task_name,
                       task_config) in enumerate(config.tasks.items()):
            models = []
            for gpu_idx in range(num_gpus):
                device = '/gpu:%d' % gpu_idx
                sys.stdout.write('Building %s model to %s...\n' %
                                 (task_name, device))
                with tf.variable_scope(task_name,
                                       reuse=tf.AUTO_REUSE) as scope:
                    with tf.device(device):
                        task_class = available_models[task_config.model_type]
                        args = [sess, task_config, self, self.vocab]
                        model = task_class(*args)
                    models.append(model)
            tasks[task_name] = MultiModelWrapper(models)

        return tasks
Ejemplo n.º 10
0
 def setup_tasks(self, sess, config):
     try:
         assert len(config.tasks) == 1
     except:
         raise ValueError("%s can execute only one type of task." %
                          (self.__class__.__name__))
     task_name = list(config.tasks.keys())[0]
     task_config = list(config.tasks.values())[0]
     model_type = available_models[task_config.model_type]
     num_gpus = len(tf_utils.get_available_gpus())
     if not num_gpus:
         with tf.variable_scope(task_name, reuse=tf.AUTO_REUSE) as scope:
             models = [model_type(sess, task_config, self, self.vocab)]
     else:
         models = []
         for i in range(num_gpus):
             device = '/gpu:%d' % (i)
             with tf.variable_scope(task_name,
                                    reuse=tf.AUTO_REUSE) as scope:
                 with tf.device(device):
                     model = model_type(sess, task_config, self, self.vocab)
                     models.append(model)
     tasks = dotDict({task_name: MultiModelWrapper(models)})
     return tasks
Ejemplo n.º 11
0
  def __init__(self, sess, config, vocab, activation=tf.nn.relu):
    super(MTLManager, self).__init__(sess, config)
    self.is_training = tf.placeholder(tf.bool, name='is_training', shape=[]) 
    self.vocab = vocab

    # with tf >= 1.2, the scope where a RNNCell is called first is cached and the variables are automatically reused.
    with tf.variable_scope("WordEncoder") as scope:
      self.word_encoder = WordEncoder(config.encoder, self.is_training, 
                                      vocab.encoder,
                                      shared_scope=scope)
    with tf.variable_scope("GlobalEncoder") as scope:
      self.shared_sent_encoder = SentenceEncoder(config.encoder, self.is_training,
                                                 self.word_encoder,
                                                 shared_scope=scope)
    ## Define each task
    self.tasks = dotDict()
    for i, (task_name, task_config) in enumerate(config.tasks.items()):
      num_gpus = len(tf_utils.get_available_gpus())
      if num_gpus:
        device = '/gpu:%d' % (i % num_gpus)
        sys.stderr.write('Building %s model to %s...\n' % (task_name, device))
      else:
        device = None
        sys.stderr.write('Building %s model to cpu ...\n' % (task_name))

      with tf.variable_scope(task_name) as encoder_scope:
        #with tf.variable_scope('SentenceEncoder') as encoder_scope:
        encoder = self.get_sent_encoder(
          config.encoder, task_config.use_local_rnn, encoder_scope)
        if i != len(config.tasks) - 1 and available_models[task_config.model_type] == TaskAdversarial:
          raise ValueError('Adversarial task must be on the last of tasks in the config.')
        task = self.define_task(sess, task_config, encoder, device)
      self.tasks[task_name] = task
      print(self.tasks)
    self.losses = [t.loss for t in self.tasks.values()]
    self.updates = self.get_updates()
Ejemplo n.º 12
0
    def __init__(self, args, sess):
        super().__init__(args, sess)
        self.config = config = self.load_config(args)
        # assert 'vocab' in self.config
        # assert 'tasks' in self.config and len(self.config.tasks)
        self.model = None

        # Load pretrained embeddings.
        self.vocab = dotDict()  #recDotDefaultDict()

        self.vocab.role = FeatureVocabulary(
            ['TOP', 'MIDDLE', 'ADC', 'SUPPORT', 'JUNGLE'])
        for k, v in self.config.vocab.items():
            vocab_type = getattr(core.vocabulary, v.vocab_type)
            self.vocab[k] = vocab_type(v)

        # self.vocab.encoder.word = VocabularyWithEmbedding(config.vocab.encoder.word)
        # self.vocab.encoder.char = PredefinedCharVocab(config.vocab.encoder.char)

        # if hasattr(config.vocab, 'decoder'):
        #   self.vocab.decoder = dotDict()
        #   self.vocab.decoder.word = VocabularyWithEmbedding(config.vocab.decoder.word)
        # else:
        #   self.vocab.decoder = self.vocab.encoder

        # Load Dataset.x
        self.dataset = recDotDict()
        for k, v in config.tasks.items():
            #t = time.time()
            print(v)
            if 'dataset' in v:  # for tasks without data
                dataset_type = getattr(core.dataset, v.dataset.dataset_type)
            else:
                continue

            self.dataset[k] = dataset_type(v.dataset, self.vocab)
Ejemplo n.º 13
0
    def padding_and_format(self, data, use_sequence_length=True):
        '''
    Caution:  if both do_reverse and use_sequence_length are True at the same time, many PAD_IDs and only a small part of a sentence are read.
    '''
        max_sequence_length = self.max_sequence_length
        do_reverse = not use_sequence_length
        batch_size = len(data)
        encoder_size, decoder_size = max_sequence_length, max_sequence_length
        encoder_inputs, decoder_inputs, encoder_sequence_length = [], [], []
        for _, encoder_input, decoder_input in data:
            encoder_sequence_length.append(len(encoder_input))
            # Encoder inputs are padded and then reversed if do_reverse=True.
            encoder_pad = [
                PAD_ID for _ in range((encoder_size - len(encoder_input)))
            ]
            encoder_input = encoder_input + encoder_pad
            if do_reverse:
                encoder_input = list(reversed(encoder_input))
            encoder_inputs.append(encoder_input)

            # Decoder inputs get an extra "GO" and "EOS" symbol, and are padded then.
            decoder_pad_size = decoder_size - len(decoder_input) - 2
            decoder_inputs.append([GO_ID] + decoder_input + [EOS_ID] +
                                  [PAD_ID] * decoder_pad_size)

        # Now we create batch-major vectors from the data selected above.
        batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], []

        # Batch encoder inputs are just re-indexed encoder_inputs.
        for length_idx in range(encoder_size):
            batch_encoder_inputs.append(
                np.array([
                    encoder_inputs[batch_idx][length_idx]
                    for batch_idx in range(batch_size)
                ],
                         dtype=np.int32))

        # Batch decoder inputs are re-indexed decoder_inputs, we create weights.
        for length_idx in range(decoder_size):
            batch_decoder_inputs.append(
                np.array([
                    decoder_inputs[batch_idx][length_idx]
                    for batch_idx in range(batch_size)
                ],
                         dtype=np.int32))

            # Create target_weights to be 0 for targets that are padding.
            batch_weight = np.ones(batch_size, dtype=np.float32)
            for batch_idx in range(batch_size):
                # We set weight to 0 if the corresponding target is a PAD symbol.
                # The corresponding target is decoder_input shifted by 1 forward.
                if length_idx < decoder_size - 1:
                    target = decoder_inputs[batch_idx][length_idx + 1]
                if length_idx == decoder_size - 1 or target == PAD_ID:
                    batch_weight[batch_idx] = 0.0
            batch_weights.append(batch_weight)
        if not use_sequence_length:
            encoder_sequence_length = None
        batch = common.dotDict({
            'encoder_inputs': batch_encoder_inputs,
            'decoder_inputs': batch_decoder_inputs,
            'target_weights': batch_weights,
            'sequence_length': encoder_sequence_length,
            'batch_size': batch_size,
        })
        return batch
Ejemplo n.º 14
0
    def evaluate(self, batches, gold_path, mode, official_stdout=False):
        def _k_to_tag(k):
            if k == -3:
                return "oracle"  # use only gold spans.
            elif k == -2:
                return "actual"  # use mention_spans as a result of pruning candidate_spans.
            elif k == -1:
                return "exact"  # use the same number of candidate_spans as the gold_spans.
            elif k == 0:
                return "threshold"  # use only candidate_spans with a score greater than 0.
            else:
                return "{}%".format(k)

        #mention_evaluators = { k:util.RetrievalEvaluator() for k in [-3, -2, -1, 0, 10, 15, 20, 25, 30, 40, 50] }
        mention_evaluators = {
            k: coref_util.RetrievalEvaluator()
            for k in [-3, -2, -1, 0]
        }

        coref_predictions = {}
        coref_evaluator = metrics.CorefEvaluator()
        results = OrderedDict()

        for example_num, example in enumerate(batches):
            input_feed = self.get_input_feed(example, False)
            gold_starts = input_feed[self.ph.gold_starts]
            gold_ends = input_feed[self.ph.gold_ends]

            ######
            # debug
            # flattened_text_emb, mention_starts, mention_ends, gold_starts, gold_ends = self.sess.run(self.debug_ops, input_feed)
            # dbgprint(str(example_num) + ':')
            # print('text_shape', flattened_text_emb.shape)
            # print('pred_mentions', np.concatenate([np.expand_dims(mention_starts, -1),
            #                                        np.expand_dims(mention_ends, -1)],
            #                                       axis=-1))
            # print('gold_mentions', np.concatenate([np.expand_dims(gold_starts, -1),
            #                                        np.expand_dims(gold_ends, -1)],
            #                                       axis=-1))
            # print()
            ######

            outputs = self.sess.run(self.outputs, input_feed)
            candidate_starts, candidate_ends, candidate_mention_scores, mention_starts, mention_ends, antecedents, antecedent_scores = outputs[:
                                                                                                                                               7]

            self.evaluate_mentions(candidate_starts, candidate_ends,
                                   mention_starts, mention_ends,
                                   candidate_mention_scores, gold_starts,
                                   gold_ends, example, mention_evaluators)
            predicted_antecedents = self.get_predicted_antecedents(
                antecedents, antecedent_scores)
            coref_predictions[example.doc_key] = self.evaluate_coref(
                mention_starts, mention_ends, predicted_antecedents,
                example.clusters, coref_evaluator)

            results[example.doc_key] = dotDict({
                'raw_text':
                example.text.raw,
                'speakers':
                example.speakers,
                'extracted_mentions':
                [(begin, end)
                 for begin, end in zip(mention_starts, mention_ends)],
                'predicted_antecedents':
                predicted_antecedents
            })
            if len(outputs) > 7:
                mention_descs = {}

                pred_mention_desc = [
                    self.vocab.decoder.word.ids2tokens(s)
                    for s in outputs[7][:, 0, :]
                ]
                gold_mention_desc = [
                    self.vocab.decoder.word.ids2tokens(s)
                    for s in outputs[8][:, 0, :]
                ]
                for s, e, desc in zip(mention_starts, mention_ends,
                                      pred_mention_desc):
                    mention_descs[(s, e)] = desc
                for s, e, desc in zip(gold_starts, gold_ends,
                                      pred_mention_desc):
                    mention_descs[(s, e)] = desc

                results[example.doc_key].mention_descs = mention_descs
            else:
                results[example.doc_key].mention_descs = []
        summary_dict = {}

        for k, evaluator in sorted(list(mention_evaluators.items()),
                                   key=operator.itemgetter(0)):
            tags = [
                "mention/{} @ {}".format(t, _k_to_tag(k))
                for t in ("R", "P", "F")
            ]
            results_to_print = []
            for t, v in zip(tags, evaluator.metrics()):
                results_to_print.append("{:<10}: {:.2f}".format(t, v))
                summary_dict["coref/%s/" % mode + t] = v
            print(", ".join(results_to_print))

        conll_results = conll.evaluate_conll(gold_path, coref_predictions,
                                             official_stdout)
        val_types = ('p', 'r', 'f')
        for metric in conll_results:
            for val_type in val_types:
                summary_dict["coref/%s/%s/%s" %
                             (mode, metric,
                              val_type)] = conll_results[metric][val_type]
            print("%s (%s) : %s" % (metric, ", ".join(val_types), " ".join(
                ["%.2f" % x for x in conll_results[metric].values()])))

        average_f1 = sum(
            conll_res["f"]
            for conll_res in list(conll_results.values())) / len(conll_results)
        summary_dict["coref/%s/Average F1 (conll)" % mode] = average_f1
        print("Average F1 (conll): {:.2f}%".format(average_f1))

        p, r, f = coref_evaluator.get_prf()
        summary_dict["coref/%s/Average F1 (py)" % mode] = f
        print("Average F1 (py): {:.2f}%".format(f * 100))
        summary_dict["coref/%s/Average precision (py)" % mode] = p
        print("Average precision (py): {:.2f}%".format(p * 100))
        summary_dict["coref/%s/Average recall (py)" % mode] = r
        print("Average recall (py): {:.2f}%".format(r * 100))

        aligned_results = coref_evaluator.get_aligned_results()
        for doc_key, aligned in zip(results, aligned_results):
            results[doc_key]['aligned_results'] = aligned

        average_f1 = sum(
            [values['f']
             for metric, values in conll_results.items()]) / len(conll_results)
        return tf_utils.make_summary(summary_dict), average_f1, results
Ejemplo n.º 15
0
from core.models.wikiP2D.desc.desc import DescriptionGeneration
from core.models.wikiP2D.category.category import CategoryClassification
from core.models.wikiP2D.graph.graph import GraphLinkPrediction
from core.models.wikiP2D.relex.relex_base import RelationExtraction
from core.models.wikiP2D.coref.coref import CoreferenceResolution
from core.models.wikiP2D.adversarial import TaskAdversarial

available_models = [
  CategoryClassification,
  DescriptionGeneration,
  GraphLinkPrediction,
  RelationExtraction,
  CoreferenceResolution,
  TaskAdversarial,
]
available_models = dotDict({c.__name__:c for c in available_models})


def get_multi_encoder(config, shared_sent_encoder, word_encoder, 
                      is_training, scope):
  with tf.variable_scope(scope):
    private_sent_encoder = SentenceEncoder(config, is_training, word_encoder,
                                           shared_scope=scope)
    encoders = [shared_sent_encoder, private_sent_encoder]
  return MultiEncoderWrapper(encoders)


##############################
##      MTL Manager
##############################
Ejemplo n.º 16
0
    def article2entries(self, article):
        def qid2entity(qid, article):
            assert qid in article.link
            s_id, (begin, end) = article.link[qid]

            # The offset is the number of words in previous sentences.
            offset = sum([len(sent) for sent in article.text[:s_id]])
            entity = recDotDefaultDict()
            # Replace entity's name with the actual representation in the article.
            entity.raw = ' '.join(article.text[s_id][begin:end + 1])
            entity.position = article.link[qid]
            entity.flat_position = (begin + offset, end + offset)
            return entity

        entry = recDotDefaultDict()
        entry.qid = article.qid

        entry.text.raw = article.text
        entry.text.flat = article.flat_text
        entry.text.word = [self.vocab.word.sent2ids(s) for s in article.text]
        entry.text.char = [self.vocab.char.sent2ids(s) for s in article.text]

        entry.query = qid2entity(article.qid, article)  # (begin, end)

        # Articles which contain triples less than self.min_triples are discarded since they can be incorrect.
        if len(article.triples.subjective.ids) + len(
                article.triples.objective.ids) < self.min_triples:
            return []
        entry.mentions.raw = []
        entry.mentions.flat_position = []

        for t_type in ['subjective', 'objective']:
            entry.triples[t_type] = []
            entry.target[t_type] = [[
                self.vocab.rel.UNK_ID for j in range(self.max_mention_width)
            ] for i in range(article.num_words)]

            for triple_idx, triple in enumerate(
                    article.triples[t_type].ids):  # triple = [subj, rel, obj]
                is_subjective = triple[0] == article.qid
                query_qid, rel_pid, mention_qid = triple if is_subjective else reversed(
                    triple)
                # TODO: 同じメンションがクエリと異なる関係を持つ場合は?
                mention = qid2entity(mention_qid, article)
                #entry.mentions[t_type].raw.append(mention.raw)
                #entry.mentions[t_type].flat_position.append(mention.flat_position)
                entry.mentions.raw.append(mention.raw)
                entry.mentions.flat_position.append(mention.flat_position)

                rel = dotDict({
                    'raw': rel_pid,
                    'name': self.vocab.rel.token2name(rel_pid)
                })

                begin, end = mention.flat_position
                if end - begin < self.max_mention_width:
                    entry.target[t_type][begin][
                        end - begin] = self.vocab.rel.token2id(rel_pid)

                triple = [entry.query, rel, mention
                          ] if is_subjective else [mention, rel, entry.query]
                entry.triples[t_type].append(triple)

        relation_freqs = Counter(flatten(entry.target.subjective))

        # TODO: For now this experiments focus only on subjective relations.
        entry.triples.objective = []
        #####################
        entry.loss_weights_by_label = [1.0 for _ in range(self.vocab.rel.size)]

        entry.num_mentions = len(entry.mentions.flat_position)
        return [entry]
Ejemplo n.º 17
0
# coding: utf-8
from core.utils.common import dotDict, recDotDefaultDict, recDotDict

QUEUEID = dotDict({
    'solo_ranked': 420,
})

TEAMID = dotDict({
    'blue': 100,
    'red': 200,
})
Ejemplo n.º 18
0
    def __init__(self, sess, config, encoder, activation=tf.nn.relu):
        super(GraphLinkPrediction, self).__init__(sess, config)
        self.sess = sess
        self.encoder = encoder
        self.activation = activation

        self.is_training = encoder.is_training
        self.keep_prob = 1.0 - tf.to_float(
            self.is_training) * config.dropout_rate
        self.ffnn_size = config.ffnn_size
        self.cnn_filter_widths = config.cnn.filter_widths
        self.cnn_filter_size = config.cnn.filter_size

        # Placeholders
        with tf.name_scope('Placeholder'):
            self.ph = recDotDefaultDict()
            self.ph.text.word = tf.placeholder(
                tf.int32, name='text.word',
                shape=[None, None]) if self.encoder.wbase else None
            self.ph.text.char = tf.placeholder(
                tf.int32, name='text.char',
                shape=[None, None, None]) if self.encoder.cbase else None

            self.ph.subj = tf.placeholder(tf.int32,
                                          name='subj.position',
                                          shape=[None, 2])
            self.ph.obj = tf.placeholder(tf.int32,
                                         name='obj.position',
                                         shape=[None, 2])

            self.ph.rel = dotDict()
            self.ph.rel.word = tf.placeholder(
                tf.int32, name='rel.word',
                shape=[None, None]) if self.encoder.wbase else None
            self.ph.rel.char = tf.placeholder(
                tf.int32, name='rel.char',
                shape=[None, None, None]) if self.encoder.cbase else None
            self.ph.target = tf.placeholder(tf.int32,
                                            name='target',
                                            shape=[None])
            self.sentence_length = tf.count_nonzero(self.ph.text.word, axis=1)

        with tf.name_scope('Encoder'):
            text_emb, encoder_outputs, encoder_state = self.encoder.encode(
                [self.ph.text.word, self.ph.text.char], self.sentence_length)
            self.encoder_outputs = encoder_outputs

        with tf.variable_scope('Subject') as scope:
            mention_starts, mention_ends = tf.unstack(self.ph.subj, axis=1)
            subj_outputs, _ = self.encoder.get_batched_mention_emb(
                text_emb, encoder_outputs, mention_starts, mention_ends)

        with tf.variable_scope('Object') as scope:
            mention_starts, mention_ends = tf.unstack(self.ph.obj, axis=1)
            obj_outputs, _ = self.encoder.get_batched_mention_emb(
                text_emb, encoder_outputs, mention_starts, mention_ends)

        with tf.variable_scope('Relation') as scope:
            # Stop gradient to prevent biased learning to the words used as relation labels.
            rel_words_emb = tf.stop_gradient(
                self.encoder.word_encoder.encode(
                    [self.ph.rel.word, self.ph.rel.char]))
            with tf.name_scope("compose_words"):
                rel_outputs = cnn(rel_words_emb, self.cnn_filter_widths,
                                  self.cnn_filter_size)

        with tf.variable_scope('Inference'):
            score_outputs = self.inference(subj_outputs, rel_outputs,
                                           obj_outputs)  # [batch_size, 1]
            self.outputs = tf.round(
                tf.reshape(score_outputs,
                           [shape(score_outputs, 0)]))  # [batch_size]
        with tf.name_scope("Loss"):
            self.losses = self.cross_entropy(score_outputs, self.ph.target)
            self.loss = tf.reduce_mean(self.losses)