Esempio n. 1
0
    def predict_sentence(self, sent):
        """
        Return a predicted label for each word in an arbitrary length sentence
        sent - a list of string tokens
        """
        ret = []
        sent_str = " ".join(sent)

        # Extract predicates by looking at verbal POS

        preds = [(word.i, str(word)) for word in spacy_ws(sent_str)
                 if word.tag_.startswith("V")]

        # Calculate num of samples (round up to the nearst multiple of sent_maxlen)
        num_of_samples = np.ceil(
            float(len(sent)) / self.sent_maxlen) * self.sent_maxlen

        # Run RNN for each predicate on this sentence
        for ind, pred in preds:
            cur_sample = self.create_sample(sent, ind)
            X = self.encode_inputs([cur_sample])
            ret.append((
                (ind, pred),
                [
                    [(self.consolidate_label(label), float(prob))
                     for (label, prob) in label_list]
                    for label_list in  #for (label, prob) in
                    self.transform_output_probs(
                        self.model.predict(X),  # "flatten" and truncate
                        get_prob=True)[0][:len(sent)]
                ]))
        return ret
def convert_single_sent(annotated_sent, verbal):
    """
    Return our format for a single annotated sentence.
    Verbal controls whether only verbal extractions should be made.
    From Mesquita's readme:
    Annotated Sentence:  The sentence annotated with the entity pair, the trigger and allowed tokens.
                        Entities are enclosed in triple square brackets, triggers are enclosed in
                        triple curly brackets and the allowed tokens are enclosed in arrows.
                        ("--->" and "<---").
    """
    proc_sent = []
    word_ind = 0
    for word in annotated_sent.split():
        if (word not in SPECIAL_CHARS):
            if "{{{" in word:
                # Boilerplate index
                bp_ind = word.index('{{{') + 3
                # Plant the index in the correct place
                word = "{}{}_{}".format(word[0:bp_ind], word_ind,
                                        word[bp_ind:])
                word_ind += 1

            elif not (word.startswith("[[[")):
                word = "{}_{}".format(word_ind, word)
                word_ind += 1

        proc_sent.append(word)

    proc_sent = " ".join(proc_sent)
    pred = get_predicate_head(proc_sent)
    raw_sent = get_raw_sent(proc_sent)
    doc = spacy_ws(strip_word_index(raw_sent))

    # Filter non-verbs and empty predicates
    if (not pred) or \
       (verbal and \
        (not doc[int(pred.split("_")[0])].tag_.startswith("V"))):
        return None
    return map(strip_word_index, [raw_sent, pred] + get_entities(proc_sent))
Esempio n. 3
0
    def encode_inputs(self, sents):
        """
        Given a dataframe which is already split to sentences,
        encode inputs for rnn classification.
        Should return a dictionary of sequences of sample of length maxlen.
        """
        word_inputs = []
        pred_inputs = []
        pos_inputs = []

        # Preproc to get all preds per run_id
        # Sanity check - make sure that all sents agree on run_id
        assert (all([len(set(sent.run_id.values)) == 1 for sent in sents]))
        run_id_to_pred = dict([(int(sent.run_id.values[0]),
                                self.get_head_pred_word(sent))
                               for sent in sents])

        # Construct a mapping from running word index to pos
        word_id_to_pos = {}
        for sent in sents:
            indices = sent.index.values
            words = sent.word.values

            for index, word in zip(indices, spacy_ws(" ".join(words))):
                word_id_to_pos[index] = word.tag_

        fixed_size_sents = self.get_fixed_size(sents)

        for sent in fixed_size_sents:

            assert (len(set(sent.run_id.values)) == 1)

            word_indices = sent.index.values
            sent_words = sent.word.values

            sent_str = " ".join(sent_words)



            pos_tags_encodings = [(SPACY_POS_TAGS.index(word_id_to_pos[word_ind]) \
                                   if word_id_to_pos[word_ind] in SPACY_POS_TAGS \
                                   else 0)
                                  for word_ind
                                  in word_indices]

            word_encodings = [self.emb.get_word_index(w) for w in sent_words]

            # Same pred word encodings for all words in the sentence
            pred_word = run_id_to_pred[int(sent.run_id.values[0])]
            pred_word_encodings = [
                self.emb.get_word_index(pred_word) for _ in sent_words
            ]

            word_inputs.append([Sample(w) for w in word_encodings])
            pred_inputs.append([Sample(w) for w in pred_word_encodings])
            pos_inputs.append([Sample(pos) for pos in pos_tags_encodings])

        # Pad / truncate to desired maximum length
        ret = defaultdict(lambda: [])

        for name, sequence in zip(
            ["word_inputs", "predicate_inputs", "postags_inputs"],
            [word_inputs, pred_inputs, pos_inputs]):
            for samples in pad_sequences(sequence,
                                         pad_func=lambda: Pad_sample(),
                                         maxlen=self.sent_maxlen):
                ret[name].append([sample.encode() for sample in samples])

        return {k: np.array(v) for k, v in ret.iteritems()}