예제 #1
0
    def predict(self, model, test_data, predict_prob=False):

        XL, XR = test_data

        # get expected length of model input
        model_input_len = model.input_shape[0][2]

        if model_input_len > XL.shape[2]:
            # pad input matrix to fit expected length
            filler = np.ones((1, 1, model_input_len))
            XL, _ = Network._pad_to_match_dimensions(XL,
                                                     filler,
                                                     2,
                                                     pad_left=True)
        else:
            XL = Network._strip_to_length(XL, model_input_len, 2)

        print "predicting..."
        labels = model.predict_classes([XL, XR])
        if predict_prob:
            probs = model.predict_proba([XL, XR])
        else:
            probs = None

        return labels, probs
예제 #2
0
def process_note(note, labels, del_list, label_index, probs):
    # get entity pairs, offsets, tokens, and event/timex entities
    entities = note.get_tlink_id_pairs()
    offsets = note.get_token_char_offsets()

    # flatten list of tokens
    tokenized_text = note.get_tokenized_text()
    tokens = []
    for line in tokenized_text:
        tokens += tokenized_text[line]

    event_timex_labels = []
    # flatten list of labels
    for label_list in note.get_labels():
        event_timex_labels += label_list

    # sort del_list to be in ascending order, and remove duplicates
    del_list = list(set(del_list))
    del_list.sort()
    del_list.reverse()
    # loop through indicies starting at the back to preserve earlier indexes
    for index in del_list:
        del entities[index]

    note_labels = labels[label_index:label_index + len(entities)]
    note_label_nums = Network()._convert_str_labels_to_int(note_labels)

    processed_entities = {}
    used_indexes = []
    # for the same entity pairs (regardless of order), only use the best scores
    for i, note_label_num in enumerate(note_label_nums):
        if (entities[i][1], entities[i][0]) in processed_entities:
            if probs[i][note_label_num] > processed_entities[(entities[i][1],
                                                              entities[i][0])]:
                used_indexes.append(i)  # reverse order
            else:
                used_indexes.append(i - 1)
        else:
            processed_entities[(entities[i][0],
                                entities[i][1])] = probs[i][note_label_num]

    note_labels = [note_labels[x] for x in used_indexes]
    entities = [entities[x] for x in used_indexes]
    return event_timex_labels, note_labels, entities, offsets, tokens
예제 #3
0
    def get_test_input(self, note):
        """Given a note, return data for every token"""

        max_id = len(note.id_to_tok)  # word ids starts with 1
        word_vectors = None
        attribute_vectors = None

        for sent_num in note.pre_processed_text:
            for tok in note.pre_processed_text[sent_num]:
                wordID = tok['id']
                word_index = int(wordID[1:])  # wordID example: 'w31'
                left_edge = max(1, word_index - 4)
                right_edge = min(max_id, word_index + 4)

                context_tokens = [
                    note.id_to_tok['w' + str(x)]
                    for x in range(left_edge, right_edge + 1)
                ]
                context_words = [x['token'] for x in context_tokens]
                vecs = self._extract_word_representations(context_words)
                if word_vectors is None:
                    word_vectors = vecs
                else:
                    word_vectors = Network._pad_and_concatenate(word_vectors,
                                                                vecs,
                                                                axis=0)

                attributes = np.array([
                    tok.get('is_main_verb', False),
                    tok.get('is_predicate', False), tok['pos'] == 'V',
                    tok['pos'] == 'N'
                ])
                attributes = attributes[np.newaxis, :]
                if attribute_vectors is None:
                    attribute_vectors = attributes
                else:
                    attribute_vectors = np.concatenate(
                        (attribute_vectors, attributes), axis=0)

        return word_vectors, attribute_vectors
예제 #4
0
def trainNetwork(gold_files, val_files, newsreader_dir, pair_type, ordered=False, no_val=False, nolink_ratio=1.0, callbacks=[], train_dir='./'):
    '''
    Train a neural network for classification of temporal realtions.
    '''

    print "Called trainNetwork"

    global N_CLASSES

    if not os.path.isfile(train_dir+'training_data.pkl'):
        notes = get_notes(gold_files, newsreader_dir)
    if not no_val:
        val_notes = get_notes(val_files, newsreader_dir)

    network = Network()
    print "loading word vectors..."
    network.word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0)

    if os.path.isfile(train_dir+'training_data.pkl'):
        print "loading pkl file... this may take over 10 minutes"
        training_data = cPickle.load(open(train_dir+'training_data.pkl'))
        print "training data size:", training_data[0].shape, training_data[1].shape, len(training_data[2])
    else:
        # nolink_ration = # no tlink cases / # tlink cases
        training_data = network._get_training_input(notes, pair_type=pair_type, nolink_ratio=nolink_ratio, shuffle=True, ordered=ordered)
        print "training data size:", training_data[0].shape, training_data[1].shape, len(training_data[2])

        if not no_val and val_notes is not None:
            val_data = network._get_test_input(val_notes, pair_type=pair_type, ordered=ordered)
            print "validation data size:", val_data[0].shape, val_data[1].shape, len(val_data[2])
        else:
            val_data = None

        del network.word_vectors
        NNet, history = network.train_model(None, epochs=200, training_input=training_data, val_input=val_data, no_val=no_val, weight_classes=False, batch_size=100,
        encoder_dropout=0, decoder_dropout=0.5, input_dropout=0.6, reg_W=0, reg_B=0, reg_act=0, LSTM_size=256,
        dense_size=100, maxpooling=True, data_dim=300, max_len='auto', nb_classes=N_CLASSES, callbacks=callbacks, ordered=ordered)

        return NNet, history
예제 #5
0
def main():

    global timenote_imported

    parser = argparse.ArgumentParser()

    parser.add_argument("predict_dir",
                        nargs=1,
                        help="Directory containing test input")

    parser.add_argument(
        "intra_model_path",
        help="Where trained model for intra-sentence pairs is located")

    parser.add_argument(
        "cross_model_path",
        help="Where trained model for cross-sentence pairs is located")

    parser.add_argument(
        "dct_model_path",
        help=
        "Where trained model for events and document creation time is located")

    parser.add_argument("annotation_destination",
                        help="Where annotated files are written")

    parser.add_argument(
        "newsreader_annotations",
        help="Where newsreader pipeline parsed file objects go")

    parser.add_argument(
        "--evaluate",
        action='store_true',
        default=False,
        help="Use gold data from the given files to produce evaluation metrics"
    )

    args = parser.parse_args()

    annotation_destination = args.annotation_destination
    newsreader_dir = args.newsreader_annotations

    if os.path.isdir(annotation_destination) is False:
        sys.exit("\n\noutput destination does not exist")
    if os.path.isdir(newsreader_dir) is False:
        sys.exit("invalid path for time note dir")

    predict_dir = args.predict_dir[0]

    if os.path.isdir(predict_dir) is False:
        sys.exit("\n\nno output directory exists at set path")

    # pickled_timeml_notes = [os.path.basename(l) for l in glob.glob(newsreader_dir + "/*")]

    if '/*' != args.predict_dir[0][-2:]:
        predict_dir = predict_dir + '/*'

    # get files in directory
    files = glob.glob(predict_dir)

    gold_files = []
    tml_files = []

    for f in files:
        if f.endswith(".TE3input"):  #input file without tlinks
            tml_files.append(f)
        elif f.endswith(".tml"):
            gold_files.append(f)

    gold_files.sort()
    tml_files.sort()
    print "gold_files", gold_files

    # one-to-one pairing of annotated file and un-annotated
    # assert len(gold_files) == len(tml_files)

    network = Network()

    intra_model = model_from_json(
        open(os.path.join(args.intra_model_path, 'intra',
                          '.arch.json')).read())
    intra_model.load_weights(
        os.path.join(args.intra_model_path, 'intra', '.weights.h5'))
    cross_model = model_from_json(
        open(os.path.join(args.cross_model_path, 'cross',
                          '.arch.json')).read())
    cross_model.load_weights(
        os.path.join(args.cross_model_path, 'cross', '.weights.h5'))
    dct_model = model_from_json(
        open(os.path.join(args.dct_model_path, 'dct', '.arch.json')).read())
    dct_model.load_weights(
        os.path.join(args.dct_model_path, 'dct', '.weights.h5'))

    for i, tml in enumerate(gold_files):

        print '\n\nprocessing file {}/{} {}'.format(i + 1, len(gold_files),
                                                    tml)
        if os.path.isfile(
                os.path.join(newsreader_dir,
                             basename(tml) + ".parsed.pickle")):
            tmp_note = cPickle.load(
                open(
                    os.path.join(newsreader_dir,
                                 basename(tml) + ".parsed.pickle"), "rb"))
        else:
            tmp_note = TimeNote(tml, tml)
            cPickle.dump(
                tmp_note,
                open(newsreader_dir + "/" + basename(tml) + ".parsed.pickle",
                     "wb"))

        # notes.append(tmp_note)
        notes = [tmp_note]  # required to be a list

        intra_labels, intra_probs, intra_pair_index = network.single_predict(
            notes, intra_model, 'intra', predict_prob=True)
        intra_labels, intra_pair_index, intra_scores = network.smart_predict(
            intra_labels, intra_probs, intra_pair_index, type='str')

        cross_labels, cross_probs, cross_pair_index = network.single_predict(
            notes, cross_model, 'cross', predict_prob=True)
        cross_labels, cross_pair_index, cross_scores = network.smart_predict(
            cross_labels, cross_probs, cross_pair_index, type='str')

        timex_labels, timex_pair_index = predict_timex_rel(notes)

        dct_labels, dct_probs, dct_pair_index = network.single_predict(
            notes, dct_model, 'dct', predict_prob=True)
        dct_labels = network._convert_int_labels_to_str(dct_labels)
        dct_scores = [max(probs) for probs in dct_probs]
        assert len(dct_labels) == len(dct_scores)

        for i, note in enumerate(notes):
            note_id_pairs = []
            note_labels = []
            note_scores = []

            for key in intra_pair_index.keys(
            ):  #  {(note_id, (ei, ej)) : index}
                # the dictionary is dynamically changing, so we need to check
                if key not in intra_pair_index:
                    continue
                if key[0] == i:
                    note_id_pairs.append(key[1])
                    note_labels.append(intra_labels[intra_pair_index[key]])
                    note_scores.append(intra_scores[intra_pair_index[key]])
                    intra_pair_index.pop(key)
                    opposite_key = (key[0], (key[1][1], key[1][0]))
                    intra_pair_index.pop(opposite_key)

            for key in cross_pair_index.keys(
            ):  # {(note_id, (ei, ej)) : index}
                # the dictionary is dynamically changing, so we need to check
                if key not in cross_pair_index:
                    continue
                if key[0] == i:
                    note_id_pairs.append(key[1])
                    note_labels.append(cross_labels[cross_pair_index[key]])
                    note_scores.append(cross_scores[cross_pair_index[key]])
                    cross_pair_index.pop(key)
                    opposite_key = (key[0], (key[1][1], key[1][0]))
                    cross_pair_index.pop(opposite_key)

            for key in timex_pair_index.keys():  # {(note_id, (t, t)) : index}
                if key[0] == i:
                    note_id_pairs.append(key[1])
                    note_labels.append(timex_labels[timex_pair_index[key]])
                    note_scores.append(1.0)  # trust timex tlinks
                    timex_pair_index.pop(key)

            for key in dct_pair_index.keys():  # {(note_id, (ei, t0)) : index}
                if key[0] == i:
                    note_id_pairs.append(key[1])
                    note_labels.append(dct_labels[dct_pair_index[key]])
                    note_scores.append(max(dct_probs[dct_pair_index[key]]))
                    #note_scores.append(0.0)
                    dct_pair_index.pop(key)

            # note_labels, note_scores = resolve_coref(note, note_id_pairs, note_labels, note_scores)
            note_labels = modify_tlinks(note_id_pairs, note_labels,
                                        note_scores)
            save_predictions(note, note_id_pairs, note_labels,
                             annotation_destination)
예제 #6
0
    def train_model(self,
                    training_data,
                    validation_data=None,
                    model_destination='./',
                    epochs=500,
                    weight_classes=False,
                    batch_size=256,
                    encoder_dropout=0,
                    decoder_dropout=0.5,
                    input_dropout=0.5,
                    reg_W=0,
                    reg_B=0,
                    reg_act=0,
                    LSTM_size=128,
                    dense_size=30,
                    maxpooling=True,
                    data_dim=300,
                    max_len='auto'):

        XL, XR, Y = training_data
        print "training data shape: ", XL.shape

        # reformat labels so that they can be used by the NN
        #Y = to_categorical(Y, 2)

        # use weighting to assist with the imbalanced data set problem
        if weight_classes:
            N = len(Y)
            n_pos = sum(Y)
            neg_weight = 1.0 * n_pos / N  # inversely proportional to frequency
            class_weight = {1: 1 - neg_weight, 0: neg_weight}

        # infer maximum sequence length
        if max_len == 'auto':
            max_len = XL.shape[2]
        # pad input to reach max_len
        else:
            filler = np.ones((1, 1, max_len))
            XL, _ = Network._pad_to_match_dimensions(XL,
                                                     filler,
                                                     2,
                                                     pad_left=True)
            XR, _ = Network._pad_to_match_dimensions(XR,
                                                     filler,
                                                     2,
                                                     pad_left=True)

        model = self.get_untrained_model(encoder_dropout=encoder_dropout,
                                         decoder_dropout=decoder_dropout,
                                         input_dropout=input_dropout,
                                         reg_W=reg_W,
                                         reg_B=reg_B,
                                         reg_act=reg_act,
                                         LSTM_size=LSTM_size,
                                         dense_size=dense_size,
                                         maxpooling=maxpooling,
                                         data_dim=data_dim,
                                         max_len=max_len)

        # split off validation data with 20 80 split (this way we get the same validation data every time we use this data sample, and can test on it after to get a confusion matrix)
        if validation_data is None:
            V_XL = XL[:(XL.shape[0] / 5), :, :]
            V_XR = XR[:(XR.shape[0] / 5), :, :]
            V_Y = Y[:(Y.shape[0] / 5), :]
            #V_labels = labels[:(Y.shape[0] / 5)]

            XL = XL[(XL.shape[0] / 5):, :, :]
            XR = XR[(XR.shape[0] / 5):, :, :]
            Y = Y[(Y.shape[0] / 5):, :]
        else:
            V_XL, V_XR, V_Y = validation_data

        # train the network
        print 'Training network...'
        earlystopping = EarlyStopping(monitor='val_loss',
                                      patience=20,
                                      verbose=0,
                                      mode='auto')
        checkpoint = ModelCheckpoint(model_destination + 'model.h5',
                                     monitor='val_acc',
                                     save_best_only=True)

        training_history = model.fit([XL, XR],
                                     Y,
                                     nb_epoch=epochs,
                                     validation_split=0,
                                     class_weight=class_weight,
                                     batch_size=batch_size,
                                     validation_data=([V_XL, V_XR], V_Y),
                                     callbacks=[checkpoint, earlystopping])

        test = model.predict_classes([V_XL, V_XR])

        Network.class_confusion(test, V_Y, 2)

        return model, training_history.history
예제 #7
0
    def get_input(self, notes, shuffle=True, neg_ratio=3):

        word_vectors = None
        attribute_vectors = None
        labels = []
        for note in notes:
            print "processing file ", note.annotated_note_path
            if hasattr(note, 'event_ids'):
                event_ids = note.event_ids
            else:
                id_chunk_map, event_ids, timex_ids, sentence_chunks = note.get_id_chunk_map(
                )

            # every event tag corresponds to a list of words, pick the first word
            event_wordIDs = [note.id_to_wordIDs[x][0] for x in event_ids]
            max_id = len(note.id_to_tok)  # word ids starts with 1

            all_wordIDs = set(['w' + str(x) for x in range(1, max_id + 1)])
            nonevent_wordIDs = all_wordIDs - set(event_wordIDs)
            n_neg_samples = min(len(nonevent_wordIDs),
                                neg_ratio * len(event_wordIDs))
            nonevent_wordIDs = list(nonevent_wordIDs)[0:n_neg_samples]

            training_wordIDs = event_wordIDs + nonevent_wordIDs

            for wordID in training_wordIDs:
                word_index = int(wordID[1:])  # wordID example: 'w31'
                left_edge = max(1, word_index - 4)
                right_edge = min(max_id, word_index + 4)

                context_tokens = [
                    note.id_to_tok['w' + str(x)]
                    for x in range(left_edge, right_edge + 1)
                ]
                context_words = [x['token'] for x in context_tokens]
                vecs = self._extract_word_representations(context_words)
                if word_vectors is None:
                    word_vectors = vecs
                else:
                    word_vectors = Network._pad_and_concatenate(word_vectors,
                                                                vecs,
                                                                axis=0)

                tok = note.id_to_tok[wordID]
                attributes = np.array([
                    tok.get('is_main_verb', False),
                    tok.get('is_predicate', False), tok['pos'] == 'V',
                    tok['pos'] == 'N'
                ])
                attributes = attributes[np.newaxis, :]
                if attribute_vectors is None:
                    attribute_vectors = attributes
                else:
                    attribute_vectors = np.concatenate(
                        (attribute_vectors, attributes), axis=0)

                if wordID in event_wordIDs:
                    labels.append(1)
                else:
                    labels.append(0)

        if shuffle:
            rng_state = np.random.get_state()
            np.random.shuffle(word_vectors)
            np.random.set_state(rng_state)
            np.random.shuffle(attribute_vectors)
            np.random.set_state(rng_state)
            np.random.shuffle(labels)

        return word_vectors, attribute_vectors, labels