def _get_test_input(notes, evaluation=False):
    '''
    get input tensors for prediction. If evaluation is true, gold labels are also extracted
    '''

    # data tensors for left and right SDP
    XL = None
    XR = None

    # list of lists, where each list contains the indices to delete from a given note
    # indices of the outer list corespond to indices of the notes list
    del_lists = []

    tlinklabels = []

    print 'Loading word embeddings...'
    word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/GoogleNews-vectors-negative300.bin', verbose=0)
    # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0)

    print 'Extracting dependency paths...'
    for i, note in enumerate(notes):
        # get the representation for the event/timex pairs in the note
        # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings
        # (with length equal to word embedding length), and axis 2 hold each word.
        left_vecs, right_vecs, del_list = _extract_path_representations(note, word_vectors)

        # add the note's data to the combine data matrix
        if XL == None:
            XL = left_vecs
        else:
            XL = _pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2])

        if XR == None:
            XR = right_vecs
        else:
            XR = _pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2])

        # remove duplicate indices
        del_list = list(set(del_list))
        # remove indices in descending order so that they continue to refer to the item we want to remove
        del_list.sort()
        del_list.reverse()
        del_lists.append(del_list)

        if evaluation:
            # get tlink lables
            note_tlinklabels = note.get_tlink_labels()
            for index in del_list:
                del note_tlinklabels[index]
            tlinklabels += note_tlinklabels

    # pad XL and XR so that they have the same number of dimensions on the second axis
    # any other dimension mis-matches are caused by actually errors and should not be padded away
    XL, XR = _pad_to_match_dimensions(XL, XR, 2, pad_left=True)

    labels = []
    if evaluation:
        labels = _convert_str_labels_to_int(tlinklabels)

    return XL, XR, del_lists, labels
Beispiel #2
0
    def _get_test_input(self, notes, pair_type, ordered=False):
        # data tensor for left and right SDP subpaths
        XL = None
        XR = None

        if self.word_vectors is None:
            print 'Loading word embeddings...'
            self.word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0)
            # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0)

        print 'Extracting dependency paths...'
        labels = None
        pair_index = {} # record note id and all the used entity pairs
        index_offset = 0
        for i, note in enumerate(notes):

            # get the representation for the event/timex pairs in the note
            # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings
            # (with length equal to word embedding length), and axis 2 hold each word.
            # del_list is a list of indices for which no SDP could be obtained
            left_vecs, right_vecs, id_pairs = self._extract_path_representations(note, self.word_vectors, pair_type, ordered=ordered)

            # only do the following for labeled data with tlinks
            # tlinks from test data are used to do evaluation
            if note.id_to_labels:
                note_labels = []
                index_to_reverse = []
                for index, pair in enumerate(id_pairs): # id pairs that have tlinks
                    #pair_index[(i, pair)] = index + index_offset

                    label_from_file = note.id_to_labels.get(pair, 'None')
                    opposite_from_file = note.id_to_labels.get((pair[1], pair[0]), 'None')
                    if label_from_file == 'None' and opposite_from_file != 'None':
                        # print note.annotated_note_path
                        # print "id pair", pair, label_from_file
                        # print "opposite", opposite_from_file
                        index_to_reverse.append(index)
                        note_labels.append(opposite_from_file) # save the opposite lable first, reverse later
                    else:
                        note_labels.append(label_from_file)

                note_labels = self._convert_str_labels_to_int(note_labels)
                labels_to_reverse = [note_labels[x] for x in index_to_reverse]
                reversed = self.reverse_labels(labels_to_reverse)
                print note.annotated_note_path
                print "{} labels augmented".format(len(reversed))

                note_labels = np.array(note_labels, dtype='int16')
                index_to_reverse = np.array(index_to_reverse)
                if index_to_reverse.any():
                    note_labels[index_to_reverse] = reversed

                if labels is None:
                    labels = note_labels
                else:
                    labels =np.concatenate((labels, note_labels))

            for index, pair in enumerate(id_pairs):
                pair_index[(i, pair)] = index + index_offset

            index_offset += len(id_pairs)

            # add the note's data to the combine data matrix
            if XL is None:
                XL = left_vecs
            else:
                XL = Network._pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2])

            if XR is None:
                XR = right_vecs
            else:
                XR = Network._pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2])

        # pad XL and XR so that they have the same number of dimensions on the second axis
        # any other dimension mis-matches are caused by actually errors and should not be padded away
        XL, XR = Network._pad_to_match_dimensions(XL, XR, 2, pad_left=True)

        return XL, XR, labels, pair_index
Beispiel #3
0
    def _get_training_input(self, notes, pair_type, nolink_ratio=None, presence=False, shuffle=True, ordered=False):

        # data tensor for left and right SDP subpaths
        XL = None
        XR = None

        if self.word_vectors is None:
            print 'Loading word embeddings...'
            word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0)

        print 'Extracting dependency paths...'
        labels = []
        for i, note in enumerate(notes):

            # get the representation for the event/timex pairs in the note
            # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings
            # (with length equal to word embedding length), and axis 2 hold each word.
            # del_list is a list of indices for which no SDP could be obtained
            left_vecs, right_vecs, id_pairs = self._extract_path_representations(note, self.word_vectors, pair_type, ordered=ordered)

            # perform a random check, to make sure the data is correctly augmented
            if not id_pairs:
                print "No pair found:", note.annotated_note_path
                continue

            pos_case_indexes = []
            neg_case_indexes = []
            note_labels = []
            for index, pair in enumerate(id_pairs):
                if pair in note.id_to_labels:
                    pos_case_indexes.append(index)
                else:
                    neg_case_indexes.append(index)
                note_labels.append(note.id_to_labels.get(pair, 'None'))
            note_labels = np.array(note_labels)

            if nolink_ratio is not None:
                np.random.shuffle(neg_case_indexes)
                n_samples = min(len(neg_case_indexes), int(nolink_ratio * len(pos_case_indexes)) )
                neg_case_indexes = neg_case_indexes[0:n_samples]
                if not neg_case_indexes:
                    training_indexes = np.array(pos_case_indexes, dtype=np.int32)
                else:
                    training_indexes = np.concatenate([pos_case_indexes, neg_case_indexes])
                left_vecs = left_vecs[training_indexes, :, :]
                right_vecs = right_vecs[training_indexes, :, :]
                note_labels = note_labels[training_indexes]

            if labels == []:
                labels = note_labels
            else:
                labels = np.concatenate((labels, note_labels))

            # add the note's data to the combine data matrix
            if XL is None:
                XL = left_vecs
            else:
                XL = Network._pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2])

            if XR is None:
                XR = right_vecs
            else:
                XR = Network._pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2])

        # pad XL and XR so that they have the same number of dimensions on the second axis
        # any other dimension mis-matches are caused by actually errors and should not be padded away
        XL, XR = Network._pad_to_match_dimensions(XL, XR, 2, pad_left=True)

        # # extract longest input sequence in the training data, and ensure both matrices
        # input_len = XL.shape[2]

        if presence:
            for i, label in enumerate(labels):
                if label != 0:
                    labels[i] = 1

        if shuffle:
            rng_state = np.random.get_state()
            np.random.shuffle(XL)
            np.random.set_state(rng_state)
            np.random.shuffle(XR)
            np.random.set_state(rng_state)
            np.random.shuffle(labels)
        del notes
        labels = self._convert_str_labels_to_int(labels)

        return XL, XR, labels
def _get_training_input(notes, no_none=False, presence=False, shuffle=True):

    # TODO: handle tlinks linking to the document creation time. at the moment, we simply skip them

    # labels for each SDP pair
    tlinklabels = []

    # data tensor for left and right SDP subpaths
    XL = None
    XR = None

    print 'Loading word embeddings...'
    word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/GoogleNews-vectors-negative300.bin', verbose=0)
    # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0)

    print 'Extracting dependency paths...'
    for i, note in enumerate(notes):
        # get tlink lables
        note_tlinklabels = note.get_tlink_labels()

        # get the representation for the event/timex pairs in the note
        # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings
        # (with length equal to word embedding length), and axis 2 hold each word.
        # del_list is a list of indices for which no SDP could be obtained
        left_vecs, right_vecs, del_list = _extract_path_representations(note, word_vectors, no_none)

        # add the note's data to the combine data matrix
        if XL == None:
            XL = left_vecs
        else:
            XL = _pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2])

        if XR == None:
            XR = right_vecs
        else:
            XR = _pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2])

        # remove duplicate indices
        del_list = list(set(del_list))
        # remove indices in descending order so that they continue to refer to the item we want to remove
        del_list.sort()
        del_list.reverse()
        for index in del_list:
            del note_tlinklabels[index]

        # add remaining labels to complete list of labels
        tlinklabels += note_tlinklabels

    # pad XL and XR so that they have the same number of dimensions on the second axis
    # any other dimension mis-matches are caused by actually errors and should not be padded away
    XL, XR = _pad_to_match_dimensions(XL, XR, 2, pad_left=True)

    # # extract longest input sequence in the training data, and ensure both matrices
    # input_len = XL.shape[2]

    labels = _convert_str_labels_to_int(tlinklabels)

    if presence:
        for i, label in enumerate(labels):
            if label != 0:
                labels[i] = 1

    if shuffle:
        rng_state = np.random.get_state()
        np.random.shuffle(XL)
        np.random.set_state(rng_state)
        np.random.shuffle(XR)
        np.random.set_state(rng_state)
        np.random.shuffle(labels)

    return XL, XR, labels