def _get_test_input(notes, evaluation=False): ''' get input tensors for prediction. If evaluation is true, gold labels are also extracted ''' # data tensors for left and right SDP XL = None XR = None # list of lists, where each list contains the indices to delete from a given note # indices of the outer list corespond to indices of the notes list del_lists = [] tlinklabels = [] print 'Loading word embeddings...' word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/GoogleNews-vectors-negative300.bin', verbose=0) # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0) print 'Extracting dependency paths...' for i, note in enumerate(notes): # get the representation for the event/timex pairs in the note # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings # (with length equal to word embedding length), and axis 2 hold each word. left_vecs, right_vecs, del_list = _extract_path_representations(note, word_vectors) # add the note's data to the combine data matrix if XL == None: XL = left_vecs else: XL = _pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2]) if XR == None: XR = right_vecs else: XR = _pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2]) # remove duplicate indices del_list = list(set(del_list)) # remove indices in descending order so that they continue to refer to the item we want to remove del_list.sort() del_list.reverse() del_lists.append(del_list) if evaluation: # get tlink lables note_tlinklabels = note.get_tlink_labels() for index in del_list: del note_tlinklabels[index] tlinklabels += note_tlinklabels # pad XL and XR so that they have the same number of dimensions on the second axis # any other dimension mis-matches are caused by actually errors and should not be padded away XL, XR = _pad_to_match_dimensions(XL, XR, 2, pad_left=True) labels = [] if evaluation: labels = _convert_str_labels_to_int(tlinklabels) return XL, XR, del_lists, labels
def _get_test_input(self, notes, pair_type, ordered=False): # data tensor for left and right SDP subpaths XL = None XR = None if self.word_vectors is None: print 'Loading word embeddings...' self.word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0) # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0) print 'Extracting dependency paths...' labels = None pair_index = {} # record note id and all the used entity pairs index_offset = 0 for i, note in enumerate(notes): # get the representation for the event/timex pairs in the note # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings # (with length equal to word embedding length), and axis 2 hold each word. # del_list is a list of indices for which no SDP could be obtained left_vecs, right_vecs, id_pairs = self._extract_path_representations(note, self.word_vectors, pair_type, ordered=ordered) # only do the following for labeled data with tlinks # tlinks from test data are used to do evaluation if note.id_to_labels: note_labels = [] index_to_reverse = [] for index, pair in enumerate(id_pairs): # id pairs that have tlinks #pair_index[(i, pair)] = index + index_offset label_from_file = note.id_to_labels.get(pair, 'None') opposite_from_file = note.id_to_labels.get((pair[1], pair[0]), 'None') if label_from_file == 'None' and opposite_from_file != 'None': # print note.annotated_note_path # print "id pair", pair, label_from_file # print "opposite", opposite_from_file index_to_reverse.append(index) note_labels.append(opposite_from_file) # save the opposite lable first, reverse later else: note_labels.append(label_from_file) note_labels = self._convert_str_labels_to_int(note_labels) labels_to_reverse = [note_labels[x] for x in index_to_reverse] reversed = self.reverse_labels(labels_to_reverse) print note.annotated_note_path print "{} labels augmented".format(len(reversed)) note_labels = np.array(note_labels, dtype='int16') index_to_reverse = np.array(index_to_reverse) if index_to_reverse.any(): note_labels[index_to_reverse] = reversed if labels is None: labels = note_labels else: labels =np.concatenate((labels, note_labels)) for index, pair in enumerate(id_pairs): pair_index[(i, pair)] = index + index_offset index_offset += len(id_pairs) # add the note's data to the combine data matrix if XL is None: XL = left_vecs else: XL = Network._pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2]) if XR is None: XR = right_vecs else: XR = Network._pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2]) # pad XL and XR so that they have the same number of dimensions on the second axis # any other dimension mis-matches are caused by actually errors and should not be padded away XL, XR = Network._pad_to_match_dimensions(XL, XR, 2, pad_left=True) return XL, XR, labels, pair_index
def _get_training_input(self, notes, pair_type, nolink_ratio=None, presence=False, shuffle=True, ordered=False): # data tensor for left and right SDP subpaths XL = None XR = None if self.word_vectors is None: print 'Loading word embeddings...' word_vectors = load_word2vec_binary(os.environ["TEA_PATH"] + '/GoogleNews-vectors-negative300.bin', verbose=0) print 'Extracting dependency paths...' labels = [] for i, note in enumerate(notes): # get the representation for the event/timex pairs in the note # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings # (with length equal to word embedding length), and axis 2 hold each word. # del_list is a list of indices for which no SDP could be obtained left_vecs, right_vecs, id_pairs = self._extract_path_representations(note, self.word_vectors, pair_type, ordered=ordered) # perform a random check, to make sure the data is correctly augmented if not id_pairs: print "No pair found:", note.annotated_note_path continue pos_case_indexes = [] neg_case_indexes = [] note_labels = [] for index, pair in enumerate(id_pairs): if pair in note.id_to_labels: pos_case_indexes.append(index) else: neg_case_indexes.append(index) note_labels.append(note.id_to_labels.get(pair, 'None')) note_labels = np.array(note_labels) if nolink_ratio is not None: np.random.shuffle(neg_case_indexes) n_samples = min(len(neg_case_indexes), int(nolink_ratio * len(pos_case_indexes)) ) neg_case_indexes = neg_case_indexes[0:n_samples] if not neg_case_indexes: training_indexes = np.array(pos_case_indexes, dtype=np.int32) else: training_indexes = np.concatenate([pos_case_indexes, neg_case_indexes]) left_vecs = left_vecs[training_indexes, :, :] right_vecs = right_vecs[training_indexes, :, :] note_labels = note_labels[training_indexes] if labels == []: labels = note_labels else: labels = np.concatenate((labels, note_labels)) # add the note's data to the combine data matrix if XL is None: XL = left_vecs else: XL = Network._pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2]) if XR is None: XR = right_vecs else: XR = Network._pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2]) # pad XL and XR so that they have the same number of dimensions on the second axis # any other dimension mis-matches are caused by actually errors and should not be padded away XL, XR = Network._pad_to_match_dimensions(XL, XR, 2, pad_left=True) # # extract longest input sequence in the training data, and ensure both matrices # input_len = XL.shape[2] if presence: for i, label in enumerate(labels): if label != 0: labels[i] = 1 if shuffle: rng_state = np.random.get_state() np.random.shuffle(XL) np.random.set_state(rng_state) np.random.shuffle(XR) np.random.set_state(rng_state) np.random.shuffle(labels) del notes labels = self._convert_str_labels_to_int(labels) return XL, XR, labels
def _get_training_input(notes, no_none=False, presence=False, shuffle=True): # TODO: handle tlinks linking to the document creation time. at the moment, we simply skip them # labels for each SDP pair tlinklabels = [] # data tensor for left and right SDP subpaths XL = None XR = None print 'Loading word embeddings...' word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/GoogleNews-vectors-negative300.bin', verbose=0) # word_vectors = load_word2vec_binary(os.environ["TEA_PATH"]+'/wiki.dim-300.win-8.neg-15.skip.bin', verbose=0) print 'Extracting dependency paths...' for i, note in enumerate(notes): # get tlink lables note_tlinklabels = note.get_tlink_labels() # get the representation for the event/timex pairs in the note # will be 3D tensor with axis zero holding the each pair, axis 1 holding the word embeddings # (with length equal to word embedding length), and axis 2 hold each word. # del_list is a list of indices for which no SDP could be obtained left_vecs, right_vecs, del_list = _extract_path_representations(note, word_vectors, no_none) # add the note's data to the combine data matrix if XL == None: XL = left_vecs else: XL = _pad_and_concatenate(XL, left_vecs, axis=0, pad_left=[2]) if XR == None: XR = right_vecs else: XR = _pad_and_concatenate(XR, right_vecs, axis=0, pad_left=[2]) # remove duplicate indices del_list = list(set(del_list)) # remove indices in descending order so that they continue to refer to the item we want to remove del_list.sort() del_list.reverse() for index in del_list: del note_tlinklabels[index] # add remaining labels to complete list of labels tlinklabels += note_tlinklabels # pad XL and XR so that they have the same number of dimensions on the second axis # any other dimension mis-matches are caused by actually errors and should not be padded away XL, XR = _pad_to_match_dimensions(XL, XR, 2, pad_left=True) # # extract longest input sequence in the training data, and ensure both matrices # input_len = XL.shape[2] labels = _convert_str_labels_to_int(tlinklabels) if presence: for i, label in enumerate(labels): if label != 0: labels[i] = 1 if shuffle: rng_state = np.random.get_state() np.random.shuffle(XL) np.random.set_state(rng_state) np.random.shuffle(XR) np.random.set_state(rng_state) np.random.shuffle(labels) return XL, XR, labels