def run(self): #t = time.perf_counter() self._write_tiles() self._write_dzi() #elapsed_time = time.perf_counter() - t cfg.ver_print("Tiling completed on {0} in: ".format(self._img_name), "<time here>")
def prepare_embeddings(self, load_bin=True, support_start_stop=True): print("Preparing Embeddings ...") # get all the sentences each sentence is a sequence of words (list of words) tokens2d = list(itertools.chain.from_iterable([p.tokens2d for p in self.protocols])) sents = [[token.word for token in tokens1d] for tokens1d in tokens2d] # train a skip gram model to generate word vectors. Vectors will be of dimension given by 'size' parameter. print(" Loading Word2Vec ...") if load_bin: print(" Loading a Massive File ...") if not os.path.isfile(cfg.PUBMED_AND_PMC_W2V_BIN): url = "http://evexdb.org/pmresources/vec-space-models/PubMed-and-PMC-w2v.bin" dirpath = os.path.dirname(cfg.PUBMED_AND_PMC_W2V_BIN) print("Downloading Word2Vec resource ...") download(url, save_filepath=cfg.PUBMED_AND_PMC_W2V_BIN) skip_gram_model = KeyedVectors.load_word2vec_format(cfg.PUBMED_AND_PMC_W2V_BIN, binary=True) else: skip_gram_model = Word2Vec(sentences=sents, size=cfg.EMBEDDING_DIM, sg=1, window=10, min_count=1, workers=4) cfg.ver_print("word2vec emb size", skip_gram_model.vector_size) sent_iter_flat = list(itertools.chain.from_iterable(sents)) list_of_chars = list(itertools.chain.from_iterable([list(word) for word in sent_iter_flat])) self.word_index = self.gen_word_index(sents, support_start_stop) self.char_index = gen_list2id_dict(list_of_chars, insert_words=['<w>', '</w>', '<s>', '</s>']) print(self.char_index) cfg.CHAR_VOCAB = len(self.char_index.items()) with open('test_tokenizer.txt', 'w', encoding='utf-8') as out: out.writelines([item + ' ' + str(self.word_index[item]) + '\n' if item in self.word_index else item + ' ' + str(self.word_index[cfg.UNK]) + '\n' for item in sent_iter_flat]) embedding_matrix = np.random.uniform(low=-0.01, high=0.01, size=(len(self.word_index) + 1, cfg.EMBEDDING_DIM)) print(" Populating Embedding Matrix ...") with open(cfg.OOP_FILEPATH, 'w') as f: f.write("Out of pre-trained Vocabulary words\n") for word, i in self.word_index.items(): try: embedding_vector = skip_gram_model[word] embedding_matrix[i] = embedding_vector except KeyError: # not found in pre-trained word embedding list. with open(cfg.OOP_FILEPATH, 'a') as f: f.write('{0}\n'.format(word)) cfg.ver_print('out of pre-trained vocab word', word) return embedding_matrix
def __prep_char_idx_seq(self, sent): cfg.ver_print("char_index", self.char_index) char_idx_seq = [self.__to_idx_seq([cfg.SENT_START], start=cfg.WORD_START, end=cfg.WORD_END, index=self.char_index)] + \ [self.__to_idx_seq(list(word), start=cfg.WORD_START, end=cfg.WORD_END, index=self.char_index) for word in sent] + \ [self.__to_idx_seq([cfg.SENT_END], start=cfg.WORD_START, end=cfg.WORD_END, index=self.char_index)] cfg.ver_print("char idx seq", char_idx_seq) return char_idx_seq
def forward(self, inp_features): cfg.ver_print("Inp features", inp_features) # inp_features is of size (seq_len x EMB_DIM) linear1_out = self.linear1(inp_features) tanh_out = self.tanh(linear1_out) linear2_out = self.linear2(tanh_out) soft_out = self.log_softmax(linear2_out) cfg.ver_print("FINAL OUT", soft_out) return soft_out
def forward(self, chars): out_stack = [] for word in chars: out = self.emb(Variable(cuda.LongTensor(word))) out = unsqueeze(out, dim=0) out, hidden_state = self.rnn(out, self.hidden_state) # TODO verify that this is indeed the last outputs of both forward rnn and backward rnn # and that we are concatenating correctly out = cat([hidden_state[0][0], hidden_state[0][1]], dim=1) cfg.ver_print("Hidden state concat", out) out = self.linear(out) out = self.tanh(out) out_stack.append(out) final_out = stack(out_stack, dim=1) return final_out
def forward(self, minibatch): out_stack = [] minibatch = list(minibatch) minibatch_lengths = [len(sent) for sent in minibatch] batch_of_words = list(chain.from_iterable(minibatch)) self.init_state(len(batch_of_words)) # a hack to get index of the sorted words, so i can unsort them back after they are processed # print(batch_of_words) sent, ridx = self.len_sort(batch_of_words) padded, seq_lengths = self.pad(sent, 0) # print(padded) out = self.emb(Variable(cuda.LongTensor(padded))) # out is of size (all_words x max_len x char_emb_size) # print("out size: {0}".format(out.size())) out = rnn.pack_padded_sequence(out, seq_lengths, batch_first=True) out, hidden_state = self.rnn(out, self.hidden_state) # hidden_state[0] is of size: (num_dir x batch_size x lstm_hidden_dim) # print("hidden state size: {0}".format(hidden_state[0].size())) # TODO verify # unsorting IMPORTANT. cos we initially sorted the seq of chars to pass it to rnn. hidden_state = torch.index_select(hidden_state[0], dim=1, index=Variable( cuda.LongTensor(ridx))) # TODO verify that this is indeed the last outputs of both forward rnn and backward rnn out = cat([hidden_state[0], hidden_state[1]], dim=1) # print("cat out size: {0}".format(out.size())) cfg.ver_print("Hidden state concat", out) out = self.linear(out) out = self.tanh(out) # print("before split and pad function {0}".format(out.size())) # this will split 1d tensor of word embeddings, into 2d array of word embeddings based on lengths final_out = self.split_and_pad(out, minibatch_lengths) # final_out is of size (batch_size x max_seq_len x emb_size) # print(final_out.size()) return final_out
def __gen_sent_idx_seq(self, sent): cfg.ver_print("word_index", self.word_index) sent_idx_seq = self.__to_idx_seq(sent, start=cfg.SENT_START, end=cfg.SENT_END, index=self.word_index, oov=self.is_oov) cfg.ver_print("sent", sent) cfg.ver_print("sent idx seq", sent_idx_seq) return sent_idx_seq
def to_categorical(self, labels, bio=False): # converts a list of labels to binary representation. a label is 1 if its an action-verb, else its 0 def split(_label): if _label == cfg.NEG_LABEL: _bio_encoding = cfg.NEG_LABEL _tag_name = "NoTag" else: _bio_encoding, _tag_name = _label.split("-", 1) return _bio_encoding, _tag_name ret = [] cfg.ver_print("labels", labels) for label in labels: bio_encoding, tag_name = split(label) if tag_name == cfg.POSITIVE_LABEL: ret.append(self.tag_idx[bio_encoding]) else: ret.append(self.tag_idx[cfg.NEG_LABEL]) return ret
def forward(self, x): v_l = [] cfg.ver_print("input to embedding layer", x) # x is of size (1 x seq_len) seq_len = x.size(1) n = x[0].cpu().data.numpy() for i in n.tolist(): v = self.emb_mat[i] # v is of size (EMB_DIM) # cfg.ver_print("v", v) v_l.append(v) v = torch.stack(v_l, dim=0) # v is of size (seq_len x EMB_DIM) v = v.view(1, seq_len, -1) # v is now of size(1 x seq_len x EMB_DIM) # cfg.ver_print("Embedding out", v) return v
def forward(self, sent_idx_seq, char_idx_seq): cfg.ver_print("Sent Index sequence", sent_idx_seq) padded_seq, seq_lengths = self.pad(sent_idx_seq) padded_seq = Variable(torch.LongTensor(padded_seq)).cuda() emb = self.emb_lookup(padded_seq) if self.char_level == "Input": char_emb = self.char_net(char_idx_seq) inp = cat([emb, char_emb], dim=2) elif self.char_level == "Highway": char_emb = self.char_net(char_idx_seq) inp = self.highway(emb, char_emb) elif self.char_level == "Attention": char_emb = self.char_net(char_idx_seq) inp = self.att_net(emb, char_emb) else: inp = emb if self.pos_feat == "Yes": padded_pos, seq_len_pos = self.pad(pos) padded_pos = Variable(torch.LongTensor(padded_pos)).cuda() pos_emb = self.pos_emb(padded_pos) inp = cat([inp, pos_emb], dim=2) # emb is now of size(1 x seq_len x EMB_DIM) cfg.ver_print("Embedding for the Sequence", inp) lstm_out, hidden_state = self.mb_lstm_forward(inp, seq_lengths) # lstm_out is of size (batch_size x seq_len x 2*EMB_DIM) unrolled_lstm_out = self.unpad(lstm_out, seq_lengths) # unrolled_lstm_out is of size(label_size x 2*EMB_DIM); where label_size is the number of words in the batch. lstm_forward, lstm_backward = lstm_out[:, :, :cfg. LSTM_HIDDEN_SIZE], lstm_out[:, :, -cfg . LSTM_HIDDEN_SIZE:] # lstm_forward of size (batch x max_seq x emb_dim) # making sure that you got the correct lstm_forward and lstm_backward. for i, seq_len in enumerate(seq_lengths): assert to_scalar( torch.sum(lstm_forward[i, seq_len - 1, :] - hidden_state[0][0, i, :])) == 0 assert to_scalar( torch.sum(lstm_backward[i, 0, :] - hidden_state[0][1, i, :])) == 0 lm_f_out = self.lm_forward( self.unpad(lstm_forward, seq_lengths, skip_start=0, skip_end=1)) lm_b_out = self.lm_backward( self.unpad(lstm_backward, seq_lengths, skip_start=1, skip_end=0)) # size of lm_f_out = (batch_size*seq_len x emb_size) cfg.ver_print("Language Model Forward pass out", lm_f_out) cfg.ver_print("Language Model Backward pass out", lm_b_out) lstm_out = self.lstm_linear(unrolled_lstm_out.squeeze()) lstm_out = torch.sigmoid(lstm_out) lstm_out = lstm_out.unsqueeze(dim=0) label_out = lstm_out linear_out = self.linear(label_out.view(-1, cfg.LSTM_OUT_SIZE)) if self.isCrossEnt: out = linear_out else: out = self.log_softmax(linear_out) cfg.ver_print("LINEAR OUT", linear_out) cfg.ver_print("FINAL OUT", out) if self.char_level == "Attention": unrolled_emb = self.unpad(emb, seq_lengths) unrolled_char_emb = self.unpad(char_emb, seq_lengths) return lm_f_out, lm_b_out, out, seq_lengths, unrolled_emb, unrolled_char_emb else: return lm_f_out, lm_b_out, out, seq_lengths