Example #1
0
 def run(self):
     #t = time.perf_counter()
     self._write_tiles()
     self._write_dzi()
     #elapsed_time = time.perf_counter() - t
     cfg.ver_print("Tiling completed on {0} in: ".format(self._img_name),
                   "<time here>")
Example #2
0
    def prepare_embeddings(self, load_bin=True, support_start_stop=True):
        print("Preparing Embeddings ...")
        # get all the sentences each sentence is a sequence of words (list of words)
        tokens2d = list(itertools.chain.from_iterable([p.tokens2d for p in self.protocols]))

        sents = [[token.word for token in tokens1d] for tokens1d in tokens2d]
        # train a skip gram model to generate word vectors. Vectors will be of dimension given by 'size' parameter.
        print("         Loading Word2Vec ...")
        if load_bin:
            print("                     Loading a Massive File ...")
            if not os.path.isfile(cfg.PUBMED_AND_PMC_W2V_BIN):
                url = "http://evexdb.org/pmresources/vec-space-models/PubMed-and-PMC-w2v.bin"
                dirpath = os.path.dirname(cfg.PUBMED_AND_PMC_W2V_BIN)
                print("Downloading Word2Vec resource ...")
                download(url, save_filepath=cfg.PUBMED_AND_PMC_W2V_BIN)

            skip_gram_model = KeyedVectors.load_word2vec_format(cfg.PUBMED_AND_PMC_W2V_BIN, binary=True)
        else:
            skip_gram_model = Word2Vec(sentences=sents, size=cfg.EMBEDDING_DIM, sg=1, window=10, min_count=1,
                                       workers=4)

        cfg.ver_print("word2vec emb size", skip_gram_model.vector_size)

        sent_iter_flat = list(itertools.chain.from_iterable(sents))

        list_of_chars = list(itertools.chain.from_iterable([list(word) for word in sent_iter_flat]))

        self.word_index = self.gen_word_index(sents, support_start_stop)

        self.char_index = gen_list2id_dict(list_of_chars, insert_words=['<w>', '</w>', '<s>', '</s>'])

        print(self.char_index)

        cfg.CHAR_VOCAB = len(self.char_index.items())

        with open('test_tokenizer.txt', 'w', encoding='utf-8') as out:
            out.writelines([item + ' ' + str(self.word_index[item]) + '\n'
                            if item in self.word_index
                            else item + ' ' + str(self.word_index[cfg.UNK]) + '\n'
                            for item in sent_iter_flat])

        embedding_matrix = np.random.uniform(low=-0.01, high=0.01, size=(len(self.word_index) + 1, cfg.EMBEDDING_DIM))
        print("         Populating Embedding Matrix ...")
        with open(cfg.OOP_FILEPATH, 'w') as f:
            f.write("Out of pre-trained Vocabulary words\n")

        for word, i in self.word_index.items():
            try:
                embedding_vector = skip_gram_model[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                # not found in pre-trained word embedding list.
                with open(cfg.OOP_FILEPATH, 'a') as f:
                    f.write('{0}\n'.format(word))
                cfg.ver_print('out of pre-trained vocab word', word)

        return embedding_matrix
Example #3
0
    def __prep_char_idx_seq(self, sent):
        cfg.ver_print("char_index", self.char_index)
        char_idx_seq = [self.__to_idx_seq([cfg.SENT_START], start=cfg.WORD_START, end=cfg.WORD_END,
                                          index=self.char_index)] + \
                       [self.__to_idx_seq(list(word), start=cfg.WORD_START, end=cfg.WORD_END,
                                          index=self.char_index)
                        for word in sent] + \
                       [self.__to_idx_seq([cfg.SENT_END], start=cfg.WORD_START, end=cfg.WORD_END,
                                          index=self.char_index)]

        cfg.ver_print("char idx seq", char_idx_seq)
        return char_idx_seq
Example #4
0
    def forward(self, inp_features):
        cfg.ver_print("Inp features", inp_features)
        # inp_features is of size (seq_len x EMB_DIM)

        linear1_out = self.linear1(inp_features)
        tanh_out = self.tanh(linear1_out)
        linear2_out = self.linear2(tanh_out)
        soft_out = self.log_softmax(linear2_out)

        cfg.ver_print("FINAL OUT", soft_out)

        return soft_out
Example #5
0
    def forward(self, chars):
        out_stack = []

        for word in chars:

            out = self.emb(Variable(cuda.LongTensor(word)))
            out = unsqueeze(out, dim=0)

            out, hidden_state = self.rnn(out, self.hidden_state)

            # TODO verify that this is indeed the last outputs of both forward rnn and backward rnn
            # and that we are concatenating correctly
            out = cat([hidden_state[0][0], hidden_state[0][1]], dim=1)
            cfg.ver_print("Hidden state concat", out)
            out = self.linear(out)
            out = self.tanh(out)
            out_stack.append(out)

        final_out = stack(out_stack, dim=1)
        return final_out
Example #6
0
    def forward(self, minibatch):
        out_stack = []
        minibatch = list(minibatch)
        minibatch_lengths = [len(sent) for sent in minibatch]
        batch_of_words = list(chain.from_iterable(minibatch))

        self.init_state(len(batch_of_words))
        # a hack to get index of the sorted words, so i can unsort them back after they are processed
        # print(batch_of_words)
        sent, ridx = self.len_sort(batch_of_words)
        padded, seq_lengths = self.pad(sent, 0)
        # print(padded)
        out = self.emb(Variable(cuda.LongTensor(padded)))
        # out is of size (all_words x max_len x char_emb_size)
        # print("out size: {0}".format(out.size()))
        out = rnn.pack_padded_sequence(out, seq_lengths, batch_first=True)
        out, hidden_state = self.rnn(out, self.hidden_state)
        # hidden_state[0] is of size: (num_dir x batch_size x lstm_hidden_dim)
        # print("hidden state size: {0}".format(hidden_state[0].size()))

        # TODO verify
        # unsorting IMPORTANT. cos we initially sorted the seq of chars to pass it to rnn.
        hidden_state = torch.index_select(hidden_state[0],
                                          dim=1,
                                          index=Variable(
                                              cuda.LongTensor(ridx)))

        # TODO verify that this is indeed the last outputs of both forward rnn and backward rnn

        out = cat([hidden_state[0], hidden_state[1]], dim=1)
        # print("cat out size: {0}".format(out.size()))
        cfg.ver_print("Hidden state concat", out)
        out = self.linear(out)
        out = self.tanh(out)
        # print("before split and pad function {0}".format(out.size()))
        # this will split 1d tensor of word embeddings, into 2d array of word embeddings based on lengths
        final_out = self.split_and_pad(out, minibatch_lengths)
        # final_out is of size (batch_size x max_seq_len x emb_size)
        # print(final_out.size())
        return final_out
Example #7
0
    def __gen_sent_idx_seq(self, sent):
        cfg.ver_print("word_index", self.word_index)
        sent_idx_seq = self.__to_idx_seq(sent, start=cfg.SENT_START, end=cfg.SENT_END,
                                         index=self.word_index, oov=self.is_oov)
        cfg.ver_print("sent", sent)
        cfg.ver_print("sent idx seq", sent_idx_seq)

        return sent_idx_seq
Example #8
0
    def to_categorical(self, labels, bio=False):

        # converts a list of labels to binary representation. a label is 1 if its an action-verb, else its 0
        def split(_label):
            if _label == cfg.NEG_LABEL:
                _bio_encoding = cfg.NEG_LABEL
                _tag_name = "NoTag"
            else:
                _bio_encoding, _tag_name = _label.split("-", 1)

            return _bio_encoding, _tag_name

        ret = []
        cfg.ver_print("labels", labels)
        for label in labels:
            bio_encoding, tag_name = split(label)

            if tag_name == cfg.POSITIVE_LABEL:
                ret.append(self.tag_idx[bio_encoding])
            else:
                ret.append(self.tag_idx[cfg.NEG_LABEL])

        return ret
Example #9
0
    def forward(self, x):
        v_l = []
        cfg.ver_print("input to embedding layer", x)
        # x is of size (1 x seq_len)
        seq_len = x.size(1)

        n = x[0].cpu().data.numpy()

        for i in n.tolist():
            v = self.emb_mat[i]
            # v is of size (EMB_DIM)
            # cfg.ver_print("v", v)

            v_l.append(v)
        v = torch.stack(v_l, dim=0)
        # v is of size (seq_len x EMB_DIM)

        v = v.view(1, seq_len, -1)
        # v is now of size(1 x seq_len x EMB_DIM)

        # cfg.ver_print("Embedding out", v)

        return v
Example #10
0
    def forward(self, sent_idx_seq, char_idx_seq):
        cfg.ver_print("Sent Index sequence", sent_idx_seq)
        padded_seq, seq_lengths = self.pad(sent_idx_seq)
        padded_seq = Variable(torch.LongTensor(padded_seq)).cuda()

        emb = self.emb_lookup(padded_seq)

        if self.char_level == "Input":
            char_emb = self.char_net(char_idx_seq)
            inp = cat([emb, char_emb], dim=2)
        elif self.char_level == "Highway":
            char_emb = self.char_net(char_idx_seq)
            inp = self.highway(emb, char_emb)
        elif self.char_level == "Attention":
            char_emb = self.char_net(char_idx_seq)
            inp = self.att_net(emb, char_emb)
        else:
            inp = emb

        if self.pos_feat == "Yes":
            padded_pos, seq_len_pos = self.pad(pos)
            padded_pos = Variable(torch.LongTensor(padded_pos)).cuda()
            pos_emb = self.pos_emb(padded_pos)
            inp = cat([inp, pos_emb], dim=2)

        # emb is now of size(1 x seq_len x EMB_DIM)
        cfg.ver_print("Embedding for the Sequence", inp)
        lstm_out, hidden_state = self.mb_lstm_forward(inp, seq_lengths)
        # lstm_out is of size (batch_size x seq_len x 2*EMB_DIM)
        unrolled_lstm_out = self.unpad(lstm_out, seq_lengths)
        # unrolled_lstm_out is of size(label_size x 2*EMB_DIM); where label_size is the number of words in the batch.
        lstm_forward, lstm_backward = lstm_out[:, :, :cfg.
                                               LSTM_HIDDEN_SIZE], lstm_out[:, :,
                                                                           -cfg
                                                                           .
                                                                           LSTM_HIDDEN_SIZE:]
        # lstm_forward of size (batch x max_seq x emb_dim)
        # making sure that you got the correct lstm_forward and lstm_backward.
        for i, seq_len in enumerate(seq_lengths):
            assert to_scalar(
                torch.sum(lstm_forward[i, seq_len - 1, :] -
                          hidden_state[0][0, i, :])) == 0
            assert to_scalar(
                torch.sum(lstm_backward[i, 0, :] -
                          hidden_state[0][1, i, :])) == 0

        lm_f_out = self.lm_forward(
            self.unpad(lstm_forward, seq_lengths, skip_start=0, skip_end=1))
        lm_b_out = self.lm_backward(
            self.unpad(lstm_backward, seq_lengths, skip_start=1, skip_end=0))
        # size of lm_f_out = (batch_size*seq_len x emb_size)
        cfg.ver_print("Language Model Forward pass out", lm_f_out)
        cfg.ver_print("Language Model Backward pass out", lm_b_out)

        lstm_out = self.lstm_linear(unrolled_lstm_out.squeeze())

        lstm_out = torch.sigmoid(lstm_out)

        lstm_out = lstm_out.unsqueeze(dim=0)

        label_out = lstm_out

        linear_out = self.linear(label_out.view(-1, cfg.LSTM_OUT_SIZE))
        if self.isCrossEnt:
            out = linear_out
        else:
            out = self.log_softmax(linear_out)

        cfg.ver_print("LINEAR OUT", linear_out)
        cfg.ver_print("FINAL OUT", out)

        if self.char_level == "Attention":
            unrolled_emb = self.unpad(emb, seq_lengths)
            unrolled_char_emb = self.unpad(char_emb, seq_lengths)
            return lm_f_out, lm_b_out, out, seq_lengths, unrolled_emb, unrolled_char_emb
        else:
            return lm_f_out, lm_b_out, out, seq_lengths