Beispiel #1
0
def get_elmo(options_file, weight_file, gpu, dropout):
    global elmo
    # Create the ELMo class.  This example computes two output representation
    # layers each with separate layer weights.
    # We recommend adding dropout (50% is good default) either here or elsewhere
    # where ELMo is used (e.g. in the next layer bi-LSTM).
    elmo = Elmo(options_file, weight_file, num_output_representations=2,
                do_layer_norm=False, dropout=dropout)

    if gpu:
        elmo.cuda()
Beispiel #2
0
class ElmoEmbedding:
    def __init__(self, dim):
        if dim == 2048:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
        elif dim == 512:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"
        self.dim = dim
        self.elmo = Elmo(options_file, weight_file, 2, dropout=0)
        if func.gpu_available():
            self.elmo = self.elmo.cuda()
        self.elmo.eval()
        self.load()


    def save(self):
        pass


    def load(self):
        self.cache = DiskDict(f'./generate/elmo.{self.dim}.cache')


    def convert(self, sentences):
        not_hit = set()
        for sent in sentences:
            key = self.make_key(sent)
            if key not in self.cache:
                not_hit.add(key)
        not_hit = list(not_hit)
        if not_hit:
            embeddings, masks = self.convert_impl([self.make_sentence(key) for key in not_hit])
            for key, embedding, mask in zip(not_hit, torch.unbind(embeddings), torch.unbind(masks)):
                embedding = embedding[:mask.sum()]
                self.cache[key] = embedding.tolist()
        embeddings = [func.tensor(self.cache[self.make_key(sent)]) for sent in sentences]
        mlen = max([e.shape[0] for e in embeddings])
        embeddings = [func.pad_zeros(e, mlen, 0) for e in embeddings]
        embeddings = torch.stack(embeddings)
        assert embeddings.requires_grad == False
        return embeddings


    def make_key(self, sent):
        return '$$'.join(sent)


    def make_sentence(self, key):
        return key.split('$$')


    def convert_impl(self, sentences):
        character_ids = func.tensor(batch_to_ids(sentences))
        m = self.elmo(character_ids)
        embeddings = m['elmo_representations']
        embeddings = torch.cat(embeddings, -1)
        mask = m['mask']
        return embeddings, mask
Beispiel #3
0
def load_elmo(opt):
    options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
    weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

    elmo = Elmo(options_file, weight_file, 3, dropout=0,
                requires_grad=False)  # by default all 3 layers are output
    if opt.gpuid != -1:
        elmo = elmo.cuda()
    return elmo
Beispiel #4
0
def main(args):

    # Set random seeds for reproducibility
    torch.manual_seed(args.seed)
    random.seed(args.seed)

    with open(args.train_path, "rb") as file:
        train_summaries = pickle.load(file, encoding='utf-8')

    # with open(args.valid_path, "rb") as file:
    #     valid_summaries = pickle.load(file, encoding='utf-8')

    for summary in train_summaries:
        convert_document(summary)

    # for summary in valid_summaries:
    #     convert_document(summary)

    elmo_instance = Elmo(options_url, weights_url, 1)
    if use_cuda:
        elmo_instance.cuda()

    begin = timer()
    total_answers = 0
    for summary in train_summaries[:]:
        answers = [[elmo_tokenize(word) for word in answer] * 2
                   for answer in summary.answers]
        answers = pad_elmo(answers)
        answers = answers * 2
        batch = variable(torch.LongTensor(answers))
        a = elmo_instance(batch)
        total_answers += len(answers)

    end = timer()
    print("Total time elapsed: {}".format(end - begin))
    print("Time per thousand answers: {}".format(
        (end - begin) * 1000 / total_answers))
Beispiel #5
0
class ELMoEmbeddingInputModule(OnlineInputModule[MCAnnotation]):
    def setup(self):
        print("Setting up Elmo Embedding")
        self.vocab = self.shared_resources
        self.config = self.shared_resources.config
        self.embeddings = self.shared_resources.embeddings
        if self.embeddings is not None:
            self.__default_vec = np.zeros([self.embeddings.shape[-1]])

        self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
        if torch.cuda.is_available():
            self.elmo.cuda()

    def setup_from_data(self, data: Iterable[Tuple[QASetting, List[Answer]]]):
        vocab = self.shared_resources.vocab
        if not vocab.frozen:
            preprocessing.fill_vocab(
                (q for q, _ in data),
                vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))
            vocab.freeze()
        if not hasattr(self.shared_resources, 'answer_vocab'
                       ) or not self.shared_resources.answer_vocab.frozen:
            self.shared_resources.answer_vocab = util.create_answer_vocab(
                qa_settings=(q for q, _ in data),
                answers=(a for _, ass in data for a in ass))
            self.shared_resources.answer_vocab.freeze()
        self.shared_resources.char_vocab = preprocessing.char_vocab_from_vocab(
            self.shared_resources.vocab)

        # Preprocess dependency info
        # if self.shared_resources.config.get("use_dep_sa", False):
        #     print("Process dependency information...", file=sys.stderr)
        #     nlp = stanfordnlp.Pipeline()
        #     type2id = nlp.processors['depparse'].trainer.vocab['deprel']

        #     for i in tqdm(range(len(data))):
        #         setting, _ = data[i]
        #         question = setting.question
        #         support = setting.support[0]

        #         doc = nlp(question + support)

        #         setting.q_tokenized = [w.text for w in doc.sentences[0].words]
        #         setting.s_tokenized = [w.text for w in doc.sentences[1].words]

        #         setting.q_dep_i = [None] * (len(setting.q_tokenized) - 1)
        #         setting.q_dep_j = [None] * (len(setting.q_tokenized) - 1)
        #         setting.q_dep_type = [None] * (len(setting.q_tokenized) - 1)
        #         for idx, d in enumerate(doc.sentences[0].dependencies):
        #             if d[1] == 'root':
        #                 continue
        #             setting.q_dep_i[idx] = int(d[0].index) - 1
        #             setting.q_dep_j[idx] = int(d[2].index) - 1
        #             setting.q_dep_type[idx] = type2id.unit2id(d[1])

        #         setting.s_dep_i = [None] * (len(setting.s_tokenized) - 1)
        #         setting.s_dep_j = [None] * (len(setting.s_tokenized) - 1)
        #         setting.s_dep_type = [None] * (len(setting.s_tokenized) - 1)
        #         for idx, d in enumerate(doc.sentences[1].dependencies):
        #             if d[1] == 'root':
        #                 continue
        #             setting.s_dep_i[idx] = int(d[0].index) - 1
        #             setting.s_dep_j[idx] = int(d[2].index) - 1
        #             setting.s_dep_type[idx] = type2id.unit2id(d[1])

        #     if torch.cuda.is_available():
        #         torch.cuda.empty_cache()

    @property
    def training_ports(self) -> List[TensorPort]:
        return [Ports.Target.target_index]

    @property
    def output_ports(self) -> List[TensorPort]:
        if self.shared_resources.embeddings is not None:
            if self.shared_resources.config.get("use_dep_sa", False):
                return [
                    Ports.Input.emb_support, Ports.Input.support_length,
                    Ports.Input.support_dep_i, Ports.Input.support_dep_j,
                    Ports.Input.support_dep_type, Ports.Input.emb_question,
                    Ports.Input.question_length, Ports.Input.question_dep_i,
                    Ports.Input.question_dep_j, Ports.Input.question_dep_type,
                    Ports.is_eval
                ]
            else:
                return [
                    Ports.Input.emb_support, Ports.Input.emb_question,
                    Ports.Input.support, Ports.Input.question,
                    Ports.Input.support_length, Ports.Input.question_length,
                    Ports.Input.sample_id, Ports.Input.word_chars,
                    Ports.Input.word_char_length,
                    Ports.Input.question_batch_words,
                    Ports.Input.support_batch_words, Ports.is_eval
                ]
        else:
            return [
                Ports.Input.support, Ports.Input.question,
                Ports.Input.support_length, Ports.Input.question_length,
                Ports.Input.sample_id, Ports.Input.word_chars,
                Ports.Input.word_char_length, Ports.Input.question_batch_words,
                Ports.Input.support_batch_words, Ports.is_eval
            ]

    def preprocess(self,
                   questions: List[QASetting],
                   answers: Optional[List[List[Answer]]] = None,
                   is_eval: bool = False) -> List[MCAnnotation]:
        if answers is None:
            answers = [None] * len(questions)
        preprocessed = []
        if len(questions) > 1000:
            bar = progressbar.ProgressBar(max_value=len(questions),
                                          widgets=[
                                              ' [',
                                              progressbar.Timer(), '] ',
                                              progressbar.Bar(), ' (',
                                              progressbar.ETA(), ') '
                                          ])
            for i, (q, a) in bar(enumerate(zip(questions, answers))):
                preprocessed.append(self.preprocess_instance(i, q, a))
        else:
            for i, (q, a) in enumerate(zip(questions, answers)):
                preprocessed.append(self.preprocess_instance(i, q, a))

        return preprocessed

    def preprocess_instance(
            self,
            idd: int,
            question: QASetting,
            answers: Optional[List[Answer]] = None) -> MCAnnotation:
        has_answers = answers is not None

        if self.shared_resources.config.get("use_dep_sa", False):
            anno = MCAnnotation(
                question_tokens=question.q_tokenized,
                question_ids=None,
                question_length=len(question.q_tokenized),
                support_tokens=question.s_tokenized,
                support_ids=None,
                support_length=len(question.s_tokenized),
                answer=self.shared_resources.answer_vocab(answers[0].text)
                if has_answers else 0,
                id=idd,
                question_dep_i=question.q_dep_i,
                question_dep_j=question.q_dep_j,
                question_dep_type=question.q_dep_type,
                support_dep_i=question.s_dep_i,
                support_dep_j=question.s_dep_j,
                support_dep_type=question.s_dep_type,
            )
            return anno
        else:
            q_tokenized, q_ids, q_length, _, _ = preprocessing.nlp_preprocess(
                question.question,
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))
            s_tokenized, s_ids, s_length, _, _ = preprocessing.nlp_preprocess(
                question.support[0],
                self.shared_resources.vocab,
                lowercase=self.shared_resources.config.get('lowercase', True))

            return MCAnnotation(
                question_tokens=q_tokenized,
                question_ids=q_ids,
                question_length=q_length,
                support_tokens=s_tokenized,
                support_ids=s_ids,
                support_length=s_length,
                answer=self.shared_resources.answer_vocab(answers[0].text)
                if has_answers else 0,
                id=idd,
                question_dep_i=None,
                question_dep_j=None,
                question_dep_type=None,
                support_dep_i=None,
                support_dep_j=None,
                support_dep_type=None,
            )

    def create_batch(self, annotations: List[MCAnnotation], is_eval: bool,
                     with_answers: bool) -> Mapping[TensorPort, np.ndarray]:
        word_chars, word_lengths, tokens, vocab, rev_vocab = \
            preprocessing.unique_words_with_chars(
                [a.question_tokens for a in annotations] + [a.support_tokens for a in annotations],
                self.shared_resources.char_vocab)
        question_words, support_words = tokens[:len(annotations)], tokens[
            len(annotations):]

        q_lengths = [a.question_length for a in annotations]
        s_lengths = [a.support_length for a in annotations]
        if self.shared_resources.config.get('use_dep_sa', False):
            xy_dict = {
                Ports.Input.support_length:
                s_lengths,
                Ports.Input.support_dep_i:
                [a.support_dep_i for a in annotations],
                Ports.Input.support_dep_j:
                [a.support_dep_j for a in annotations],
                Ports.Input.support_dep_type:
                [a.support_dep_type for a in annotations],
                Ports.Input.question_length:
                q_lengths,
                Ports.Input.question_dep_i:
                [a.question_dep_i for a in annotations],
                Ports.Input.question_dep_j:
                [a.question_dep_j for a in annotations],
                Ports.Input.question_dep_type:
                [a.question_dep_type for a in annotations],
                Ports.is_eval:
                is_eval
            }
        else:
            xy_dict = {
                Ports.Input.question_length: q_lengths,
                Ports.Input.support_length: s_lengths,
                Ports.Input.sample_id: [a.id for a in annotations],
                Ports.Input.word_chars: word_chars,
                Ports.Input.word_char_length: word_lengths,
                Ports.Input.question_batch_words: question_words,
                Ports.Input.support_batch_words: support_words,
                Ports.is_eval: is_eval,
                Ports.Input.support: [a.support_ids for a in annotations],
                Ports.Input.question: [a.question_ids for a in annotations]
            }

        if with_answers:
            xy_dict[Ports.Target.target_index] = [
                a.answer for a in annotations
            ]
        xy_dict = numpify(xy_dict)

        # Elmo embeddings
        tokens_support = [a.support_tokens for a in annotations]
        tokens_question = [a.question_tokens for a in annotations]

        # debug
        tokens_support_len = 0
        tokens_question_len = 0
        tokens_support_maxlen = 0
        tokens_question_maxlen = 0
        for a in annotations:
            tokens_support_len += len(a.support_tokens)
            tokens_question_len += len(a.question_tokens)
            tokens_support_maxlen = max(tokens_support_maxlen,
                                        len(a.support_tokens))
            tokens_question_maxlen = max(tokens_question_maxlen,
                                         len(a.question_tokens))
        # print('Q len:', tokens_question_len, 'maxlen:', tokens_question_maxlen,
        #         '  S len:', tokens_support_len, 'maxlen:', tokens_support_maxlen,
        #         file=sys.stderr)

        chars_support = batch_to_ids(tokens_support)
        chars_question = batch_to_ids(tokens_question)

        if torch.cuda.is_available():
            chars_support = chars_support.cuda()
            chars_question = chars_question.cuda()

        with torch.no_grad():
            emb_support = self.elmo(
                chars_support)['elmo_representations'][0].detach()
            emb_question = self.elmo(
                chars_question)['elmo_representations'][0].detach()

        xy_dict[Ports.Input.emb_support] = emb_support
        xy_dict[Ports.Input.emb_question] = emb_question

        return xy_dict
class BiLSTM(nn.Module):
    def __init__(self,
                 emb_dim,
                 h_dim,
                 n_labels,
                 v_size,
                 gpu=True,
                 v_vec=None,
                 batch_first=True,
                 emb_type=None,
                 elmo_model_dir=None):
        super(BiLSTM, self).__init__()
        self.gpu = gpu
        self.h_dim = h_dim
        if self.h_dim is None:
            self.h_dim = emb_dim + 36
        if emb_type == 'ELMo':
            options_file = f'{elmo_model_dir}/options.json'
            weight_file = f'{elmo_model_dir}/weights.hdf5'
            self.word_embed = Elmo(options_file,
                                   weight_file,
                                   num_output_representations=1,
                                   dropout=0)
            if gpu:
                self.word_embed = self.word_embed.cuda()
        elif emb_type == 'ELMoForManyLangs':
            from elmoformanylangs import Embedder
            e = Embedder(elmo_model_dir)
            self.word_embed = e.sents2elmo
        elif emb_type == 'None':
            self.word_embed = None
        else:
            self.word_embed = nn.Embedding(v_size, emb_dim, padding_idx=0)
        if v_vec is not None:
            v_vec = torch.tensor(v_vec)
            self.word_embed.weight.data.copy_(v_vec)

        feature_embed_layers = []
        feature_embed_size = {
            "feature:0": 25,
            "feature:1": 26,
            "feature:2": 12,
            "feature:3": 6,
            "feature:4": 94,
            "feature:5": 32
        }
        for key in feature_embed_size:
            size = feature_embed_size[key]
            feature_embed = nn.Embedding(size, 5, padding_idx=0)
            feature_embed.weight.data[0] = torch.zeros(5)
            feature_embed_layers.append(feature_embed)
        self.feature_embed_layers = nn.ModuleList(feature_embed_layers)
        self.drop_target = nn.Dropout(p=0.2)
        self.lstm = nn.LSTM(input_size=emb_dim + 36,
                            hidden_size=self.h_dim,
                            batch_first=batch_first,
                            bidirectional=True)
        self.l1 = nn.Linear(self.h_dim * 2, n_labels)

    def init_hidden(self, b_size):
        h0 = Variable(torch.zeros(1 * 2, b_size, self.h_dim))
        c0 = Variable(torch.zeros(1 * 2, b_size, self.h_dim))
        if self.gpu:
            h0 = h0.cuda()
            c0 = c0.cuda()
        return (h0, c0)

    def forward(self, x):
        self.hidden = self.init_hidden(x[2].size(0))
        if self.word_embed:
            word_emb = self.word_embed(x[0])
            if self.word_embed.__class__.__name__ == 'Embedding':
                pass
            elif self.word_embed.__class__.__name__ == 'Elmo':
                exophoras = [['これ'], ['あなた'], ['私']]
                exophora_ids = batch_to_ids(exophoras)
                if self.gpu:
                    exophora_ids = exophora_ids.cuda()
                exophora_emb = self.word_embed(exophora_ids)
                word_emb = word_emb['elmo_representations'][0]
                exophora_emb = exophora_emb['elmo_representations'][0]
                exophora_emb = exophora_emb.reshape(3, -1)
                exophora_emb = exophora_emb.repeat([word_emb.shape[0], 1, 1])
                none_emb = torch.zeros(word_emb.shape[0], 1, word_emb.shape[2])
                if self.gpu:
                    none_emb = none_emb.cuda()
                word_emb = torch.cat((none_emb, exophora_emb, word_emb), 1)
            elif self.word_embed.__func__.__name__ == 'sents2elmo':
                word_emb = [torch.tensor(emb) for emb in word_emb]
                word_emb = nn.utils.rnn.pad_sequence(word_emb,
                                                     batch_first=True,
                                                     padding_value=0)
                exophoras = [['これ'], ['あなた'], ['私']]
                exophora_emb = self.word_embed(exophoras)
                exophora_emb = torch.tensor(exophora_emb).reshape(3, -1)
                exophora_emb = exophora_emb.repeat([word_emb.shape[0], 1, 1])
                none_emb = torch.zeros(word_emb.shape[0], 1, word_emb.shape[2])
                if self.gpu:
                    word_emb = word_emb.cuda()
                    exophora_emb = exophora_emb.cuda()
                    none_emb = none_emb.cuda()
                word_emb = torch.cat((none_emb, exophora_emb, word_emb), dim=1)
        feature_emb_list = []
        for i, _x in enumerate(x[1]):
            feature_emb = self.feature_embed_layers[i](_x)
            feature_emb_list.append(feature_emb)
        x_feature = torch.tensor(x[2], dtype=torch.float, device=x[2].device)
        if self.word_embed:
            x = torch.cat(
                (word_emb, feature_emb_list[0], feature_emb_list[1],
                 feature_emb_list[2], feature_emb_list[3], feature_emb_list[4],
                 feature_emb_list[5], x_feature),
                dim=2)
        else:
            x = torch.cat(
                (feature_emb_list[0], feature_emb_list[1], feature_emb_list[2],
                 feature_emb_list[3], feature_emb_list[4], feature_emb_list[5],
                 x_feature),
                dim=2)
        x = self.drop_target(x)
        out, hidden = self.lstm(x, self.hidden)
        # out = out[:, :, :self.h_dim] + out[:, :, self.h_dim:]

        out = self.l1(out)
        return out
class WordRep(nn.Module):
    def __init__(self, data):
        super(WordRep, self).__init__()
        print("build word representation...")
        self.gpu = data.HP_gpu
        self.use_char = data.use_char
        self.batch_size = data.HP_batch_size
        self.char_hidden_dim = 0
        self.char_all_feature = False
        self.sentence_classification = data.sentence_classification
        self.use_features = data.use_features
        if self.use_char:
            self.char_hidden_dim = data.HP_char_hidden_dim
            self.char_embedding_dim = data.char_emb_dim
            if data.char_feature_extractor == "CNN":
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "LSTM":
                self.char_feature = CharBiLSTM(data.char_alphabet.size(),
                                               data.pretrain_char_embedding,
                                               self.char_embedding_dim,
                                               self.char_hidden_dim,
                                               data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "GRU":
                self.char_feature = CharBiGRU(data.char_alphabet.size(),
                                              data.pretrain_char_embedding,
                                              self.char_embedding_dim,
                                              self.char_hidden_dim,
                                              data.HP_dropout, self.gpu)
            elif data.char_feature_extractor == "ALL":
                self.char_all_feature = True
                self.char_feature = CharCNN(data.char_alphabet.size(),
                                            data.pretrain_char_embedding,
                                            self.char_embedding_dim,
                                            self.char_hidden_dim,
                                            data.HP_dropout, self.gpu)
                self.char_feature_extra = CharBiLSTM(
                    data.char_alphabet.size(), data.pretrain_char_embedding,
                    self.char_embedding_dim, self.char_hidden_dim,
                    data.HP_dropout, self.gpu)
            else:
                print(
                    "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)."
                )
                exit(0)

        self.embedding_dim = data.word_emb_dim
        self.drop = nn.Dropout(data.HP_dropout)
        self.use_elmo = data.use_elmo
        self.fine_tune_emb = data.fine_tune_emb

        if not self.use_elmo:

            self.word_embedding = nn.Embedding(data.word_alphabet.size(),
                                               self.embedding_dim)
            self.word_embedding.weight.requires_grad = self.fine_tune_emb
            if data.pretrain_word_embedding is not None:
                self.word_embedding.weight.data.copy_(
                    torch.from_numpy(data.pretrain_word_embedding))
            else:
                self.word_embedding.weight.data.copy_(
                    torch.from_numpy(
                        self.random_embedding(data.word_alphabet.size(),
                                              self.embedding_dim)))

        else:
            options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
            weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
            # Compute two different representation for each token.
            # Each representation is a linear weighted combination for the
            # 3 layers in ELMo (i.e., charcnn, the outputs of the two BiLSTM))

            if self.fine_tune_emb:
                #self.elmo = Elmo(options_file, weight_file, 1, dropout=0, scalar_mix_parameters=[1.,1.,1.])#, requires_grad=self.fine_tune_emb)
                self.elmo = Elmo(options_file,
                                 weight_file,
                                 1,
                                 dropout=0,
                                 requires_grad=True)
            else:
                self.elmo = Elmo(options_file,
                                 weight_file,
                                 1,
                                 dropout=0,
                                 scalar_mix_parameters=[0., 0., 0.])

            if self.gpu:
                self.elmo = self.elmo.cuda()

#         self.feature_num = data.feature_num
#         self.feature_embedding_dims = data.feature_emb_dims
#         self.feature_embeddings = nn.ModuleList()
#         for idx in range(self.feature_num):
#             self.feature_embeddings.append(nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))
#         for idx in range(self.feature_num):
#             if data.pretrain_feature_embeddings[idx] is not None:
#                 self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(data.pretrain_feature_embeddings[idx]))
#             else:
#                 self.feature_embeddings[idx].weight.data.copy_(torch.from_numpy(self.random_embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])))

        if data.use_features:
            self.feature_num = data.feature_num
            self.feature_embedding_dims = data.feature_emb_dims
            self.feature_embeddings = nn.ModuleList()
            for idx in range(self.feature_num):
                self.feature_embeddings.append(
                    nn.Embedding(data.feature_alphabets[idx].size(),
                                 self.feature_embedding_dims[idx]))
            for idx in range(self.feature_num):
                if data.pretrain_feature_embeddings[idx] is not None:
                    self.feature_embeddings[idx].weight.data.copy_(
                        torch.from_numpy(
                            data.pretrain_feature_embeddings[idx]))
                else:
                    self.feature_embeddings[idx].weight.data.copy_(
                        torch.from_numpy(
                            self.random_embedding(
                                data.feature_alphabets[idx].size(),
                                self.feature_embedding_dims[idx])))

        if self.gpu:
            self.drop = self.drop.cuda()

            if not self.use_elmo:
                self.word_embedding = self.word_embedding.cuda()
#             for idx in range(self.feature_num):
#                 self.feature_embeddings[idx] = self.feature_embeddings[idx].cuda()
            if data.use_features:
                for idx in range(self.feature_num):
                    self.feature_embeddings[idx] = self.feature_embeddings[
                        idx].cuda()

    def random_embedding(self, vocab_size, embedding_dim):
        pretrain_emb = np.empty([vocab_size, embedding_dim])
        scale = np.sqrt(3.0 / embedding_dim)
        for index in range(vocab_size):
            pretrain_emb[index, :] = np.random.uniform(-scale, scale,
                                                       [1, embedding_dim])
        return pretrain_emb

    def forward(self, word_inputs, feature_inputs, word_seq_lengths,
                char_inputs, char_seq_lengths, char_seq_recover,
                word_text_input):
        """
            input:
                word_inputs: (batch_size, sent_len)
                features: list [(batch_size, sent_len), (batch_len, sent_len),...]
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output:
                Variable(batch_size, sent_len, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)

        if self.use_elmo:

            character_ids = batch_to_ids(word_text_input)
            if self.gpu:
                character_ids = character_ids.cuda()

            elmo_output = self.elmo(character_ids)["elmo_representations"][0]

            if not self.fine_tune_emb:
                elmo_output = elmo_output.detach()

            word_list = [elmo_output]

        else:

            word_embs = self.word_embedding(word_inputs)
            word_list = [word_embs]

        if not self.sentence_classification and self.use_features:
            for idx in range(self.feature_num):
                word_list.append(self.feature_embeddings[idx](
                    feature_inputs[idx]))
        if self.use_char:
            ## calculate char lstm last hidden
            # print("charinput:", char_inputs)
            # exit(0)
            char_features = self.char_feature.get_last_hiddens(
                char_inputs,
                char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)
            ## concat word and char together
            word_list.append(char_features)
            word_embs = torch.cat([word_embs, char_features], 2)
            if self.char_all_feature:
                char_features_extra = self.char_feature_extra.get_last_hiddens(
                    char_inputs,
                    char_seq_lengths.cpu().numpy())
                char_features_extra = char_features_extra[char_seq_recover]
                char_features_extra = char_features_extra.view(
                    batch_size, sent_len, -1)
                ## concat word and char together
                word_list.append(char_features_extra)

        word_embs = torch.cat(word_list, 2)

        word_represent = self.drop(word_embs)
        return word_represent
Beispiel #8
0
class PairClassifier(nn.Module):
    def __init__(self,
                 input_size,
                 num_epochs=10,
                 dropout_p=0.1,
                 loss_func='crossentropy'):
        super(PairClassifier, self).__init__()
        print('elmo files:', options_file, weight_file)
        self.elmo = Elmo(options_file, weight_file, 1, dropout=0)
        if use_cuda:
            self.elmo = self.elmo.cuda()

        self.input_size = input_size
        self.epochs = num_epochs
        self.loss_func = loss_func

        self.dropout = nn.Dropout(dropout_p)
        self.fc1 = nn.Linear(
            self.input_size * 2, 2
        )  # Use this layer when train with only CNN model, i.e. No ensemble
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        # print('x:', str(x))
        batch_size = len(x)
        character_ids = batch_to_ids(x)
        if use_cuda:
            character_ids = character_ids.cuda()
        embeddings = self.elmo(character_ids)['elmo_representations']
        #print('elmo embeddings:', embeddings[0].size())
        X = embeddings[0].view(batch_size, -1, 1024)  # (N, W, D)

        # TODO: embed entity and time phrase

        x = self.dropout(x)  # (N, len(Ks)*Co)

        logit = self.fc1(x)  # (N, C)
        return logit

    ''' Create and train a CNN model
        Hybrid features supported - pass structured feats as X2
        Does NOT support joint training yet
        returns: the CNN model
    '''

    def fit(self, ann_maps, timex_maps, id_maps):
        # Train and return the model
        st = time.time()

        # TODO: create the pairs

        # Params
        batch_size = 16
        learning_rate = 0.001

        if use_cuda:
            self = self.cuda()

        Yarray = Y.astype('int')
        X_len = len(X)
        print('X len:', X_len)
        print('Y numpy shape:', str(Yarray.shape))
        steps = 0
        st = time.time()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)

        if self.loss_func == 'crossentropy':
            loss = nn.CrossEntropyLoss()
        else:
            print('ERROR: unrecognized loss function name')

        for epoch in range(self.epochs):
            print('epoch', str(epoch))
            i = 0
            numpy.random.seed(seed=1)
            perm = torch.from_numpy(numpy.random.permutation(X_len))
            permutation = perm.long()
            perm_list = perm.tolist()
            Xiter = [X[i] for i in perm_list]
            #Xiter = X[permutation]
            Yiter = Yarray[permutation]

            while i + batch_size < X_len:
                batchX = Xiter[i:i + batch_size]
                batchY = Yiter[i:i + batch_size]
                #Xtensor = torch.from_numpy(batchX).float()
                Ytensor = torch.from_numpy(batchY).long()
                if use_cuda:
                    #Xtensor = Xtensor.cuda()
                    Ytensor = Ytensor.to(tdevice)

                optimizer.zero_grad()
                logit = self(batchX)

                loss_val = loss(logit, Ytensor)
                #print('loss: ', loss_val.data.item())
                loss_val.backward()
                optimizer.step()
                steps += 1
                i = i + batch_size

            # Print epoch time
            ct = time.time() - st
            unit = "s"
            if ct > 60:
                ct = ct / 60
                unit = "m"
            print("time so far: ", str(ct), unit)
            print('loss: ', loss_val.data.item())

    def predict(self, test_anns, test_times, testids=None):
        y_pred = {}

        # TODO: create pairs

        for x in range(len(testX)):
            input_row = testX[x]

            icd = None
            if icd is None:
                icd_var = self([input_row])
                # Softmax and log softmax values
                icd_vec = self.logsoftmax(icd_var).squeeze()
                #print('pred vector:', icd_vec.size(), icd_vec)
                #print('argmax:', torch.argmax(icd_vec))
                #icd_vec_softmax = softmax(icd_var)
                cat = torch.argmax(icd_vec).item()
                if x == 0:
                    print('cat:', cat)
                #icd_code = cat

            y_pred.append(cat)
        #print "Probabilities: " + str(probs)

        return y_pred  # Uncomment this line if threshold is not in used.