def test_average_embedding_works(self): tempdir = tempfile.mkdtemp() sentences_path = os.path.join(tempdir, "sentences.txt") output_path = os.path.join(tempdir, "output.txt") sentence = "Michael went to the store to buy some eggs ." with open(sentences_path, 'w') as f: f.write(sentence) sys.argv = ["run.py", # executable "elmo", # command sentences_path, output_path, "--average", "--options-file", self.options_file, "--weight-file", self.weight_file] main() assert os.path.exists(output_path) embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file) expected_embedding = embedder.embed_sentence(sentence.split()) expected_embedding = (expected_embedding[0] + expected_embedding[1] + expected_embedding[2]) / 3 with h5py.File(output_path, 'r') as h5py_file: assert list(h5py_file.keys()) == [sentence] # The vectors in the test configuration are smaller (32 length) embedding = h5py_file.get(sentence) assert embedding.shape == (len(sentence.split()), 32) numpy.testing.assert_allclose(embedding, expected_embedding, rtol=1e-4)
def get_sentence_embeddings(self, sentences, name="test"): # Layer 0 are token representations which are not sensitive to context # Layer 1 are representations from the first bilstm # Layer 2 are the representations from the second bilstm # Load any preexisting embeddings for name. Careful, make sure that name is unique! embedding_file = self.embedding_dir + name + "sentence_embeddings.pickle" sentence_embeddings = self.load_embeddings(embedding_file) if not (len(sentence_embeddings) == len(sentences)): if self.embedder is None: self.embedder = ElmoEmbedder() sentence_embeddings = self.embedder.embed_batch(sentences) if not len(sentence_embeddings) == len(sentences): logging.info("Something went wrong with the embedding. Number of embeddings: " + str( len(sentence_embeddings)) + " Number of sentences: " + str(len(sentences))) self.save_embeddings(embedding_file, sentence_embeddings) single_layer_embeddings = [embedding[self.layer_id] for embedding in sentence_embeddings[:]] if self.only_forward: forward_embeddings = [] for sentence_embedding in single_layer_embeddings: forward_embeddings.append([token_embedding[0:512] for token_embedding in sentence_embedding]) return forward_embeddings else: return single_layer_embeddings
def test_embeddings_are_as_expected(self): loaded_sentences, loaded_embeddings = self._load_sentences_embeddings() assert len(loaded_sentences) == len(loaded_embeddings) batch_size = len(loaded_sentences) # The sentences and embeddings are organized in an idiosyncratic way TensorFlow handles batching. # We are going to reorganize them linearly so they can be grouped into batches by AllenNLP. sentences = [] expected_embeddings = [] for batch_number in range(len(loaded_sentences[0])): for index in range(batch_size): sentences.append(loaded_sentences[index][batch_number].split()) expected_embeddings.append(loaded_embeddings[index][batch_number]) assert len(expected_embeddings) == len(sentences) embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file) embeddings = list(embedder.embed_sentences(sentences, batch_size)) assert len(embeddings) == len(sentences) for tensor, expected in zip(embeddings, expected_embeddings): numpy.testing.assert_array_almost_equal(tensor[2], expected)
class ElmoEncoder(TextEncoder): def __init__(self, embedding_dir): super(ElmoEncoder, self).__init__(embedding_dir) self.layer_id = 1 self.only_forward = False self.embedder = None # Word embeddings are just sentence embeddings with sentences consisting of a single word # This is most likely too naive. def get_word_embeddings(self, words, name="test"): self.get_sentence_embeddings(name, words) # Takes a list of sentences and returns a list of embeddings def get_sentence_embeddings(self, sentences, name="test"): # Layer 0 are token representations which are not sensitive to context # Layer 1 are representations from the first bilstm # Layer 2 are the representations from the second bilstm # Load any preexisting embeddings for name. Careful, make sure that name is unique! embedding_file = self.embedding_dir + name + "sentence_embeddings.pickle" sentence_embeddings = self.load_embeddings(embedding_file) if not (len(sentence_embeddings) == len(sentences)): if self.embedder is None: self.embedder = ElmoEmbedder() sentence_embeddings = self.embedder.embed_batch(sentences) if not len(sentence_embeddings) == len(sentences): logging.info("Something went wrong with the embedding. Number of embeddings: " + str( len(sentence_embeddings)) + " Number of sentences: " + str(len(sentences))) self.save_embeddings(embedding_file, sentence_embeddings) single_layer_embeddings = [embedding[self.layer_id] for embedding in sentence_embeddings[:]] if self.only_forward: forward_embeddings = [] for sentence_embedding in single_layer_embeddings: forward_embeddings.append([token_embedding[0:512] for token_embedding in sentence_embedding]) return forward_embeddings else: return single_layer_embeddings
def test_embed_batch_contains_empty_sentence(self): embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file) embeddings = list(embedder.embed_sentences(["This is a test".split(), []])) assert len(embeddings) == 2
def test_embed_batch_contains_empty_sentence(self): embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file) embeddings = list(embedder.embed_sentences([u"This is a test".split(), []])) assert len(embeddings) == 2
parser = argparse.ArgumentParser(description='sow training') parser.add_argument('--input_file', default='../sample_test_sow_reap.txt', help='input file') parser.add_argument('--elmo_data_dir', help='path to elmo weights and options file') parser.add_argument('--output_folder', default='sow_intermediate', help='input file') args = parser.parse_args() options_file = os.path.join(args.elmo_data_dir, 'options.json') weight_file = os.path.join(args.elmo_data_dir, 'weights.hdf5') elmo = ElmoEmbedder(options_file, weight_file) batch_size = 200 input_file = open(args.input_file) input_file.readline() ### READ TWO EXTRANEUOUS LINE OF FILE input_file.readline() sentences = [] while True: line = input_file.readline() if line == "": ### REACHED END OF FILE break else: sentence = get_phrase_list.get_next_sentence(input_file) sentences.append(sentence.sent.split(' ')) embeddings = [] for i in range(int(len(sentences) / batch_size) + 1):
class ElmoEncoder(object): def __init__(self): self.elmo = ElmoEmbedder() # return: numpy array def encode_batch(self, sents): vec_seq = self.elmo.embed_sentences(sents) vecs = [] for vec in vec_seq: vecs.append(self.collapse_vec(vec)) # vecs = torch.stack(vecs) vecs = np.stack(vecs) return vecs def collapse_vec(self, vec_seq, time_combine_method="max", layer_combine_method="add"): if time_combine_method == "max": vec = vec_seq.max(axis=1) elif time_combine_method == "mean": vec = vec_seq.mean(axis=1) elif time_combine_method == "concat": vec = np.concatenate(vec_seq, axis=1) elif time_combine_method == "last": vec = vec_seq[:, -1] else: raise NotImplementedError if layer_combine_method == "add": vec = vec.sum(axis=0) elif layer_combine_method == "mean": vec = vec.mean(axis=0) elif layer_combine_method == "concat": vec = np.concatenate(vec, axis=0) elif layer_combine_method == "last": vec = vec[-1] else: raise NotImplementedError return vec def encode(self, sents, time_combine_method="max", layer_combine_method="add"): """ Load ELMo and encode sents """ vecs = {} for sent in sents: vec_seq = self.elmo.embed_sentence(sent) if time_combine_method == "max": vec = vec_seq.max(axis=1) elif time_combine_method == "mean": vec = vec_seq.mean(axis=1) elif time_combine_method == "concat": vec = np.concatenate(vec_seq, axis=1) elif time_combine_method == "last": vec = vec_seq[:, -1] else: raise NotImplementedError if layer_combine_method == "add": vec = vec.sum(axis=0) elif layer_combine_method == "mean": vec = vec.mean(axis=0) elif layer_combine_method == "concat": vec = np.concatenate(vec, axis=0) elif layer_combine_method == "last": vec = vec[-1] else: raise NotImplementedError vecs[' '.join(sent)] = vec return vecs
""" requirements pip install numpy pip install sklearn pip install allennlp pip install allennlp-models """ #import scipy import logging from sklearn.metrics.pairwise import cosine_similarity import numpy as np from allennlp.commands.elmo import ElmoEmbedder logging.info("Loading of ELMo...") elmo = ElmoEmbedder() # by default # or you can use another model #options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo = ElmoEmbedder(options_file=options_file, weight_file=weight_file) #tokens = ["I", "ate", "an", "apple", "for", "breakfast"] #vectors = elmo.embed_sentence(tokens) #scipy.spatial.distance.cosine(vecs1[2][0], vecs2[2][0]) def similarity_matrix(corpus): """ corpus = [["First", "name"], ["Second", "name"], ["Given", "name"], ["Last", "name"]] or corpus = [["First name"], ["Second name"], ["Given name"], ["Last name"]] """ layer = 2 # the output layer
class SeqVecEmbedder(EmbedderWithFallback): """SeqVec Embedder Heinzinger, Michael, et al. "Modeling aspects of the language of life through transfer-learning protein sequences." BMC bioinformatics 20.1 (2019): 723. https://doi.org/10.1186/s12859-019-3220-8 """ name = "seqvec" embedding_dimension = 1024 number_of_layers = 3 _weights_file: str _options_file: str _model: ElmoEmbedder # The fallback model running on the cpu, which will be initialized if needed _model_fallback: Optional[ElmoEmbedder] = None _necessary_files = ["weights_file", "options_file"] def __init__(self, warmup_rounds: int = 4, **kwargs): """ Initialize Elmo embedder. Can define non-positional arguments for paths of files and other settings. :param warmup_rounds: A sample sequence will be embedded this often to work around elmo's non-determinism (https://github.com/allenai/allennlp/blob/v0.9.0/tutorials/how_to/elmo.md#notes-on-statefulness-and-non-determinism) :param weights_file: path of weights file :param options_file: path of options file :param model_directory: Alternative of weights_file/options_file :param max_amino_acids: max # of amino acids to include in embed_many batches. Default: 15k AA """ super().__init__(**kwargs) # Get file locations from kwargs if "model_directory" in self._options: self._weights_file = str( Path( self._options["model_directory"]).joinpath("weights_file")) self._options_file = str( Path( self._options["model_directory"]).joinpath("options_file")) else: self._weights_file = self._options["weights_file"] self._options_file = self._options["options_file"] if self._device.type == "cuda": logger.info("CUDA available, using the GPU") cuda_device = self._device.index or 0 else: logger.info("CUDA NOT available, using the CPU. This is slow") cuda_device = -1 self._model = ElmoEmbedder( weight_file=self._weights_file, options_file=self._options_file, cuda_device=cuda_device, ) self.warmup_rounds = warmup_rounds if self.warmup_rounds > 0: logger.info("Running ELMo warmup") for _ in range(self.warmup_rounds): self.embed(_warmup_seq) def embed(self, sequence: str) -> ndarray: return self._model.embed_sentence(list(sequence)) def _get_fallback_model(self) -> ElmoEmbedder: if not self._model_fallback: logger.warning( "Loading model for CPU into RAM. Embedding on the CPU is very slow and you should avoid it." ) self._model_fallback = ElmoEmbedder( weight_file=self._weights_file, options_file=self._options_file, cuda_device=-1, ) if self.warmup_rounds > 0: logger.info("Running CPU ELMo warmup") for _ in range(self.warmup_rounds): self._model_fallback.embed_sentence(list(_warmup_seq)) return self._model_fallback def _embed_batch_impl( self, batch: List[str], model: ElmoEmbedder) -> Generator[ndarray, None, None]: # elmo expect a `List[str]` as it was meant for tokens/words with more than one character. yield from model.embed_batch([list(seq) for seq in batch]) @staticmethod def reduce_per_protein(embedding): return embedding.sum(0).mean(0)
parser.add_argument('--subsetfile', default='labels/train_split_Depression_AVEC2017.csv', type=str) parser.add_argument('--transcriptdir', type=str, default='labels_processed') parser.add_argument('-ip', type=str, default=None) parser.add_argument('-o', '--output', type=str, default='train_elmo.ark', help='feature output') parser.add_argument('-w', type=int, default=4, help="Worker count") parser.add_argument('--filterlen', default=0, type=int) parser.add_argument('--filterby', type=str, default='Participant') args = parser.parse_args() elmo = ElmoEmbedder() # Extracting features for the Participant IDs subset_df = pd.read_csv(args.subsetfile) speakers = subset_df['Participant_ID'].values with open(args.output, 'wb') as fd: for speaker in tqdm(speakers): # PRocess transcript first to get start_end transcript_file = glob( os.path.join(args.transcriptdir, str(speaker)) + '*TRANSCRIPT.csv')[0] transcript_df = pd.read_csv(transcript_file, sep='\t') transcript_df.value = transcript_df.value.str.strip() transcript_df.dropna(inplace=True) transcript_df = transcript_df[
class Model(BaseModel): def __init__(self, vocab, config): word2id = vocab.word2idx super(Model, self).__init__() vocab_num = len(word2id) self.word2id = word2id self.config = config self.char_dict = preprocess.get_char_dict('data/char_vocab.english.txt') self.genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} self.device = torch.device("cuda:" + config.cuda) self.emb = nn.Embedding(vocab_num, 350) emb1 = EmbedLoader().load_with_vocab(config.glove, vocab,normalize=False) emb2 = EmbedLoader().load_with_vocab(config.turian, vocab ,normalize=False) pre_emb = np.concatenate((emb1, emb2), axis=1) pre_emb /= (np.linalg.norm(pre_emb, axis=1, keepdims=True) + 1e-12) if pre_emb is not None: self.emb.weight = nn.Parameter(torch.from_numpy(pre_emb).float()) for param in self.emb.parameters(): param.requires_grad = False self.emb_dropout = nn.Dropout(inplace=True) if config.use_elmo: self.elmo = ElmoEmbedder(options_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_options.json', weight_file='data/elmo/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5', cuda_device=int(config.cuda)) print("elmo load over.") self.elmo_args = torch.randn((3), requires_grad=True).to(self.device) self.char_emb = nn.Embedding(len(self.char_dict), config.char_emb_size) self.conv1 = nn.Conv1d(config.char_emb_size, 50, 3) self.conv2 = nn.Conv1d(config.char_emb_size, 50, 4) self.conv3 = nn.Conv1d(config.char_emb_size, 50, 5) self.feature_emb = nn.Embedding(config.span_width, config.feature_size) self.feature_emb_dropout = nn.Dropout(p=0.2, inplace=True) self.mention_distance_emb = nn.Embedding(10, config.feature_size) self.distance_drop = nn.Dropout(p=0.2, inplace=True) self.genre_emb = nn.Embedding(7, config.feature_size) self.speaker_emb = nn.Embedding(2, config.feature_size) self.bilstm = VarLSTM(input_size=350+150*config.use_CNN+config.use_elmo*1024,hidden_size=200,bidirectional=True,batch_first=True,hidden_dropout=0.2) # self.bilstm = nn.LSTM(input_size=500, hidden_size=200, bidirectional=True, batch_first=True) self.h0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device) self.c0 = nn.init.orthogonal_(torch.empty(2, 1, 200)).to(self.device) self.bilstm_drop = nn.Dropout(p=0.2, inplace=True) self.atten = ffnn(input_size=400, hidden_size=config.atten_hidden_size, output_size=1) self.mention_score = ffnn(input_size=1320, hidden_size=config.mention_hidden_size, output_size=1) self.sa = ffnn(input_size=3980+40*config.use_metadata, hidden_size=config.sa_hidden_size, output_size=1) self.mention_start_np = None self.mention_end_np = None def _reorder_lstm(self, word_emb, seq_lens): sort_ind = sorted(range(len(seq_lens)), key=lambda i: seq_lens[i], reverse=True) seq_lens_re = [seq_lens[i] for i in sort_ind] emb_seq = self.reorder_sequence(word_emb, sort_ind, batch_first=True) packed_seq = nn.utils.rnn.pack_padded_sequence(emb_seq, seq_lens_re, batch_first=True) h0 = self.h0.repeat(1, len(seq_lens), 1) c0 = self.c0.repeat(1, len(seq_lens), 1) packed_out, final_states = self.bilstm(packed_seq, (h0, c0)) lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True) back_map = {ind: i for i, ind in enumerate(sort_ind)} reorder_ind = [back_map[i] for i in range(len(seq_lens_re))] lstm_out = self.reorder_sequence(lstm_out, reorder_ind, batch_first=True) return lstm_out def reorder_sequence(self, sequence_emb, order, batch_first=True): """ sequence_emb: [T, B, D] if not batch_first order: list of sequence length """ batch_dim = 0 if batch_first else 1 assert len(order) == sequence_emb.size()[batch_dim] order = torch.LongTensor(order) order = order.to(sequence_emb).long() sorted_ = sequence_emb.index_select(index=order, dim=batch_dim) del order return sorted_ def flat_lstm(self, lstm_out, seq_lens): batch = lstm_out.shape[0] seq = lstm_out.shape[1] dim = lstm_out.shape[2] l = [j + i * seq for i, seq_len in enumerate(seq_lens) for j in range(seq_len)] flatted = torch.index_select(lstm_out.view(batch * seq, dim), 0, torch.LongTensor(l).to(self.device)) return flatted def potential_mention_index(self, word_index, max_sent_len): # get mention index [3,2]:the first sentence is 3 and secend 2 # [0,0,0,1,1] --> [[0, 0], [0, 1], [1, 1], [1, 2], [2, 2], [3, 3], [3, 4], [4, 4]] (max =2) potential_mention = [] for i in range(len(word_index)): for j in range(i, i + max_sent_len): if (j < len(word_index) and word_index[i] == word_index[j]): potential_mention.append([i, j]) return potential_mention def get_mention_start_end(self, seq_lens): # 序列长度转换成mention # [3,2] --> [0,0,0,1,1] word_index = [0] * sum(seq_lens) sent_index = 0 index = 0 for length in seq_lens: for l in range(length): word_index[index] = sent_index index += 1 sent_index += 1 # [0,0,0,1,1]-->[[0,0],[0,1],[0,2]....] mention_id = self.potential_mention_index(word_index, self.config.span_width) mention_start = np.array(mention_id, dtype=int)[:, 0] mention_end = np.array(mention_id, dtype=int)[:, 1] return mention_start, mention_end def get_mention_emb(self, flatten_lstm, mention_start, mention_end): mention_start_tensor = torch.from_numpy(mention_start).to(self.device) mention_end_tensor = torch.from_numpy(mention_end).to(self.device) emb_start = flatten_lstm.index_select(dim=0, index=mention_start_tensor) # [mention_num,embed] emb_end = flatten_lstm.index_select(dim=0, index=mention_end_tensor) # [mention_num,embed] return emb_start, emb_end def get_mask(self, mention_start, mention_end): # big mask for attention mention_num = mention_start.shape[0] mask = np.zeros((mention_num, self.config.span_width)) # [mention_num,span_width] for i in range(mention_num): start = mention_start[i] end = mention_end[i] # 实际上是宽度 for j in range(end - start + 1): mask[i][j] = 1 mask = torch.from_numpy(mask) # [mention_num,max_mention] # 0-->-inf 1-->0 log_mask = torch.log(mask) return log_mask def get_mention_index(self, mention_start, max_mention): # TODO 后面可能要改 assert len(mention_start.shape) == 1 mention_start_tensor = torch.from_numpy(mention_start) num_mention = mention_start_tensor.shape[0] mention_index = mention_start_tensor.expand(max_mention, num_mention).transpose(0, 1) # [num_mention,max_mention] assert mention_index.shape[0] == num_mention assert mention_index.shape[1] == max_mention range_add = torch.arange(0, max_mention).expand(num_mention, max_mention).long() # [num_mention,max_mention] mention_index = mention_index + range_add mention_index = torch.min(mention_index, torch.LongTensor([mention_start[-1]]).expand(num_mention, max_mention)) return mention_index.to(self.device) def sort_mention(self, mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_lens): # 排序记录,高分段在前面 mention_score, mention_ids = torch.sort(candidate_mention_score, descending=True) preserve_mention_num = int(self.config.mention_ratio * sum(seq_lens)) mention_ids = mention_ids[0:preserve_mention_num] mention_score = mention_score[0:preserve_mention_num] mention_start_tensor = torch.from_numpy(mention_start).to(self.device).index_select(dim=0, index=mention_ids) # [lamda*word_num] mention_end_tensor = torch.from_numpy(mention_end).to(self.device).index_select(dim=0, index=mention_ids) # [lamda*word_num] mention_emb = candidate_mention_emb.index_select(index=mention_ids, dim=0) # [lamda*word_num,emb] assert mention_score.shape[0] == preserve_mention_num assert mention_start_tensor.shape[0] == preserve_mention_num assert mention_end_tensor.shape[0] == preserve_mention_num assert mention_emb.shape[0] == preserve_mention_num # TODO 不交叉没做处理 # 对start进行再排序,实际位置在前面 # TODO 这里只考虑了start没有考虑end mention_start_tensor, temp_index = torch.sort(mention_start_tensor) mention_end_tensor = mention_end_tensor.index_select(dim=0, index=temp_index) mention_emb = mention_emb.index_select(dim=0, index=temp_index) mention_score = mention_score.index_select(dim=0, index=temp_index) return mention_start_tensor, mention_end_tensor, mention_score, mention_emb def get_antecedents(self, mention_starts, max_antecedents): num_mention = mention_starts.shape[0] max_antecedents = min(max_antecedents, num_mention) # mention和它是第几个mention之间的对应关系 antecedents = np.zeros((num_mention, max_antecedents), dtype=int) # [num_mention,max_an] # 记录长度 antecedents_len = [0] * num_mention for i in range(num_mention): ante_count = 0 for j in range(max(0, i - max_antecedents), i): antecedents[i, ante_count] = j ante_count += 1 # 补位操作 for j in range(ante_count, max_antecedents): antecedents[i, j] = 0 antecedents_len[i] = ante_count assert antecedents.shape[1] == max_antecedents return antecedents, antecedents_len def get_antecedents_score(self, span_represent, mention_score, antecedents, antecedents_len, mention_speakers_ids, genre): num_mention = mention_score.shape[0] max_antecedent = antecedents.shape[1] pair_emb = self.get_pair_emb(span_represent, antecedents, mention_speakers_ids, genre) # [span_num,max_ant,emb] antecedent_scores = self.sa(pair_emb) mask01 = self.sequence_mask(antecedents_len, max_antecedent) maskinf = torch.log(mask01).to(self.device) assert maskinf.shape[1] <= max_antecedent assert antecedent_scores.shape[0] == num_mention antecedent_scores = antecedent_scores + maskinf antecedents = torch.from_numpy(antecedents).to(self.device) mention_scoreij = mention_score.unsqueeze(1) + torch.gather( mention_score.unsqueeze(0).expand(num_mention, num_mention), dim=1, index=antecedents) antecedent_scores += mention_scoreij antecedent_scores = torch.cat([torch.zeros([mention_score.shape[0], 1]).to(self.device), antecedent_scores], 1) # [num_mentions, max_ant + 1] return antecedent_scores ############################## def distance_bin(self, mention_distance): bins = torch.zeros(mention_distance.size()).byte().to(self.device) rg = [[1, 1], [2, 2], [3, 3], [4, 4], [5, 7], [8, 15], [16, 31], [32, 63], [64, 300]] for t, k in enumerate(rg): i, j = k[0], k[1] b = torch.LongTensor([i]).unsqueeze(-1).expand(mention_distance.size()).to(self.device) m1 = torch.ge(mention_distance, b) e = torch.LongTensor([j]).unsqueeze(-1).expand(mention_distance.size()).to(self.device) m2 = torch.le(mention_distance, e) bins = bins + (t + 1) * (m1 & m2) return bins.long() def get_distance_emb(self, antecedents_tensor): num_mention = antecedents_tensor.shape[0] max_ant = antecedents_tensor.shape[1] assert max_ant <= self.config.max_antecedents source = torch.arange(0, num_mention).expand(max_ant, num_mention).transpose(0,1).to(self.device) # [num_mention,max_ant] mention_distance = source - antecedents_tensor mention_distance_bin = self.distance_bin(mention_distance) distance_emb = self.mention_distance_emb(mention_distance_bin) distance_emb = self.distance_drop(distance_emb) return distance_emb def get_pair_emb(self, span_emb, antecedents, mention_speakers_ids, genre): emb_dim = span_emb.shape[1] num_span = span_emb.shape[0] max_ant = antecedents.shape[1] assert span_emb.shape[0] == antecedents.shape[0] antecedents = torch.from_numpy(antecedents).to(self.device) # [num_span,max_ant,emb] antecedent_emb = torch.gather(span_emb.unsqueeze(0).expand(num_span, num_span, emb_dim), dim=1, index=antecedents.unsqueeze(2).expand(num_span, max_ant, emb_dim)) # [num_span,max_ant,emb] target_emb_tiled = span_emb.expand((max_ant, num_span, emb_dim)) target_emb_tiled = target_emb_tiled.transpose(0, 1) similarity_emb = antecedent_emb * target_emb_tiled pair_emb_list = [target_emb_tiled, antecedent_emb, similarity_emb] # get speakers and genre if self.config.use_metadata: antecedent_speaker_ids = mention_speakers_ids.unsqueeze(0).expand(num_span, num_span).gather(dim=1, index=antecedents) same_speaker = torch.eq(mention_speakers_ids.unsqueeze(1).expand(num_span, max_ant), antecedent_speaker_ids) # [num_mention,max_ant] speaker_embedding = self.speaker_emb(same_speaker.long().to(self.device)) # [mention_num.max_ant,emb] genre_embedding = self.genre_emb( torch.LongTensor([genre]).expand(num_span, max_ant).to(self.device)) # [mention_num,max_ant,emb] pair_emb_list.append(speaker_embedding) pair_emb_list.append(genre_embedding) # get distance emb if self.config.use_distance: distance_emb = self.get_distance_emb(antecedents) pair_emb_list.append(distance_emb) pair_emb = torch.cat(pair_emb_list, 2) return pair_emb def sequence_mask(self, len_list, max_len): x = np.zeros((len(len_list), max_len)) for i in range(len(len_list)): l = len_list[i] for j in range(l): x[i][j] = 1 return torch.from_numpy(x).float() def logsumexp(self, value, dim=None, keepdim=False): """Numerically stable implementation of the operation value.exp().sum(dim, keepdim).log() """ # TODO: torch.max(value, dim=None) threw an error at time of writing if dim is not None: m, _ = torch.max(value, dim=dim, keepdim=True) value0 = value - m if keepdim is False: m = m.squeeze(dim) return m + torch.log(torch.sum(torch.exp(value0), dim=dim, keepdim=keepdim)) else: m = torch.max(value) sum_exp = torch.sum(torch.exp(value - m)) return m + torch.log(sum_exp) def softmax_loss(self, antecedent_scores, antecedent_labels): antecedent_labels = torch.from_numpy(antecedent_labels * 1).to(self.device) gold_scores = antecedent_scores + torch.log(antecedent_labels.float()) # [num_mentions, max_ant + 1] marginalized_gold_scores = self.logsumexp(gold_scores, 1) # [num_mentions] log_norm = self.logsumexp(antecedent_scores, 1) # [num_mentions] return torch.sum(log_norm - marginalized_gold_scores) # [num_mentions]reduce_logsumexp def get_predicted_antecedents(self, antecedents, antecedent_scores): predicted_antecedents = [] for i, index in enumerate(np.argmax(antecedent_scores.detach(), axis=1) - 1): if index < 0: predicted_antecedents.append(-1) else: predicted_antecedents.append(antecedents[i, index]) return predicted_antecedents def get_predicted_clusters(self, mention_starts, mention_ends, predicted_antecedents): mention_to_predicted = {} predicted_clusters = [] for i, predicted_index in enumerate(predicted_antecedents): if predicted_index < 0: continue assert i > predicted_index predicted_antecedent = (int(mention_starts[predicted_index]), int(mention_ends[predicted_index])) if predicted_antecedent in mention_to_predicted: predicted_cluster = mention_to_predicted[predicted_antecedent] else: predicted_cluster = len(predicted_clusters) predicted_clusters.append([predicted_antecedent]) mention_to_predicted[predicted_antecedent] = predicted_cluster mention = (int(mention_starts[i]), int(mention_ends[i])) predicted_clusters[predicted_cluster].append(mention) mention_to_predicted[mention] = predicted_cluster predicted_clusters = [tuple(pc) for pc in predicted_clusters] mention_to_predicted = {m: predicted_clusters[i] for m, i in mention_to_predicted.items()} return predicted_clusters, mention_to_predicted def evaluate_coref(self, mention_starts, mention_ends, predicted_antecedents, gold_clusters, evaluator): gold_clusters = [tuple(tuple(m) for m in gc) for gc in gold_clusters] mention_to_gold = {} for gc in gold_clusters: for mention in gc: mention_to_gold[mention] = gc predicted_clusters, mention_to_predicted = self.get_predicted_clusters(mention_starts, mention_ends, predicted_antecedents) evaluator.update(predicted_clusters, gold_clusters, mention_to_predicted, mention_to_gold) return predicted_clusters def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len): """ 实际输入都是tensor :param sentences: 句子,被fastNLP转化成了numpy, :param doc_np: 被fastNLP转化成了Tensor :param speaker_ids_np: 被fastNLP转化成了Tensor :param genre: 被fastNLP转化成了Tensor :param char_index: 被fastNLP转化成了Tensor :param seq_len: 被fastNLP转化成了Tensor :return: """ # change for fastNLP sentences = sentences[0].tolist() doc_tensor = doc_np[0] speakers_tensor = speaker_ids_np[0] genre = genre[0].item() char_index = char_index[0] seq_len = seq_len[0].cpu().numpy() # 类型 # doc_tensor = torch.from_numpy(doc_np).to(self.device) # speakers_tensor = torch.from_numpy(speaker_ids_np).to(self.device) mention_emb_list = [] word_emb = self.emb(doc_tensor) word_emb_list = [word_emb] if self.config.use_CNN: # [batch, length, char_length, char_dim] char = self.char_emb(char_index) char_size = char.size() # first transform to [batch *length, char_length, char_dim] # then transpose to [batch * length, char_dim, char_length] char = char.view(char_size[0] * char_size[1], char_size[2], char_size[3]).transpose(1, 2) # put into cnn [batch*length, char_filters, char_length] # then put into maxpooling [batch * length, char_filters] char_over_cnn, _ = self.conv1(char).max(dim=2) # reshape to [batch, length, char_filters] char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1) word_emb_list.append(char_over_cnn) char_over_cnn, _ = self.conv2(char).max(dim=2) char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1) word_emb_list.append(char_over_cnn) char_over_cnn, _ = self.conv3(char).max(dim=2) char_over_cnn = torch.tanh(char_over_cnn).view(char_size[0], char_size[1], -1) word_emb_list.append(char_over_cnn) # word_emb = torch.cat(word_emb_list, dim=2) # use elmo or not if self.config.use_elmo: # 如果确实被截断了 if doc_tensor.shape[0] == 50 and len(sentences) > 50: sentences = sentences[0:50] elmo_embedding, elmo_mask = self.elmo.batch_to_embeddings(sentences) elmo_embedding = elmo_embedding.to( self.device) # [sentence_num,max_sent_len,3,1024]--[sentence_num,max_sent,1024] elmo_embedding = elmo_embedding[:, 0, :, :] * self.elmo_args[0] + elmo_embedding[:, 1, :, :] * \ self.elmo_args[1] + elmo_embedding[:, 2, :, :] * self.elmo_args[2] word_emb_list.append(elmo_embedding) # print(word_emb_list[0].shape) # print(word_emb_list[1].shape) # print(word_emb_list[2].shape) # print(word_emb_list[3].shape) # print(word_emb_list[4].shape) word_emb = torch.cat(word_emb_list, dim=2) word_emb = self.emb_dropout(word_emb) # word_emb_elmo = self.emb_dropout(word_emb_elmo) lstm_out = self._reorder_lstm(word_emb, seq_len) flatten_lstm = self.flat_lstm(lstm_out, seq_len) # [word_num,emb] flatten_lstm = self.bilstm_drop(flatten_lstm) # TODO 没有按照论文写 flatten_word_emb = self.flat_lstm(word_emb, seq_len) # [word_num,emb] mention_start, mention_end = self.get_mention_start_end(seq_len) # [mention_num] self.mention_start_np = mention_start # [mention_num] np self.mention_end_np = mention_end mention_num = mention_start.shape[0] emb_start, emb_end = self.get_mention_emb(flatten_lstm, mention_start, mention_end) # [mention_num,emb] # list mention_emb_list.append(emb_start) mention_emb_list.append(emb_end) if self.config.use_width: mention_width_index = mention_end - mention_start mention_width_tensor = torch.from_numpy(mention_width_index).to(self.device) # [mention_num] mention_width_emb = self.feature_emb(mention_width_tensor) mention_width_emb = self.feature_emb_dropout(mention_width_emb) mention_emb_list.append(mention_width_emb) if self.config.model_heads: mention_index = self.get_mention_index(mention_start, self.config.span_width) # [mention_num,max_mention] log_mask_tensor = self.get_mask(mention_start, mention_end).float().to( self.device) # [mention_num,max_mention] alpha = self.atten(flatten_lstm).to(self.device) # [word_num] # 得到attention mention_head_score = torch.gather(alpha.expand(mention_num, -1), 1, mention_index).float().to(self.device) # [mention_num,max_mention] mention_attention = F.softmax(mention_head_score + log_mask_tensor, dim=1) # [mention_num,max_mention] # TODO flatte lstm word_num = flatten_lstm.shape[0] lstm_emb = flatten_lstm.shape[1] emb_num = flatten_word_emb.shape[1] # [num_mentions, max_mention_width, emb] mention_text_emb = torch.gather( flatten_word_emb.unsqueeze(1).expand(word_num, self.config.span_width, emb_num), 0, mention_index.unsqueeze(2).expand(mention_num, self.config.span_width, emb_num)) # [mention_num,emb] mention_head_emb = torch.sum( mention_attention.unsqueeze(2).expand(mention_num, self.config.span_width, emb_num) * mention_text_emb, dim=1) mention_emb_list.append(mention_head_emb) candidate_mention_emb = torch.cat(mention_emb_list, 1) # [candidate_mention_num,emb] candidate_mention_score = self.mention_score(candidate_mention_emb) # [candidate_mention_num] antecedent_scores, antecedents, mention_start_tensor, mention_end_tensor = (None, None, None, None) mention_start_tensor, mention_end_tensor, mention_score, mention_emb = \ self.sort_mention(mention_start, mention_end, candidate_mention_emb, candidate_mention_score, seq_len) mention_speakers_ids = speakers_tensor.index_select(dim=0, index=mention_start_tensor) # num_mention antecedents, antecedents_len = self.get_antecedents(mention_start_tensor, self.config.max_antecedents) antecedent_scores = self.get_antecedents_score(mention_emb, mention_score, antecedents, antecedents_len, mention_speakers_ids, genre) ans = {"candidate_mention_score": candidate_mention_score, "antecedent_scores": antecedent_scores, "antecedents": antecedents, "mention_start_tensor": mention_start_tensor, "mention_end_tensor": mention_end_tensor} return ans def predict(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len): ans = self(sentences, doc_np, speaker_ids_np, genre, char_index, seq_len) predicted_antecedents = self.get_predicted_antecedents(ans["antecedents"], ans["antecedent_scores"]) predicted_clusters, mention_to_predicted = self.get_predicted_clusters(ans["mention_start_tensor"], ans["mention_end_tensor"], predicted_antecedents) return {'predicted':predicted_clusters,"mention_to_predicted":mention_to_predicted}
args, extra_args = argparser.parse_known_args() config = Configurable(args.config_file, extra_args) torch.set_num_threads(args.thread) vocab = creatVocab(config.train_file, config.min_occur_count) pickle.dump(vocab, open(config.save_vocab_path, 'wb')) config.use_cuda = False gpu_id = -1 if gpu and args.gpu >= 0: torch.cuda.set_device(args.gpu) config.use_cuda = True print("GPU ID: ", args.gpu) gpu_id = args.gpu elmo = ElmoEmbedder(config.elmo_option_file, config.elmo_weight_file, gpu_id) elmo_layers = elmo.elmo_bilm.num_layers elmo_dims = elmo.elmo_bilm.get_output_dim() model = BiLSTMModel(vocab, config, (elmo_layers, elmo_dims)) if config.use_cuda: #torch.backends.cudnn.enabled = True model = model.cuda() classifier = SentenceClassifier(model, elmo, vocab) data = read_corpus(config.train_file) dev_data = read_corpus(config.dev_file) test_data = read_corpus(config.test_file)
class ElmoExtractor(DirectSentenceExtractor): _options_file = 'elmo_2x2048_256_2048cnn_1xhighway_options.json' _weight_file = 'elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5' _method = 'elmo' def __init__(self, layer=None): super(ElmoExtractor, self).__init__() self._options_file = os.path.join(self._embedding_model_path, self._method, self._options_file) self._weight_file = os.path.join(self._embedding_model_path, self._method, self._weight_file) '''first check whether weight/option exist''' if not os.path.exists(self._options_file): raise ValueError('Elmo model file(s) ' + ' '\ ' are missing. Please download ' + \ ' the model files via the following command: ' +\ ' python download.py(\'elmo\')' ) self._elmoObj = ElmoEmbedder(options_file=self._options_file, \ weight_file=self._weight_file) if layer == None: self._layer = 'default' else: self._layer = layer def getVector(self, stim, cbow=False): if not isinstance(stim, list): stim = [stim] if cbow == True: stim_cbows = self.generateCbows(stim) stim = stim_cbows embeddings = [] for s in stim: tokens = s.split() embedding_layers = self._elmoObj.embed_sentence(tokens) if self._layer == 'default': '''we take the average from the three layers of Elmo''' embedding_avg = np.average(embedding_layers, axis=0) sentence_op = np.average(embedding_avg, axis=0) elif self._layer == 'top': embedding_top = embedding_layers[-1] sentence_op = np.average(embedding_top, axis=0) elif self._layer == 'bottom': embedding_bottom = embedding_layers[1] sentence_op = np.average(embedding_bottom, axis=0) embeddings.append(sentence_op) num_dims = embeddings[0].shape[0] features = ['%s%d' % (self.prefix, i) for i in range(num_dims)] return ExtractorResult(embeddings, stim, self, features=features)
class DataGenerator(): def __init__(self, configs): self.configs = configs self.elmo = ElmoEmbedder(options_file=self.configs['elmo_option_file'], weight_file=self.configs['elmo_weight_file'], cuda_device=0) self.train_c_r, self.train_label = self.load_train_data() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), ' : Finish Loading Training Data...') self.dev_c_r, self.dev_label = self.load_dev_data() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), ' : Finish Loading Dev Data...') self.train_data_size = len(self.train_label) print('Train set size: ', self.train_data_size) self.dev_data_size = len(self.dev_label) print('Dev set size: ', self.dev_data_size) def train_data_generator(self, batch_num): train_size = self.train_data_size start = batch_num * self.configs['batch_size'] % train_size end = (batch_num * self.configs['batch_size'] + self.configs['batch_size']) % train_size # shuffle data at the beginning of every epoch if batch_num == 0: self.train_c_r, self.train_label, _ = self.unison_shuffled_copies( self.train_c_r, self.train_label) print( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), ' : Finish Shuffling Data...') if start < end: batches_label = self.train_label[start:end] batches_c_r = self.train_c_r[start:end] else: batches_label = self.train_label[train_size - self. configs['batch_size']:train_size] batches_c_r = self.train_c_r[train_size - self.configs['batch_size']:train_size] turns, turn_num, turn_len, response, response_len, label = self.batch2placeholder( batches_c_r, batches_label) return turns, turn_num, turn_len, response, response_len, label def dev_data_generator(self, batch_num): """ This function return training/validation/test data for classifier. batch_num*batch_size is start point of the batch. :param batch_size: int. the size of each batch :return: [[[float32,],],]. [[[wordembedding]element,]batch,] """ dev_size = self.dev_data_size start = batch_num * self.configs['batch_size'] % dev_size end = (batch_num * self.configs['batch_size'] + self.configs['batch_size']) % dev_size if start < end: batches_label = self.dev_label[start:end] batches_c_r = self.dev_c_r[start:end] else: batches_label = self.dev_label[start:] batches_c_r = self.dev_c_r[start:] turns, turn_num, turn_len, response, response_len, label = self.batch2placeholder( batches_c_r, batches_label) return turns, turn_num, turn_len, response, response_len, label def batch2placeholder(self, batches_c_r, batches_label): tmp = list(zip(*batches_c_r)) example_id_c_r, turns, turn_num, turn_len, candidate, candidate_len = tmp[ 0], tmp[1], tmp[2], tmp[3], tmp[4], tmp[5] tmp = list(zip(*batches_label)) example_id_y, label = tmp[0], tmp[1] assert example_id_c_r == example_id_y # shuffle respone order in one example candidate, candidate_len, label = self.shuffle_response( candidate, candidate_len, label) # generate elmo embedding turns, turn_len, candidate = self.elmo_emb(turns, turn_len, candidate) return turns, turn_num, turn_len, candidate, candidate_len, label def unison_shuffled_copies(self, a, b): assert len(a) == len(b) p = np.random.permutation(len(a)) return a[p], b[p], p def shuffle_response(self, response, response_len, label): """ responses contain ground truth id :param response: (batch_size, options_num, max_turn_len) :param response_len: (batch_size, options_num) :param label: (batch_size) :return: """ tmp_response = np.zeros_like(response) tmp_response_len = np.zeros_like(response_len) tmp_label = np.zeros_like(label) for i in range(len(response)): tmp_response[i], tmp_response_len[ i], shuffle_id = self.unison_shuffled_copies( np.array(response[i]), np.array(response_len[i])) tmp_label[i] = np.argwhere(shuffle_id == label[i]) return tmp_response, tmp_response_len, tmp_label def get_context_response(self, data): """ :param data: :param eos_idx: :param max_turn_num: :param max_turn_len: :return: array of tuple, tuple:(sent_list, example_turn_num, example_turn_len) """ saver = [] for c in range(data.shape[0]): turn_num = data['turn_num'][c] turn_len = data['turn_len'][c] c_s = data['context'][c] if len(c_s) > self.configs['max_turn_num']: c_s = c_s[-self.configs['max_turn_num']:] turn_num = self.configs['max_turn_num'] turn_len = turn_len[-self.configs['max_turn_num']:] r_s = data['candidate'][c] res = np.array([ data['id'][c], c_s, turn_num, turn_len, r_s, data['candidate_len'][c] ], dtype=object) saver.append(res) return np.array(saver) def get_label(self, data): saver = [] for e in range(data.shape[0]): res = np.array([data['id'][e], 0], dtype=object) saver.append(res) return np.array(saver) def load_train_data(self): if os.path.exists( self.configs['process_train_data']) and os.path.getsize( self.configs['process_train_data']) > 0: with open(self.configs['process_train_data'], 'rb') as f: train_c_r, train_label = pickle.load(f) else: with open(self.configs['train_data'], 'rb') as f: train_data = pickle.load(f) train_c_r = self.get_context_response(train_data) train_label = self.get_label(train_data) with open(self.configs['process_train_data'], 'wb') as f: pickle.dump((train_c_r, train_label), f) return train_c_r, train_label def load_dev_data(self): if os.path.exists( self.configs['process_dev_data']) and os.path.getsize( self.configs['process_dev_data']) > 0: with open(self.configs['process_dev_data'], 'rb') as f: dev_c_r, dev_label = pickle.load(f) else: with open(self.configs['dev_data'], 'rb') as f: dev_data = pickle.load(f) dev_c_r = self.get_context_response(dev_data) dev_label = self.get_label(dev_data) with open(self.configs['process_dev_data'], 'wb') as f: pickle.dump((dev_c_r, dev_label), f) return dev_c_r, dev_label def elmo_emb(self, turns, turn_len, candidate): _turns = [] _candidate = [] _turns_len = [] for idx in range(self.configs['batch_size']): turns_emb = self.elmo.embed_batch(turns[idx]) candidate_emb = self.elmo.embed_batch(candidate[idx]) pad_len = np.zeros(shape=[self.configs['max_turn_num']]) pad_len[:len(turn_len[idx])] = turn_len[idx] _turns_len.append(pad_len) # Padding turns embedding turns_emb_pad = [] for i, emb in enumerate(turns_emb): pad_emb = np.zeros(shape=[ self.configs['elmo_layer'], self.configs['max_turn_len'], self.configs['emb_size'] ], dtype=np.float32) pad_emb[:emb.shape[0], :emb.shape[1], :emb.shape[2]] = emb turns_emb_pad.append(pad_emb) turns_emb_pad = np.array(turns_emb_pad) turns_pad = np.zeros(shape=[ self.configs['max_turn_num'], self.configs['elmo_layer'], self.configs['max_turn_len'], self.configs['emb_size'] ], dtype=np.float32) turns_pad[:turns_emb_pad.shape[0], :, :, :] = turns_emb_pad # Padding candidate embedding candidate_emb_pad = [] for emb in candidate_emb: pad_emb = np.zeros(shape=[ self.configs['elmo_layer'], self.configs['max_turn_len'], self.configs['emb_size'] ], dtype=np.float32) pad_emb[:emb.shape[0], :emb.shape[1], :emb.shape[2]] = emb candidate_emb_pad.append(pad_emb) candidate_emb_pad = np.array(candidate_emb_pad) _turns.append(turns_pad) _candidate.append(candidate_emb_pad) _turns = np.array(_turns) _candidate = np.array(_candidate) _turns_len = np.array(_turns_len) return _turns, _turns_len, _candidate
def main(): parser = argparse.ArgumentParser(description='Tuning with bi-directional RNN-CNN-CRF') parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', required=True) parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_filters', type=int, default=30, help='Number of filters in CNN') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--learning_rate', type=float, default=0.015, help='Learning rate') parser.add_argument('--alpha', type=float, default=0.1, help='alpha of rmsprop') parser.add_argument('--momentum', type=float, default=0, help='momentum') parser.add_argument('--lr_decay', type=float, default=0, help='Decay rate of learning rate') parser.add_argument('--gamma', type=float, default=0.0, help='weight for regularization') parser.add_argument('--dropout', choices=['std', 'variational'], help='type of dropout', required=True) parser.add_argument('--p', type=float, default=0.5, help='dropout rate') parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF') parser.add_argument('--schedule', type=int, help='schedule for learning rate decay') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--embedding', choices=['word2vec', 'glove', 'senna', 'sskip', 'polyglot', 'elmo'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--elmo_option', help='path for ELMo option file') parser.add_argument('--elmo_weight', help='path for ELMo weight file') parser.add_argument('--elmo_cuda', help='assign GPU for ELMo embedding task') parser.add_argument('--attention', choices=['none', 'mlp', 'fine'], help='attetion mode', required=True) parser.add_argument('--data_reduce', help='data size reduce, value is keeping rate', default=1.0) parser.add_argument('--train') # "data/POS-penn/wsj/split1/wsj1.train.original" parser.add_argument('--dev') # "data/POS-penn/wsj/split1/wsj1.dev.original" parser.add_argument('--test') # "data/POS-penn/wsj/split1/wsj1.test.original" args = parser.parse_args() logger = get_logger("NERCRF") mode = args.mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size num_filters = args.num_filters learning_rate = args.learning_rate alpha = args.alpha momentum = args.momentum lr_decay = args.lr_decay gamma = args.gamma schedule = args.schedule p = args.p unk_replace = args.unk_replace bigram = args.bigram embedding = args.embedding embedding_path = args.embedding_dict elmo_option = args.elmo_option elmo_weight = args.elmo_weight elmo_cuda = int(args.elmo_cuda) attention_mode = args.attention data_reduce = float(args.data_reduce) embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, pos_alphabet, \ chunk_alphabet, ner_alphabet = bionlp_data.create_alphabets(os.path.join(Path(train_path).parent.abspath( ), "alphabets"), train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) if embedding == 'elmo': logger.info("Loading ELMo Embedder") ee = ElmoEmbedder(options_file=elmo_option, weight_file=elmo_weight, cuda_device=elmo_cuda) else: ee = None logger.info("Reading Data") use_gpu = torch.cuda.is_available() data_train = bionlp_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, elmo_ee=ee, data_reduce=data_reduce) num_data = sum(data_train[1]) num_labels = ner_alphabet.size() data_dev = bionlp_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True, elmo_ee=ee) data_test = bionlp_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, use_gpu=use_gpu, volatile=True, elmo_ee=ee) writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[bionlp_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if not embedd_dict == None and word in embedd_dict: embedding = embedd_dict[word] elif not embedd_dict == None and word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = 1 tag_space = args.tag_space if args.dropout == 'std': if attention_mode == 'none': network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p, p_rnn=p, bigram=bigram, elmo=(embedding == 'elmo')) else: network = BiRecurrentConvAttentionCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, p_in=p, p_rnn=p, bigram=bigram, elmo=(embedding == 'elmo'), attention_mode=attention_mode) else: raise NotImplementedError if use_gpu: network.cuda() lr = learning_rate # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) optim = RMSprop(network.parameters(), lr=lr, alpha=alpha, momentum=momentum, weight_decay=gamma) logger.info("Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s" % ( mode, num_layers, hidden_size, num_filters, tag_space, 'bigram' if bigram else 'unigram')) logger.info("training: l2: %f, (#training data: %d, batch: %d, dropout: %.2f, unk replace: %.2f)" % ( gamma, num_data, batch_size, p, unk_replace)) num_batches = num_data // batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 for epoch in range(1, num_epochs + 1): print('Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % ( epoch, mode, args.dropout, lr, lr_decay, schedule)) train_err = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch in range(1, num_batches + 1): word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable(data_train, batch_size, unk_replace=unk_replace) optim.zero_grad() loss = network.loss(word, char, labels, mask=masks, elmo_word=elmo_embedding) loss.backward() clip_grad_norm(network.parameters(), 5.0) optim.step() num_inst = word.size(0) train_err += loss.data[0] * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch time_left = (num_batches - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % ( batch, num_batches, train_err / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (num_batches, train_err / train_total, time.time() - start_time)) # evaluate performance on dev data network.eval() tmp_filename = 'tmp/%s_dev%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable(data_dev, batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode(word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d' % (str(uid), epoch) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable(data_test, batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode(word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename) print("best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( test_acc, test_precision, test_recall, test_f1, best_epoch)) if epoch % schedule == 0: # lr = learning_rate / (1.0 + epoch * lr_decay) # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) lr = lr * lr_decay optim.param_groups[0]['lr'] = lr
def main(): parser = argparse.ArgumentParser( description='Tuning with Multitask bi-directional RNN-CNN-CRF') parser.add_argument('--config', help='Config file (Python file format)', default="config_multitask.py") parser.add_argument('--grid', help='Grid Search Options', default="{}") args = parser.parse_args() logger = get_logger("Multi-Task") use_gpu = torch.cuda.is_available() # Config Tensorboard Writer log_writer = SummaryWriter() # Load from config file spec = importlib.util.spec_from_file_location("config", args.config) config_module = importlib.util.module_from_spec(spec) spec.loader.exec_module(config_module) config = config_module.entries # Load options from grid search options = eval(args.grid) for k, v in options.items(): if isinstance(v, six.string_types): cmd = "%s = \"%s\"" % (k, v) else: cmd = "%s = %s" % (k, v) log_writer.add_scalar(k, v, 1) exec(cmd) # Load embedding dict embedding = config.embedding.embedding_type embedding_path = config.embedding.embedding_dict embedd_dict, embedd_dim = utils.load_embedding_dict( embedding, embedding_path) # Collect data path data_dir = config.data.data_dir data_names = config.data.data_names train_paths = [ os.path.join(data_dir, data_name, "train.tsv") for data_name in data_names ] dev_paths = [ os.path.join(data_dir, data_name, "devel.tsv") for data_name in data_names ] test_paths = [ os.path.join(data_dir, data_name, "test.tsv") for data_name in data_names ] # Create alphabets logger.info("Creating Alphabets") if not os.path.exists('tmp'): os.mkdir('tmp') word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet, ner_alphabet_task, label_reflect = \ bionlp_data.create_alphabets(os.path.join(Path(data_dir).abspath(), "alphabets", "_".join(data_names)), train_paths, data_paths=dev_paths + test_paths, use_cache=True, embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("POS Alphabet Size: %d" % pos_alphabet.size()) logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info( "NER Alphabet Size per Task: %s", str([task_alphabet.size() for task_alphabet in ner_alphabet_task])) #task_reflects = torch.LongTensor(reverse_reflect(label_reflect, ner_alphabet.size())) #if use_gpu: # task_reflects = task_reflects.cuda() if embedding == 'elmo': logger.info("Loading ELMo Embedder") ee = ElmoEmbedder(options_file=config.embedding.elmo_option, weight_file=config.embedding.elmo_weight, cuda_device=config.embedding.elmo_cuda) else: ee = None logger.info("Reading Data") # Prepare dataset data_trains = [ bionlp_data.read_data_to_variable(train_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, elmo_ee=ee) for task_id, train_path in enumerate(train_paths) ] num_data = [sum(data_train[1]) for data_train in data_trains] num_labels = ner_alphabet.size() num_labels_task = [task_item.size() for task_item in ner_alphabet_task] data_devs = [ bionlp_data.read_data_to_variable(dev_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, dev_path in enumerate(dev_paths) ] data_tests = [ bionlp_data.read_data_to_variable(test_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet_task[task_id], use_gpu=use_gpu, volatile=True, elmo_ee=ee) for task_id, test_path in enumerate(test_paths) ] writer = BioNLPWriter(word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[bionlp_data.UNK_ID, :] = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if not embedd_dict == None and word in embedd_dict: embedding = embedd_dict[word] elif not embedd_dict == None and word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform( -scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") # Construct network window = 3 num_layers = 1 mode = config.rnn.mode hidden_size = config.rnn.hidden_size char_dim = config.rnn.char_dim num_filters = config.rnn.num_filters tag_space = config.rnn.tag_space bigram = config.rnn.bigram attention_mode = config.rnn.attention if config.rnn.dropout == 'std': network = FullySharedBiRecurrentCRF( len(data_trains), embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), num_filters, window, mode, hidden_size, num_layers, num_labels, num_labels_task=num_labels_task, tag_space=tag_space, embedd_word=word_table, p_in=config.rnn.p, p_rnn=config.rnn.p, bigram=bigram, elmo=(embedding == 'elmo'), attention_mode=attention_mode, adv_loss_coef=config.multitask.adv_loss_coef, diff_loss_coef=config.multitask.diff_loss_coef, char_level_rnn=config.rnn.char_level_rnn) else: raise NotImplementedError if use_gpu: network.cuda() # Prepare training unk_replace = config.embedding.unk_replace num_epochs = config.training.num_epochs batch_size = config.training.batch_size lr = config.training.learning_rate momentum = config.training.momentum alpha = config.training.alpha lr_decay = config.training.lr_decay schedule = config.training.schedule gamma = config.training.gamma # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) optim = RMSprop(network.parameters(), lr=lr, alpha=alpha, momentum=momentum, weight_decay=gamma) logger.info( "Network: %s, num_layer=%d, hidden=%d, filter=%d, tag_space=%d, crf=%s" % (mode, num_layers, hidden_size, num_filters, tag_space, 'bigram' if bigram else 'unigram')) logger.info( "training: l2: %f, (#training data: %s, batch: %d, dropout: %.2f, unk replace: %.2f)" % (gamma, num_data, batch_size, config.rnn.p, unk_replace)) num_batches = [x // batch_size + 1 for x in num_data] dev_f1 = [0.0 for x in num_data] dev_acc = [0.0 for x in num_data] dev_precision = [0.0 for x in num_data] dev_recall = [0.0 for x in num_data] test_f1 = [0.0 for x in num_data] test_acc = [0.0 for x in num_data] test_precision = [0.0 for x in num_data] test_recall = [0.0 for x in num_data] best_epoch = [0 for x in num_data] # Training procedure for epoch in range(1, num_epochs + 1): print( 'Epoch %d (%s(%s), learning rate=%.4f, decay rate=%.4f (schedule=%d)): ' % (epoch, mode, config.rnn.dropout, lr, lr_decay, schedule)) train_err = 0. train_total = 0. # Gradient decent on training data start_time = time.time() num_back = 0 network.train() batch_count = 0 for batch in range(1, 2 * num_batches[0] + 1): r = random.random() task_id = 0 if r <= 0.5 else random.randint(1, len(num_data) - 1) batch_count += 1 word, char, _, _, labels, masks, lengths, elmo_embedding = bionlp_data.get_batch_variable( data_trains[task_id], batch_size, unk_replace=unk_replace) optim.zero_grad() loss, task_loss, adv_loss, diff_loss = network.loss( task_id, word, char, labels, mask=masks, elmo_word=elmo_embedding) #log_writer.add_scalars( # 'train_loss_task' + str(task_id), # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (num_batches[task_id] + 1) + batch #) #log_writer.add_scalars( # 'train_loss_overview', # {'all_loss': loss, 'task_loss': task_loss, 'adv_loss': adv_loss, 'diff_loss': diff_loss}, # (epoch - 1) * (sum(num_batches) + 1) + batch_count #) loss.backward() clip_grad_norm(network.parameters(), 5.0) optim.step() num_inst = word.size(0) train_err += loss.data[0] * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch time_left = (2 * num_batches[0] - batch) * time_ave # update log if batch % 100 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss: %.4f, time left (estimated): %.2fs' % ( batch, 2 * num_batches[0], train_err / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, time: %.2fs' % (2 * num_batches[0], train_err / train_total, time.time() - start_time)) # Evaluate performance on dev data network.eval() for task_id in range(len(num_batches)): tmp_filename = 'tmp/%s_dev%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_devs[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() acc, precision, recall, f1 = evaluate(tmp_filename) log_writer.add_scalars( 'dev_task' + str(task_id), { 'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1 }, epoch) print( 'dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) if dev_f1[task_id] < f1: dev_f1[task_id] = f1 dev_acc[task_id] = acc dev_precision[task_id] = precision dev_recall[task_id] = recall best_epoch[task_id] = epoch # Evaluate on test data when better performance detected tmp_filename = 'tmp/%s_test%d%d' % (str(uid), epoch, task_id) writer.start(tmp_filename) for batch in bionlp_data.iterate_batch_variable( data_tests[task_id], batch_size): word, char, pos, chunk, labels, masks, lengths, elmo_embedding = batch preds, _ = network.decode( task_id, word, char, target=labels, mask=masks, leading_symbolic=bionlp_data.NUM_SYMBOLIC_TAGS, elmo_word=elmo_embedding) writer.write(word.data.cpu().numpy(), pos.data.cpu().numpy(), chunk.data.cpu().numpy(), preds.cpu().numpy(), labels.data.cpu().numpy(), lengths.cpu().numpy()) writer.close() test_acc[task_id], test_precision[task_id], test_recall[ task_id], test_f1[task_id] = evaluate(tmp_filename) log_writer.add_scalars( 'test_task' + str(task_id), { 'accuracy': test_acc[task_id], 'precision': test_precision[task_id], 'recall': test_recall[task_id], 'f1': test_f1[task_id] }, epoch) print( "================================================================================" ) print("dataset: %s" % data_names[task_id]) print( "best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (dev_acc[task_id], dev_precision[task_id], dev_recall[task_id], dev_f1[task_id], best_epoch[task_id])) print( "best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % (test_acc[task_id], test_precision[task_id], test_recall[task_id], test_f1[task_id], best_epoch[task_id])) print( "================================================================================\n" ) if epoch % schedule == 0: # lr = learning_rate / (1.0 + epoch * lr_decay) # optim = SGD(network.parameters(), lr=lr, momentum=momentum, weight_decay=gamma, nesterov=True) lr = lr * lr_decay optim.param_groups[0]['lr'] = lr # writer.export_scalars_to_json("./all_scalars.json") writer.close()
def __init__(self, args, word_vocab): super().__init__(args, word_vocab) # import ElmoEmbedder here so that the cuda_visible_divices can work from allennlp.commands.elmo import ElmoEmbedder self.elmo = ElmoEmbedder(cuda_device=0 if args.gpu is not None else -1)
class ELMOCRFSegModel(LSTMCRFSegModel): def __init__(self, args, word_vocab): super().__init__(args, word_vocab) # import ElmoEmbedder here so that the cuda_visible_divices can work from allennlp.commands.elmo import ElmoEmbedder self.elmo = ElmoEmbedder(cuda_device=0 if args.gpu is not None else -1) def _setup_placeholders(self): self.placeholders = { 'input_words': tf.placeholder(tf.int32, shape=[None, None]), 'input_length': tf.placeholder(tf.int32, shape=[None]), 'elmo_vectors': tf.placeholder(tf.float32, shape=[None, 3, None, 1024]), 'seg_labels': tf.placeholder(tf.float32, shape=[None, None]), 'dropout_keep_prob': tf.placeholder(tf.float32) } def _embed(self): with tf.device('/cpu:0'): word_emb_init = tf.constant_initializer(self.word_vocab.embeddings) if self.word_vocab.embeddings is not None \ else tf.random_normal_initializer() self.word_embeddings = tf.get_variable( 'word_embeddings', shape=(self.word_vocab.size(), self.word_vocab.embed_dim), initializer=word_emb_init, trainable=False) self.embedded_words = tf.nn.embedding_lookup( self.word_embeddings, self.placeholders['input_words']) self.elmo_weights = tf.nn.softmax( tf.get_variable('elmo_weights', [3], dtype=tf.float32, trainable=True)) self.scale_para = tf.get_variable('scale_para', [1], dtype=tf.float32, trainable=True) self.elmo_vectors = self.scale_para * ( self.elmo_weights[0] * self.placeholders['elmo_vectors'][:, 0, :, :] + self.elmo_weights[1] * self.placeholders['elmo_vectors'][:, 1, :, :] + self.elmo_weights[2] * self.placeholders['elmo_vectors'][:, 2, :, :]) self.embedded_inputs = tf.concat( [self.embedded_words, self.elmo_vectors], -1) self.embedded_inputs = tf.nn.dropout( self.embedded_inputs, self.placeholders['dropout_keep_prob']) def _compute_loss(self): self.loss = tf.reduce_mean(-self.log_likelyhood, 0) if self.weight_decay > 0: with tf.variable_scope('l2_loss'): l2_loss = tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'bias' not in v.name ]) self.loss += self.weight_decay * l2_loss def _train_epoch(self, train_batches, print_every_n_batch): total_loss, total_batch_num = 0, 0 for bitx, batch in enumerate(train_batches): feed_dict = { self.placeholders['input_words']: batch['word_ids'], self.placeholders['input_length']: batch['length'], self.placeholders['seg_labels']: batch['seg_labels'] } elmo_vectors, mask = self.elmo.batch_to_embeddings( [sample['words'] for sample in batch['raw_data']]) feed_dict[self.placeholders['elmo_vectors']] = np.asarray( elmo_vectors.cpu().data) feed_dict[self.placeholders[ 'dropout_keep_prob']] = self.dropout_keep_prob _, loss, grad_norm = self.sess.run( [self.train_op, self.loss, self.grad_norm], feed_dict) if bitx != 0 and print_every_n_batch > 0 and bitx % print_every_n_batch == 0: self.logger.info('bitx: {}, loss: {}, grad: {}'.format( bitx, loss, grad_norm)) total_loss += loss total_batch_num += 1 return total_loss / total_batch_num def segment(self, batch): feed_dict = { self.placeholders['input_words']: batch['word_ids'], self.placeholders['input_length']: batch['length'] } elmo_vectors, mask = self.elmo.batch_to_embeddings( [sample['words'] for sample in batch['raw_data']]) feed_dict[self.placeholders['elmo_vectors']] = np.asarray( elmo_vectors.data.cpu()) feed_dict[self.placeholders['dropout_keep_prob']] = 1.0 scores, trans_params = self.sess.run([self.scores, self.trans_params], feed_dict) batch_pred_segs = [] # log_likes = [] for sample_idx in range(len(batch['raw_data'])): length = batch['length'][sample_idx] viterbi_seq, viterbi_score = tc.crf.viterbi_decode( scores[sample_idx][:length], trans_params) # with tf.Graph().as_default(), tf.Session() as session: # length_tensor = tf.expand_dims(c2t(length), axis=0) # viterbi_seq_tensor = tf.expand_dims(c2t(viterbi_seq), axis=0) # scores_tensor = c2t(scores) # trans_params_tensor = c2t(trans_params) # log_likelihood, tparams = tc.crf.crf_log_likelihood(scores_tensor, viterbi_seq_tensor, length_tensor, trans_params_tensor) # log_like_numpy = session.run(log_likelihood) # log_likes.append(log_like_numpy) # tf.get_default_graph().finalize() pred_segs = [] for word_idx, label in enumerate(viterbi_seq): if label == 1: pred_segs.append(word_idx) batch_pred_segs.append(pred_segs) return batch_pred_segs # , log_likes
@author: OHyic """ from allennlp.commands.elmo import ElmoEmbedder import numpy as np #define max token length max_tokens = 60 #input sentences sentences = [ "how are you doing", "what is your name", "can you subscribe to my channel" ] #create a pretrained elmo model (requires internet connection) elmo = ElmoEmbedder(cuda_device=0) embeddings = [] #loop through the input sentences for index, elmo_embedding in enumerate(elmo.embed_sentences(sentences)): print("elmo:", index) # Average the 3 layers returned from Elmo avg_elmo_embedding = np.average(elmo_embedding, axis=0) padding_length = max_tokens - avg_elmo_embedding.shape[0] if (padding_length > 0): avg_elmo_embedding = np.append(avg_elmo_embedding, np.zeros((padding_length, avg_elmo_embedding.shape[1])), axis=0) else: avg_elmo_embedding = avg_elmo_embedding[:max_tokens]
import numpy as np from allennlp.commands.elmo import ElmoEmbedder import time ### ELMo embedding on training data weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" start = time.time() print("Downloading elmo model...") elmo = ElmoEmbedder(options_file, weight_file) print("Downloaded in %fs" % (time.time()-start)) start = time.time() sentences = ["First sentence .".split(), "Another one".split()] X = elmo.embed_sentence(sentences[1]) print(X.shape) print('Type: ', type(elmo)) print("Embedding done in %fs." % (time.time()-start))
class EntityEmbedder: def __init__(self): """ Initialize the value for constants which are useful to drive the behaviour """ # list of model names self.ELMO_NAME = 'Elmo' # list of extraction modes for Elmo self.LAYER_2 = 'layer_2' self.LAYER_1 = 'layer_1' self.LAYER_0 = 'layer_0' self.MEAN = 'mean' # list of word phrase aggregation names self.VECTOR_MEAN = 'vector_mean' def initialize_embedder_model(self, model_name, corpus): """ setup the variables which will determine how words will be translated in vectors :param model_name: the string which identifies the model used to embed values: ELMO_NAME: the allennlp's ElmoEmbedder is used, this take one sentence at time a sentence is a list of single words (['this', 'is', 'a', 'sentence']) corpus: a corpus in a format in accord with the model specifications: ELMO_NAME: a list of lists, each sublist is a sentence in the format ['this', 'is', 'a', 'sentence'] """ if model_name == self.ELMO_NAME: self.model = ElmoEmbedder(cuda_device=0) self.model_name = model_name self.corpus = corpus def setup(self, model_name, extraction_mode, occurrences_of_entities_path, aggregation_method, corpus, verbose=False): """ setup the values to drive the behaviour and setup the resources :param model_name: the name of the embedding model (ELMO_NAME) extraction_mode: the modality to extract vectors for word: if model_name == ELMO_NAME then extraction_mode can take these values: [LAYER_0, LAYER_1, LAYER_2]: the vector returned comes from layer 0 / 1 / 2 of ELMO MEAN: the mean of layers 0, 1 and 2 is returned occurrences_of_entities_path: the path to the file which contains the occurrences of the entities (the output of CorpusManager.check_composite_words()) aggregation_method: the method used to aggregate token vectors in word phrases (for 'new york' there will be two vectors, we want only one) values: VECTOR_MEAN: the mean of all token vectors is returned corpus: a corpus in a format in accord with the model specifications (see inizialize_embedder_model for more specific description) :return: a list of indexes which are the row in which word appear """ print('setupping the embedder') self.initialize_embedder_model(model_name=model_name, corpus=corpus) self.extraction_mode = extraction_mode self.OCCURRENCE_OF_ENTITIES_PATH = occurrences_of_entities_path self.verbose = verbose self.aggregation_method = aggregation_method def set_extraction_mode(self, mode): """ setup the name of the extraction mode, which will be used to drive the other functions in the class :param mode: the string which identifies the mode used to extract vectors of the sentences """ self.extraction_mode = mode def extract_embedding(self, model_output): """ returns the embeddings starting from the output of the model :param model_output: the desired output of self.model """ if self.extraction_mode == self.LAYER_2 and self.model_name == self.ELMO_NAME: return model_output[2] if self.extraction_mode == self.LAYER_1 and self.model_name == self.ELMO_NAME: return model_output[1] if self.extraction_mode == self.LAYER_0 and self.model_name == self.ELMO_NAME: return model_output[0] if self.extraction_mode == self.MEAN and self.model_name == self.ELMO_NAME: return (model_output[0] + model_output[1] + model_output[2]) / 3 def embed_sentence(self, sentence): """ returns the embedding of the input sentence based on the instantiated model :param sentence: if model_name == ELMO_NAME a sentence in this format: ['this', 'is', 'a', 'sentence'] """ if self.model_name == self.ELMO_NAME: return self.extract_embedding(self.model.embed_sentence(sentence)) def create_embedding_data_structure(self): """ creates the data structure useful to retrieve embeddings needs the output of the function 'check_composite_words' of the CorpusManager Class """ print('creating data structures') all_occurrences = load_data_with_pickle( self.OCCURRENCE_OF_ENTITIES_PATH) all_occurrences = [(k, v) if type(v[0]) == tuple else (k, v[0]) for k, v in all_occurrences.items() if len(k) > 2] all_occurrences = {x[0]: x[1] for x in all_occurrences} sentences_to_embed = [ v[0] for values in all_occurrences.values() for v in values ] if self.verbose: print('total found entity mentions: {}'.format( len(sentences_to_embed))) print( 'fraction of sentences with entity mentions: {:.2f} ({} on {})' .format( len(set(sentences_to_embed)) / len(self.corpus), len(set(sentences_to_embed)), len(self.corpus))) print('{:.2f} average entity mentions per sentence'.format( len(sentences_to_embed) / len(set(sentences_to_embed)))) embedding_data_structure = {index: [] for index in sentences_to_embed} for entity_mention, occurrences in all_occurrences.items(): for couple in occurrences: embedding_data_structure[couple[0]].append( (couple[1], entity_mention)) embedding_data_structure = { k: v for k, v in embedding_data_structure.items() if v } self.ordered_embedding_data_structure = OrderedDict( sorted(embedding_data_structure.items())) def extract_vectors_of_occurrences_in_corpus(self): """ returns the embedding of all input sentences based on the instantiated model :param sentences: if model_name == ELMO_NAME a list of sentence in this format: ['this', 'is', 'a', 'sentence'] """ print('generate vectors') self.vectors_dict = defaultdict(list) for row_index, occurrences in tqdm( self.ordered_embedding_data_structure.items()): vectors = self.embed_sentence(self.corpus[row_index]) for occ in occurrences: for word_index in occ[0]: if len(occ[1].split(' ')) == 1: self.vectors_dict[occ[1]].append(vectors[word_index]) else: vecs = [ vectors[w_i] for w_i in range( word_index, word_index + len(occ[1].split(' '))) ] self.vectors_dict[occ[1]].append( self.word_phrase_aggregation_method(vecs=vecs)) def word_phrase_aggregation_method(self, vecs): """ aggregates a list of vectors in accord to the aggregation method (extracts a single vector for the word phrase 'New York' starting from the vectors of 'New' and 'York') :param vecs: the list of vector to be aggregated """ if self.aggregation_method == self.VECTOR_MEAN: return np.mean(vecs, axis=0) def create_dataset(self, entity_dict, X_PATH, Y_PATH, entities_PATH): """ creates a dataset composed of: a list of vectors (X), a list of labels (Y), the entities names which order corresponds to values in X and Y (entities) :param entity_dict: a dict of entities which is in the format: {concept: [list of entities]}, used to set the Y values and the entities values X_PATH: the filepath in which save the list of vectors Y_PATH: the filepath in which save the list of labels entities_PATH: the filepath in which save the list of entities names """ print('creating dataset') reverse_dict = defaultdict(list) for k, words in entity_dict.items(): for w in words: reverse_dict[w].append(k) X = [] Y = [] entities = [] for label, label_vectors in self.vectors_dict.items(): if label in reverse_dict: for v in label_vectors: X.append(v) Y.append(reverse_dict[label][0]) entities.append(label) save_data_with_pickle(X_PATH, X) save_data_with_pickle(Y_PATH, Y) save_data_with_pickle(entities_PATH, entities)
def _embed_batch_impl( self, batch: List[str], model: ElmoEmbedder) -> Generator[ndarray, None, None]: # elmo expect a `List[str]` as it was meant for tokens/words with more than one character. yield from model.embed_batch([list(seq) for seq in batch])
def __init__(self, options_path: str, weights_path: str, device: int = 0): self.model = ElmoEmbedder(options_path, weights_path, cuda_device=device)
class BilmElmo(Bilm): def __init__(self, cuda_device, weights_path, options_path, vocab_path, batch_size=40, cutoff_elmo_vocab=50000): super().__init__() logging.info( 'creating elmo in device %d. weight path %s, vocab_path %s ' ' batch_size: %d' % ( cuda_device, weights_path, vocab_path, batch_size)) self.elmo = ElmoEmbedder(cuda_device=cuda_device, weight_file= weights_path, options_file=options_path ) self.batch_size = batch_size logging.info('warming up elmo') self._warm_up_elmo() logging.info('reading elmo weights') with h5py.File(weights_path, 'r', libver='latest', swmr=True) as fin: self.elmo_softmax_w = fin['softmax/W'][:cutoff_elmo_vocab, :].transpose() # self.elmo_softmax_b=fin['softmax/b'][:cutoff_elmo_vocab] self.elmo_word_vocab = [] self.elmo_word_vocab_lemmatized = [] # we prevent the prediction of these by removing their weights and their vocabulary altogether stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'} logging.info('reading elmo vocabulary') lines_to_remove = set() with open(vocab_path, encoding="utf-8") as fin: for idx, line in enumerate(fin): if idx == cutoff_elmo_vocab: break word = line.strip() if len(word) == 1 or word in stop_words: lines_to_remove.add(idx) self.elmo_word_vocab.append(word) with open(vocab_path + '.lemmatized', encoding="utf-8") as fin: for idx, line in enumerate(fin): if idx == cutoff_elmo_vocab: break word = line.strip() if len(word) == 1 or word in stop_words: lines_to_remove.add(idx) self.elmo_word_vocab_lemmatized.append(word) # remove stopwords self.elmo_word_vocab = [x for i, x in enumerate(self.elmo_word_vocab) if i not in lines_to_remove] self.elmo_word_vocab_lemmatized = [x for i, x in enumerate(self.elmo_word_vocab_lemmatized) if i not in lines_to_remove] self.elmo_softmax_w = np.delete(self.elmo_softmax_w, list(lines_to_remove), 1) # self.elmo_softmax_b = np.delete(self.elmo_softmax_b, list(lines_to_remove)) # logging.info('caching cnn embeddings') # self.elmo.elmo_bilm.create_cached_cnn_embeddings(self.elmo_word_vocab) # self.elmo.elmo_bilm._has_cached_vocab = True @staticmethod def create_lemmatized_vocabulary_if_needed(vocab_path): """ this creates a new voabulary file in the same directory as ELMo vocab where words has been lemmatized :param vocab_path: path to ELMo vocabulary :return: """ if not os.path.isfile(vocab_path + '.lemmatized'): # if there is not lemmatized vocabulary create it with open(vocab_path, encoding="utf-8") as fin: unlem = [x.strip() for x in fin.readlines()] logging.info('lemmatizing ELMo vocabulary') print('lemmatizing ELMo vocabulary') import spacy nlp = spacy.load("es", disable=['ner', 'parser']) #RL new_vocab = [] for spacyed in tqdm( nlp.pipe(unlem, batch_size=1000, n_threads=multiprocessing.cpu_count()), total=len(unlem)): new_vocab.append(spacyed[0].lemma_ if spacyed[0].lemma_ != '-PRON-' else spacyed[0].lower_) with open(vocab_path + '.lemmatized', 'w', encoding="utf-8") as fout: for word in new_vocab: fout.write('%s\n' % word) logging.info('lemmatization done and cached to file') print('lemmatization done and cached to file') def _warm_up_elmo(self): # running a few sentences in elmo will set it to a better state than initial zeros warm_up_sent = "En efecto , rematado ya su juicio , vino a dar en el más " \ "extraño pensamiento que jamás dio loco en el mundo ; y fue que " \ "le pareció convenible y necesario , así para el aumento de su honra " \ "como para el servicio de su república , hacerse caballero andante , e irse " \ "por todo el mundo con sus armas y caballo a buscar las " \ "aventuras y a ejercitarse en todo aquello que él había leído que " \ "los caballeros andantes se ejercitaban , deshaciendo todo género de agravio , y poniéndose " \ "en ocasiones y peligros donde , acabándolos , cobrase eterno nombre y fama .".split() for _ in range(3): _ = list(self.elmo.embed_sentences([warm_up_sent] * self.batch_size, self.batch_size)) def _get_top_words_dist(self, state, cutoff): log_probs = np.matmul(state, self.elmo_softmax_w)# (not) + self.elmo_softmax_b - we prevent unconditionally probable substitutes predictions by ignoring the bias vector top_k_log_probs = np.argpartition(-log_probs, cutoff)[: cutoff] top_k_log_probs_vals = log_probs[top_k_log_probs] e_x = np.exp(top_k_log_probs_vals - np.max(top_k_log_probs_vals)) probs = e_x / e_x.sum(axis=0) return top_k_log_probs, probs def _embed_sentences(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]], disable_symmetric_patterns) -> \ Tuple[List, List]: inst_id_sent_tuples = list(inst_id_to_sentence.items()) target = inst_id_sent_tuples[0][0].rsplit('.', 1)[0] to_embed = [] if disable_symmetric_patterns: # w/o sym. patterns - predict for blanked out word. # if the target word is the first or last in sentence get empty prediction by embedding '.' for _, (tokens, target_idx) in inst_id_sent_tuples: forward = tokens[:target_idx] backward = tokens[target_idx + 1:] if not forward: forward = ['.'] if not backward: backward = ['.'] to_embed.append(forward) to_embed.append(backward) else: # w/ sym. patterns - include target word + "and" afterwards in both directions for _, (tokens, target_idx) in inst_id_sent_tuples: # forward sentence to_embed.append(tokens[:target_idx + 1] + ['y']) #RL # backward sentence to_embed.append(['y'] + tokens[target_idx:]) #RL logging.info('embedding %d sentences for target %s' % (len(to_embed), target)) embedded = list(self.elmo.embed_sentences(to_embed, self.batch_size)) return inst_id_sent_tuples, embedded def predict_sent_substitute_representatives(self, inst_id_to_sentence: Dict[str, Tuple[List[str], int]], n_represent: int, n_samples_side: int, disable_symmetric_patterns: bool, disable_lemmatiziation: bool, prediction_cutoff: int) \ -> Dict[str, List[Dict[str, int]]]: """ a representative is a dictionary made out of samples from both sides of the BiLM, predicting substitutes for a contextualized token. an example might look like: {'forward_jump':2,'backward_leap':1, 'backward_climb':1} (n_samples_side=2) we return a list of n_representatives of those :param inst_id_to_sentence: dictionary instance_id -> (sentence tokens list, target word index in tokens) :param n_represent: number of representatives :param n_samples_side: number of samples to draw from each side :param disable_symmetric_patterns: if true words are predicted from context only :param disable_lemmatiziation: if true predictions are not lemmatized :param prediction_cutoff: only top prediction_cutoff LM prediction are considered :return: map from instance id to list of representatives """ inst_id_sent_tuples, embedded = self._embed_sentences(inst_id_to_sentence, disable_symmetric_patterns) lemma = inst_id_sent_tuples[0][0].split('.')[0] vocabulary_used = self.elmo_word_vocab if disable_lemmatiziation else self.elmo_word_vocab_lemmatized results = {} for i in range(len(inst_id_sent_tuples)): inst_id, (tokens, target_idx) = inst_id_sent_tuples[i] target_word_lower = tokens[target_idx].lower() sentence = ' '.join([t if i != target_idx else '***%s***' % t for i, t in enumerate(tokens)]) logging.info('instance %s sentence: %s' % (inst_id, sentence)) # these will be multiplied by ELMo's output matrix, [layer-number,token-index, state dims] # (first 512 state dims in elmo are the forward LM, 512:1024 are the backward LM) forward_out_em = embedded[i * 2][2, -1, :512] backward_out_em = embedded[i * 2 + 1][2, 0, 512:] forward_idxs, forward_dist = self._get_top_words_dist(forward_out_em, prediction_cutoff) backward_idxs, backward_dist = self._get_top_words_dist(backward_out_em, prediction_cutoff) forward_samples = [] # after removing samples equal to disamb. target, # we might end up with not enough samples, so repeat until we have enough samples while len(forward_samples) < n_represent * n_samples_side: new_samples = list( np.random.choice(forward_idxs, n_represent * n_samples_side * 2, p=forward_dist)) new_samples = [vocabulary_used[x] for x in new_samples if vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower] forward_samples += new_samples backward_samples = [] while len(backward_samples) < n_represent * n_samples_side: new_samples = list( np.random.choice(backward_idxs, n_represent * n_samples_side * 2, p=backward_dist)) new_samples = [vocabulary_used[x] for x in new_samples if vocabulary_used[x].lower() != lemma and vocabulary_used[x].lower() != target_word_lower] backward_samples += new_samples logging.info('some forward samples: %s' % [x for x in forward_samples[:5]]) logging.info('some backward samples: %s' % [x for x in backward_samples[:5]]) representatives = [] for _ in range(n_represent): representative = dict() for _ in range(n_samples_side): for sample_src in forward_samples, backward_samples: sample_word = sample_src.pop() representative[sample_word] = representative.get(sample_word, 0) + 1 representatives.append(representative) logging.info('first 3 representatives out of %d:\n%s' % (n_represent, representatives[:3])) results[inst_id] = representatives return results
test_tcr = pd.read_csv('TCR.csv') test_ac1, test_ac2, test_bc1, test_bc2 = abc('test_tracdr.txt', 'test_trbcdr.txt') test_ac3 = [] test_bc3 = [] for ind, out in test_tcr.iterrows(): test_ac3.append(out[1]) test_bc3.append(out[4]) model_dir = Path('uniref50_v2') weights = model_dir / 'weights.hdf5' options = model_dir / 'options.json' seqvec = ElmoEmbedder(options, weights, cuda_device=-1) def s2v(seq): embed1 = seqvec.embed_sentence(list(seq)) protein_embd1 = torch.tensor(embed1).sum(dim=0).mean(dim=0) return list(protein_embd1.detach().numpy()) def embed(l): value = [] uni = list(set(l)) for i, seq in enumerate(uni): sys.stdout.write('%d\r' % i) sys.stdout.flush() value.append(s2v(seq))
def __init__(self): self.elmo = ElmoEmbedder()
def loadModel(args): # Load model according to the choice in "embed". print("Begin loading model...") embed = args.embedding start = time.time() if embed == "glove": from GloVe import loadAndCreateModel if args.path: path_to_glove = args.path # "./../../../../Perso/Pretrained-Embedding/GloVe/" print('GloVe path: ' + path_to_glove + '.\nWarning: in GloVe case, it must be the FOLDER path.') else: print('You need to give GloVe FOLDER path') sys.exit() if args.dimension: dim = args.dimension if dim not in [50, 100, 200, 300]: print( "Available GloVe dimension: 50, 100, 200 or 300. You chose %d !" % (dim)) sys.exit() else: dim = 50 print('Chosen dimension for GloVe: ', dim) start = time.time() model = loadAndCreateModel(dim, path_to_glove) vocab_size = len(model.keys()) d = len(model['hello']) elif embed == "numberBatch": from numberbatch import loadAndCreateNumberBatchModel start = time.time() dim = 300 model = loadAndCreateNumberBatchModel() vocab_size = len(model.keys()) d = len(model['hello']) elif embed == "miniNumberbatch": from miniNumberbatch import loadMiniNumberbatch if args.path: mNb_path = args.path #"./../17.06/mini.h5" print( 'Conceptnet model path: ' + mNb_path + '. Warning: in miniNumberbatch case, it must be the FILE.h5 path.' ) else: print('You need to give ConceptNet miniNumberBatch FILE.h5 path') sys.exit() start = time.time() model = loadMiniNumberbatch(mNb_path) vocab_size = len(model.keys()) d = len(model['hello']) elif embed == "elmo": from allennlp.commands.elmo import ElmoEmbedder ### ELMo embedding on training data if args.which_elmo: which_elmo = args.which_elmo # "small" print("Chosen ELMo option: ", which_elmo) else: which_elmo = "small" print('Default ELMo chosen: small.') if which_elmo == "small": weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" elif which_elmo == "medium": weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" elif which_elmo == "original": weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5" options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json" else: print('This option is not available...') sys.exit() start = time.time() print("Downloading elmo model...") model = ElmoEmbedder(options_file, weight_file) dim = model.embed_sentence(['Hello']).shape[2] d = dim vocab_size = 0 print("Downloaded in %fs" % (time.time() - start)) elif embed == 'infersent': from models import InferSent nltk.download('punkt') if args.path: inferSent_path = args.path print( 'InferSent model path: ' + inferSent_path + '. Warning: in InferSent case, it must be InferSent FOLDER path.' ) else: print('You need to give InferSent FOLDER path') sys.exit() if args.version == 1 or args.version == 2: model_version = args.version else: print( 'You need to choose InferSent version between 1 (Word2Vec input) or 2 (FastText input).' ) sys.exit() if args.embedding_path: W2V_PATH = args.embedding_path print( 'InferSent pretrained embedding path: ' + W2V_PATH + '. Warning: in this case, it must be "model.txt" or "model.vec" path.' ) else: print( 'You need to give InferSent "model.txt" or "model.vec" path path' ) sys.exit() MODEL_PATH = os.path.join(inferSent_path, "./encoder/infersent%s.pkl" % model_version) params_model = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version } model = InferSent(params_model) model.load_state_dict(torch.load(MODEL_PATH)) # If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings. # W2V_PATH = './../../../../Perso/Pretrained-Embedding/GloVe/glove.840B.300d.txt' if model_version == 1 else '../dataset/fastText/crawl-300d-2M.vec' model.set_w2v_path(W2V_PATH) # Load embeddings of K most frequent words vocab_size = 100000 model.build_vocab_k_words(K=vocab_size) d = model.encode(['hello guys']).shape[1] print('Model ' + embed.upper() + ' loaded in %fs.' % (time.time() - start)) print("Vocabulary size: %d" % vocab_size) print("Vector dimension: %d" % d) return model, d
def test_embed_batch_is_empty_sentence(self): embedder = ElmoEmbedder(options_file=self.options_file, weight_file=self.weight_file) embeddings = embedder.embed_sentence([]) assert embeddings.shape == (3, 0, 1024)
import random # In[2]: # set determinstic results ''' SEED = 1234 random.seed(SEED) torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True ''' # In[3]: from allennlp.commands.elmo import ElmoEmbedder elmo = ElmoEmbedder() from graph_lstm import * from decoder import * from graph2seq_model import * # get the decoder vocab with open('./data/vocab.pkl', 'rb') as f: vocab = pickle.load(f) print("Size of vocab: {}".format(vocab.idx)) # get the graph lstm synset vocab with open('./data/synset_vocab.pkl', 'rb') as f: synset_vocab = pickle.load(f) print("Size of synset vocab: {}".format(synset_vocab.idx))
import sys sys.path.append('src') import new_data_io, SIF_embedding_lib import csv from allennlp.commands.elmo import ElmoEmbedder from nltk import word_tokenize import numpy as np """ This area should be placed in the main call """ elmo = ElmoEmbedder() class params(object): def __init__(self): self.LW = 1e-5 self.LC = 1e-5 self.eta = 0.05 def __str__(self): t = "LW", self.LW, ", LC", self.LC, ", eta", self.eta t = map(str, t) return ' '.join(t) cleanfile = "test.txt" wordfile = 'glove.6B.100d.txt' # word vector file, can be downloaded from GloVe website weightfile = 'enwiki_vocab_min200.txt' (words, We) = new_data_io.getWordmap(wordfile) weightpara = 1e-3 word2weight = new_data_io.getWordWeight(
class MLPRegression(Module): def __init__(self, embed_params, attention_type, all_attributes, output_size, layers, hand_feat_dim, device="cpu", embedding_dim=1024, turn_on_hand_feats=False, turn_on_embeddings=False): ''' Super class for training ''' super(MLPRegression, self).__init__() # Set model constants and embeddings self.device = device self.layers = layers self.embedding_dim = embedding_dim self.output_size = output_size self.attention_type = attention_type self.all_attributes = all_attributes self.is_hand_feats_on = turn_on_hand_feats self.is_embeds_on = turn_on_embeddings # Initialise embeddings if self.is_embeds_on: self._init_embeddings(embed_params) else: self.reduced_embedding_dim = 0 if self.is_hand_feats_on: self.hand_feat_dim = hand_feat_dim else: self.hand_feat_dim = 0 # Initialise regression layers and parameters self._init_regression() # Initialise attention parameters self._init_attention() def _init_embeddings(self, embedding_params): ''' Initialise embeddings ''' if type(embedding_params[0]) is str: self.vocab = None options_file = embedding_params[0] weight_file = embedding_params[1] self.embeddings = ElmoEmbedder(options_file, weight_file, cuda_device=0) # self.embeddings = Elmo(options_file, weight_file, 3, dropout=0) self.reduced_embedding_dim = 256 # ELMO tuning parameters self.embed_linmap_argpred_lower = Linear( self.embedding_dim, self.reduced_embedding_dim) self.embed_linmap_argpred_mid = Linear(self.embedding_dim, self.reduced_embedding_dim, bias=False) self.embed_linmap_argpred_top = Linear(self.embedding_dim, self.reduced_embedding_dim, bias=False) else: # GloVe embeddings glove_embeds = embedding_params[0] self.vocab = embedding_params[1] self.num_embeddings = len(self.vocab) self.embeddings = torch.nn.Embedding(self.num_embeddings, self.embedding_dim, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False) self.reduced_embedding_dim = 300 self.embeddings.weight.data.copy_( torch.from_numpy(glove_embeds.values)) self.embeddings.weight.requires_grad = False self.vocab_hash = {w: i for i, w in enumerate(self.vocab)} # self.embed_linmap = Linear(self.embedding_dim, self.reduced_embedding_dim) def _init_regression(self): ''' Define the linear maps ''' # Output regression parameters self.linmaps = ModuleDict( {prot: ModuleList([]) for prot in self.all_attributes.keys()}) for prot in self.all_attributes.keys(): last_size = self.reduced_embedding_dim # Handle varying size of dimension depending on representation if self.attention_type[prot]['repr'] == "root": if self.attention_type[prot]['context'] != "none": last_size *= 2 else: if self.attention_type[prot]['context'] == "none": last_size *= 2 else: last_size *= 3 # self.layer_norm[prot] = torch.nn.LayerNorm(last_size) last_size += self.hand_feat_dim for out_size in self.layers: linmap = Linear(last_size, out_size) self.linmaps[prot].append(linmap) last_size = out_size final_linmap = Linear(last_size, self.output_size) self.linmaps[prot].append(final_linmap) # Dropout layer self.dropout = Dropout() def _regression_nonlinearity(self, x): return F.relu(x) def _init_attention(self): ''' Initialises the attention map vector/matrix Takes attention_type-Span, Sentence, Span-param, Sentence-param as a parameter to decide the size of the attention matrix ''' self.att_map_repr = ModuleDict({}) self.att_map_W = ModuleDict({}) self.att_map_V = ModuleDict({}) self.att_map_context = ModuleDict({}) for prot in self.attention_type.keys(): # Token representation if self.attention_type[prot]['repr'] == "span": repr_dim = 2 * self.reduced_embedding_dim self.att_map_repr[prot] = Linear(self.reduced_embedding_dim, 1, bias=False) self.att_map_W[prot] = Linear(self.reduced_embedding_dim, self.reduced_embedding_dim) self.att_map_V[prot] = Linear(self.reduced_embedding_dim, 1, bias=False) elif self.attention_type[prot]['repr'] == "param": repr_dim = 2 * self.reduced_embedding_dim self.att_map_repr[prot] = Linear(self.reduced_embedding_dim, self.reduced_embedding_dim, bias=False) self.att_map_W[prot] = Linear(2 * self.reduced_embedding_dim, self.reduced_embedding_dim) self.att_map_V[prot] = Linear(self.reduced_embedding_dim, 1, bias=False) else: repr_dim = self.reduced_embedding_dim # Context representation # There is no attention for argument davidsonian if self.attention_type[prot]['context'] == 'param': self.att_map_context[prot] = Linear(repr_dim, self.reduced_embedding_dim, bias=False) elif self.attention_type[prot][ 'context'] == 'david' and prot == 'arg': self.att_map_context[prot] = Linear(repr_dim, self.reduced_embedding_dim, bias=False) def _choose_tokens(self, batch, lengths): ''' Extracts tokens from a batch at specified position(lengths) batch - batch_size x max_sent_length x embed_dim lengths - batch_size x max_span_length x embed_dim ''' idx = (lengths).unsqueeze(2).expand(-1, -1, batch.shape[2]) return batch.gather(1, idx).squeeze() def _get_inputs(self, words): ''' Return ELMO embeddings as root, span or param span ''' if not self.vocab: raw_embeds, masks = self.embeddings.batch_to_embeddings(words) # raw_ = self.embeddings(batch_to_ids(words).to(self.device)) # raw_embeds, masks = torch.cat([x.unsqueeze(1) for x in raw_['elmo_representations']], dim=1), raw_['mask'] masks = masks.unsqueeze(2).repeat( 1, 1, self.reduced_embedding_dim).byte() embedded_inputs = (self.embed_linmap_argpred_lower( raw_embeds[:, 0, :, :].squeeze()) + self.embed_linmap_argpred_mid( raw_embeds[:, 1, :, :].squeeze()) + self.embed_linmap_argpred_top( raw_embeds[:, 2, :, :].squeeze())) masked_embedded_inputs = embedded_inputs * masks.float() return masked_embedded_inputs, masks else: # Glove embeddings indices = [[self.vocab_hash[word] for word in sent] for sent in words] indices = torch.tensor(indices, dtype=torch.long, device=self.device) embeddings = self.embeddings(indices) masks = (embeddings != 0)[:, :, :self.reduced_embedding_dim].byte() # reduced_embeddings = self.embed_linmap(embeddings) * masks.float() return embeddings, masks def _get_representation(self, prot, embeddings, roots, spans, context=False): ''' returns the representation required from arguments passed by running attention based on arguments passed ''' # Get token(pred/arg) representation rep_type = self.attention_type[prot]['repr'] roots_rep_raw = self._choose_tokens(embeddings, roots) if len(roots_rep_raw.shape) == 1: roots_rep_raw = roots_rep_raw.unsqueeze(0) if rep_type == "root": token_rep = roots_rep_raw else: masks_spans = (spans == -1) spans[spans == -1] = 0 spans_rep_raw = self._choose_tokens(embeddings, spans) if len(spans_rep_raw.shape) == 1: spans_rep_raw = spans_rep_raw.unsqueeze(0).unsqueeze(1) elif len(spans_rep_raw.shape) == 2: if spans.shape[0] == 1: spans_rep_raw = spans_rep_raw.unsqueeze(0) elif spans.shape[1] == 1: spans_rep_raw = spans_rep_raw.unsqueeze(1) if rep_type == "span": att_raw = self.att_map_repr[prot](spans_rep_raw).squeeze() # additive attention # att_raw_w = torch.relu(self.att_map_W[prot](for_att)) # att_raw = self.att_map_V[prot](att_raw_w).squeeze() elif rep_type == "param": # att_param = torch.relu(self.att_map_repr[prot](roots_rep_raw)).unsqueeze(2) # att_raw = torch.matmul(spans_rep_raw, att_param).squeeze() # additive attention for_att = torch.cat( (spans_rep_raw, roots_rep_raw.unsqueeze(1).repeat( 1, spans_rep_raw.shape[1], 1)), dim=2) att_raw_w = torch.relu(self.att_map_W[prot](for_att)) att_raw = self.att_map_V[prot](att_raw_w).squeeze() att_raw = att_raw.masked_fill(masks_spans, -1e9) att = F.softmax(att_raw, dim=1) att = self.dropout(att) pure_token_rep = torch.matmul( att.unsqueeze(2).permute(0, 2, 1), spans_rep_raw).squeeze() if not context: token_rep = torch.cat((roots_rep_raw, pure_token_rep), dim=1) else: token_rep = pure_token_rep return token_rep def _run_attention(self, prot, embeddings, roots, spans, context_roots, context_spans, masks): ''' Various attention mechanisms implemented ''' # Get the required representation for pred/arg token_rep = self._get_representation(prot=prot, embeddings=embeddings, roots=roots, spans=spans) # Get the required representation for context of pred/arg context_type = self.attention_type[prot]['context'] if context_type == "none": context_rep = None elif context_type == "param": # Sentence level attention att_param = torch.relu( self.att_map_context[prot](token_rep)).unsqueeze(1) att_raw = torch.matmul(embeddings, att_param.permute(0, 2, 1)) att_raw = att_raw.masked_fill(masks[:, :, 0:1] == 0, -1e9) att = F.softmax(att_raw, dim=1) att = self.dropout(att) context_rep = torch.matmul(att.permute(0, 2, 1), embeddings).squeeze() elif context_type == "david": if prot == "arg": prot_context = 'pred' context_roots = torch.tensor(context_roots, dtype=torch.long, device=self.device).unsqueeze(1) max_span = max([len(a) for a in context_spans]) context_spans = torch.tensor([ a + [-1 for i in range(max_span - len(a))] for a in context_spans ], dtype=torch.long, device=self.device) context_rep = self._get_representation(context=True, prot=prot_context, embeddings=embeddings, roots=context_roots, spans=context_spans) else: prot_context = 'arg' context_rep = None for i, ctx_root in enumerate(context_roots): ctx_root = torch.tensor(ctx_root, dtype=torch.long, device=self.device).unsqueeze(1) max_span = max([len(a) for a in context_spans[i]]) ctx_span = torch.tensor([ a + [-1 for i in range(max_span - len(a))] for a in context_spans[i] ], dtype=torch.long, device=self.device) sentence = embeddings[i, :, :].unsqueeze(0).repeat( len(ctx_span), 1, 1) ctx_reps = self._get_representation(context=True, prot=prot_context, embeddings=sentence, roots=ctx_root, spans=ctx_span) if len(ctx_reps.shape) == 1: ctx_reps = ctx_reps.unsqueeze(0) # Attention over arguments att_nd_param = torch.relu(self.att_map_context[prot]( token_rep[i, :].unsqueeze(0))) att_raw = torch.matmul(att_nd_param, ctx_reps.permute(1, 0)) att = F.softmax(att_raw, dim=1) ctx_rep_final = torch.matmul(att, ctx_reps) if i: context_rep = torch.cat((context_rep, ctx_rep_final), dim=0).squeeze() else: context_rep = ctx_rep_final if context_rep is not None: inputs_for_regression = torch.cat((token_rep, context_rep), dim=1) else: inputs_for_regression = token_rep return inputs_for_regression def _run_regression(self, prot, x): ''' Run regression to get 3 attribute vector ''' for i, lin_map in enumerate(self.linmaps[prot]): if i: x = self._regression_nonlinearity(x) x = self.dropout(x) x = lin_map(x) return torch.sigmoid(x) def forward(self, prot, words, roots, spans, context_roots, context_spans, hand_feats): """ Forward propagation of activations """ if self.is_embeds_on: inputs_for_attention, masks = self._get_inputs(words) inputs_for_regression = self._run_attention( prot=prot, embeddings=inputs_for_attention, roots=roots, spans=spans, context_roots=context_roots, context_spans=context_spans, masks=masks) if self.is_hand_feats_on: inputs_for_regression = torch.cat( (inputs_for_regression, hand_feats), dim=1) elif self.is_hand_feats_on: inputs_for_regression = hand_feats else: sys.exit('You need some word representation!!') outputs = self._run_regression(prot=prot, x=inputs_for_regression) return outputs
from allennlp.commands.elmo import ElmoEmbedder import pickle from utils import Config, safe_pickle_dump import gensim elmo = ElmoEmbedder( options_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json', weight_file='https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' ) db = pickle.load(open(Config.db_path, 'rb')) summary_tokens = [] for pid, j in db.items(): # idvv = '%sv%d' % (j['_rawid'], j['_version']) summary = j['summary'].replace('\n', ' ') summary = gensim.utils.simple_preprocess(summary) summary_tokens += summary, print(len(summary_tokens)) elmo_embed = elmo.embed_batch(summary_tokens) safe_pickle_dump(elmo_embed, 'elmo_embed.p')
def embedder(self): if self._memo_embedder is None: from allennlp.commands.elmo import ElmoEmbedder self._memo_embedder = ElmoEmbedder(options_file=options_file, weight_file=weight_file) return self._memo_embedder
def __init__(self, cuda_device, weights_path, vocab_path, batch_size=40, cutoff_elmo_vocab=50000): super().__init__() logging.info( 'creating elmo in device %d. weight path %s, vocab_path %s ' ' batch_size: %d' % (cuda_device, weights_path, vocab_path, batch_size)) self.elmo = ElmoEmbedder(cuda_device=cuda_device) self.batch_size = batch_size logging.info('warming up elmo') self._warm_up_elmo() logging.info('reading elmo weights') with h5py.File(weights_path, 'r', libver='latest', swmr=True) as fin: self.elmo_softmax_w = fin[ 'softmax/W'][:cutoff_elmo_vocab, :].transpose() self.elmo_word_vocab = [] self.elmo_word_vocab_lemmatized = [] # we prevent the prediction of these by removing their weights and their vocabulary altogether stop_words = {'<UNK>', '<S>', '</S>', '--', '..', '...', '....'} logging.info('reading elmo vocabulary') lines_to_remove = set() with open(vocab_path, encoding="utf-8") as fin: for idx, line in enumerate(fin): if idx == cutoff_elmo_vocab: break word = line.strip() if len(word) == 1 or word in stop_words: lines_to_remove.add(idx) self.elmo_word_vocab.append(word) with open(vocab_path + '.lemmatized', encoding="utf-8") as fin: for idx, line in enumerate(fin): if idx == cutoff_elmo_vocab: break word = line.strip() if len(word) == 1 or word in stop_words: lines_to_remove.add(idx) self.elmo_word_vocab_lemmatized.append(word) # remove stopwords self.elmo_word_vocab = [ x for i, x in enumerate(self.elmo_word_vocab) if i not in lines_to_remove ] self.elmo_word_vocab_lemmatized = [ x for i, x in enumerate(self.elmo_word_vocab_lemmatized) if i not in lines_to_remove ] self.elmo_softmax_w = np.delete(self.elmo_softmax_w, list(lines_to_remove), 1)