def __init__(self, text: str, context: List[str], tokenizers: Dict[str, BaseTokenizer] = None): if tokenizers is None: tokenizers = {"tokens": WordTokenizer()} self.text = text self.context = context self.tokenizers = tokenizers self.tokens: Dict[str, List[Any]] = defaultdict(list) self.namespaces = list(tokenizers.keys()) for namespace in tokenizers.keys(): self.namespaces.append(f"contextual_{namespace}") # add tokens for the word tokens for namespace, tokenizer in self.tokenizers.items(): tokens = tokenizer.tokenize(text) for token in tokens: self.add_token(token=token, namespace=namespace) # add tokens for the contextual lines for namespace, tokenizer in self.tokenizers.items(): for contextual_line in self.context: tokens = tokenizer.tokenize(contextual_line) tokens = [Token(tok) for tok in tokens] self.tokens[f"contextual_{namespace}"].append(tokens) self.line = Line(text=text, tokenizers=self.tokenizers) self.context_lines = [] for text in self.context: context_line = Line(text=text, tokenizers=self.tokenizers) self.context_lines.append(context_line)
def get_docs_labels_refs( self) -> (List[List[Line]], List[SeqLabel], List[Line]): docs: List[List[Line]] = [] labels: List[SeqLabel] = [] refs: List[Line] = [] with open(self.filename, "r", encoding="utf-8") as fp: for line in fp: line = line.strip() if not bool(line): continue line_sents, line_labels, line_ref = line.strip().split("###") sents: List[str] = [ sent.strip() for sent in line_sents.split("\t") ] sents_labels: List[str] = [ sent_label.strip() for sent_label in line_labels.split(",") ] sents_refs: str = line_ref doc = [ Line(text=sent, tokenizers=self.tokenizers) for sent in sents ] label = SeqLabel(labels={"seq_label": sents_labels}) ref = Line(text=sents_refs, tokenizers=self.tokenizers) docs.append(doc) labels.append(label) refs.append(ref) return docs, labels, refs
def setup_scorer(abs_sum_dataset_manager): dataset_manager = abs_sum_dataset_manager scorer = SummarizationMetrics(dataset_manager) lines = [ Line("word11_train word21_train"), Line("word12_train word22_train word32_train"), ] true_summary = [ Line("word11_label word21_label"), Line("word11_label word22_label"), ] true_summary_tokens = ["word11_label", "word22_label", "word33_label"] pred_summary_tokens = [ "word11_label", "word22_label", "word23_label", "word33_label", ] predicted_tags = {"predicted_tags_tokens": [[0, 2], [1, 4, 5]]} return ( scorer, (lines, true_summary, predicted_tags), (true_summary_tokens, pred_summary_tokens), )
def setup_lstm2seqdecoder(request, ): HIDDEN_DIM = 1024 NUM_LAYERS = request.param[0] BIDIRECTIONAL = request.param[1] TEACHER_FORCING_RATIO = request.param[3] MAX_LENGTH = 5 lines = [] words = [] # texts = ["First", "second", "Third"] texts = ["First sentence", "second sentence", "Third long sentence here"] for text in texts: line = Line(text=text) word = Line(text=text.split()[0]) lines.append(line) words.append(word) flat_texts = [[word for sentence in texts for word in sentence]] vocab = Vocab(flat_texts) vocab.build_vocab() num_direction = 2 if BIDIRECTIONAL else 1 h0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.1 c0 = torch.ones(NUM_LAYERS, len(texts), num_direction * HIDDEN_DIM) * 0.2 embedder = WordEmbedder(embedding_type="glove_6B_50") encoder_outputs = (torch.ones(len(texts), 5, num_direction * HIDDEN_DIM) * 0.5 if request.param[2] else None) decoder = Lstm2SeqDecoder( embedder=embedder, vocab=vocab, max_length=MAX_LENGTH, attn_module=request.param[2], dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, rnn_bias=False, num_layers=NUM_LAYERS, ) return ( decoder, { "HIDDEN_DIM": HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "MAX_LENGTH": MAX_LENGTH, "TEACHER_FORCING_RATIO": TEACHER_FORCING_RATIO, "LINES": lines, "WORDS": words, "VOCAB_SIZE": vocab.get_vocab_len(), "BIDIRECTIONAL": BIDIRECTIONAL, }, encoder_outputs, (h0, c0), )
def setup_lstm2vecencoder(request): hidden_dimension = 1024 combine_strategy = request.param[1] bidirectional = request.param[0] embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = LSTM2VecEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=hidden_dimension, bidirectional=bidirectional, combine_strategy=combine_strategy, rnn_bias=False, ) texts = ["First sentence", "second sentence"] lines = [] for text in texts: line = Line(text=text) lines.append(line) return ( encoder, { "hidden_dim": 2 * hidden_dimension if bidirectional and combine_strategy == "concat" else hidden_dimension, "bidirectional": False, "combine_strategy": combine_strategy, "lines": lines, }, )
def setup_lines(): texts = ["first line", "second line"] lines = [] for text in texts: line = Line(text=text) lines.append(line) return lines
def get_lines_labels(self) -> (List[Line], List[SeqLabel]): lines: List[Line] = [] labels: List[SeqLabel] = [] with open(self.filename, "r", encoding="utf-8") as fp: for line in fp: line = line.strip() if not bool(line): continue lines_and_labels = line.strip().split(" ") words: List[str] = [] word_labels: List[str] = [] for word_line_labels in lines_and_labels: word, word_label = word_line_labels.split("###") word = word.strip() word_label = word_label.strip() words.append(word) word_labels.append(word_label) line = Line(text=" ".join(words), tokenizers=self.tokenizers) label = SeqLabel(labels={"seq_label": word_labels}) lines.append(line) labels.append(label) return lines, labels
def setup_elmo_embedder(): elmo_embedder = ElmoEmbedder() texts = ["I like to test elmo", "Elmo context embedder"] lines = [] for text in texts: line = Line(text=text) lines.append(line) return elmo_embedder, lines
def setup_lines(): texts = ["First sentence", "Second Sentence"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()}, ) lines.append(line) return lines
def lines(): texts = ["First line", "Second Line which is longer"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(tokenizer="vanilla")} ) lines.append(line) return lines
def test_line_word_tokenizers(self): text = "This is a single line" line = Line(text=text, tokenizers={"tokens": WordTokenizer()}) tokens = line.tokens assert [token.text for token in tokens["tokens"]] == [ "This", "is", "a", "single", "line", ]
def setup_bow_encoder(request): aggregation_type = request.param embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = BOW_Encoder(embedder=embedder, aggregation_type=aggregation_type) texts = ["First sentence", "second sentence"] lines = [] for text in texts: line = Line(text=text) lines.append(line) return encoder, lines
def _form_line_label(self, text: str, labels: List[str]): line = Line(text=text, tokenizers=self.tokenizers) labels_ = zip(*labels) labels_ = zip(self.column_names, labels_) labels_ = dict(labels_) if self.train_only: column_index = 0 column_name = self.column_names[column_index] labels_ = {column_name: labels_[column_name]} label = SeqLabel(labels=labels_) return line, label
def setup_bow_elmo_encoder(request): layer_aggregation = request.param strings = [ "I like to eat carrot", "I like to go out on long drives in a car" ] lines = [] for string in strings: line = Line(text=string) lines.append(line) bow_elmo_embedder = BowElmoEmbedder(layer_aggregation=layer_aggregation) return bow_elmo_embedder, lines
def get_lines_labels(self, start_token: str = "<SOS>", end_token: str = "<EOS>") -> (List[Line], List[Line]): lines: List[Line] = [] labels: List[Line] = [] with open(self.filename) as fp: for line in fp: line, label = line.split("###") line = line.strip() label = label.strip() line_instance = Line(text=line, tokenizers=self.tokenizers) label_instance = Line(text=label, tokenizers=self.tokenizers) for namespace, tokenizer in self.tokenizers.items(): line_instance.tokens[namespace].insert( 0, Token(start_token)) line_instance.tokens[namespace].append(Token(end_token)) label_instance.tokens[namespace].insert( 0, Token(start_token)) label_instance.tokens[namespace].append(Token(end_token)) lines.append(line_instance) labels.append(label_instance) return lines, labels
def make_line(self, line: str): """ Makes a line object from string, having some characteristics as the lines used by the datasets Parameters ---------- line : str Returns ------- Line """ line_ = Line(text=line, tokenizers=self.train_dataset.tokenizers) return line_
def get_lines_labels(self) -> (List[Line], List[Label]): lines: List[Line] = [] labels: List[Label] = [] with open(self.filename) as fp: for line in fp: line, label = line.split("###") line = line.strip() label = label.strip() line_instance = Line(text=line, tokenizers=self.tokenizers) label_instance = Label(text=label) lines.append(line_instance) labels.append(label_instance) return lines, labels
def test_line_char_tokenizer(self): text = "Word" line = Line( text=text, tokenizers={ "tokens": WordTokenizer(), "chars": CharacterTokenizer() }, ) tokens = line.tokens word_tokens = tokens["tokens"] char_tokens = tokens["chars"] word_tokens = [tok.text for tok in word_tokens] char_tokens = [tok.text for tok in char_tokens] assert word_tokens == ["Word"] assert char_tokens == ["W", "o", "r", "d"]
def setup_char_embedder(request, clf_dataset_manager): char_embedding_dim, hidden_dim = request.param datset_manager = clf_dataset_manager embedder = CharEmbedder( char_embedding_dimension=char_embedding_dim, hidden_dimension=hidden_dim, datasets_manager=datset_manager, ) texts = ["This is sentence", "This is another sentence"] lines = [] for text in texts: line = Line( text=text, tokenizers={"tokens": WordTokenizer(), "char_tokens": CharacterTokenizer()}, ) lines.append(line) return embedder, lines
def setup_lstm2seqencoder(request): HIDDEN_DIM = 1024 BIDIRECTIONAL = request.param[0] COMBINE_STRATEGY = request.param[1] NUM_LAYERS = request.param[2] ADD_PROJECTION_LAYER = request.param[3] embedder = WordEmbedder(embedding_type="glove_6B_50") encoder = Lstm2SeqEncoder( embedder=embedder, dropout_value=0.0, hidden_dim=HIDDEN_DIM, bidirectional=BIDIRECTIONAL, combine_strategy=COMBINE_STRATEGY, rnn_bias=False, num_layers=NUM_LAYERS, add_projection_layer=ADD_PROJECTION_LAYER, ) lines = [] texts = ["First sentence", "second sentence"] for text in texts: line = Line(text=text) lines.append(line) return ( encoder, { "HIDDEN_DIM": HIDDEN_DIM, "COMBINE_STRATEGY": COMBINE_STRATEGY, "BIDIRECTIONAL": BIDIRECTIONAL, "EXPECTED_HIDDEN_DIM": 2 * HIDDEN_DIM if COMBINE_STRATEGY == "concat" and BIDIRECTIONAL and not ADD_PROJECTION_LAYER else HIDDEN_DIM, "NUM_LAYERS": NUM_LAYERS, "LINES": lines, "TIME_STEPS": 2, }, )
def _form_line_label(self, text: str, labels: List[str]): line = Line(text=text, tokenizers=self.tokenizers) labels_ = zip(*labels) labels_ = zip(self.column_names, labels_) labels_ = dict(labels_) if self.train_only: if self.train_only == "pos": column_index = 0 elif self.train_only == "dep": column_index = 1 elif self.train_only == "ner": column_index = 2 else: raise ValueError( f"train_only parameter can be one of [pos, dep, ner]") column_name = self.column_names[column_index] labels_ = {column_name: labels_[column_name]} label = SeqLabel(labels=labels_) return line, label
def setup_bert_embedder(request): dropout_value = 0.0 bert_type, aggregation_type = request.param bert_embedder = BertEmbedder( dropout_value=dropout_value, aggregation_type=aggregation_type, bert_type=bert_type, ) strings = [ "Lets start by talking politics", "there are radical ways to test your code", ] lines = [] for string in strings: line = Line(text=string) lines.append(line) return bert_embedder, lines
def test_line_namespaces(self): text = "Single line" line = Line(text=text, tokenizers={"tokens": WordTokenizer()}) assert line.namespaces == ["tokens"]
def _generate_lines_with_start_token(self): line = Line("") line.add_token(self.start_token, "tokens") return line
def forward( self, lines: List[Line], c0: torch.FloatTensor, h0: torch.FloatTensor, encoder_outputs: torch.FloatTensor = None, teacher_forcing_ratio: float = 0, ) -> torch.Tensor: """ Parameters ---------- lines : list of Line objects Batched tokenized source sentence of shape [batch size]. h0, c0 : 3d torch.FloatTensor Hidden and cell state of the LSTM layer. Each state's shape [n layers * n directions, batch size, hidden dim] Returns ------- prediction : 2d torch.LongTensor For each token in the batch, the predicted target vobulary. Shape [batch size, output dim] hn, cn : 3d torch.FloatTensor Hidden and cell state of the LSTM layer. Each state's shape [n layers * n directions, batch size, hidden dim] """ use_teacher_forcing = (True if (random.random() < teacher_forcing_ratio) else False) if use_teacher_forcing: max_length = max(len(line.tokens["tokens"]) for line in lines) else: max_length = self.max_length batch_size = len(lines) # tensor to store decoder's output outputs = torch.zeros(max_length, batch_size, self.vocab_size).to(self.device) # last hidden & cell state of the encoder is used as the decoder's initial hidden state if use_teacher_forcing: prediction, _, _ = self.forward_step( lines=lines, h0=h0, c0=c0, encoder_outputs=encoder_outputs) outputs[1:] = prediction.permute(1, 0, 2)[:-1] else: lines = [self._generate_lines_with_start_token()] * batch_size for i in range(1, max_length): prediction, hn, cn = self.forward_step( lines=lines, h0=h0, c0=c0, encoder_outputs=encoder_outputs) prediction = prediction.squeeze(1) outputs[i] = prediction line_token_indexes = prediction.argmax(1) line_tokens = [ self.vocab.idx2token[line_token_index] for line_token_index in line_token_indexes.cpu().numpy() ] lines = [] for token in line_tokens: line = Line("") line.add_token(token, "tokens") lines.append(line) h0, c0 = hn, cn outputs = outputs.permute(1, 0, 2) return outputs