def endElement(self, name): # print("endElement '" + name + "'") if name == "p": # end of sentence if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') self.sents.append(self.tokens) self.allLabels.append(self.labels) self.tokens = [] self.labels = [] if name == "rs": # end of entity localTokens = tokenizeAndFilterSimple(self.accumulated) begin = True if self.currentLabel == None: self.currentLabel = 'O' for token in localTokens: self.tokens.append(token) if begin: self.labels.append('B-' + self.currentLabel) begin = False else: self.labels.append('I-' + self.currentLabel) self.currentLabel = None self.accumulated = ''
def test_tokenizer_filter_simple(self): input = 'this is a test, but a stupid test!!' output = tokenizeAndFilterSimple(input) assert len(output) == 11 assert output == ['this', 'is', 'a', 'test', ',', 'but', 'a', 'stupid', 'test', '!', '!']
def startElement(self, name, attrs): if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') if name == 'TEI' or name == 'tei': # beginning of a document self.tokens = [] self.labels = [] self.sents = [] self.allLabels = [] if name == "p": # beginning of sentence self.tokens = [] self.labels = [] self.currentLabel = 'O' if name == "rs": # beginning of entity if attrs.getLength() != 0: if attrs.getValue("type") != 'insult' and attrs.getValue( "type") != 'threat': print("Invalid entity type:", attrs.getValue("type")) self.currentLabel = '<' + attrs.getValue("type") + '>' self.accumulated = ''
def test_tokenizer_filter_simple_with_breaklines(self): input = '\nthis is yet \u2666 another, dummy... test,\na [stupid] test?!' output = tokenizeAndFilterSimple(input) assert len(output) == 19 assert output == ['this', 'is', 'yet', '\u2666', 'another', ',', 'dummy', '.', '.', '.', 'test', ',', 'a', '[', 'stupid', ']', 'test', '?', '!']
def get_tokens_from_text_features( token_features: List[str], text_feature_indices: List[int]) -> List[str]: return tokenizeAndFilterSimple(' '.join([ ( token_features[text_feature_index] if text_feature_index < len(token_features) else '' ) for text_feature_index in text_feature_indices ]).replace(NBSP, ' '))
def to_vector_single(text, embeddings, maxlen=300): """ Given a string, tokenize it, then convert it to a sequence of word embedding vectors with the provided embeddings, introducing <PAD> and <UNK> padding token vector when appropriate """ tokens = tokenizeAndFilterSimple(clean_text(text)) window = tokens[-maxlen:] # TBD: use better initializers (uniform, etc.) x = np.zeros((maxlen, embeddings.embed_size), ) # TBD: padding should be left and which vector do we use for padding? # and what about masking padding later for RNN? for i, word in enumerate(window): x[i, :] = embeddings.get_word_vector(word).astype('float32') return x
def __data_generation(self, index): 'Generates data containing batch_size samples' max_iter = min(self.batch_size, len(self.x) - self.batch_size * index) # restrict data to index window sub_x = self.x[(index * self.batch_size):(index * self.batch_size) + max_iter] batch_x = np.zeros((max_iter, self.maxlen, self.embeddings.embed_size), dtype='float32') batch_y = None if self.y is not None: batch_y = np.zeros((max_iter, len(self.list_classes)), dtype='float32') x_tokenized = [] for i in range(0, max_iter): tokens = tokenizeAndFilterSimple(sub_x[i]) x_tokenized.append(tokens) if self.embeddings.use_ELMo: #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x) batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings, self.maxlen) if self.embeddings.use_BERT: batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings, self.maxlen) # Generate data for i in range(0, max_iter): # Store sample if not self.embeddings.use_ELMo and not self.embeddings.use_BERT: batch_x[i] = to_vector_single( self.x[(index * self.batch_size) + i], self.embeddings, self.maxlen) # Store class # classes are numerical, so nothing to vectorize for y if self.y is not None: batch_y[i] = self.y[(index * self.batch_size) + i] return batch_x, batch_y
def startElement(self, name, attrs): if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') if name == 'corpus' or name == 'DOC': # beginning of a document self.tokens = [] self.labels = [] self.sents = [] self.allLabels = [] if name == "sentence": # beginning of sentence self.tokens = [] self.labels = [] self.currentLabel = 'O' if name == "ENAMEX" or name == "EX_ENAMEX": # beginning of entity if attrs.getLength() != 0: #if attrs.getValue("type") != 'insult' and attrs.getValue("type") != 'threat': # print("Invalid entity type:", attrs.getValue("type")) attribute_names = attrs.getNames() mainType = None if "type" in attrs: mainType = attrs.getValue("type") if "TYPE" in attrs: mainType = attrs.getValue("TYPE") if mainType == None: print('ENAMEX element without type attribute!') if "sub_type" in attrs: subType = attrs.getValue("sub_type") else: subType = '' if self.corpus_type == 'lemonde': self.currentLabel = '<' + self.translate_fr_labels( mainType, subType) + '>' else: self.currentLabel = '<' + mainType + '>' self.accumulated = ''
def create_inputs(self, x_s, dummy_label='O'): """ Gets a collection of `InputExample` for input to be labelled (for prediction) """ examples = [] # dummy label to avoid breaking the BERT base code for (i, x) in enumerate(x_s): guid = i tokens = [] labels = [] # if x is not already segmented: if isinstance(x, list): simple_tokens = x else: simple_tokens = tokenizeAndFilterSimple(x) for j in range(len(simple_tokens)): tokens.append(tokenization.convert_to_unicode( simple_tokens[j])) labels.append(tokenization.convert_to_unicode(dummy_label)) examples.append( InputExample(guid=guid, tokens=tokens, labels=labels)) return examples
def get_window_batch_data( # pylint: disable=too-many-statements self, window_indices_and_offsets: List[Tuple[int, int]]): 'Generates data containing batch_size samples' # restrict data to index window # Note: can't apply max_sequence_length here because we may tokenize sub_x = take_with_offset(self.x, window_indices_and_offsets) # tokenize texts in self.x if not already done if self.tokenize: x_tokenized = [ tokenizeAndFilterSimple(text) for text in sub_x ] else: x_tokenized = sub_x max_length_x = max((len(tokens) for tokens in x_tokenized)) if self.max_sequence_length and max_length_x > self.max_sequence_length: max_length_x = self.max_sequence_length # truncation of sequence at max_sequence_length x_tokenized = truncate_batch_values(x_tokenized, self.max_sequence_length) # prevent sequence of length 1 alone in a batch (this causes an error in tf) extend = False if max_length_x == 1: max_length_x += 1 extend = True batch_y = None sub_f = None if ( self.preprocessor.return_features or self.additional_token_feature_indices or self.text_feature_indices ): assert self.features is not None sub_f = take_with_offset( self.features, window_indices_and_offsets, max_sequence_length=max_length_x ) batch_text_list = list(iter_batch_text_list( x_tokenized, batch_features=sub_f, additional_token_feature_indices=self.additional_token_feature_indices, text_feature_indices=self.text_feature_indices )) LOGGER.debug('batch_text_list: %s', batch_text_list) padded_batch_text_list = self.to_padded_batch_text_list( batch_text_list ) LOGGER.debug('padded_batch_text_list: %s', padded_batch_text_list) batch_x = self.to_concatenated_batch_vector_from_batch_text_list( batch_text_list, max_length_x, text_is_token=( not self.additional_token_feature_indices and not self.text_feature_indices ) ) if self.preprocessor.return_casing: batch_a = to_batch_casing(x_tokenized, max_length_x) batch_y = None # store tag embeddings if self.y is not None: batch_y = take_with_offset(self.y, window_indices_and_offsets) max_length_y = max((len(y_row) for y_row in batch_y)) if self.max_sequence_length and max_length_y > self.max_sequence_length: max_length_y = self.max_sequence_length # truncation of sequence at max_sequence_length batch_y = truncate_batch_values(batch_y, self.max_sequence_length) batches, batch_y = self.preprocessor.transform( padded_batch_text_list, batch_y, extend=extend ) else: batches = self.preprocessor.transform( padded_batch_text_list, extend=extend ) batch_c = np.asarray(batches[0]) batch_l = batches[1] inputs = [] inputs.append(batch_x) inputs.append(batch_c) if self.preprocessor.return_casing: inputs.append(batch_a) if self.preprocessor.return_features: LOGGER.debug('extend: %s', extend) try: batch_features = self.preprocessor.transform_features(sub_f, extend=extend) batch_features = left_pad_batch_values(batch_features, max_length_x) except TypeError: batch_features = left_pad_batch_values( self.preprocessor.transform_features(sub_f), max_length_x ) LOGGER.debug('batch_features.shape: %s', batch_features.shape) inputs.append(batch_features) inputs.append(batch_l) if LOGGER.isEnabledFor(logging.DEBUG): LOGGER.debug('inputs shapes: %s', [ np.asarray(x).shape for x in inputs ]) return inputs, batch_y
def __data_generation(self, index): 'Generates data containing batch_size samples' max_iter = min(self.batch_size, len(self.x) - self.batch_size * index) # restrict data to index window sub_x = self.x[(index * self.batch_size):(index * self.batch_size) + max_iter] # tokenize texts in self.x if not already done max_length_x = 0 if self.tokenize: x_tokenized = [] for i in range(0, max_iter): tokens = tokenizeAndFilterSimple(sub_x[i]) if len(tokens) > max_length_x: max_length_x = len(tokens) x_tokenized.append(tokens) else: for tokens in sub_x: if len(tokens) > max_length_x: max_length_x = len(tokens) x_tokenized = sub_x # prevent sequence of length 1 alone in a batch (this causes an error in tf) extend = False if max_length_x == 1: max_length_x += 1 extend = True batch_x = np.zeros( (max_iter, max_length_x, self.embeddings.embed_size), dtype='float32') if self.preprocessor.return_casing: batch_a = np.zeros((max_iter, max_length_x), dtype='float32') batch_y = None max_length_y = max_length_x if self.y is not None: # note: tags are always already "tokenized", batch_y = np.zeros((max_iter, max_length_y), dtype='float32') if self.embeddings.use_ELMo: #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x) batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings, max_length_x) elif self.embeddings.use_BERT: #batch_x = to_vector_bert(x_tokenized, self.embeddings, max_length_x) batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings, max_length_x) # generate data for i in range(0, max_iter): # store sample embeddings if not self.embeddings.use_ELMo and not self.embeddings.use_BERT: batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings, max_length_x) if self.preprocessor.return_casing: batch_a[i] = to_casing_single(x_tokenized[i], max_length_x) # store tag embeddings if self.y is not None: batch_y = self.y[(index * self.batch_size):(index * self.batch_size) + max_iter] if self.y is not None: batches, batch_y = self.preprocessor.transform(x_tokenized, batch_y, extend=extend) else: batches = self.preprocessor.transform(x_tokenized, extend=extend) batch_c = np.asarray(batches[0]) batch_l = batches[1] if self.preprocessor.return_casing: return batch_x, batch_c, batch_a, batch_l, batch_y else: return batch_x, batch_c, batch_l, batch_y
def tokenize(self, text: str) -> List[str]: return tokenizeAndFilterSimple(text.replace(NBSP, ' '))
def __data_generation(self, index): 'Generates data containing batch_size samples' max_iter = min(self.batch_size, len(self.original_x)-self.batch_size * index) # restrict data to index window sub_x = self.x[(index * self.batch_size):(index * self.batch_size) + max_iter] # tokenize texts in self.x if not already done # From: https://github.com/elifesciences/sciencebeam-trainer-delft/blob/c31f97433243a2b0a66671c0dd3e652dcd306362/sciencebeam_trainer_delft/sequence_labelling/data_generator.py#L102-L118 if self.tokenize: x_tokenized = [ tokenizeAndFilterSimple(text) for text in sub_x ] else: x_tokenized = sub_x max_length_f = max_length_x = max((len(tokens) for tokens in x_tokenized)) if self.max_sequence_length and max_length_x > self.max_sequence_length: max_length_x = self.max_sequence_length # truncation of sequence at max_sequence_length x_tokenized = np.asarray(truncate_batch_values(x_tokenized, self.max_sequence_length)) # prevent sequence of length 1 alone in a batch (this causes an error in tf) extend = False if max_length_x == 1: max_length_x += 1 extend = True # generate data batch_a = np.zeros((max_iter, max_length_x), dtype='float32') if self.embeddings.use_ELMo: batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings, max_length_x, extend=extend) elif self.embeddings.use_BERT: batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings, max_length_x, extend=extend) else: batch_x = np.zeros((max_iter, max_length_x, self.embeddings.embed_size), dtype='float32') # store sample embeddings for i in range(0, max_iter): batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings, max_length_x) if self.preprocessor.return_casing: for i in range(0, max_iter): batch_a[i] = to_casing_single(x_tokenized[i], max_length_x) batch_y = None # store tag embeddings if self.y is not None: # note: tags are always already "tokenized", batch_y = self.y[(index*self.batch_size):(index*self.batch_size)+max_iter] max_length_y = max((len(y_row) for y_row in batch_y)) # From: https://github.com/elifesciences/sciencebeam-trainer-delft/blob/c31f97433243a2b0a66671c0dd3e652dcd306362/sciencebeam_trainer_delft/sequence_labelling/data_generator.py#L152 if self.max_sequence_length and max_length_y > self.max_sequence_length: # truncation of sequence at max_sequence_length batch_y = np.asarray(truncate_batch_values(batch_y, self.max_sequence_length)) batch_f = np.zeros((batch_x.shape[0:2]), dtype='int32') if self.preprocessor.return_features: sub_f = self.features[(index * self.batch_size):(index * self.batch_size) + max_iter] if self.max_sequence_length and max_length_f > self.max_sequence_length: max_length_f = self.max_sequence_length # truncation of sequence at max_sequence_length sub_f = truncate_batch_values(sub_f, self.max_sequence_length) batch_f = self.preprocessor.transform_features(sub_f, extend=extend) if self.y is not None: batches, batch_y = self.preprocessor.transform(x_tokenized, batch_y, extend=extend) else: batches = self.preprocessor.transform(x_tokenized, extend=extend) batch_c = np.asarray(batches[0]) batch_l = batches[1] return batch_x, batch_c, batch_f, batch_a, batch_l, batch_y