Ejemplo n.º 1
0
    def endElement(self, name):
        # print("endElement '" + name + "'")
        if name == "p":
            # end of sentence
            if self.accumulated != '':
                localTokens = tokenizeAndFilterSimple(self.accumulated)
                for token in localTokens:
                    self.tokens.append(token)
                    self.labels.append('O')

            self.sents.append(self.tokens)
            self.allLabels.append(self.labels)
            self.tokens = []
            self.labels = []
        if name == "rs":
            # end of entity
            localTokens = tokenizeAndFilterSimple(self.accumulated)
            begin = True
            if self.currentLabel == None:
                self.currentLabel = 'O'
            for token in localTokens:
                self.tokens.append(token)
                if begin:
                    self.labels.append('B-' + self.currentLabel)
                    begin = False
                else:
                    self.labels.append('I-' + self.currentLabel)
            self.currentLabel = None
        self.accumulated = ''
Ejemplo n.º 2
0
    def test_tokenizer_filter_simple(self):
        input = 'this is a test, but a stupid test!!'

        output = tokenizeAndFilterSimple(input)

        assert len(output) == 11
        assert output == ['this', 'is', 'a', 'test', ',', 'but', 'a', 'stupid', 'test', '!', '!']
Ejemplo n.º 3
0
 def startElement(self, name, attrs):
     if self.accumulated != '':
         localTokens = tokenizeAndFilterSimple(self.accumulated)
         for token in localTokens:
             self.tokens.append(token)
             self.labels.append('O')
     if name == 'TEI' or name == 'tei':
         # beginning of a document
         self.tokens = []
         self.labels = []
         self.sents = []
         self.allLabels = []
     if name == "p":
         # beginning of sentence
         self.tokens = []
         self.labels = []
         self.currentLabel = 'O'
     if name == "rs":
         # beginning of entity
         if attrs.getLength() != 0:
             if attrs.getValue("type") != 'insult' and attrs.getValue(
                     "type") != 'threat':
                 print("Invalid entity type:", attrs.getValue("type"))
             self.currentLabel = '<' + attrs.getValue("type") + '>'
     self.accumulated = ''
Ejemplo n.º 4
0
    def test_tokenizer_filter_simple_with_breaklines(self):
        input = '\nthis is yet \u2666 another, dummy... test,\na [stupid] test?!'

        output = tokenizeAndFilterSimple(input)

        assert len(output) == 19
        assert output == ['this', 'is', 'yet', '\u2666', 'another', ',', 'dummy', '.', '.', '.', 'test', ',', 'a',
                          '[', 'stupid', ']', 'test', '?', '!']
def get_tokens_from_text_features(
        token_features: List[str],
        text_feature_indices: List[int]) -> List[str]:
    return tokenizeAndFilterSimple(' '.join([
        (
            token_features[text_feature_index]
            if text_feature_index < len(token_features)
            else ''
        )
        for text_feature_index in text_feature_indices
    ]).replace(NBSP, ' '))
Ejemplo n.º 6
0
def to_vector_single(text, embeddings, maxlen=300):
    """
    Given a string, tokenize it, then convert it to a sequence of word embedding 
    vectors with the provided embeddings, introducing <PAD> and <UNK> padding token
    vector when appropriate
    """
    tokens = tokenizeAndFilterSimple(clean_text(text))
    window = tokens[-maxlen:]

    # TBD: use better initializers (uniform, etc.)
    x = np.zeros((maxlen, embeddings.embed_size), )

    # TBD: padding should be left and which vector do we use for padding?
    # and what about masking padding later for RNN?
    for i, word in enumerate(window):
        x[i, :] = embeddings.get_word_vector(word).astype('float32')

    return x
Ejemplo n.º 7
0
    def __data_generation(self, index):
        'Generates data containing batch_size samples'
        max_iter = min(self.batch_size, len(self.x) - self.batch_size * index)

        # restrict data to index window
        sub_x = self.x[(index * self.batch_size):(index * self.batch_size) +
                       max_iter]

        batch_x = np.zeros((max_iter, self.maxlen, self.embeddings.embed_size),
                           dtype='float32')
        batch_y = None
        if self.y is not None:
            batch_y = np.zeros((max_iter, len(self.list_classes)),
                               dtype='float32')

        x_tokenized = []
        for i in range(0, max_iter):
            tokens = tokenizeAndFilterSimple(sub_x[i])
            x_tokenized.append(tokens)

        if self.embeddings.use_ELMo:
            #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x)
            batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings,
                                                 self.maxlen)

        if self.embeddings.use_BERT:
            batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings,
                                                 self.maxlen)

        # Generate data
        for i in range(0, max_iter):
            # Store sample
            if not self.embeddings.use_ELMo and not self.embeddings.use_BERT:
                batch_x[i] = to_vector_single(
                    self.x[(index * self.batch_size) + i], self.embeddings,
                    self.maxlen)

            # Store class
            # classes are numerical, so nothing to vectorize for y
            if self.y is not None:
                batch_y[i] = self.y[(index * self.batch_size) + i]

        return batch_x, batch_y
Ejemplo n.º 8
0
    def startElement(self, name, attrs):
        if self.accumulated != '':
            localTokens = tokenizeAndFilterSimple(self.accumulated)
            for token in localTokens:
                self.tokens.append(token)
                self.labels.append('O')
        if name == 'corpus' or name == 'DOC':
            # beginning of a document
            self.tokens = []
            self.labels = []
            self.sents = []
            self.allLabels = []
        if name == "sentence":
            # beginning of sentence
            self.tokens = []
            self.labels = []
            self.currentLabel = 'O'
        if name == "ENAMEX" or name == "EX_ENAMEX":
            # beginning of entity
            if attrs.getLength() != 0:
                #if attrs.getValue("type") != 'insult' and attrs.getValue("type") != 'threat':
                #    print("Invalid entity type:", attrs.getValue("type"))
                attribute_names = attrs.getNames()
                mainType = None
                if "type" in attrs:
                    mainType = attrs.getValue("type")
                if "TYPE" in attrs:
                    mainType = attrs.getValue("TYPE")
                if mainType == None:
                    print('ENAMEX element without type attribute!')

                if "sub_type" in attrs:
                    subType = attrs.getValue("sub_type")
                else:
                    subType = ''
                if self.corpus_type == 'lemonde':
                    self.currentLabel = '<' + self.translate_fr_labels(
                        mainType, subType) + '>'
                else:
                    self.currentLabel = '<' + mainType + '>'
        self.accumulated = ''
Ejemplo n.º 9
0
 def create_inputs(self, x_s, dummy_label='O'):
     """
     Gets a collection of `InputExample` for input to be labelled (for prediction)
     """
     examples = []
     # dummy label to avoid breaking the BERT base code
     for (i, x) in enumerate(x_s):
         guid = i
         tokens = []
         labels = []
         # if x is not already segmented:
         if isinstance(x, list):
             simple_tokens = x
         else:
             simple_tokens = tokenizeAndFilterSimple(x)
         for j in range(len(simple_tokens)):
             tokens.append(tokenization.convert_to_unicode(
                 simple_tokens[j]))
             labels.append(tokenization.convert_to_unicode(dummy_label))
         examples.append(
             InputExample(guid=guid, tokens=tokens, labels=labels))
     return examples
    def get_window_batch_data(  # pylint: disable=too-many-statements
            self,
            window_indices_and_offsets: List[Tuple[int, int]]):
        'Generates data containing batch_size samples'

        # restrict data to index window
        # Note: can't apply max_sequence_length here because we may tokenize
        sub_x = take_with_offset(self.x, window_indices_and_offsets)

        # tokenize texts in self.x if not already done
        if self.tokenize:
            x_tokenized = [
                tokenizeAndFilterSimple(text)
                for text in sub_x
            ]
        else:
            x_tokenized = sub_x

        max_length_x = max((len(tokens) for tokens in x_tokenized))

        if self.max_sequence_length and max_length_x > self.max_sequence_length:
            max_length_x = self.max_sequence_length
            # truncation of sequence at max_sequence_length
            x_tokenized = truncate_batch_values(x_tokenized, self.max_sequence_length)

        # prevent sequence of length 1 alone in a batch (this causes an error in tf)
        extend = False
        if max_length_x == 1:
            max_length_x += 1
            extend = True

        batch_y = None

        sub_f = None
        if (
                self.preprocessor.return_features
                or self.additional_token_feature_indices
                or self.text_feature_indices
        ):
            assert self.features is not None
            sub_f = take_with_offset(
                self.features,
                window_indices_and_offsets,
                max_sequence_length=max_length_x
            )

        batch_text_list = list(iter_batch_text_list(
            x_tokenized,
            batch_features=sub_f,
            additional_token_feature_indices=self.additional_token_feature_indices,
            text_feature_indices=self.text_feature_indices
        ))
        LOGGER.debug('batch_text_list: %s', batch_text_list)

        padded_batch_text_list = self.to_padded_batch_text_list(
            batch_text_list
        )
        LOGGER.debug('padded_batch_text_list: %s', padded_batch_text_list)

        batch_x = self.to_concatenated_batch_vector_from_batch_text_list(
            batch_text_list,
            max_length_x,
            text_is_token=(
                not self.additional_token_feature_indices
                and not self.text_feature_indices
            )
        )

        if self.preprocessor.return_casing:
            batch_a = to_batch_casing(x_tokenized, max_length_x)

        batch_y = None
        # store tag embeddings
        if self.y is not None:
            batch_y = take_with_offset(self.y, window_indices_and_offsets)
            max_length_y = max((len(y_row) for y_row in batch_y))
            if self.max_sequence_length and max_length_y > self.max_sequence_length:
                max_length_y = self.max_sequence_length
                # truncation of sequence at max_sequence_length
                batch_y = truncate_batch_values(batch_y, self.max_sequence_length)

            batches, batch_y = self.preprocessor.transform(
                padded_batch_text_list, batch_y, extend=extend
            )
        else:
            batches = self.preprocessor.transform(
                padded_batch_text_list, extend=extend
            )

        batch_c = np.asarray(batches[0])

        batch_l = batches[1]

        inputs = []
        inputs.append(batch_x)
        inputs.append(batch_c)
        if self.preprocessor.return_casing:
            inputs.append(batch_a)
        if self.preprocessor.return_features:
            LOGGER.debug('extend: %s', extend)
            try:
                batch_features = self.preprocessor.transform_features(sub_f, extend=extend)
                batch_features = left_pad_batch_values(batch_features, max_length_x)
            except TypeError:
                batch_features = left_pad_batch_values(
                    self.preprocessor.transform_features(sub_f),
                    max_length_x
                )
            LOGGER.debug('batch_features.shape: %s', batch_features.shape)
            inputs.append(batch_features)
        inputs.append(batch_l)

        if LOGGER.isEnabledFor(logging.DEBUG):
            LOGGER.debug('inputs shapes: %s', [
                np.asarray(x).shape for x in inputs
            ])

        return inputs, batch_y
Ejemplo n.º 11
0
    def __data_generation(self, index):
        'Generates data containing batch_size samples'
        max_iter = min(self.batch_size, len(self.x) - self.batch_size * index)

        # restrict data to index window
        sub_x = self.x[(index * self.batch_size):(index * self.batch_size) +
                       max_iter]

        # tokenize texts in self.x if not already done
        max_length_x = 0
        if self.tokenize:
            x_tokenized = []
            for i in range(0, max_iter):
                tokens = tokenizeAndFilterSimple(sub_x[i])
                if len(tokens) > max_length_x:
                    max_length_x = len(tokens)
                x_tokenized.append(tokens)
        else:
            for tokens in sub_x:
                if len(tokens) > max_length_x:
                    max_length_x = len(tokens)
            x_tokenized = sub_x

        # prevent sequence of length 1 alone in a batch (this causes an error in tf)
        extend = False
        if max_length_x == 1:
            max_length_x += 1
            extend = True

        batch_x = np.zeros(
            (max_iter, max_length_x, self.embeddings.embed_size),
            dtype='float32')
        if self.preprocessor.return_casing:
            batch_a = np.zeros((max_iter, max_length_x), dtype='float32')

        batch_y = None
        max_length_y = max_length_x
        if self.y is not None:
            # note: tags are always already "tokenized",
            batch_y = np.zeros((max_iter, max_length_y), dtype='float32')

        if self.embeddings.use_ELMo:
            #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x)
            batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings,
                                                 max_length_x)
        elif self.embeddings.use_BERT:
            #batch_x = to_vector_bert(x_tokenized, self.embeddings, max_length_x)
            batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings,
                                                 max_length_x)

        # generate data
        for i in range(0, max_iter):
            # store sample embeddings
            if not self.embeddings.use_ELMo and not self.embeddings.use_BERT:
                batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings,
                                              max_length_x)

            if self.preprocessor.return_casing:
                batch_a[i] = to_casing_single(x_tokenized[i], max_length_x)

            # store tag embeddings
            if self.y is not None:
                batch_y = self.y[(index *
                                  self.batch_size):(index * self.batch_size) +
                                 max_iter]

        if self.y is not None:
            batches, batch_y = self.preprocessor.transform(x_tokenized,
                                                           batch_y,
                                                           extend=extend)
        else:
            batches = self.preprocessor.transform(x_tokenized, extend=extend)

        batch_c = np.asarray(batches[0])

        batch_l = batches[1]

        if self.preprocessor.return_casing:
            return batch_x, batch_c, batch_a, batch_l, batch_y
        else:
            return batch_x, batch_c, batch_l, batch_y
Ejemplo n.º 12
0
 def tokenize(self, text: str) -> List[str]:
     return tokenizeAndFilterSimple(text.replace(NBSP, ' '))
Ejemplo n.º 13
0
    def __data_generation(self, index):
        'Generates data containing batch_size samples'
        max_iter = min(self.batch_size, len(self.original_x)-self.batch_size * index)

        # restrict data to index window
        sub_x = self.x[(index * self.batch_size):(index * self.batch_size) + max_iter]

        # tokenize texts in self.x if not already done
        # From: https://github.com/elifesciences/sciencebeam-trainer-delft/blob/c31f97433243a2b0a66671c0dd3e652dcd306362/sciencebeam_trainer_delft/sequence_labelling/data_generator.py#L102-L118
        if self.tokenize:
            x_tokenized = [
                tokenizeAndFilterSimple(text)
                for text in sub_x
            ]
        else:
            x_tokenized = sub_x

        max_length_f = max_length_x = max((len(tokens) for tokens in x_tokenized))

        if self.max_sequence_length and max_length_x > self.max_sequence_length:
            max_length_x = self.max_sequence_length
            # truncation of sequence at max_sequence_length
            x_tokenized = np.asarray(truncate_batch_values(x_tokenized, self.max_sequence_length))

        # prevent sequence of length 1 alone in a batch (this causes an error in tf)
        extend = False
        if max_length_x == 1:
            max_length_x += 1
            extend = True

        # generate data
        batch_a = np.zeros((max_iter, max_length_x), dtype='float32')

        if self.embeddings.use_ELMo:
            batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings, max_length_x, extend=extend)
        elif self.embeddings.use_BERT:
            batch_x = to_vector_simple_with_bert(x_tokenized, self.embeddings, max_length_x, extend=extend)
        else:
            batch_x = np.zeros((max_iter, max_length_x, self.embeddings.embed_size), dtype='float32')
            # store sample embeddings
            for i in range(0, max_iter):
                batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings, max_length_x)

        if self.preprocessor.return_casing:
            for i in range(0, max_iter):
                batch_a[i] = to_casing_single(x_tokenized[i], max_length_x)

        batch_y = None
        # store tag embeddings
        if self.y is not None:
            # note: tags are always already "tokenized",
            batch_y = self.y[(index*self.batch_size):(index*self.batch_size)+max_iter]
            max_length_y = max((len(y_row) for y_row in batch_y))

            # From: https://github.com/elifesciences/sciencebeam-trainer-delft/blob/c31f97433243a2b0a66671c0dd3e652dcd306362/sciencebeam_trainer_delft/sequence_labelling/data_generator.py#L152
            if self.max_sequence_length and max_length_y > self.max_sequence_length:
                # truncation of sequence at max_sequence_length
                 batch_y = np.asarray(truncate_batch_values(batch_y, self.max_sequence_length))

        batch_f = np.zeros((batch_x.shape[0:2]), dtype='int32')

        if self.preprocessor.return_features:
            sub_f = self.features[(index * self.batch_size):(index * self.batch_size) + max_iter]
            if self.max_sequence_length and max_length_f > self.max_sequence_length:
                max_length_f = self.max_sequence_length
                # truncation of sequence at max_sequence_length
                sub_f = truncate_batch_values(sub_f, self.max_sequence_length)

            batch_f = self.preprocessor.transform_features(sub_f, extend=extend)

        if self.y is not None:
            batches, batch_y = self.preprocessor.transform(x_tokenized, batch_y, extend=extend)
        else:
            batches = self.preprocessor.transform(x_tokenized, extend=extend)

        batch_c = np.asarray(batches[0])
        batch_l = batches[1]

        return batch_x, batch_c, batch_f, batch_a, batch_l, batch_y