Ejemplo n.º 1
0
    def endElement(self, name):
        # print("endElement '" + name + "'")
        if name == "p":
            # end of sentence
            if self.accumulated != '':
                localTokens = tokenizeAndFilterSimple(self.accumulated)
                for token in localTokens:
                    self.tokens.append(token)
                    self.labels.append('O')

            self.sents.append(self.tokens)
            self.allLabels.append(self.labels)
            self.tokens = []
            self.labels = []
        if name == "rs":
            # end of entity
            localTokens = tokenizeAndFilterSimple(self.accumulated)
            begin = True
            if self.currentLabel is None:
                self.currentLabel = 'O'
            for token in localTokens:
                self.tokens.append(token)
                if begin:
                    self.labels.append('B-' + self.currentLabel)
                    begin = False
                else:
                    self.labels.append('I-' + self.currentLabel)
            self.currentLabel = None
        self.accumulated = ''
Ejemplo n.º 2
0
 def startElement(self, name, attrs):
     if self.accumulated != '':
         localTokens = tokenizeAndFilterSimple(self.accumulated)
         for token in localTokens:
             self.tokens.append(token)
             self.labels.append('O')
     if name == 'TEI' or name == 'tei':
         # beginning of a document
         self.tokens = []
         self.labels = []
         self.sents = []
         self.allLabels = []
     if name == "p":
         # beginning of sentence
         self.tokens = []
         self.labels = []
         self.currentLabel = 'O'
     if name == "rs":
         # beginning of entity
         if attrs.getLength() != 0:
             if attrs.getValue("type") != 'insult' and attrs.getValue(
                     "type") != 'threat':
                 print("Invalid entity type:", attrs.getValue("type"))
             self.currentLabel = '<' + attrs.getValue("type") + '>'
     self.accumulated = ''
Ejemplo n.º 3
0
def to_vector_single(text, embeddings, maxlen=300):
    """
    Given a string, tokenize it, then convert it to a sequence of word embedding 
    vectors with the provided embeddings, introducing <PAD> and <UNK> padding token
    vector when appropriate
    """
    tokens = tokenizeAndFilterSimple(clean_text(text))
    window = tokens[-maxlen:]

    # TBD: use better initializers (uniform, etc.)
    x = np.zeros((maxlen, embeddings.embed_size), )

    # TBD: padding should be left and which vector do we use for padding?
    # and what about masking padding later for RNN?
    for i, word in enumerate(window):
        x[i, :] = embeddings.get_word_vector(word).astype('float32')

    return x
Ejemplo n.º 4
0
    def startElement(self, name, attrs):
        if self.accumulated != '':
            localTokens = tokenizeAndFilterSimple(self.accumulated)
            for token in localTokens:
                self.tokens.append(token)
                self.labels.append('O')
        if name == 'corpus' or name == 'DOC':
            # beginning of a document
            self.tokens = []
            self.labels = []
            self.sents = []
            self.allLabels = []
        if name == "sentence":
            # beginning of sentence
            self.tokens = []
            self.labels = []
            self.currentLabel = 'O'
        if name == "ENAMEX":
            # beginning of entity
            if attrs.getLength() != 0:
                #if attrs.getValue("type") != 'insult' and attrs.getValue("type") != 'threat':
                #    print("Invalid entity type:", attrs.getValue("type"))
                attribute_names = attrs.getNames()
                mainType = None
                if "type" in attrs:
                    mainType = attrs.getValue("type")
                if "TYPE" in attrs:
                    mainType = attrs.getValue("TYPE")
                if mainType is None:
                    print('ENAMEX element without type attribute!')

                if "sub_type" in attrs:
                    subType = attrs.getValue("sub_type")
                else:
                    subType = ''
                if self.corpus_type == 'lemonde':
                    self.currentLabel = '<' + self.translate_fr_labels(
                        mainType, subType) + '>'
                else:
                    self.currentLabel = '<' + mainType + '>'
        self.accumulated = ''
Ejemplo n.º 5
0
    def __data_generation(self, index):
        'Generates data containing batch_size samples'
        max_iter = min(self.batch_size, len(self.x) - self.batch_size * index)

        # restrict data to index window
        sub_x = self.x[(index * self.batch_size):(index * self.batch_size) +
                       max_iter]

        # tokenize texts in self.x if not already done
        max_length_x = 0
        if self.tokenize:
            x_tokenized = []
            for i in range(0, max_iter):
                tokens = tokenizeAndFilterSimple(sub_x[i])
                if len(tokens) > max_length_x:
                    max_length_x = len(tokens)
                x_tokenized.append(tokens)
        else:
            for tokens in sub_x:
                if len(tokens) > max_length_x:
                    max_length_x = len(tokens)
            x_tokenized = sub_x

        batch_x = np.zeros(
            (max_iter, max_length_x, self.embeddings.embed_size),
            dtype='float32')
        if self.preprocessor.return_casing:
            batch_a = np.zeros((max_iter, max_length_x), dtype='float32')

        batch_y = None
        max_length_y = max_length_x
        if self.y is not None:
            # note: tags are always already "tokenized",
            batch_y = np.zeros((max_iter, max_length_y), dtype='float32')

        if self.embeddings.use_ELMo:
            #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x)
            batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings,
                                                 max_length_x)

        # generate data
        for i in range(0, max_iter):
            # store sample embeddings
            if not self.embeddings.use_ELMo:
                batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings,
                                              max_length_x)

            if self.preprocessor.return_casing:
                batch_a[i] = to_casing_single(x_tokenized[i], max_length_x)

            # store tag embeddings
            if self.y is not None:
                batch_y = self.y[(index *
                                  self.batch_size):(index * self.batch_size) +
                                 max_iter]

        if self.y is not None:
            batches, batch_y = self.preprocessor.transform(
                x_tokenized, batch_y)
        else:
            batches = self.preprocessor.transform(x_tokenized)

        batch_c = np.asarray(batches[0])
        batch_l = batches[1]

        if self.preprocessor.return_casing:
            return batch_x, batch_c, batch_a, batch_l, batch_y
        else:
            return batch_x, batch_c, batch_l, batch_y