def transform(self, X, y=None, entity_labels=None):
        """ Transforms the list of `Document` objects that are provided as
            input to the BIO format and returns features, tokens, and labels per word.
            Features of a word stored as a list of values.
        """
        log.info("Generating features for {} documents...".format(len(X)))
        tokens_per_doc, labels_per_doc = \
            transform_annotated_documents_to_bio_format(X, entity_labels=entity_labels)
        tokens_flat = [token for tokens in tokens_per_doc for token in tokens]
        labels_flat = [label for labels in labels_per_doc for label in labels]
        pos_tags_flat = [pos_tag for tokens in tokens_per_doc for pos_tag in tokens_to_pos_tags(tokens)]

        features_flat = [self._word_to_context(tokens_flat, pos_tags_flat, idx)
                         for idx in range(len(tokens_flat))]

        if not self.encoders:
            # first time run
            for idx in range(len(features_flat[0])):
                if isinstance(features_flat[0][idx], str):
                    self.encoders[idx] = LabelEncoder()
                    column_vector = [features_flat[i][idx] for i in range(len(features_flat))]
                    column_vector.append(UNKNOWN_WORD)
                    self.encoders[idx].fit(column_vector)

        for idx in self.encoders:
            column_vector = [features_flat[i][idx] for i in range(len(features_flat))]
            self._process_unknown_values(column_vector, self.encoders[idx].classes_.tolist(), UNKNOWN_WORD)
            column_vector = self.encoders[idx].transform(column_vector).tolist()
            for i in range(len(features_flat)):
                features_flat[i][idx] = column_vector[i]

        return features_flat, tokens_flat, labels_flat
Ejemplo n.º 2
0
def test_tokens_to_pos_tags():

    sentence = "The cat (feline) eats a mouse."
    split_tokens = sentence_to_tokens(sentence)
    pos_tags = tokens_to_pos_tags(split_tokens)

    assert_equal(['DT', 'NN', '(', 'NN', ')', 'VBZ', 'DT', 'NN', '.'],
                 pos_tags)
 def transform(self, X, y=None):
     """ Transforms the list of documents and returns tokens with their features.
         Each document should represent a sentence.
     """
     log.info("Generating features for {} documents...".format(len(X)))
     features = []
     for doc in X:
         doc_features = []
         for pos_tag in tokens_to_pos_tags(document_to_tokens(doc)):
             if pos_tag in self.model.wv:
                 doc_features.append((pos_tag, self.model.wv[pos_tag]))
         features.append(doc_features)
     return features
    def fit(self,
            X,
            y=None,
            size=100,
            min_count=5,
            workers=1,
            window=5,
            sample=1e-3,
            skipgram=False):
        """ Trains a Word2vec model on given documents.
            Each document should represent a sentence.
        Args:
            X: list(Document | AnnotatedDocument | list(str))
            y: optional labels
            size: Word vector dimensionality
            min_count: Minimum word count
            workers: Number of threads to run in parallel
            window: Context window size
            sample: Downsample setting for frequent words
            skipgram: Use skip-gram if True and CBOW otherwise
        """
        log.info("Checking parameters...")
        self.config.set_parameters({
            "size": size,
            "min_count": min_count,
            "workers": workers,
            "window": window,
            "sample": sample
        })
        # Get sentences as lists of tokens
        log.info("Tokenizing {} documents...".format(len(X)))
        sentences = []
        for idx, doc in enumerate(X):
            sentences.append(tokens_to_pos_tags(document_to_tokens(doc)))
            log_progress(log, idx, len(X))
        # Initialize and train the model (this will take some time)
        log.info("Training Word2vec on {} sentences...".format(len(X)))
        self.model = Word2Vec(sentences,
                              workers=self.config.get_parameter("workers"),
                              size=self.config.get_parameter("size"),
                              min_count=self.config.get_parameter("min_count"),
                              window=self.config.get_parameter("window"),
                              sample=self.config.get_parameter("sample"),
                              sg=1 if skipgram else 0)

        # If you don't plan to train the model any further, calling
        # init_sims() will make the model much more memory-efficient.
        self.model.init_sims(replace=True)
        return self
Ejemplo n.º 5
0
    def _preprocessor(self, data):
        """ Helper function for interconversions.
        """

        tokens, labels = transform_annotated_documents_to_bio_format(data)
        pos_tags = []
        for t_i in tokens:
            pos_tags.append(tokens_to_pos_tags(t_i))
        sentences = []
        for i in range(len(tokens)):
            sentence = [
                (text, pos, label)
                for text, pos, label in zip(tokens[i], pos_tags[i], labels[i])
            ]
            sentences.append(sentence)

        features = [self._sent_to_features(s) for s in sentences]
        labels = [self._sent_to_labels(s) for s in sentences]

        return features, tokens, labels
    def transform(self, X, y=None, entity_labels=None):
        """ Transforms the list of `Document` objects that are provided as
            input to the BIO format and returns features, tokens, and labels per word.
            Features of a word stored as a dictionary of name-value pairs.
        """
        tokens_per_doc, labels_per_doc = \
            transform_annotated_documents_to_bio_format(X, entity_labels=entity_labels)
        pos_tags_per_doc = []
        for tokens in tokens_per_doc:
            pos_tags_per_doc.append(tokens_to_pos_tags(tokens))
        sentences = []
        for i in range(len(tokens_per_doc)):
            sentence = [(text, pos, label) for text, pos, label in zip(
                tokens_per_doc[i], pos_tags_per_doc[i], labels_per_doc[i])]
            sentences.append(sentence)

        features_per_doc = [self._sent_to_features(s) for s in sentences]
        labels_per_doc = [self._sent_to_labels(s) for s in sentences]

        return features_per_doc, tokens_per_doc, labels_per_doc
def main():
    parser = argparse.ArgumentParser(description='bioBERT_data_preprocessing.py')
    parser.path_train('path_train', dest = 'path_train', deafult = None, type = str)
    parser.path_test('path_test', dest = 'path_test', deafult = None, type = str)
    parser.label('label', dest = 'label', default = None, type = str)
    parser.output_dir_train('output_dir_train', dest = 'output_dir_train', deafult = None, type = str)
    parser.output_dir_train_eval('output_dir_train_eval', dest = 'output_dir_train_eval', deafult = None, type = str)
    parser.output_dir_eval('output_dir_eval', dest = 'output_dir_eval', deafult = None, type = str)
    parser.output_dir_test('output_dir_test', dest = 'output_dir_test', deafult = None, type = str)
    parser.path_google_bert('path_google_bert', dest = 'path_google_bert', deafult = None, type = str)
    parser.output_dir_mapping_tokens('output_dir_mapping_tokens', dest = 'output_dir_mapping_tokens', deafult = None, type = str)
    parser.output_dir_mapping_labels('output_dir_mapping_labels', dest = 'output_dir_mapping_labels', deafult = None, type = str)
    args = parser.parse_args()


    data_path_train = path_train
    data_path_test = path_test
    bi_train = BratInput(data_path_train).transform()
    bi_test = BratInput(data_path_test).transform()
    
    # Clean the data (train)
    data_train = retain_annotations(bi_train, label)
    clean_data_train = clean_annotated_documents(data_train)
    non_overlap_data_train = resolve_overlaps(clean_data_train)
    
    # Clean the data (test)
    data_test = retain_annotations(bi_test, label)
    clean_data_test = clean_annotated_documents(data_test)
    non_overlap_data_test = resolve_overlaps(clean_data_test)
    
    # Transform to BIO format
    bio_train = transform_annotated_documents_to_bio_format(non_overlap_data_train, entity_labels=label)
    bio_test = transform_annotated_documents_to_bio_format(non_overlap_data_test, entity_labels=label)
    
    
    ######################################
    ### Train, Eval and train-eval set ###
    enum1 = []
    enum2 = []
    enum3 = []
    for idx, inner in enumerate(bio_train[0], start=1):
        enum2.append(idx)
        for jdx, elt in enumerate(inner, start=1):
            enum1.append(jdx)
            enum3.append(elt)
                  
        
    # Number of tokens per sentence
    nest_token_enum = []
    for x in bio_train[0]:
        nest_token_enum.append(len(x))
        
    # Sentence ID
    numbers_ar = np.arange(1, len(non_overlap_data_train)+1)
    nest_token_enum_ar = np.asarray(nest_token_enum)
    sent_id = np.repeat(numbers_ar, nest_token_enum_ar)
    
            
    # BIO tags
    list_bio2 = []
    for x in bio_train[1]:
        list_bio2.append(x)
    flatten = lambda l: [item for sublist in l for item in sublist]
    flatten_BIO = flatten(list_bio2)
    
    
    # POS tags
    pos = tokens_to_pos_tags(enum3)
    
    
    # Sentence list with string
    sentence = []
    for i in sent_id:
        sentence.append(f'Sentence: {i}')
        
    
    # Create a dataframes (train(60/100), train_eval(80/100), eval(train_eval- train)=20/100)
    d = {'Sentence_ID':sentence,'Token_ID':enum1, 'BIO':flatten_BIO, 'POS':pos, 'Token': enum3}
    df = pd.DataFrame(d)
    df['ID'] = df.index
    
    l = df[df["Token_ID"]==1].index.tolist()
    for c, i in enumerate(l):
        dfs = np.split(df, [i+1+c])
            df = pd.concat([dfs[0], pd.DataFrame([[np.NaN, np.NaN, np.NaN, np.NaN, np.NaN, np.NaN]], columns=df.columns), dfs[1]], ignore_index=True)
 from itertools import repeat
 
 numbers_ar = np.arange(1, len(non_overlap_data_test)+1)
 nest_token_enum_ar = np.asarray(nest_token_enum)
 sent_id = np.repeat(numbers_ar, nest_token_enum_ar)
 
 # BIO tags
 list_bio2 = []
 for x in bio_test[1]:
     list_bio2.append(x)
 flatten = lambda l: [item for sublist in l for item in sublist]
 flatten_BIO = flatten(list_bio2)
 
 
 # Create POS tags
 pos = tokens_to_pos_tags(enum3)
 
 
 # Sentence list with string
 sentence = []
 for i in sent_id:
     sentence.append(f'Sentence: {i}')
 
 
 # Create a dataframe (test)    
 d = {'Sentence_ID':sentence,'Token_ID':enum1, 'BIO':flatten_BIO, 'POS':pos, 'Token': enum3}
 df = pd.DataFrame(d)
 df['ID'] = df.index
 
 l = df[df["Token_ID"]==1].index.tolist()
 for c, i in enumerate(l):
Ejemplo n.º 9
0
    def fit(
            self,
            X,
            y=None,
            X_valid=None,
            char_emb_size=32,
            word_emb_size=128,
            char_lstm_units=32,
            word_lstm_units=128,
            pos_emb_size=16,
            dropout=0.5,
            batch_size=8,
            num_epochs=10,
            use_crf=False,
            use_char_emb=False,
            shuffle=False,
            use_pos_emb=False,
            hparams_1={},  # specific to config 
            hparams_2={}):  # other params
        """ Trains the NER model. The input is a list of
            `AnnotatedDocument` instances.

            We should be careful with batch size:
            it must satisfy len(X) % batch_size == 0.
            Otherwise, it crashes with an error from time to time.
            An example here is a token assigned a tag (the BIO scheme).
        """

        log.info("Checking parameters...")
        self.config.set_parameters({
            "num_epochs": num_epochs,
            "dropout": dropout,
            "batch_size": batch_size,
            "char_emb_size": char_emb_size,
            "word_emb_size": word_emb_size,
            "char_lstm_units": char_lstm_units,
            "word_lstm_units": word_lstm_units,
            "pos_emb_size": pos_emb_size,
            "use_crf": use_crf,
            "use_char_emb": use_char_emb,
            "shuffle": shuffle,
            "use_pos_emb": use_pos_emb
        })

        if hparams_1:
            self.config.set_parameters(hparams_1)
        self.config.validate()
        if hparams_2:
            self.hparams.update(hparams_2)

        log.info("Transforming {} items to BIO format...".format(len(X)))
        X_train, Y_train = self._transform_to_bio(X)

        pos_tags = []
        if use_pos_emb:
            log.info("Getting POS tags for {} items...".format(len(X)))
            for idx, x in enumerate(X_train):
                pos_tags.append(tokens_to_pos_tags(x))
                log_progress(log, idx, len(X))

        self.p = IndexTransformer(
            use_char=self.config.get_parameter("use_char_emb"))
        self.p.fit(X_train, Y_train)
        self.word_embeddings = filter_embeddings(
            self.word_embeddings, self.p._word_vocab.vocab,
            self.config.get_parameter("word_emb_size"))

        # compile the model architecture
        self._compile_model()

        train_seq = NERSequence(X_train,
                                Y_train,
                                self.config.get_parameter("batch_size"),
                                preprocess=self.p.transform)

        if X_valid:
            X_valid, Y_valid = self._transform_to_bio(X_valid)
            valid_seq = NERSequence(X_valid, Y_valid, batch_size,
                                    self.p.transform)
            f1 = F1score(valid_seq, preprocessor=self.p)

        # train model
        log.info("Training BiLSTM...")
        self.model.fit_generator(
            generator=train_seq,
            epochs=self.config.get_parameter("num_epochs"),
            validation_data=valid_seq if valid_seq else None,
            verbose=1,
            shuffle=self.config.get_parameter("shuffle"),
            callbacks=self.hparams['callbacks'] +
            [f1] if 'callbacks' in self.hparams else [f1])

        return self
Ejemplo n.º 10
0
    def transform(self, X, y=None):
        """ Annotates the list of `Document` objects that are provided as
            input and returns a list of `AnnotatedDocument` objects.
        """
        log.info(
            "Annotating named entities in {} documents with BiLSTM...".format(
                len(X)))
        annotated_documents = []
        for idx, document in enumerate(X):

            # get tokens
            tokens, _ = transform_annotated_document_to_bio_format(document)

            # encode tokens and pad the sequence
            coded_tokens = [
                self.encoder.encode_word(token) for token in tokens
            ]
            x = pad_sequences(maxlen=self.encoder.max_len,
                              sequences=[coded_tokens],
                              padding="post",
                              value=self.encoder.encode_word(PAD_WORD))
            inputs = [x]

            # add encoded and padded char sequences if needed
            if self.config.get_parameter("use_char_emb"):
                c = [[self.encoder.encode_char(char) for char in token]
                     for token in tokens]
                c = pad_sequences(
                    maxlen=self.encoder.max_len_char,
                    sequences=c,
                    padding="post",
                    value=self.encoder.encode_char(PAD_CHAR)).tolist()
                # add padding chars for padding words
                for i in range(len(tokens), self.encoder.max_len):
                    c.append([self.encoder.encode_char(PAD_CHAR)] *
                             self.encoder.max_len_char)
                c = np.array([c], ndmin=3)
                inputs.append(c)

            # add encoded and padded POS tag sequences if needed
            if self.config.get_parameter("use_pos_emb"):
                pos_tags = tokens_to_pos_tags(tokens)
                coded_pos_tags = [
                    self.encoder.encode_pos(pos) for pos in pos_tags
                ]
                p = pad_sequences(maxlen=self.encoder.max_len,
                                  sequences=[coded_pos_tags],
                                  padding="post",
                                  value=self.encoder.encode_pos(PAD_POS))
                inputs.append(p)

            # get predicted tags
            output = self.model.predict(x=inputs)
            coded_tags = np.argmax(output, axis=-1)[0]
            tags = [self.encoder.decode_tag(idx) for idx in coded_tags]
            tags = tags[:len(tokens)]

            # annotate a document
            annotated_documents.append(
                transform_bio_tags_to_annotated_document(
                    tokens, tags, document))
            # info
            log_progress(log, idx, len(X))

        return annotated_documents
Ejemplo n.º 11
0
    def fit(self,
            X,
            y=None,
            char_emb_size=32,
            word_emb_size=128,
            char_lstm_units=32,
            word_lstm_units=128,
            pos_emb_size=16,
            dropout=0.5,
            batch_size=8,
            num_epochs=10,
            use_crf=False,
            use_char_emb=False,
            shuffle=False,
            use_pos_emb=False):
        """ Trains the NER model. The input is a list of
            `AnnotatedDocument` instances.

            We should be careful with batch size:
            it must satisfy len(X) % batch_size == 0.
            Otherwise, it crushes with an error from time to time.
            An example here is a token assigned a tag (the BIO scheme).
        """

        log.info("Checking parameters...")
        self.config.set_parameters({
            "num_epochs":
            num_epochs,
            "dropout":
            dropout,
            "batch_size":
            batch_size,
            "char_emb_size":
            char_emb_size,
            "word_emb_size":
            self.word_embeddings.vector_size
            if self.word_embeddings else word_emb_size,
            "char_lstm_units":
            char_lstm_units,
            "word_lstm_units":
            word_lstm_units,
            "pos_emb_size":
            pos_emb_size,
            "use_crf":
            use_crf,
            "use_char_emb":
            use_char_emb,
            "shuffle":
            shuffle,
            "use_pos_emb":
            use_pos_emb
        })
        self.config.validate()

        log.info("Transforming {} items to BIO format...".format(len(X)))
        X_train, Y_train = self._transform_to_bio(X)

        pos_tags = []
        if use_pos_emb:
            log.info("Getting POS tags for {} items...".format(len(X)))
            for idx, x in enumerate(X_train):
                pos_tags.append(tokens_to_pos_tags(x))
                log_progress(log, idx, len(X))

        # fit encoder
        self.encoder.fit(X=X_train,
                         Y=Y_train,
                         use_chars=self.config.get_parameter("use_char_emb"),
                         pos_tags=pos_tags)

        # compile the model architecture
        self._compile_model()

        # encode and pad word sequences
        X = [[self.encoder.encode_word(word) for word in x] for x in X_train]
        X = pad_sequences(maxlen=self.encoder.max_len,
                          sequences=X,
                          padding="post",
                          value=self.encoder.encode_word(PAD_WORD))

        # add X to inputs
        inputs = [X]

        # encode and pad tag sequences
        Y = [[self.encoder.encode_tag(tag) for tag in y] for y in Y_train]
        Y = pad_sequences(maxlen=self.encoder.max_len,
                          sequences=Y,
                          padding="post",
                          value=self.encoder.encode_tag(PAD_TAG))
        Y = np.array(
            [to_categorical(y, num_classes=self.encoder.tag_count) for y in Y])

        # encode and pad character sequences if needed
        if self.config.get_parameter("use_char_emb"):
            C = []
            for x in X_train:
                c = [[self.encoder.encode_char(char) for char in word]
                     for word in x]
                c = pad_sequences(
                    maxlen=self.encoder.max_len_char,
                    sequences=c,
                    padding="post",
                    value=self.encoder.encode_char(PAD_CHAR)).tolist()
                # add padding chars for padding words
                for i in range(len(x), self.encoder.max_len):
                    c.append([self.encoder.encode_char(PAD_CHAR)] *
                             self.encoder.max_len_char)

                C.append(c)

            C = np.array(C, ndmin=3)
            inputs.append(C)

        # encode and pad POS tag sequences if needed
        if self.config.get_parameter("use_pos_emb"):
            P = [[self.encoder.encode_pos(pos) for pos in pos_seq]
                 for pos_seq in pos_tags]
            P = pad_sequences(maxlen=self.encoder.max_len,
                              sequences=P,
                              padding="post",
                              value=self.encoder.encode_pos(PAD_POS))
            inputs.append(P)

        # train model
        log.info("Training BiLSTM...")
        self.model.fit(x=inputs,
                       y=Y,
                       epochs=self.config.get_parameter("num_epochs"),
                       batch_size=self.config.get_parameter("batch_size"),
                       verbose=1,
                       shuffle=self.config.get_parameter("shuffle"))
        return self