Ejemplo n.º 1
0
    def predict(self, input_dir, output_dir, rw_type, input_format, chunk_len=100,
                test_scores = False,
                output_confidence=False,
                special_model_path=None):
        """
        tags each file in the input directory (txt or tsv files) and writes the results
        to output_dir. Also adds a folder "result_stats" with runtime information to the
        output_dir

        tsv files must have at least the columns 'tok' and 'sentstart'
        :param input_dir: string value: path to input directory
        :param output_dir: string value: path to output directory
        :param rw_type: string value: direct, indirect, freeIndirect or reported
        :param input_format: string value: txt or tsv
        :param chunk_len:
        :return:
        """
        # time the prediction
        start_time = datetime.datetime.now().replace(microsecond=0)
        # create a subdir for testing and overview information in the outputdir
        result_subdir = "result_stats"
        if not os.path.exists(os.path.join(output_dir, result_subdir)):
            os.makedirs(os.path.join(output_dir, result_subdir))

        # load the model
        # determine the current script path
        curr_path = os.path.dirname(os.path.abspath(__file__))
        if special_model_path is None:
            model_path = os.path.join(curr_path, "models", rw_type, "final-model.pt")
        else:
            model_path = os.path.join(curr_path, "models", special_model_path, "final-model.pt")
        if not os.path.exists(model_path):
            logging.warning("Predicting {} aborted. Model not found at path '{}'. Please download a model and put it into "
                          "the appropriate directory. The model file must be named final-model.pt.".format(rw_type, model_path))
        else:
            self.logger.info("loading model {}".format(model_path))
            model = SequenceTagger.load(model_path)
            self.logger.info("model loaded")

            # if test mode, collect score data (initialize in any case)
            score_dict = {"file": [], "f1":[], "precision": [], "recall": []}
            all_predictions_df = pd.DataFrame()

            input_files = [x for x in os.listdir(input_dir)]
            for file in input_files:
                resfile_name = re.sub("\..+$", ".tsv", file)
                self.logger.info("predicting {}".format(file))
                # read the file and convert to dataframe
                if input_format == "txt":
                    data = self.convert_txtfile_to_dateframe(os.path.join(input_dir, file))
                else:
                    data = pd.read_csv(os.path.join(input_dir, file), sep="\t", quoting=3, encoding="utf-8", na_values=[])

                # check for tok column:
                if "tok" not in data.columns:
                    self.logger.warning("Column 'tok' is missing in file {}. File will be skipped.".format(file))
                else:
                    if "sentstart" not in data.columns:
                        self.logger.warning("Column 'sentstart' is missing in file {}. Will be added with default values (all 'no').".format(file))
                        data["sentstart"] = ["no"]*len(data)

                    self.logger.debug("TEST: data head:\n {}".format(data.head(10)))
                    # create sentlist (based on max chunk length)
                    sent_list = self.create_sentlist_from_file_batchmax(data,
                                                                        maxlen=chunk_len,
                                                                        compare_column="NaN")
                    # predict
                    res_dict = {"tok": [], rw_type + "_pred": [], rw_type + "_conf": []}
                    for sent in sent_list:
                        model.predict(sent)
                        pred_list = [x["type"] for x in sent.to_dict("cat")["entities"]]
                        res_dict["tok"].extend([x["text"] for x in sent.to_dict("cat")["entities"]])
                        res_dict[rw_type + "_conf"].extend([x["confidence"] for x in sent.to_dict("cat")["entities"]])
                        res_dict[rw_type + "_pred"].extend(pred_list)
                    pred_df = pd.DataFrame(res_dict)
                    # create output
                    # if there is a missmatch in file length after prediction, still save the results
                    if (len(data) != len(pred_df)):
                        self.logger.warning("File length changed when predicting for file {} (before: {}, after: {})\n"
                                        "Result file will be saved with prefix 'warn_'; additional columns are lost."
                                      .format(file, len(data), len(pred_df)))
                        pred_df.to_csv(os.path.join(output_dir, "warn_" + resfile_name), index=False, sep="\t")
                    # if everything is okay, add the new column(s) to the original data and save
                    else:
                        if output_confidence:
                            data[rw_type + "_conf"] = pred_df[rw_type + "_conf"]
                        data[rw_type + "_pred"] = pred_df[rw_type + "_pred"]
                        data.to_csv(os.path.join(output_dir, resfile_name), index=False, sep="\t", encoding="utf-8")
                        # calculate the testscores:
                        if test_scores:
                            self.logger.info("Calculate scores for {}".format(file))
                            if rw_type in data.columns and rw_type + "_pred" in data.columns:
                                data, f1, prec, rec = self.calculate_scores(data, rw_type)
                                score_dict["file"].append(file)
                                score_dict["f1"].append(f1)
                                score_dict["precision"].append(prec)
                                score_dict["recall"].append(rec)
                                all_predictions_df = all_predictions_df.append(data)
                            else:
                                self.logger.warning("Skipping test scores for file {}: Missing column {} and/or {}".format(file, rw_type, rw_type + "_pred"))

            end_time = datetime.datetime.now().replace(microsecond=0)

            # write an overview file when the process is finished
            res_text = "RW Tagger (predict): Model {}\n" \
                       "Predict time:\nstart: {}nend:{}\ntotal: {}" \
                .format(model_path, start_time, end_time, end_time - start_time)
            # if in test mode, calculate the final scores (for all the data) and save the test score df
            if test_scores:
                self.logger.info("Calculate total scores")
                if len(all_predictions_df) > 0:
                    self.logger.debug("all_predictions_len: {}".format(len(all_predictions_df)))
                    all_predictions_df, f1, prec, rec = self.calculate_scores(all_predictions_df, rw_type)
                    score_dict["file"].append("total")
                    score_dict["f1"].append(f1)
                    score_dict["precision"].append(prec)
                    score_dict["recall"].append(rec)
                    score_df = pd.DataFrame(score_dict)
                    score_df.to_csv(os.path.join(output_dir, result_subdir, rw_type + "_test_scores.tsv"), index=False, sep="\t", encoding="utf-8")
                    res_text += "\nTotal test scores (for detailed scores see {}_test_scores.tsv):\n" \
                                "f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec)
                    self.logger.info("Total scores for {}: f1: {}, precision: {}, recall: {}".format(rw_type, f1, prec, rec))
            with open(os.path.join(output_dir, result_subdir, rw_type + "_overview.txt"), "w", encoding="utf-8") as f:
                f.write(res_text)
Ejemplo n.º 2
0
    # comment in these lines to use contextual string embeddings
    #
    # CharLMEmbeddings('news-forward'),
    #
    # CharLMEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# initialize trainer
from flair.trainers.sequence_tagger_trainer import SequenceTaggerTrainer

trainer: SequenceTaggerTrainer = SequenceTaggerTrainer(tagger,
                                                       corpus,
                                                       test_mode=True)

trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=20)
Ejemplo n.º 3
0
    # comment in this line to use character embeddings
    # CharacterEmbeddings(),

    # comment in these lines to use flair embeddings
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

# 6. initialize trainer
from flair.trainers import ModelTrainer

# trainer: ModelTrainer = ModelTrainer(tagger, corpus)

checkpoint = 'resources/taggers/ner_with_random_dp_1/checkpoint.pt'
trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)

# 7. start training
trainer.train('resources/taggers/ner_with_random_dp_1',
              learning_rate=0.1,
              mini_batch_size=32,
                        help='Use gpu/cpu, put "cuda" if gpu and "cpu" if cpu')

args = parser.parse_args()
input_folder=args.input
model_file=args.model
gpu_type=args.gpu


flair.device = torch.device(gpu_type)
from flair.data import Sentence
from flair.models import SequenceTagger
from tqdm import tqdm
import torch
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.trainers import ModelTrainer
from flair.datasets import ColumnCorpus
from flair.models import SequenceTagger

#Change this line if you have POS tags in your data, eg.- {0: 'text', 1:'pos', 2:'ner'}
columns = {0: 'text',1: 'ner'}

corpus: ColumnCorpus = ColumnCorpus(input_folder, column_format={0: 'text',1: 'ner'})

tagger = SequenceTagger.load(model_file)
print("Dev set results")
result, _ = tagger.evaluate(corpus.dev)
print(result.detailed_results)
print("Test set results")
result, _ = tagger.evaluate(corpus.test)
print(result.detailed_results)
Ejemplo n.º 5
0
from tensorflow.python.keras.backend import set_session
from tensorflow.python.keras.models import load_model

#sess = tf.Session()
#graph = tf.get_default_graph()

# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras! 
# Otherwise, their weights will be unavailable in the threads after the session there has been set
#set_session(sess)

print(datetime.datetime.now(), 'loading Bi-LSTM.h5')
cat_model = load_model('Bi-LSTM.h5')
print(datetime.datetime.now(), 'Category model loaded')


ner_model = SequenceTagger.load('checkpoint.pt')
print(datetime.datetime.now(), 'NER model loaded')

with open('./token.pkl','rb') as infile:
	token = pickle.load(infile)
		
with open('./cols.pkl','rb') as infile:
	cols = pickle.load(infile)


app = Flask(__name__)


def predict_category(sentence):	
	seq_x = sequence.pad_sequences(token.texts_to_sequences([sentence]), maxlen=60)
	
Ejemplo n.º 6
0
from flair.data import Sentence
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger.load("ner")

sentence: Sentence = Sentence("George Washington went to Washington .")
tagger.predict(sentence)

print("Analysing THE sentence %s" % sentence)

print("\nThe following NER tags are found: \n")
print(sentence.to_tagged_string())
Ejemplo n.º 7
0
from flair.data import Sentence
from flair.models import SequenceTagger

# load the NER tagger
tagger = SequenceTagger.load('ner')


def parseXML():
    import glob
    d = "/home/xtof/arxiv/"
    fs = glob.glob(d + "*.xml")
    for f in fs:
        print("FICH ", f)
        with open(f, "r") as ff:
            indesc = False
            abs = ""
            for l in ff:
                if '<dc:description>' in l:
                    i = l.find('<dc:description>')
                    l = l[i + 16:]
                    indesc = True
                if indesc: abs += l.strip() + " "
                if '</dc:description>' in l:
                    i = abs.find('</dc:description>')
                    abs = abs[:i].strip()
                    indesc = False
                    # print(abs)
                    sentence = Sentence(abs)
                    tagger.predict(sentence)
                    for entity in sentence.get_spans('ner'):
                        print("NER", entity)
Ejemplo n.º 8
0
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from flair.models import SequenceTagger

from components.data_ETL import load_text, create_upload_tab_html_output

# Env variables
PSEUDO_REST_API_URL = os.environ.get('PSEUDO_REST_API_URL', '')
PSEUDO_MODEL_PATH = os.environ.get('PSEUDO_MODEL_PATH', '')
TAGGER = None
if not PSEUDO_REST_API_URL and not PSEUDO_MODEL_PATH:
    print("Neither the pseudonymization service nor a trained model are available. We cannot continue :(")
    exit(1)
elif (not PSEUDO_REST_API_URL and PSEUDO_MODEL_PATH) or (PSEUDO_MODEL_PATH and PSEUDO_REST_API_URL):
    TAGGER = SequenceTagger.load(PSEUDO_MODEL_PATH)

with open("./assets/text_files/upload_example.txt", "r") as example:
    TEXTE_EXEMPLE = example.read()

tab_upload_content = dbc.Tab(
    label='Pseudonymisez un document',
    tab_id="tab-upload",
    children=html.Div(className='control-tab', children=[
        html.Div([html.P("Veuillez choisir un fichier à analyser (type .odt, .doc, .docx, .txt. Max 100 Ko)"),
                  html.P([html.B("Attention: "),
                          "cette application n'est qu'une démo,  aucune donnée n'est conservée. Veillez à ne pas transmettre d’informations sensibles."])],
                 className='app-controls-block'),
        html.Div(
            id='seq-view-fast-upload',
            children=dcc.Upload(id='upload-data',
Ejemplo n.º 9
0
from flair.data import Sentence
from flair.models import SequenceTagger

texts = ['Hello, World', 'Lorem ipsum dolor sit amet']

tagger = SequenceTagger.load('./temp/best-model.pt')

for text in texts:
    # predict NER tags
    sentence = Sentence(text)
    tagger.predict(sentence)

    print(f'****** {text}')
    spans = sentence.get_spans('ner')
    if not spans:
        print(f'No entities found')

    for entity in spans:
        print({
            "start": entity.start_pos,
            "end": entity.end_pos,
            "label": entity.tag
        })

    print('****\n')
Ejemplo n.º 10
0
from flair.data import Sentence
from flair.models import SequenceTagger

model = SequenceTagger.load('output/best-model.pt')

fr = open('../data/example_recipe.txt', 'r')
for line in fr:
    sentence = Sentence(line)
    model.predict(sentence)
    print(sentence.to_tagged_string())
Ejemplo n.º 11
0
    def __init__(
            self,
            task_name: str,
            tag_dictionary: Dictionary,
            tag_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        """
        super(TARSTagger, self).__init__()

        from flair.embeddings import TransformerWordEmbeddings

        if not isinstance(embeddings, TransformerWordEmbeddings):
            embeddings = TransformerWordEmbeddings(model=embeddings,
                                                   fine_tune=True,
                                                   layers='-1',
                                                   layer_mean=False,
                                                   )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('O')
        tars_dictionary.add_item('S-')
        tars_dictionary.add_item('B-')
        tars_dictionary.add_item('E-')
        tars_dictionary.add_item('I-')

        # initialize a bare-bones sequence tagger
        self.tars_model = SequenceTagger(123,
                                         embeddings,
                                         tag_dictionary=tars_dictionary,
                                         tag_type=self.static_label_type,
                                         use_crf=False,
                                         use_rnn=False,
                                         reproject_embeddings=False,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, tag_dictionary, tag_type)
Ejemplo n.º 12
0
class TARSTagger(FewshotClassifier):
    """
    TARS model for sequence tagging. In the backend, the model uses a BERT based 5-class
    sequence labeler which given a <label, text> pair predicts the probability for each word
    to belong to one of the BIOES classes. The input data is a usual Sentence object which is inflated
    by the model internally before pushing it through the transformer stack of BERT.
    """

    static_label_type = "tars_label"

    def __init__(
            self,
            task_name: str,
            tag_dictionary: Dictionary,
            tag_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        """
        super(TARSTagger, self).__init__()

        from flair.embeddings import TransformerWordEmbeddings

        if not isinstance(embeddings, TransformerWordEmbeddings):
            embeddings = TransformerWordEmbeddings(model=embeddings,
                                                   fine_tune=True,
                                                   layers='-1',
                                                   layer_mean=False,
                                                   )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('O')
        tars_dictionary.add_item('S-')
        tars_dictionary.add_item('B-')
        tars_dictionary.add_item('E-')
        tars_dictionary.add_item('I-')

        # initialize a bare-bones sequence tagger
        self.tars_model = SequenceTagger(123,
                                         embeddings,
                                         tag_dictionary=tars_dictionary,
                                         tag_type=self.static_label_type,
                                         use_crf=False,
                                         use_rnn=False,
                                         reproject_embeddings=False,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, tag_dictionary, tag_type)

    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \
            else f"{original_text} {self.separator} {label}"

        label_length = 0 if not self.prefix else len(label.split(" ")) + len(self.separator.split(" "))

        # make a tars sentence where all labels are O by default
        tars_sentence = Sentence(label_text_pair, use_tokenizer=False)
        for token in tars_sentence:
            token.add_tag(self.static_label_type, "O")

        # overwrite O labels with tags
        for token in sentence:
            tag = token.get_tag(self.get_current_label_type()).value

            if tag == "O":
                tars_tag = "O"
            elif tag == label:
                tars_tag = "S-"
            elif tag[1] == "-" and tag[2:] == label:
                tars_tag = tag.split('-')[0] + '-'
            else:
                tars_tag = "O"

            tars_sentence.get_token(token.idx + label_length).add_tag(self.static_label_type, tars_tag)

        return tars_sentence

    def _get_state_dict(self):
        model_state = {
            "state_dict": self.state_dict(),

            "current_task": self._current_task,
            "tag_type": self.get_current_label_type(),
            "tag_dictionary": self.get_current_label_dictionary(),
            "tars_model": self.tars_model,
            "num_negative_labels_to_sample": self.num_negative_labels_to_sample,
            "prefix": self.prefix,

            "task_specific_attributes": self._task_specific_attributes,
        }
        return model_state

    @staticmethod
    def _init_model_with_state_dict(state):
        print("init TARS")

        # init new TARS classifier
        model = TARSTagger(
            task_name=state["current_task"],
            tag_dictionary=state["tag_dictionary"],
            tag_type=state["tag_type"],
            embeddings=state["tars_model"].embeddings,
            num_negative_labels_to_sample=state["num_negative_labels_to_sample"],
            prefix=state["prefix"],
        )
        # set all task information
        model.task_specific_attributes = state["task_specific_attributes"]
        # linear layers of internal classifier
        model.load_state_dict(state["state_dict"])
        return model

    @property
    def tars_embeddings(self):
        return self.tars_model.embeddings

    def predict(
            self,
            sentences: Union[List[Sentence], Sentence],
            mini_batch_size=32,
            verbose: bool = False,
            label_name: Optional[str] = None,
            return_loss=False,
            embedding_storage_mode="none",
    ):
        # return
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name == None:
            label_name = self.get_current_label_type()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # set context if not set already
        previous_sentence = None
        for sentence in sentences:
            if sentence.is_context_set(): continue
            sentence._previous_sentence = previous_sentence
            sentence._next_sentence = None
            if previous_sentence: previous_sentence._next_sentence = sentence
            previous_sentence = sentence

        # reverse sort all sequences by their length
        rev_order_len_index = sorted(
            range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True
        )

        reordered_sentences: List[Union[Sentence, str]] = [
            sentences[index] for index in rev_order_len_index
        ]

        dataloader = DataLoader(
            dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size
        )

        # progress bar for verbosity
        if verbose:
            dataloader = tqdm(dataloader)

        overall_loss = 0
        overall_count = 0
        batch_no = 0
        with torch.no_grad():
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(f"Inferencing on batch {batch_no}")

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    for token in sentence:
                        token.remove_labels(label_name)

                    all_labels = [label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item]

                    all_detected = {}
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(label, sentence)

                        label_length = 0 if not self.prefix else len(label.split(" ")) + len(self.separator.split(" "))

                        loss_and_count = self.tars_model.predict(tars_sentence,
                                                                 label_name=label_name,
                                                                 all_tag_prob=True,
                                                                 return_loss=True)
                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        for span in tars_sentence.get_spans(label_name):
                            span.set_label('tars_temp_label', label)
                            all_detected[span] = span.score

                        for span in tars_sentence.get_spans(label_name):
                            for token in span:
                                corresponding_token = sentence.get_token(token.idx - label_length)
                                if corresponding_token is None: continue
                                if corresponding_token.get_tag(label_name).value != '' and \
                                        corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score:
                                    continue
                                corresponding_token.add_tag(
                                    label_name,
                                    token.get_tag(label_name).value + label,
                                    token.get_tag(label_name).score,
                                )

                    # import operator
                    # sorted_x = sorted(all_detected.items(), key=operator.itemgetter(1))
                    # sorted_x.reverse()
                    # print(sorted_x)
                    # for tuple in sorted_x:
                    #     span = tuple[0]
                    #
                    #     tag_this = True
                    #
                    # for token in span:
                    #     corresponding_token = sentence.get_token(token.idx)
                    #     if corresponding_token is None:
                    #         tag_this = False
                    #         continue
                    #     if corresponding_token.get_tag(label_name).value != '' and \
                    #             corresponding_token.get_tag(label_name).score > token.get_tag(label_name).score:
                    #         tag_this = False
                    #         continue
                    #
                    # if tag_this:
                    #     for token in span:
                    #         corresponding_token = sentence.get_token(token.idx)
                    #         corresponding_token.add_tag(
                    #             label_name,
                    #             token.get_tag(label_name).value + span.get_labels('tars_temp_label')[0].value,
                    #             token.get_tag(label_name).score,
                    #         )

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count
Ejemplo n.º 13
0
        list_end.append(end)
    for i in range(len(list_mention)):
        dic["ner"]["extracted"].append(list_mention[i])
        dic["ner"]["start_offset"].append(list_start[i])
        dic["ner"]["end_offset"].append(list_end[i])
    return dic

def write_json(filename, dic):
    json_file = open(filename, 'w')
    json.dump(dic, json_file, ensure_ascii=False)
    return

if __name__ == '__main__':
    test_path = 'input.txt'
    result_path = 'predict.json'
    model = SequenceTagger.load('output/final-model.pt')
    input_sentences = []
    sentences = []
    tags = []

    file = make_wakati(test_path)
    for i in range(len(file)):
        line = file[i]
        input_sentence = Sentence(line)
        model.predict(input_sentence)
        input_sentences.append(input_sentence)
    for line in input_sentences:
        sentence, tag = convert_flair(line.to_tagged_string())
        sentences.append(sentence)
        tags.append(tag)
    predict_dictionary = convert_json(sentences, tags)
Ejemplo n.º 14
0
from flair.models import SequenceTagger
from flair.data import Sentence
import flair, torch
import sys

flair.device = torch.device('cpu')

classifier_model = sys.argv[1]
given_sentence = sys.argv[2]

classifier = SequenceTagger.load_from_file('./' + classifier_model +
                                           '/best-model.pt')

sentence = Sentence(given_sentence)

classifier.predict(sentence)
print(sentence)
#print(sentence.labels)
print(sentence.to_tagged_string())
Ejemplo n.º 15
0
def entities_extractor(url):

    res = requests.get(url)
    html_page = res.content

    soup = BeautifulSoup(html_page, 'html.parser')
    text = soup.find_all(text=True)
    set([t.parent.name for t in text])

    output = ''
    blacklist = [
        '[document]',
        'a',
        'article',
        'aside',
        'body',
        'button',
        'clippath',
        'defs',
        'div',
        'figcaption',
        'figure',
        'footer',
        'form',
        'g',
        'h1',
        'h2',
        'head',
        'header',
        'html',
        'label',
        'li',
        'link',
        'meta',
        'nav',
        'noscript',
        'picture',
        'script',
        'section',
        'span',
        'strong',
        'style',
        'svg',
        'time',
        'title',
        'ul',
        # there may be more elements you don't want, such as "style", etc.
    ]

    for t in text:
        if t.parent.name not in blacklist:
            output += '{} '.format(t)

    sentences = [
        Sentence(sent, use_tokenizer=True) for sent in split_single(output)
    ]

    tagger = SequenceTagger.load('ner')
    tagger.predict(sentences)

    li = []
    for i in sentences:
        for entity in i.get_spans('ner'):
            li.append(entity.to_dict())

    df = pd.DataFrame(li)
    df = pd.crosstab(df.text, df.type)
    return df
Ejemplo n.º 16
0
base_url = Path("")
wiki_subfolder = "wiki_2019"

# 1. Input sentences when using Flair.
input_documents = example_preprocessing()

# For Mention detection two options.
# 2. Mention detection, we used the NER tagger, user can also use his/her own mention detection module.
mention_detection = MentionDetection(base_url, wiki_subfolder)

# If you want to use your own MD system, the required input is: {doc_name: [text, spans] ... }.
mentions_dataset, n_mentions = mention_detection.format_spans(input_documents)

# Alternatively use Flair NER tagger.
tagger_ner = SequenceTagger.load("ner-fast")
mentions_dataset, n_mentions = mention_detection.find_mentions(
    input_documents, tagger_ner)

# 3. Load model.
config = {
    "mode": "eval",
    "model_path": base_url / wiki_subfolder / "generated" / "model",
}
model = EntityDisambiguation(base_url, wiki_subfolder, config)

# 4. Entity disambiguation.
predictions, timing = model.predict(mentions_dataset)

# 5. Optionally use our function to get results in a usable format.
result = process_results(mentions_dataset,
Ejemplo n.º 17
0
 def load_ner_model(self) -> SequenceTagger:
     model = SequenceTagger.load("flair/ner-english-ontonotes-fast")
     return model
Ejemplo n.º 18
0
        labels.append(token.get_tag("ner").value)
    return tokens, labels


def iobes2bio(iobes_labels):
    bio_labels = []
    for label in iobes_labels:
        if label[0] == 'S':
            bio_labels.append('B' + label[1:])
        elif label[0] == 'E':
            bio_labels.append('I' + label[1:])
        else:
            bio_labels.append(label)
    return bio_labels


tagger = SequenceTagger.load(os.path.join(model_folder, 'final-model.pt'))

test_sentences = [x for x in corpus.test]
tagger.predict(test_sentences)
sentences = []
for sentence in test_sentences:
    tokens, labels = get_tokens_and_labels(sentence)
    labels = iobes2bio(labels)
    sentences.append((tokens, labels))
with open(os.path.join(data_folder, 'predict.bio'), 'w') as f:
    for tokens, labels in sentences:
        for token, label in zip(tokens, labels):
            f.write(f'{token}\t{label}\n')
        f.write('\n')
Ejemplo n.º 19
0
# -*- coding: utf-8 -*-
from flair.datasets import CONLL_03
from flair.embeddings import PooledFlairEmbeddings, StackedEmbeddings, WordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

corpus = CONLL_03(base_path="data/conll-2003")

embedding_types = [
    WordEmbeddings("glove"),
    PooledFlairEmbeddings("news-forward", pooling="min"),
    PooledFlairEmbeddings("news-backward", pooling="min"),
]

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=corpus.make_tag_dictionary(tag_type="ner"),
    tag_type="ner",
)


trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train("models/checkpoints", train_with_dev=True, max_epochs=150)
Ejemplo n.º 20
0
 def __init__(self, word_list, model='ner-ontonotes'):
     super(Vocabulary, self).__init__(word_list)
     self.compiled = None
     self.ner_tagger = SequenceTagger.load(model) if model else None
Ejemplo n.º 21
0
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        self._flair_pos_tagger = SequenceTagger.load("pos-fast")
        self._flair_to_lemminflect_pos_map = {"NN": "NOUN", "VB": "VERB", "JJ": "ADJ"}
Ejemplo n.º 22
0
 def load_model(self, path=None):
     if path is None:
         path = model_dir + "heb.sent"
     if not os.path.exists(path):
         raise FileNotFoundError("Cannot find sentence splitter model heb.sent at " +path)
     self.model = SequenceTagger.load(path)
            ttl = []
            ttl.append(row[3])
            ttl.append(row[4])
            tl.append(ttl)
    file.close()
    return tl


if __name__ == "__main__":
    ex = "The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing"
    ex = "There are a lot of cars in Los Angeles"
    ex = 'BYD quickly debuted it\'s E-SEED GT concept car and Song Pro SUV alongside it\'s all-new e-series models at the Shanghai International Automobile Industry Exhibition'
    ex = "BYD debuted its E-SEED GT concept car and Song Pro SUV alongside its all-new e-series models at the Shanghai International Automobile Industry Exhibition. The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing. A total of 23 new car models were exhibited at the event, held at Shanghai’s National Convention and Exhibition Center, fully demonstrating the BYD New Architecture (BNA) design, the 3rd generation of Dual Mode technology, plus the e-platform framework."
    ex = "The Akash eagerly wanted Mehar Sharma's blue coloured jacket, green umbrella of John Sowa, and Ritwik Mishra's big black red jeans"
    ex = "Akash wants umbrella of Mehar"
    tagger = SequenceTagger.load('chunk')
    print(ex)

    # sentence = Sentence('BYD quickly debuted it\'s E-SEED GT concept car and Song Pro SUV alongside it\'s all-new e-series models at the Shanghai International Automobile Industry Exhibition .')
    for x in getPhrases(ex, tagger):
        print(x)
    # print(type(strchunked))

    input('Enter')

    nlp = en_core_web_sm.load()
    doc = nlp(
        'The company also showcased its latest Dynasty series of vehicles, which were recently unveiled at the company’s spring product launch in Beijing'
    )
    pos_tags = [(i, i.tag_) for i in doc]
    print(pos_tags)
Ejemplo n.º 24
0
                i] and pseudo_pred_labels[i] != 'O':
            FP += 1
        else:
            pass

    return TP, FP, FN


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train flair")
    parser.add_argument("--folder", type=str, help="folder to chkp")
    parser.add_argument("--method", choices=['chunk', 'word'])
    args = parser.parse_args()
    args = vars(args)

    tagger = SequenceTagger.load('./flair_models/' + args['folder'] +
                                 '/final-model.pt')
    test_data_f = "score/eng.testb.src"
    test_labels_f = "score/eng.testb.trg"
    TP, FP, FN = 0, 0, 0
    with open(test_data_f, 'r') as f_data,\
         open(test_labels_f, 'r') as f_labels:
        for sent, labels in zip(f_data, f_labels):
            sent = sent[:-1].split(' ')
            labels = labels[:-1].split(' ')
            true_spans = iob_to_chunk(labels)
            sentence = Sentence(' '.join(sent))
            tagger.predict(sentence)
            pred_spans = sentence.get_spans('ner')
            pred_spans = [(r.tag, r.tokens[0].idx - 1, r.tokens[-1].idx - 1)
                          for r in pred_spans]
            if args['method'] == 'chunk':
def run_experiment(config):
    print('Active learning strategy:', config.al.strat_name)

    print('Loading task...', config.data.task)
    preprocess = (config.model.model_type == 'crf')
    print(config.data.data_folder)
    X_train, X_test, y_train, y_test, tag_dictionary = load_task(
        config.data.data_folder, config.data.task, config.data.tag_column,
        preprocess)
    print('Done.')

    strat = strategies_to_try(config.al.strat_name)
    model_name = config.model.model_type

    for repeat in range(config.n_repeats):
        print(
            f'######################==Repeat {repeat} ==#####################')

        strat = strategies_to_try(config.al.strat_name)

        model_name = config.model.model_type

        if config.al.percent:
            percent = 0.02
            print('FULL:', len(y_train))
            y_seed = y_train2y_seed_percent(y_train, percent, rpt=repeat)
            selector = [False for _ in range(len(y_seed))]
            for ind, answ in enumerate(y_seed):
                if answ is None:
                    selector[ind] = False
                elif all(e is None for e in y_seed):
                    selector[ind] = False
                else:
                    selector[ind] = True

            y_nonempty = np.array(y_seed)[selector]
            print('2PERCENT:', len(y_nonempty))
            max_samples_number = int(len(y_seed) * percent)
        else:
            y_seed = y_train2y_seed(y_train, rpt=repeat)
            max_samples_number = config.al.max_samples_number

        if 'flair' in config.model.model_type:
            print(config.model.model_type)

            bayes_type = config.model.bayes_type if config.model.bayes else 'no_bayes'
            models_path = os.path.join(
                config.exp_path,
                f'{model_name}_{config.model.emb_name}_{bayes_type}/{config.al.strat_name}'
            )
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(
                    os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            print('Embeddings', config.model.emb_name)
            emb = get_embeddings(config.model.emb_name)

            tagger = SequenceTagger(hidden_size=config.model.hidden_size,
                                    embeddings=emb(),
                                    tag_dictionary=tag_dictionary,
                                    tag_type=config.data.task,
                                    use_crf=True)
            print(config.model.bayes)
            if config.model.bayes:
                print('BAYES CHOSEN')
                convert_to_mc_dropout(
                    tagger,
                    (nn.Dropout, flair.nn.WordDropout, flair.nn.LockedDropout),
                    option='flair')
                active_tagger = LibActFlairBayes(
                    tagger,
                    base_path=models_path,
                    reset_model_before_train=True,
                    mini_batch_size=config.model.bs,
                    eval_mini_batch_size=config.model.ebs,
                    checkpoint=False,
                    learning_rate=config.model.lr,
                    index_subset=False,
                    save_all_models=False,
                    max_epochs=config.model.n_epochs,
                    min_learning_rate=config.model.min_lr)

                print(active_tagger)

            else:
                active_tagger = LibActFlair(
                    tagger,
                    base_path=models_path,
                    reset_model_before_train=True,
                    mini_batch_size=config.model.bs,
                    eval_mini_batch_size=config.model.ebs,
                    checkpoint=False,
                    learning_rate=config.model.lr,
                    index_subset=False,
                    save_all_models=False,
                    max_epochs=config.model.n_epochs,
                    min_learning_rate=config.model.min_lr)
            fit_model = False

        elif config.model.model_type == 'crf':
            models_path = os.path.join(config.exp_path, model_name)
            os.makedirs(models_path, exist_ok=True)

            if os.path.exists(
                    os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            active_tagger = LibActCrf(algorithm="lbfgs",
                                      c1=0.1,
                                      c2=0.1,
                                      max_iterations=100,
                                      all_possible_transitions=True)
            fit_model = True

        elif config.model.model_type == 'transformers':

            if config.model.bayes:
                libactnn = LibActNNBayes
                bayes_type = config.model.bayes_type
            else:
                libactnn = LibActNN
                bayes_type = 'no_bayes'

            models_path = os.path.join(
                config.exp_path,
                f'{model_name}_{bayes_type}/{config.al.strat_name}')
            print(models_path)

            if os.path.exists(
                    os.path.join(models_path, f'statistics{repeat}.json')):
                print(f'statistics{repeat}.json already exists. Next')
                continue

            index2tag = ['[PAD]'] + tag_dictionary.get_items()
            tag2index = {e: i for i, e in enumerate(index2tag)}
            active_tagger = create_libact_adaptor(tag2index,
                                                  index2tag,
                                                  LibActNN,
                                                  config=config)
            fit_model = False

        active_learn_alg_ctor = make_libact_strategy_ctor(
            lambda tr_ds: strat(tr_ds, active_tagger),
            max_samples_number=config.al.max_samples_number)

        active_learner = ActiveLearner(
            active_learn_alg_ctor=active_learn_alg_ctor,
            y_dtype='str',
            X_full_dataset=X_train,
            y_full_dataset=y_seed,
            X_test_dataset=X_test,
            y_test_dataset=y_test,
            model_evaluate=active_tagger,
            eval_metrics=[f1_score],
            rnd_start_steps=0)

        statistics = emulate_active_learning(
            y_train,
            active_learner,
            max_iterations=config.al.n_iterations,
            fit_model=fit_model)
        dump_file(statistics, models_path, f'statistics{repeat}.json')
Ejemplo n.º 26
0
def delete_pattern_en(term_list):
    total = 0
    deletes = []

    lemmas_list = []
    cont = 0
    cont_inf = 0
    cont_post = 0
    for i in term_list:
        if (len(i) > 1):
            #print( i, i.split(' ') )

            pos_tagger = SequenceTagger.load("flair/pos-english")
            i = Sentence(i)
            #si se cae el de lynx, probar con este https://corenlp.run/
            print('esto es i')
            print(i)
            #tag=pos_tagger.tag(i.split(' '))
            tag = i.get_spans('pos')
            print(tag)
            total = total + 1
            joini = i
            list_pos = []
            #spl=joini.split(' ')
            if (joini != ''):
                join_tag = ''
                for t in tag:
                    if (t[1] == 'AUX'):
                        doc = nlp(t[0])
                        lemlist = [tok.lemma_ for tok in doc]
                        lem = ''.join(lemlist)
                        lemmas_list.append(lem)
                        if (lem == i):
                            lem = t[0]
                        list_pos.append('aux--' + str(lem))
                        if (len(spl) == 1):
                            ind = term_list.index(str(i))
                            term_list[ind] = str(lem)
                    if (t[1] == 'NOUN'):
                        list_pos.append('noun-' + str(t[0]))
                    if (t[1] == 'VERB'):
                        cont_inf = cont_inf + 1
                        doc = nlp(t[0])
                        for tok in doc:
                            l = tok.lemma_
                            if (l != t[0]):
                                cont_post = cont_post + 1
                        lemlist = [tok.lemma_ for tok in doc]
                        lem = ''.join(lemlist)
                        lemmas_list.append(lem)
                        if (lem == i):
                            lem = t[0]
                        list_pos.append('verb-' + str(lem))
                        if (len(spl) == 1):
                            ind = term_list.index(str(i))
                            term_list[ind] = str(lem)
                    if (t[1] == 'ADV'):
                        list_pos.append('adv--' + str(t[0]))
                    if (t[1] == 'ADJ'):
                        list_pos.append('adj--' + str(t[0]))
                    if (t[1] == 'SCONJ'):
                        list_pos.append('sconj' + str(t[0]))

                spl_i = joini.split(' ')

                if (len(list_pos) == 1):
                    pos1 = list_pos[0]
                    if (pos1[0:4] == 'adv-'):
                        term = pos1[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1

                elif (len(list_pos) == 2 and len(spl_i) == 2):
                    pos1 = list_pos[0]
                    pos2 = list_pos[1]
                    term = ''
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'adj-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adv-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'adv-'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1

                elif (len(list_pos) == 3 and len(spl_i) == 3):
                    #print(list_pos, spl_i,'-', len(list_pos), len(spl_i))
                    pos1 = list_pos[0]
                    pos2 = list_pos[1]
                    pos3 = list_pos[2]
                    term = ''
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1

                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'verb'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'aux-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'adj-'):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'noun' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'aux-'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'noun' and pos2[0:4] == 'adv-'
                            and pos3[0:4] == 'scon' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'scon'
                            and pos3[0:4] == 'adv-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'aux-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'verb' and pos2[0:4] == 'verb'
                            and pos3[0:4] == 'verb' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1
                    if (pos1[0:4] == 'adj-' and pos2[0:4] == 'noun'
                            and pos3[0:4] == 'adj-' and joini in term_list):
                        term = pos1[5:] + ' ' + pos2[5:] + ' ' + pos3[5:]
                        deletes.append(joini)
                        ind = term_list.index(joini)
                        #term_list.pop(ind)
                        cont = cont + 1

    for i in deletes:
        if (i in term_list):
            ind = term_list.index(i)
            term_list.pop(ind)

    #elapsed_time=time()-start_time
    #txt='PATRONES, DELETE'+' ('+str(cont)+') NEW LIST SIZE: ('+str(len(term_list))+') TIME: ('+str(elapsed_time)+')'
    joind = ', '.join(deletes)
    #print('PATRONES DELETE', cont, len(term_list), elapsed_time)
    #conts_log.information(txt, 'TERMS REMOVED: '+joind)
    return (term_list)
def train_sequence_labeling_model(data_folder, proposed_tags_vocabulary_size,
                                  skf_split_no):
    """
    Trains the sequence labeling model with 2 RNN layers instead of 1.
    Model is trained to predict part of speech tag and takes into account information about:
    - text (plain text made of tokens that together form a sentence),
    - occurrence of separator before token,
    - proposed tags for given token.
    It is trained with use of Stacked Embeddings used to combine different embeddings together. Words are embedded
    using a concatenation of two vector embeddings:
    - Flair Embeddings - contextual string embeddings that capture latent syntactic-semantic
      information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any
      explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are
      contextualized by their surrounding text, meaning that the same word will have different embeddings depending on
      its contextual use.
      There are forward (that goes through the given on input plain text form left to right) and backward model (that
      goes through the given on input plain text form right to left) used for part of speech (pos) tag training.
    - One Hot Embeddings - embeddings that encode each word in a vocabulary as a one-hot vector, followed by an
      embedding layer. These embeddings thus do not encode any prior knowledge as do most other embeddings. They also
      differ in that they require to see a Corpus during instantiation, so they can build up a vocabulary consisting of
      the most common words seen in the corpus, plus an UNK token for all rare words.
      There are two One Hot Embeddings used in training:
      - first to embed information about occurrence of separator before token,
      - second to embed information about concatenated with a ';' proposed tags.
    Model training is based on stratified 10 fold cross validation split indicated by skf_split_no argument.
    Model and training logs are saved in resources_ex_4/taggers/example-pos/it-<skf_split_no> directory (where
    <skf_split_no> is the number of stratified 10 fold cross validation split used to train the model).
    This is the method where internal states of forward and backward Flair models are taken at the end of each token
    and, supplemented by information about occurrence of separator before token and proposed tags for given token used
    to train model for one of stratified 10 fold cross validation splits.
    Additionally method logs other training log files and saves them in the resources_ex_4 directory of this project
    under the name training_ex_4_<skf_plit_no>.log

    :param data_folder: folder where files with column corpus split are stored. Those columns are used to initialize
    ColumnCorpus object
    :param proposed_tags_vocabulary_size: number of proposed tags
    :param skf_split_no: number that indicates one of stratified 10 fold cross validation splits (from range 1 to 10)
    used to train the model
    """
    # define columns
    columns = {0: 'text', 1: 'pos', 2: 'is_separator', 3: 'proposed_tags'}
    # init a corpus using column format, data folder and the names of the train and test files
    # 1. get the corpus
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='train_' + str(skf_split_no),
                                  test_file='test_' + str(skf_split_no),
                                  dev_file=None)
    log.info(corpus)
    # 2. what tag do we want to predict
    tag_type = 'pos'
    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    log.info(tag_dictionary)
    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        FlairEmbeddings('pl-forward', chars_per_chunk=64),
        FlairEmbeddings('pl-backward', chars_per_chunk=64),
        OneHotEmbeddings(corpus=corpus,
                         field='is_separator',
                         embedding_length=3,
                         min_freq=3),
        OneHotEmbeddings(corpus=corpus,
                         field='proposed_tags',
                         embedding_length=math.ceil(
                             (proposed_tags_vocabulary_size + 1)**0.25),
                         min_freq=3)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)
    # 5. initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=False,
                                            rnn_layers=2)
    # 6. initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    # 7. start training
    trainer.train(
        use_scratch_dir_if_available('resources_ex_4/taggers/example-pos/it-' +
                                     str(skf_split_no)),
        learning_rate=0.1,
        mini_batch_size=32,
        embeddings_storage_mode='gpu',
        max_epochs=sys.maxsize,
        monitor_test=True)
    # 8. plot weight traces (optional)
    plotter = Plotter()
    plotter.plot_weights(
        use_scratch_dir_if_available('resources_ex_4/taggers/example-pos/it-' +
                                     str(skf_split_no) + '/weights.txt'))
Ejemplo n.º 28
0
from flair.data import Sentence
from flair.models import SequenceTagger

# convert text file into String
with open(
        r'/home/pia/Uni/5.Semester/Textmining/Satz-Reduktion/satz-reduktion/Datensatz/Saetze_clean_flair.txt',
        "r") as myfile:
    data = myfile.read().replace('\n', ' ')

# make a sentence
sentence = Sentence(data, use_tokenizer=True)

#load the NER tagger
tagger = SequenceTagger.load('de-ner')

#run NER over sentence
tagger.predict(sentence)

#save tagged sentence into a String
tagged = sentence.to_tagged_string()

# save String into text file
file = open(
    '/home/pia/Uni/5.Semester/Textmining/Satz-Reduktion/satz-reduktion/Datensatz/saetze_clean_getagged_flair.txt',
    'w')
file.writelines(tagged)

print(sentence)
print('The following NER tags are found:')

#iterate over entities and print spans
Ejemplo n.º 29
0
corpus: Corpus = ColumnCorpus(data_folder,
                              columns,
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='valid.txt')
print(corpus)
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary)

if ARGS.restore:
    ensemble_tagger = EnsembleTagger.load(model_path + "final-model.pt")
else:
    elmo_tagger = SequenceTagger(hidden_size=256,
                                 embeddings=ELMoEmbeddings('small'),
                                 tag_dictionary=tag_dictionary,
                                 tag_type=tag_type,
                                 use_crf=True)
    bert_tagger = SequenceTagger(hidden_size=256,
                                 embeddings=BertEmbeddings(),
                                 tag_dictionary=tag_dictionary,
                                 tag_type=tag_type,
                                 use_crf=True)
    xlnet_tagger = SequenceTagger(hidden_size=256,
                                  embeddings=XLNetEmbeddings(),
                                  tag_dictionary=tag_dictionary,
                                  tag_type=tag_type,
                                  use_crf=True)
    flair_tagger = SequenceTagger(hidden_size=256,
                                  embeddings=StackedEmbeddings([
                                      FlairEmbeddings('news-forward'),
Ejemplo n.º 30
0
from flair.data import Sentence
from flair.models import SequenceTagger

# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = SequenceTagger.load('ner')

# run NER over sentence
tagger.predict(sentence)
Ejemplo n.º 31
0
        for ppn in ppnDirs:
            for dirpath, dirnames, files in os.walk(sbbGetBasePath + ppn):
                for name in files:
                    if dirpath.endswith("_FULLTEXT"):
                        # if we found a fulltext directory, only add XML files, i.e., the ALTO candidate files
                        if name.endswith(".xml") or name.endswith(".XML"):
                            fulltextFilePaths.append(
                                os.path.join(dirpath, name))
                            dirsPerPPN[ppn].append(os.path.join(dirpath, name))

        totalFiles = len(fulltextFilePaths)
        printLog("Found %i ALTO candidate files for further processing." %
                 totalFiles)

        if useFlairNLP:
            nerModel = SequenceTagger.load(flairModel)
            print("Flair model loaded.")

        processCounter = 0
        for ppn in dirsPerPPN:
            textPerPPN = ""
            nerTextPerPPN = ""
            nerDicts = []
            print("Processing PPN: " + ppn)
            for file in dirsPerPPN[ppn]:
                processCounter += 1
                print(
                    "\tProcessing file %i of %i (total files over all PPNs)" %
                    (processCounter, totalFiles))

                r = parseALTO(file)