def load_task(data_folder, task, tag_column, preprocess):
    X = {'train': [], 'test': []}
    y = {'train': [], 'test': []}
    tag_dictionary = Dictionary()
    tag_dictionary.add_item('<START>')
    tag_dictionary.add_item('<STOP>')

    for part in ('train', 'test'):
        #dataset = load_file(data_folder, task, f'{part}.txt')

        file_path = Path(f'{data_folder}/{task}/{part}.txt')
        print('Loading: ', file_path)

        corpus = ColumnDataset(
            path_to_column_file=file_path,
            column_name_map={
                0: 'text',
                tag_column: 'ner'
            },
            tag_to_bioes=None,
            encoding='utf8',
            comment_symbol=None,
            in_memory=True,
            document_separator_token=None,
        )

        for sent in corpus:
            tokens = [w.text for w in sent]
            if preprocess:
                X[part].append(
                    list(
                        zip(tokens,
                            [nltk.pos_tag([tok])[0][1] for tok in tokens])))
            else:
                X[part].append(tokens)

            labels = [w.get_tag('ner').value for w in sent]
            y[part].append(labels)

            for tag in labels:
                tag_dictionary.add_item(tag)

    print('Train size:', len(X['train']))
    print('Test size:', len(X['test']))

    return X['train'], X['test'], y['train'], y['test'], tag_dictionary
    def test_sentences(self):
        column_name = {0: "text", 1: "ner"}
        corpus = ColumnDataset(Path("tests/test_n.txt"), column_name)
        list_s = []
        for i in corpus:
            if len(i.get_spans("ner")) > 0:
                list_s.append(i)
            else:
                continue

        with open('tests/test.json', 'r') as json_file:
            json_object = json.load(json_file)
        document = json_object["documents"][0]
        typesystem = load_typesystem(json_object["typeSystem"])
        cas = load_cas_from_xmi(document['xmi'], typesystem=typesystem)
        list_sentences = flair_train_ner_dataset(cas)
        for sentence, t_sentence in zip(list_s, list_sentences):
            self.assertEqual(sentence.to_tagged_string("ner"),
                             t_sentence.to_tagged_string("ner"))
Esempio n. 3
0
def create_flair_corpus(conll_tagged: str):
    text_id = md5(conll_tagged.encode("utf-8")).hexdigest()
    temp_conll_file = Path(f"/tmp/{text_id}")
    try:
        with open(temp_conll_file, "w") as temp_file:
            temp_file.write(conll_tagged)

        flair_corpus = ColumnDataset(path_to_column_file=temp_conll_file,
                                     column_name_map={0: 'text', 1: 'ner',
                                                      2: 'start_pos', 3: 'end_pos'})
        for sentence in flair_corpus.sentences:
            for (token, start_pos_span, end_pos_span) in zip(sentence.tokens, sentence.get_spans("start_pos"),
                                                             sentence.get_spans("end_pos")):
                token.start_pos = int(start_pos_span.tag)
                token.end_pos = int(end_pos_span.tag)

        return flair_corpus
    finally:
        temp_conll_file.unlink()
Esempio n. 4
0
def prepare_error_pane():
    """
    This function generates the list of Dash components to show the errors produced while tagging a
    document by several trained models. The inout is a set of CoNLL annotated files (three columns: token,
    true tag, predicted tag). The output is a list of list of components describing the content of the CoNLL documents
    with P (paragraphs) and Marks (highlights). It also returns a list of dicts that contain the stats of the errors/corrects
    done by the corresponding model.

    Returns:

    """
    dict_df, data_stats = prepare_error_decisions(Path(TEXT_FILES))
    list_stats_datasets = [(2, 219, 62), (3, 270, 91), (12, 1112, 396),
                           (20, 1382, 296), (37, 2256, 804), (42, 2622, 679),
                           (58, 3982, 1112), (109, 5516, 1298)]

    files_paths = []
    list_stats_errors = []
    for file_o in ORDERED_FILENAMES:
        df: DataFrame = dict_df[file_o]
        df.to_csv(f"/tmp/{file_o[:-4]}.csv", sep="\t", header=None, index=None)
        files_paths.append(f"/tmp/{file_o[:-4]}.csv")
        list_stats_errors.append(data_stats[file_o])

    flair_datasets = []
    for path in files_paths:
        temp_set = ColumnDataset(path_to_column_file=Path(path),
                                 column_name_map={
                                     0: 'text',
                                     1: 'ner'
                                 })
        add_span_positions_to_dataset(temp_set)
        flair_datasets.append(temp_set)

    html_components = []
    for flair_dataset in flair_datasets:
        html_components.append(
            generate_errors_tab_html_components(flair_dataset))
    return html_components, list_stats_errors, list_stats_datasets
Esempio n. 5
0
def ColumnCorpusTrain(data_folder: Union[str, Path],
                      column_format: Dict[int, str],
                      train_file=None,
                      tag_to_bioes=None,
                      in_memory: bool = True,
                      eval_part: int = 0,
                      min_occur=0.10):
    """
    Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
    :param data_folder: base folder with the task data
    :param column_format: a map specifying the column format
    :param train_file: the name of the train file
    :param test_file: the name of the test file
    :param dev_file: the name of the dev file, if None, dev data is sampled from train
    :param tag_to_bioes: whether to convert to BIOES tagging scheme
    :return: a Corpus with annotated train, dev and test data
    """

    if eval_part < 0 or eval_part > 10:
        print("eval part must be in range 0-10")
        exit(0)

    if type(data_folder) == str:
        data_folder: Path = Path(data_folder)

    if train_file is not None:
        train_file = data_folder / train_file

    # get train data
    train = ColumnDataset(
        train_file,
        column_format,
        tag_to_bioes,
        in_memory=in_memory,
    )

    # read in test file if exists, otherwise sample 10% of train data as test dataset

    good = True

    while good:
        print("looking for split..")
        train = shuffleDoc(train)

        tab_good = []
        tab_train = []
        tab_test = []
        tab_dev = []
        for eval_part in range(0, 10):

            train_length = len(train)
            dev_size: int = round(train_length / 10)
            test_size: int = round(train_length / 10)
            start_dev = dev_size * eval_part

            print(dev_size, test_size, start_dev)

            dev = train[start_dev:start_dev + dev_size]
            if eval_part < 9:
                test = train[start_dev + dev_size:start_dev + dev_size +
                             test_size]
                train_ = train[:start_dev] + train[start_dev + dev_size +
                                                   test_size:]
            else:
                dev = train[start_dev:]
                test = train[:test_size]
                train_ = train[test_size:start_dev]

            tab_dev.append(dev)
            tab_test.append(test)
            tab_train.append(train_)

            test_count = 0
            for t in test:
                done = False
                for tok in t.tokens:
                    if tok.get_tag("ner") != "O":
                        done = True
                if done:
                    test_count += 1
            dev_count = 0
            for t in dev:
                done = False
                for tok in t.tokens:
                    if tok.get_tag("ner") != "O":
                        done = True
                if done:
                    dev_count += 1

            print(dev_count, test_count, min_occur * len(dev),
                  min_occur * len(test))
            if dev_count >= min_occur * len(
                    dev) and test_count >= min_occur * len(test):
                tab_good += [0]
            else:
                tab_good += [1]

        good = False
        for t in tab_good:
            if t == 1:
                good = True

        # read in dev file if exists, otherwise sample 10% of train data as dev dataset
    corpus = []
    for i in range(len(tab_train)):
        corpus.append(Corpus(tab_train[i], tab_dev[i], tab_test[i]))

    return corpus