def load_task(data_folder, task, tag_column, preprocess): X = {'train': [], 'test': []} y = {'train': [], 'test': []} tag_dictionary = Dictionary() tag_dictionary.add_item('<START>') tag_dictionary.add_item('<STOP>') for part in ('train', 'test'): #dataset = load_file(data_folder, task, f'{part}.txt') file_path = Path(f'{data_folder}/{task}/{part}.txt') print('Loading: ', file_path) corpus = ColumnDataset( path_to_column_file=file_path, column_name_map={ 0: 'text', tag_column: 'ner' }, tag_to_bioes=None, encoding='utf8', comment_symbol=None, in_memory=True, document_separator_token=None, ) for sent in corpus: tokens = [w.text for w in sent] if preprocess: X[part].append( list( zip(tokens, [nltk.pos_tag([tok])[0][1] for tok in tokens]))) else: X[part].append(tokens) labels = [w.get_tag('ner').value for w in sent] y[part].append(labels) for tag in labels: tag_dictionary.add_item(tag) print('Train size:', len(X['train'])) print('Test size:', len(X['test'])) return X['train'], X['test'], y['train'], y['test'], tag_dictionary
def test_sentences(self): column_name = {0: "text", 1: "ner"} corpus = ColumnDataset(Path("tests/test_n.txt"), column_name) list_s = [] for i in corpus: if len(i.get_spans("ner")) > 0: list_s.append(i) else: continue with open('tests/test.json', 'r') as json_file: json_object = json.load(json_file) document = json_object["documents"][0] typesystem = load_typesystem(json_object["typeSystem"]) cas = load_cas_from_xmi(document['xmi'], typesystem=typesystem) list_sentences = flair_train_ner_dataset(cas) for sentence, t_sentence in zip(list_s, list_sentences): self.assertEqual(sentence.to_tagged_string("ner"), t_sentence.to_tagged_string("ner"))
def create_flair_corpus(conll_tagged: str): text_id = md5(conll_tagged.encode("utf-8")).hexdigest() temp_conll_file = Path(f"/tmp/{text_id}") try: with open(temp_conll_file, "w") as temp_file: temp_file.write(conll_tagged) flair_corpus = ColumnDataset(path_to_column_file=temp_conll_file, column_name_map={0: 'text', 1: 'ner', 2: 'start_pos', 3: 'end_pos'}) for sentence in flair_corpus.sentences: for (token, start_pos_span, end_pos_span) in zip(sentence.tokens, sentence.get_spans("start_pos"), sentence.get_spans("end_pos")): token.start_pos = int(start_pos_span.tag) token.end_pos = int(end_pos_span.tag) return flair_corpus finally: temp_conll_file.unlink()
def prepare_error_pane(): """ This function generates the list of Dash components to show the errors produced while tagging a document by several trained models. The inout is a set of CoNLL annotated files (three columns: token, true tag, predicted tag). The output is a list of list of components describing the content of the CoNLL documents with P (paragraphs) and Marks (highlights). It also returns a list of dicts that contain the stats of the errors/corrects done by the corresponding model. Returns: """ dict_df, data_stats = prepare_error_decisions(Path(TEXT_FILES)) list_stats_datasets = [(2, 219, 62), (3, 270, 91), (12, 1112, 396), (20, 1382, 296), (37, 2256, 804), (42, 2622, 679), (58, 3982, 1112), (109, 5516, 1298)] files_paths = [] list_stats_errors = [] for file_o in ORDERED_FILENAMES: df: DataFrame = dict_df[file_o] df.to_csv(f"/tmp/{file_o[:-4]}.csv", sep="\t", header=None, index=None) files_paths.append(f"/tmp/{file_o[:-4]}.csv") list_stats_errors.append(data_stats[file_o]) flair_datasets = [] for path in files_paths: temp_set = ColumnDataset(path_to_column_file=Path(path), column_name_map={ 0: 'text', 1: 'ner' }) add_span_positions_to_dataset(temp_set) flair_datasets.append(temp_set) html_components = [] for flair_dataset in flair_datasets: html_components.append( generate_errors_tab_html_components(flair_dataset)) return html_components, list_stats_errors, list_stats_datasets
def ColumnCorpusTrain(data_folder: Union[str, Path], column_format: Dict[int, str], train_file=None, tag_to_bioes=None, in_memory: bool = True, eval_part: int = 0, min_occur=0.10): """ Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000. :param data_folder: base folder with the task data :param column_format: a map specifying the column format :param train_file: the name of the train file :param test_file: the name of the test file :param dev_file: the name of the dev file, if None, dev data is sampled from train :param tag_to_bioes: whether to convert to BIOES tagging scheme :return: a Corpus with annotated train, dev and test data """ if eval_part < 0 or eval_part > 10: print("eval part must be in range 0-10") exit(0) if type(data_folder) == str: data_folder: Path = Path(data_folder) if train_file is not None: train_file = data_folder / train_file # get train data train = ColumnDataset( train_file, column_format, tag_to_bioes, in_memory=in_memory, ) # read in test file if exists, otherwise sample 10% of train data as test dataset good = True while good: print("looking for split..") train = shuffleDoc(train) tab_good = [] tab_train = [] tab_test = [] tab_dev = [] for eval_part in range(0, 10): train_length = len(train) dev_size: int = round(train_length / 10) test_size: int = round(train_length / 10) start_dev = dev_size * eval_part print(dev_size, test_size, start_dev) dev = train[start_dev:start_dev + dev_size] if eval_part < 9: test = train[start_dev + dev_size:start_dev + dev_size + test_size] train_ = train[:start_dev] + train[start_dev + dev_size + test_size:] else: dev = train[start_dev:] test = train[:test_size] train_ = train[test_size:start_dev] tab_dev.append(dev) tab_test.append(test) tab_train.append(train_) test_count = 0 for t in test: done = False for tok in t.tokens: if tok.get_tag("ner") != "O": done = True if done: test_count += 1 dev_count = 0 for t in dev: done = False for tok in t.tokens: if tok.get_tag("ner") != "O": done = True if done: dev_count += 1 print(dev_count, test_count, min_occur * len(dev), min_occur * len(test)) if dev_count >= min_occur * len( dev) and test_count >= min_occur * len(test): tab_good += [0] else: tab_good += [1] good = False for t in tab_good: if t == 1: good = True # read in dev file if exists, otherwise sample 10% of train data as dev dataset corpus = [] for i in range(len(tab_train)): corpus.append(Corpus(tab_train[i], tab_dev[i], tab_test[i])) return corpus