コード例 #1
0
def extra_attr_collector(input_files, extra_attr):
    all_tag_set = set()
    for sentence in conllx_iterator_reader(input_files):
        offset_sentence, _ = conllx_to_offset(sentence)
        tag_set = {offset_sentence.extra_attr[extra_attr]}

        all_tag_set.update(tag_set)

    return all_tag_set
コード例 #2
0
def label_collector(input_files, tag_index=0):
    all_tag_set = set()
    for sentence in conllx_iterator_reader(input_files):
        offset_sentence, _ = conllx_to_offset(sentence)
        tag_set = {offset_sentence.label}

        all_tag_set.update(tag_set)

    return all_tag_set
コード例 #3
0
def entity_collector(input_files, tag_index=0):
    all_tag_set = set()
    for sentence in conllx_iterator_reader(input_files):
        offset_sentence, _ = conllx_to_offset(sentence)
        tag_set = {i.entity for i in offset_sentence.span_set}

        all_tag_set.update(tag_set)

    return all_tag_set
コード例 #4
0
ファイル: split_data.py プロジェクト: shfshf/data_tools
#!/usr/bin/env python

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.split_data import split_data
from tokenizer_tools.conllz.writer import write_conllx

data = list(conllx_iterator_reader(['./data/all_data.conllx']))
train, dev, test = split_data(data)

with open('./data/train.conllx', 'wt') as fd:
    write_conllx(train, fd)

with open('./data/dev.conllx', 'wt') as fd:
    write_conllx(dev, fd)

with open('./data/test.conllx', 'wt') as fd:
    write_conllx(test, fd)

コード例 #5
0
#!/usr/bin/env python

import os
import pathlib

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.conllz.writer import write_conllx

current_dir = os.path.dirname(os.path.abspath(__file__))

input_file_list = [
    str(i) for i in pathlib.Path('./data/domain').iterdir() if i.is_file()
]

data = list(conllx_iterator_reader(input_file_list))

with open('./data/all.conllx', 'wt') as fd:
    write_conllx(data, fd)
コード例 #6
0
ファイル: split_data.py プロジェクト: shfshf/data_expend
import pathlib
import json
from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.split_data import split_data
from tokenizer_tools.conllz.writer import write_conllx

# read mapping
with open("./data/mapping.json", 'r', encoding='UTF-8') as f:
    map_list = json.load(f)

# path = pathlib.Path('./data/mapping.json')
# if path.exists():
if map_list:
    dir = ['./data/expend/data_expend.conllx']
else:
    dir = ['./data/all_data.conllx']

data = list(conllx_iterator_reader(dir))
train, dev, test = split_data(data)

with open('./data/final/train.conllx', 'wt') as fd:
    write_conllx(train, fd)

with open('./data/final/dev.conllx', 'wt') as fd:
    write_conllx(dev, fd)

with open('./data/final/test.conllx', 'wt') as fd:
    write_conllx(test, fd)


コード例 #7
0
ファイル: data_split.py プロジェクト: shfshf/data_tools
#!/usr/bin/env python

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.split_data import split_data
from tokenizer_tools.conllz.writer import write_conllx

data = list(conllx_iterator_reader(['./data/导航.conllx']))
train, dev, test = split_data(data)

with open('./data/train.conllx', 'wt') as fd:
    write_conllx(train, fd)

with open('./data/dev.conllx', 'wt') as fd:
    write_conllx(dev, fd)

with open('data/导航.conllx', 'wt') as fd:
    write_conllx(test, fd)

コード例 #8
0
#!/usr/bin/env python

import datetime
import json

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader

data = list(conllx_iterator_reader(["data/all_data.conllx"]))
train_data = list(conllx_iterator_reader(["data/final/train.conllx"]))
dev_data = list(conllx_iterator_reader(["data/final/dev.conllx"]))
test_data = list(conllx_iterator_reader(["data/final/test.conllx"]))

metdata = {
    "data": {
        "whole_data_size": len(data),
        "train_data_size": len(train_data),
        "dev_data_size": len(dev_data),
        "test_data_size": len(test_data),
    },
    "create_time": datetime.datetime.now().isoformat(),
}

with open("data/final/metadata.json", "wt") as fd:
    json.dump(metdata, fd)
コード例 #9
0
def test_conllx_iterator_reader():
    for i in conllx_iterator_reader(['corpus1.txt']):
        print(i)