Beispiel #1
0
def main(file_prefix):
    base_name, _ = os.path.splitext(file_prefix)

    log_file = './data/error/{}.error'.format(base_name)

    with open('./data/raw/{}'.format(file_prefix)) as fd, open(log_file,
                                                               'wt') as logger:
        output_lines = []
        seq_list = []
        for raw_line in fd:
            line = raw_line.strip()
            if not line:
                continue

            try:
                seq, sentence = process_one_line(line, logger)
            except CheckFailedError as e:
                continue
            else:
                seq_list.append(seq)
                output_lines.append(sentence)

        # write_conll(output_lines, 'data/{}.text'.format(file_prefix))
        with open('./data/domain/{}.conllx'.format(base_name),
                  'wt') as output_fd:
            write_conllx(output_lines, output_fd)
def test_write_conllx():
    sentence_1 = SentenceX()
    sentence_1.id = 'SID-1'
    sentence_1.write_as_row(['char-1', 'tag-1'])
    sentence_1.write_as_row(['char-2', 'tag-2'])

    sentence_2 = SentenceX()
    sentence_2.id = 'SID-2'
    sentence_2.write_as_row(['char-1', 'tag-1'])
    sentence_2.write_as_row(['char-2', 'tag-2'])

    sentence = [sentence_1, sentence_2]
    write_conllx(sentence, open('corpus4.txt', 'w'))
Beispiel #3
0
    def write_to_file(self, output_file):
        sentence_list = [offset_to_sentence(offset) for offset in self]

        with open(output_file, "wt") as fd:
            write_conllx(sentence_list, fd)
Beispiel #4
0
#!/usr/bin/env python

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.split_data import split_data
from tokenizer_tools.conllz.writer import write_conllx

data = list(conllx_iterator_reader(['./data/all_data.conllx']))
train, dev, test = split_data(data)

with open('./data/train.conllx', 'wt') as fd:
    write_conllx(train, fd)

with open('./data/dev.conllx', 'wt') as fd:
    write_conllx(dev, fd)

with open('./data/test.conllx', 'wt') as fd:
    write_conllx(test, fd)

Beispiel #5
0
#!/usr/bin/env python

import os
import pathlib

from tokenizer_tools.conllz.iterator_reader import conllx_iterator_reader
from tokenizer_tools.conllz.writer import write_conllx

current_dir = os.path.dirname(os.path.abspath(__file__))

input_file_list = [
    str(i) for i in pathlib.Path('./data/domain').iterdir() if i.is_file()
]

data = list(conllx_iterator_reader(input_file_list))

with open('./data/all.conllx', 'wt') as fd:
    write_conllx(data, fd)