def parse_data(option):
    current_dir = os.path.dirname(os.path.abspath(__file__))

    input_file = os.path.join(current_dir, 'data', 'raw_data.txt')

    conllu_dir = os.path.join(current_dir, 'data', 'conllu')
    utils.create_if_dir_not_exists(conllu_dir)

    output_file = os.path.join(
        conllu_dir,
        'data_{}.conllu'.format('-'.join([str(i) for i in option.values()]))
    )

    data_cleaner = DataParserAndTranslator(input_file, output_file)
    data_cleaner.process(**option)
Exemple #2
0
#!/usr/bin/env python

import os

import utils


def convert_encoding(input_file,
                     output_file,
                     input_encoding='gbk',
                     output_encoding='utf8'):
    with open(input_file, 'rb') as input_fd, open(output_file,
                                                  'wb') as output_fd:
        file_content = input_fd.read()
        unicode_file_content = file_content.decode(input_encoding)
        output_fd.write(unicode_file_content.encode(output_encoding))


if __name__ == "__main__":
    current_dir = os.path.dirname(os.path.abspath(__file__))

    data_dir = os.path.join(current_dir, 'data')
    utils.create_if_dir_not_exists(data_dir)

    input_file = os.path.join(data_dir, '1998-01-2003版-带音.txt')
    output_file = os.path.join(data_dir, 'raw_data.txt')

    convert_encoding(input_file, output_file)
import os

from nltk.corpus.reader.conll import ConllCorpusReader
import utils

current_dir = os.path.dirname(os.path.abspath(__file__))

root = os.path.join(current_dir, './data/conllu')
utils.create_if_dir_not_exists(root)


def get_corpus_reader(fileids,
                      columntypes=(ConllCorpusReader.WORDS,
                                   ConllCorpusReader.POS, ConllCorpusReader.NE,
                                   ConllCorpusReader.IGNORE),
                      root=root):
    corpus_reader = ConllCorpusReader(root, fileids, columntypes)

    return corpus_reader
import os

from nltk.corpus.reader.conll import ConllCorpusReader

from corpus import get_corpus_reader
import utils

current_dir = os.path.dirname(os.path.abspath(__file__))

conllu_dir = os.path.join(current_dir, 'data/split_data')
utils.create_if_dir_not_exists(conllu_dir)

token_dir = os.path.join(current_dir, 'data/split_crfpp')
utils.create_if_dir_not_exists(token_dir)


def main(delimit="  "):
    for root, dirs, files in os.walk(conllu_dir):
        for file_ in files:

            # ignore hidden file
            if os.path.basename(file_).startswith('.'):
                continue

            corpus_reader = get_corpus_reader(
                columntypes=(ConllCorpusReader.IGNORE, ConllCorpusReader.WORDS,
                             ConllCorpusReader.POS, ConllCorpusReader.NE,
                             ConllCorpusReader.IGNORE),
                root=root,
                fileids=[file_])
        train_set = sent_list[:train_set_len]
        dev_set = sent_list[train_set_len:train_set_len + dev_set_len]
        test_set = sent_list[-test_set_len:]

        self.write_data(train_set, 'train')
        self.write_data(dev_set, 'dev')
        self.write_data(test_set, 'test')

    def write_data(self, data, data_set_name):
        output_file = self.get_output_file(data_set_name)

        with open(output_file, 'w') as fd:
            for sent in data:
                for id, token_and_more in enumerate(sent, start=1):
                    fd.write("\t".join([str(id)] + list(token_and_more)) +
                             "\n")
                fd.write('\n')

    def get_output_file(self, data_set_name):
        return os.path.join(self.output_dir, data_set_name + '.conllu')


if __name__ == "__main__":
    split_data_dir = os.path.join(current_dir, 'data', 'split_data')
    utils.create_if_dir_not_exists(split_data_dir)

    data_splitter = DataSplitter(split_data_dir)

    data_splitter.split_data()
Exemple #6
0
import os
import random
random.seed(0)
import utils
from nltk.util import LazyMap
from nltk.corpus.reader.conll import ConllCorpusReader
from tokenizer_tools.tagset.NER.BILUO import BILUOEncoderDecoder

current_dir = os.path.dirname(os.path.abspath(__file__))

# ****************************************************************************** #
#                               源目文件存储位置及配置                              #
# ****************************************************************************** #
# conll source file
conll_root = os.path.join(current_dir, './data/conllu')
utils.create_if_dir_not_exists(conll_root)
fileids = ['data_False-True-True-True-True-True-False.conllu']
# conll file column info
columntypes = (ConllCorpusReader.IGNORE, ConllCorpusReader.WORDS,
               ConllCorpusReader.POS, ConllCorpusReader.NE,
               ConllCorpusReader.IGNORE)

# step1: split data storage
split_data_dir = os.path.join(current_dir, 'data', 'split_data')
utils.create_if_dir_not_exists(split_data_dir)

# step2: BMES char file storage
token_dir = os.path.join(current_dir, 'data/split_char_crfpp')
utils.create_if_dir_not_exists(token_dir)