def parse_data(option): current_dir = os.path.dirname(os.path.abspath(__file__)) input_file = os.path.join(current_dir, 'data', 'raw_data.txt') conllu_dir = os.path.join(current_dir, 'data', 'conllu') utils.create_if_dir_not_exists(conllu_dir) output_file = os.path.join( conllu_dir, 'data_{}.conllu'.format('-'.join([str(i) for i in option.values()])) ) data_cleaner = DataParserAndTranslator(input_file, output_file) data_cleaner.process(**option)
#!/usr/bin/env python import os import utils def convert_encoding(input_file, output_file, input_encoding='gbk', output_encoding='utf8'): with open(input_file, 'rb') as input_fd, open(output_file, 'wb') as output_fd: file_content = input_fd.read() unicode_file_content = file_content.decode(input_encoding) output_fd.write(unicode_file_content.encode(output_encoding)) if __name__ == "__main__": current_dir = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(current_dir, 'data') utils.create_if_dir_not_exists(data_dir) input_file = os.path.join(data_dir, '1998-01-2003版-带音.txt') output_file = os.path.join(data_dir, 'raw_data.txt') convert_encoding(input_file, output_file)
import os from nltk.corpus.reader.conll import ConllCorpusReader import utils current_dir = os.path.dirname(os.path.abspath(__file__)) root = os.path.join(current_dir, './data/conllu') utils.create_if_dir_not_exists(root) def get_corpus_reader(fileids, columntypes=(ConllCorpusReader.WORDS, ConllCorpusReader.POS, ConllCorpusReader.NE, ConllCorpusReader.IGNORE), root=root): corpus_reader = ConllCorpusReader(root, fileids, columntypes) return corpus_reader
import os from nltk.corpus.reader.conll import ConllCorpusReader from corpus import get_corpus_reader import utils current_dir = os.path.dirname(os.path.abspath(__file__)) conllu_dir = os.path.join(current_dir, 'data/split_data') utils.create_if_dir_not_exists(conllu_dir) token_dir = os.path.join(current_dir, 'data/split_crfpp') utils.create_if_dir_not_exists(token_dir) def main(delimit=" "): for root, dirs, files in os.walk(conllu_dir): for file_ in files: # ignore hidden file if os.path.basename(file_).startswith('.'): continue corpus_reader = get_corpus_reader( columntypes=(ConllCorpusReader.IGNORE, ConllCorpusReader.WORDS, ConllCorpusReader.POS, ConllCorpusReader.NE, ConllCorpusReader.IGNORE), root=root, fileids=[file_])
train_set = sent_list[:train_set_len] dev_set = sent_list[train_set_len:train_set_len + dev_set_len] test_set = sent_list[-test_set_len:] self.write_data(train_set, 'train') self.write_data(dev_set, 'dev') self.write_data(test_set, 'test') def write_data(self, data, data_set_name): output_file = self.get_output_file(data_set_name) with open(output_file, 'w') as fd: for sent in data: for id, token_and_more in enumerate(sent, start=1): fd.write("\t".join([str(id)] + list(token_and_more)) + "\n") fd.write('\n') def get_output_file(self, data_set_name): return os.path.join(self.output_dir, data_set_name + '.conllu') if __name__ == "__main__": split_data_dir = os.path.join(current_dir, 'data', 'split_data') utils.create_if_dir_not_exists(split_data_dir) data_splitter = DataSplitter(split_data_dir) data_splitter.split_data()
import os import random random.seed(0) import utils from nltk.util import LazyMap from nltk.corpus.reader.conll import ConllCorpusReader from tokenizer_tools.tagset.NER.BILUO import BILUOEncoderDecoder current_dir = os.path.dirname(os.path.abspath(__file__)) # ****************************************************************************** # # 源目文件存储位置及配置 # # ****************************************************************************** # # conll source file conll_root = os.path.join(current_dir, './data/conllu') utils.create_if_dir_not_exists(conll_root) fileids = ['data_False-True-True-True-True-True-False.conllu'] # conll file column info columntypes = (ConllCorpusReader.IGNORE, ConllCorpusReader.WORDS, ConllCorpusReader.POS, ConllCorpusReader.NE, ConllCorpusReader.IGNORE) # step1: split data storage split_data_dir = os.path.join(current_dir, 'data', 'split_data') utils.create_if_dir_not_exists(split_data_dir) # step2: BMES char file storage token_dir = os.path.join(current_dir, 'data/split_char_crfpp') utils.create_if_dir_not_exists(token_dir)