def main(fin_names, fout_name, split_num): """ make training format for CRFsuite :param fin_names: list of input files :param fout_name: output file path :param split_num: split number of output file """ fouts = [] sizes = [] if split_num > 1: fouts = [open('%s.%d' % (fout_name, idx), 'w') for idx in range(split_num)] sizes = [0] * split_num else: fouts = [open(fout_name, 'w')] sizes = [0] for sent in sejong_corpus.load(IS_SPOKEN, fin_names): if not sent.is_good_tags(): logging.error('Invalid tag in sentence: %s', sent) continue sent_pairs = [] for word in sent.words: try: word_pairs = sejong_align.align(word) except sejong_align.AlignError: sent_pairs = [] break else: sent_pairs.append(word_pairs) if sent_pairs: surfaces = [surface.encode('UTF-8') for pairs in sent_pairs for surface, _ in pairs] new_size = sum([len(surface) for surface in surfaces]) print_aligned(select_file(fouts, sizes, new_size), sent_pairs)
def main(fin_names, fout): """ make word(EoJeol) per line formatted corpus from Sejong tagged corpus :param fin_names: list of input files :param fout: output file """ for sent in sejong_corpus.load(IS_SPOKEN, fin_names): print >> fout, sent print >> fout
def main(fin_names, fout): """ align syllables to morphemes in word(EoJeol) :param fin_names: list of input files :param fout: output file """ for sent in sejong_corpus.load(IS_SPOKEN, fin_names): sent_pairs = [] for surface in sent.words: try: word_pairs = sejong_align.align(surface) except sejong_align.AlignError: sent_pairs = [] break else: sent_pairs.extend(word_pairs) if sent_pairs: print_aligned(fout, sent_pairs)