def crf_format(annotated_corpus_path): wordset_dic=pos_map_to_perdep.load_wordset(); with codecs.open(annotated_corpus_path, 'r', 'utf-8') as lines: crf_format_stack=[]; crf_format_stack_layer2=[]; for line in lines: if line[0]==u'#' or line==u'\n': pass; else: if line==u'!end\n': crf_format_stack.append( (u'\n', u'\n') ); crf_format_stack_layer2.append( (u'\n', u'\n') ); else: token_sent_id, features=line.strip(u'\n').split(u'\t'); surface, POS, stem, tag, category, inflection, blank=features.split(u'|'); stem=re.sub(ur'_+[1-9]', u'', stem); stem=re.sub(ur'_*MTE', u'', stem); information_tuple=(POS, category, inflection); perdep_information_tuple=pos_map_to_perdep.map_pos(wordset_dic,\ surface,\ information_tuple); coarse_pos=perdep_information_tuple[0]; fine_pos=perdep_information_tuple[1]; #結果がNoneのときのチェック用に残しておく if perdep_information_tuple==None: print information_tuple; print surface; if len(perdep_information_tuple)==3: if perdep_information_tuple[0]==u'N': if perdep_information_tuple[2]==None: print u'Warning! the number feature for N is None. It must be something mistaken'; elif perdep_information_tuple[0]==u'V': number_info=u'number={}'.format(perdep_information_tuple[2][0]); person_info=u'person={}'.format(perdep_information_tuple[2][1]); tma_info=u'tma={}'.format(perdep_information_tuple[2][2]); crf_token_format_layer1_gold=u'{} {}\n'.format(surface, perdep_information_tuple[0]); crf_token_format_layer1_test=u'{}\n'.format(surface); crf_token_format_layer2_gold=u'{} {} {}\n'.format(surface,\ perdep_information_tuple[0],\ perdep_information_tuple[1]); crf_token_format_layer2_test=u'{} {}\n'.format(surface, perdep_information_tuple[0]); crf_format_stack.append( (crf_token_format_layer1_test, crf_token_format_layer1_gold) ); crf_format_stack_layer2.append( (crf_token_format_layer2_test, crf_token_format_layer2_gold) ); return crf_format_stack, crf_format_stack_layer2;
def conll_format(annotated_corpus_path): wordset_dic=pos_map_to_perdep.load_wordset(); with codecs.open(annotated_corpus_path, 'r', 'utf-8') as lines: conll_format_stack=[]; token_id=1; for line in lines: if line[0]==u'#' or line==u'\n': pass; else: if line==u'!end\n': conll_format_stack.append(u'\n'); token_id=1; else: token_sent_id, features=line.strip(u'\n').split(u'\t'); surface, POS, stem, tag, category, inflection, blank=features.split(u'|'); stem=re.sub(ur'_+[1-9]', u'', stem); stem=re.sub(ur'_*MTE', u'', stem); information_tuple=(POS, category, inflection); perdep_information_tuple=pos_map_to_perdep.map_pos(wordset_dic,\ surface,\ information_tuple); coarse_pos=perdep_information_tuple[0]; fine_pos=perdep_information_tuple[1]; #結果がNoneのときのチェック用に残しておく if perdep_information_tuple==None: print information_tuple; print surface; if len(perdep_information_tuple)==3: if perdep_information_tuple[0]==u'N': feature_column=u'attachment=ISO|number={}'.format(perdep_information_tuple[2]); if perdep_information_tuple[2]==None: print u'Warning! the number feature for N is None. It must be something mistaken'; elif perdep_information_tuple[0]==u'V': number_info=u'number={}'.format(perdep_information_tuple[2][0]); person_info=u'person={}'.format(perdep_information_tuple[2][1]); tma_info=u'tma={}'.format(perdep_information_tuple[2][2]); feature_column=person_info+u'|attachment=ISO|'+number_info+u'|'+tma_info; feature_column=re.sub(ur'^person=None\|', u'', feature_column); feature_column=re.sub(ur'\|number=None', u'', feature_column); feature_column=re.sub(ur'\|tma=None', u'', feature_column); #attachment=ISOはこのバージョンでのみ有効 else: feature_column=u'attachment=ISO'; conll_token_format=u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\n'\ .format(str(token_id),surface,stem,coarse_pos,fine_pos,feature_column,u'0',u'0',u'_', u'_'); conll_format_stack.append(conll_token_format); token_id=token_id+1; return conll_format_stack;