def crf_format(annotated_corpus_path):
    wordset_dic=pos_map_to_perdep.load_wordset();
    with codecs.open(annotated_corpus_path, 'r', 'utf-8') as lines:
        crf_format_stack=[];
        crf_format_stack_layer2=[];
        for line in lines:
            if line[0]==u'#' or line==u'\n':
                pass;

            else:
                if line==u'!end\n':
                    crf_format_stack.append( (u'\n', u'\n') );
                    crf_format_stack_layer2.append( (u'\n', u'\n') );

                else:
                    token_sent_id, features=line.strip(u'\n').split(u'\t');
                    
                    surface, POS, stem, tag, category, inflection, blank=features.split(u'|');
                    stem=re.sub(ur'_+[1-9]', u'', stem);
                    stem=re.sub(ur'_*MTE', u'', stem);

                    information_tuple=(POS, category, inflection);
                    perdep_information_tuple=pos_map_to_perdep.map_pos(wordset_dic,\
                                                                       surface,\
                                                                       information_tuple);
                    coarse_pos=perdep_information_tuple[0];
                    fine_pos=perdep_information_tuple[1];
                   
                    #結果がNoneのときのチェック用に残しておく
                    if perdep_information_tuple==None:
                        print information_tuple;
                        print surface;

                    if len(perdep_information_tuple)==3:
                        if perdep_information_tuple[0]==u'N':
                            if perdep_information_tuple[2]==None:
                                print u'Warning! the number feature for N is None. It must be something mistaken';

                        elif perdep_information_tuple[0]==u'V':
                            number_info=u'number={}'.format(perdep_information_tuple[2][0]);
                            person_info=u'person={}'.format(perdep_information_tuple[2][1]);
                            tma_info=u'tma={}'.format(perdep_information_tuple[2][2]);


                    
                    crf_token_format_layer1_gold=u'{} {}\n'.format(surface, perdep_information_tuple[0]);
                    crf_token_format_layer1_test=u'{}\n'.format(surface);


                    crf_token_format_layer2_gold=u'{} {} {}\n'.format(surface,\
                                                               perdep_information_tuple[0],\
                                                               perdep_information_tuple[1]);
                    crf_token_format_layer2_test=u'{} {}\n'.format(surface, perdep_information_tuple[0]);


                    crf_format_stack.append( (crf_token_format_layer1_test, crf_token_format_layer1_gold) );
                    crf_format_stack_layer2.append( (crf_token_format_layer2_test, crf_token_format_layer2_gold) );

    return crf_format_stack, crf_format_stack_layer2;
def conll_format(annotated_corpus_path):
    wordset_dic=pos_map_to_perdep.load_wordset();
    with codecs.open(annotated_corpus_path, 'r', 'utf-8') as lines:
        conll_format_stack=[];
        token_id=1;
        for line in lines:
            if line[0]==u'#' or line==u'\n':
                pass;

            else:
                if line==u'!end\n':
                    conll_format_stack.append(u'\n');
                    token_id=1;

                else:
                    token_sent_id, features=line.strip(u'\n').split(u'\t');
                    
                    surface, POS, stem, tag, category, inflection, blank=features.split(u'|');
                    stem=re.sub(ur'_+[1-9]', u'', stem);
                    stem=re.sub(ur'_*MTE', u'', stem);

                    information_tuple=(POS, category, inflection);
                    perdep_information_tuple=pos_map_to_perdep.map_pos(wordset_dic,\
                                                                       surface,\
                                                                       information_tuple);
                    coarse_pos=perdep_information_tuple[0];
                    fine_pos=perdep_information_tuple[1];
                   
                    #結果がNoneのときのチェック用に残しておく
                    if perdep_information_tuple==None:
                        print information_tuple;
                        print surface;

                    if len(perdep_information_tuple)==3:
                        if perdep_information_tuple[0]==u'N':
                            feature_column=u'attachment=ISO|number={}'.format(perdep_information_tuple[2]);
                            if perdep_information_tuple[2]==None:
                                print u'Warning! the number feature for N is None. It must be something mistaken';

                        elif perdep_information_tuple[0]==u'V':
                            number_info=u'number={}'.format(perdep_information_tuple[2][0]);
                            person_info=u'person={}'.format(perdep_information_tuple[2][1]);
                            tma_info=u'tma={}'.format(perdep_information_tuple[2][2]);
                  

                            feature_column=person_info+u'|attachment=ISO|'+number_info+u'|'+tma_info;   
                            feature_column=re.sub(ur'^person=None\|', u'', feature_column);
                            feature_column=re.sub(ur'\|number=None', u'', feature_column);
                            feature_column=re.sub(ur'\|tma=None', u'', feature_column);

                    #attachment=ISOはこのバージョンでのみ有効
                    else:
                            feature_column=u'attachment=ISO';

                    
                    conll_token_format=u'{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\n'\
                            .format(str(token_id),surface,stem,coarse_pos,fine_pos,feature_column,u'0',u'0',u'_', u'_');

                    conll_format_stack.append(conll_token_format);
                    token_id=token_id+1;

    return conll_format_stack;