コード例 #1
0
def map_tags(tagged_src, untagged_tgt, alignment_list):
    tagged_tgt =[]
    print alignment_list[0]
    for sentence in untagged_tgt:
        sent_token_list = Sentence()
        for word in sentence:
            sent_token_list.add_token(Token(word, untagged_tag_str, 0, 0))
            
        tagged_tgt.append(sent_token_list)
            
    count = 0
    for sent_num, pairings in enumerate(alignment_list):
        for pair in pairings:
            src_tag_idx = pair[0]
            tgt_tag_idx = pair[1]

            tagged_tgt[sent_num].get_token_at(tgt_tag_idx).pos_tag = tagged_src[sent_num].get_token_at(src_tag_idx).pos_tag
    
    return tagged_tgt
コード例 #2
0
def get_test_corpus(file_name):
    corpus=""
    words=[]
    test_correct_tags=[]
    sentence_tags = Sentence()
    on_sentence = False
    for line in codecs.open(file_name,'r', encoding="utf-8"):

        vals = line.split('\t')
        if (len(vals) > 1):
            on_sentence=True
            tok = Token(vals[1],vals[3], vals[6], 0)
            sentence_tags.add_token(tok)
        elif(on_sentence):
            on_sentence = False
            test_correct_tags.append(sentence_tags)
            sentence_tags = Sentence()


    print str(len(test_correct_tags)) + " sentences in test corpus"
    return test_correct_tags
コード例 #3
0
def load_tagged_sentences(file_name):
    sentences_w_tags = []
    count = 0
    words=[]
    tags=[]
    sentence_obj = Sentence()
    on_sentence = False
    for line in codecs.open(file_name, 'r', encoding="utf-8"):
    
        vals = line.split('\t')
        if (len(vals) > 1):
            on_sentence = True
            tok = Token()
            tok.orig = vals[1]
            tok.pos_tag = vals[3]
            tok.head = int(vals[6])
            sentence_obj.add_token(tok)
        elif (on_sentence):
            on_sentence=False
            sentences_w_tags.append(sentence_obj)
            sentence_obj = Sentence()
    
    return sentences_w_tags # [ Sentence_obj, Sentence_obj]