def get_tagged_sentences(self, file_name): sentences_w_tags = [] count = 0 words=[] tags=[] sentence_obj = Sentence() sentence_obj.add_token(Token(orig_token='<start>')) on_sentence = False for line in codecs.open(file_name, 'r', encoding="utf-8"): vals = line.split('\t') if (len(vals) > 1): on_sentence = True tok = Token() tok.orig = vals[1] tok.pos_tag = vals[3] tok.head = int(vals[6]) tok.head_label = vals[7] sentence_obj.add_token(tok) elif (on_sentence): on_sentence=False sentence_obj.add_token(Token(orig_token='ROOT')) sentences_w_tags.append(sentence_obj) sentence_obj = Sentence() sentence_obj.add_token(Token(orig_token='<start>')) return sentences_w_tags # [ Sentence_obj, Sentence_obj]
def read_conll(loc): for sent_str in open(loc).read().strip().split('\n\n'): lines = [line.split() for line in sent_str.split('\n')] words = DefaultList(''); tags = DefaultList('') heads = [None]; labels = [None] for index, word,lem, pos, something, s1, head, label, s2, s3 in lines: words.append(intern(word)) #words.append(intern(normalize(word))) tags.append(intern(pos)) heads.append(int(head) if head != '0' else len(lines) ) labels.append(label) pad_tokens(words); pad_tokens(tags) sent_obj = Sentence() sent_obj.add_words(words) sent_obj.set_pos_tags(tags) sent_obj.set_heads(heads) sent_obj.set_head_labels(labels) yield sent_obj