コード例 #1
0
def create_doc_from_edu_file(edu_file, annotate_func):
    with open(edu_file, 'r') as fin:
        doc_tokens = []
        paragraphs = [p.strip() for p in fin.read().split('<P>') if p.strip()]
        previous_edu_num = 0
        for pidx, para in enumerate(paragraphs):
            sentences = [s.strip() for s in para.split('<S>') if s.strip()]
            for sidx, sent in enumerate(sentences):
                edus = [e.strip() + ' ' for e in sent.split('\n') if e.strip()]
                sent_text = ''.join(edus)
                annot_re = annotate_func(sent_text)['sentences'][0]
                sent_tokens = []
                for t in annot_re['tokens']:
                    token = Token()
                    token.tidx, token.word, token.lemma, token.pos = t[
                        'index'], t['word'], t['lemma'], t['pos']
                    token.pidx, token.sidx = pidx + 1, sidx
                    edu_text_length = 0
                    for eidx, edu_text in enumerate(edus):
                        edu_text_length += len(edu_text)
                        if edu_text_length > t['characterOffsetEnd']:
                            token.eduidx = previous_edu_num + eidx + 1
                            break
                    sent_tokens.append(token)
                for dep in annot_re['basicDependencies']:
                    dependent_token = sent_tokens[dep['dependent'] - 1]
                    dependent_token.hidx = dep['governor']
                    dependent_token.dep_label = dep['dep']
                doc_tokens += sent_tokens
                previous_edu_num += len(edus)
    doc = Doc()
    doc.init_from_tokens(doc_tokens)
    return doc
コード例 #2
0
ファイル: document.py プロジェクト: rodristk/rst-large-scale
 def _parse_fmerge_line(line):
     """ Parse one line from *.merge file
     """
     items = line.split("\t")
     tok = Token()
     tok.pidx, tok.sidx, tok.tidx = int(items[-1]), int(items[0]), int(
         items[1])
     # Without changing the case
     tok.word = items[2]
     try:
         tok.eduidx = int(items[9])
     except ValueError:
         print("EDU index for {} is missing in fmerge file".format(
             tok.word))
         pass
     return tok
コード例 #3
0
ファイル: document.py プロジェクト: janetlauyeung/rst-coref
 def _parse_fmerge_line(line):
     """ Parse one line from *.merge file
     """
     items = line.split("\t")
     tok = Token()
     tok.pidx, tok.sidx, tok.tidx = int(items[-1]), int(items[0]), int(items[1])
     # Without changing the case
     tok.word, tok.lemma = items[2], items[3]
     tok.pos = items[4]
     tok.dep_label = items[5]
     try:
         tok.hidx = int(items[6])
     except ValueError:
         pass
     tok.ner, tok.partial_parse = items[7], items[8]
     try:
         tok.eduidx = int(items[9])
     except ValueError:
         print("EDU index for {} is missing in fmerge file".format(tok.word))
         # sys.exit()
         pass
     return tok