コード例 #1
0
def read_data(lines,
              tree_flag,
              score_cat_rex=None,
              ignore_terminal_rex=None,
              word_split_rex=None,
              debug_level=0):
    "Reads data either in tree format or in flat format"

    if tree_flag:
        words = []
        for line in lines:
            trees = tb.string_trees(line)
            trees.insert(0, 'ROOT')
            words.append(tree_string(trees, score_cat_rex,
                                     ignore_terminal_rex))
            if debug_level >= 1000:
                sys.stderr.write("# line = %s,\n# words = %s\n" %
                                 (line, words[-1]))
    else:
        words = [[
            w for w in word_split_rex.split(line)
            if w != '' and not ignore_terminal_rex.match(w)
        ] for line in lines]

    segments = [''.join(ws) for ws in words]
    if debug_level >= 10000:
        sys.stderr.write("lines[0] = %s\n" % lines[0])
        sys.stderr.write("words[0] = %s\n" % words[0])
        sys.stderr.write("segments[0] = %s\n" % segments[0])
    stringpos = [words_stringpos(ws) for ws in words]
    return (segments, stringpos)
コード例 #2
0
def read_data(inf, tree_flag, score_cat_rex, ignore_terminal_rex, debug_level=0, max_lines=0):
    "Reads data from inf, either in tree format or in flat format"
    strippedlines = [line.strip() for line in inf]

    if max_lines > 0:
        if len(strippedlines) < max_lines:
            sys.stderr.write("Warning: max_lines=%d, len(strippedlines) = %d\n"%(max_lines,len(strippedlines)))
        strippedlines = strippedlines[0:max_lines]

    if tree_flag:
        lines = []
        for line in strippedlines:
            trees = tb.string_trees(line)
            trees.insert(0, 'ROOT')
            lines.append(tree_words(trees, score_cat_rex, ignore_terminal_rex))
            if debug_level >= 10000:
                sys.stderr.write("# line = %s,\n# words = %s\n"%(line, lines[-1]))
    else:
        lines = [[string_word(s) for s in line.split()] for line in strippedlines]

    # print "# tree_flag =", tree_flag, "source =", strippedlines[-1], "line =", lines[-1]
    
    sentences = [''.join((w[0] if isinstance(w,tuple) else w for w in ws)) for ws in lines]
    topics = [words_topics(line) for line in lines]
    stringpos = [words_stringpos(ws) for ws in lines]
    topicstringpos = [words_topicstringpos(ws) for ws in lines]
    return (sentences,(stringpos,topics,topicstringpos))
コード例 #3
0
def read_write(inf, outf=sys.stdout, nskip=0):
    "Reads data from inf in tree format"
    for line in inf:
        line = line.strip()
        if len(line) > 0:
            if nskip <= 0:
                trees = tb.string_trees(line)
                trees.insert(0, 'ROOT')
                outf.write(tree_string(trees).strip())
                outf.write('\n')
        else:
            if nskip <= 0:
                outf.write('\n')
                outf.flush()
            nskip -= 1
        trees = tb.string_trees(line)
        trees.insert(0, 'ROOT')
コード例 #4
0
ファイル: trees-words.py プロジェクト: alecristia/CDSwordSeg
def read_write(inf, outf=sys.stdout, nskip=0):
    "Reads data from inf in tree format"
    for line in inf:
        line = line.strip()
        if len(line) > 0:
            if nskip <= 0:
                trees = tb.string_trees(line)
                trees.insert(0, 'ROOT')
                outf.write(tree_string(trees).strip())
                outf.write('\n')
        else:
            if nskip <= 0:
                outf.write('\n')
                outf.flush()
            nskip -= 1
        trees = tb.string_trees(line)
        trees.insert(0, 'ROOT')
コード例 #5
0
ファイル: trees-words.py プロジェクト: boerschi/artlangseg
def read_write(inf, outf=sys.stdout, nskip=0):
    "Reads data from inf in tree format"
    for line in inf:
        line = line.strip()
	#hacky way of exlucding the spurious entries
        if line.count("lexentry")>0:
            continue

        if len(line) > 0:
            if nskip <= 0:
                trees = tb.string_trees(line)
                trees.insert(0, 'ROOT')
                outf.write(tree_string(trees).strip())
                outf.write('\n')
        else:
            if nskip <= 0:
                outf.write('\n')
                outf.flush()
            nskip -= 1
        trees = tb.string_trees(line)
        trees.insert(0, 'ROOT')
コード例 #6
0
    def setUp(self):
        gstr = """(S (EDITED (NP (EX there)) (, ,)) 
                 (NP (EX there)) 
                 (VP (BES 's) (NP (DT no) (NN way))) (. .))
              (S (CC and) (, ,) (INTJ (UH uh)) 
                 (PRN (, ,) 
                      (S (NP (PRP you)) (VP (VBP know))) (, ,)) 
                 (NP (DT all))) 
              (S (EDITED (EDITED (EDITED (S (NP (EX There)) (VP (BES 's))) (, ,)) 
                                            (NP (EX there)) (, ,)) (NP (DT th-)) (, ,)) 
                 (NP (DT this) (NN topic)) 
                 (VP (VBZ is) (ADJP (ADVP (RB kind) (RB of)) (TYPO (JJ mute))) (. .) 
                 (INTJ (UH Uh))))
           """
        pstr = """(S (NP (EX there)) 
                 (, ,) 
                 (NP (EX there)) 
                 (VP (BES 's) (NP (DT no) (NN way))) (. .))
              (S1 (CC and) (, ,) (INTJ (UH uh)) (, ,)
                 (PRN (S (NP (PRP you)) (VP (VBP know)))) 
                 (, ,)
                 (NP (DT all))) 
              (S (EDITED (EDITED (EDITED (S (NP (EX There)) (VP (BES 's))) (, ,)) 
                                            (NP (EX there)) (, ,)) (NP (DT th-)) (, ,)) 
                 (NP (DT this) (NN topic)) 
                 (VP (VBZ is) (ADJP (ADVP (RB kind) (RB of)) (TYPO (JJ mute))) (. .) 
                 (INTJ (UH Uh))))
           """
        self.gs = tb.string_trees(gstr)
        self.ps = tb.string_trees(pstr)

        self.e_no_words = EvalParse()
        self.e_no_words(self.ps, self.gs)
        self.e_no_words_tbl = self.e_no_words.table()
        # print(self.e_no_words_tbl)

        self.e_words = EvalParse(evaluate_word_coverage=True)
        self.e_words(self.ps, self.gs)
        self.e_words_tbl = self.e_words.table()
コード例 #7
0
def getAnalysis(sTree, wCat, sCat, wordSep, segSep, sylSep, noLabel):
    """
        Stores the yield of a tree with syllabic and segmental information
        each segment is indexed with "_C" or "_V" (consonant or vowel) if
        noLabel == FALSE
        each syllable is separated by sylSep
        each word is separated by wordSep
        each segment is separated by segSep
    """
    tree = tb.string_trees(sTree)
    tree.insert(0, 'ROOT')
    words = []
    visitTree(tree, words, wCat, sCat, segSep, sylSep, noLabel)
    return wordSep.join(words)
コード例 #8
0
def readwords(fname, wordrex, wordtype_word_fname_count):
    inf = file(fname, "rU")
    for line in inf:
        if len(line) > 0 and line[0] == '(':
            trees = tb.string_trees(line.strip())
            assert(len(trees) == 1)
            tree = trees[0]
            assert(isinstance(tree, list))
            label = tree[0]
            mo = wordrex.match(label)
            if mo:
                wordtype = mo.group(1)
                wordcount = int(mo.group(2))
                word = ''.join(tb.terminals(tree))
                lx.incr3(wordtype_word_fname_count, wordtype, word, fname, wordcount)
コード例 #9
0
def read_data(lines, tree_flag, score_cat_rex=None, ignore_terminal_rex=None, word_split_rex=None, debug_level=0):
    "Reads data either in tree format or in flat format"

    if tree_flag:
        words = []
        for line in lines:
            if line.count("lexentry")>0:
                continue
            trees = tb.string_trees(line)
            trees.insert(0, 'ROOT')
            words.append(tree_string(trees, score_cat_rex, ignore_terminal_rex))
            if debug_level >= 1000:
                sys.stderr.write("# line = %s,\n# words = %s\n"%(line, words[-1]))
    else:
        words = [[w for w in word_split_rex.split(line) if w != '' and not ignore_terminal_rex.match(w)] for line in lines if line.count("lexentry")==0]

    segments = [''.join(ws) for ws in words]
    if debug_level >= 10000:
        sys.stderr.write("lines[0] = %s\n"%lines[0])
        sys.stderr.write("words[0] = %s\n"%words[0])
        sys.stderr.write("segments[0] = %s\n"%segments[0])
    stringpos = [words_stringpos(ws) for ws in words]
    return (segments,stringpos)
コード例 #10
0
def read_data(inf, tree_flag, types_flag, score_cat_rex, ignore_terminal_rex, word_split_rex, debug_level=0, max_lines=0):
    "Reads data from inf, either in tree format or in flat format"
    strippedlines = [line.strip() for line in inf]

    if max_lines > 0:
        if len(strippedlines) < max_lines:
            sys.stderr.write("Warning: max_lines=%d, len(strippedlines) = %d\n"%(max_lines,len(strippedlines)))
        strippedlines = strippedlines[0:max_lines]

    if tree_flag:
        lines0 = []
        for line in strippedlines:
            trees = tb.string_trees(line)
            trees.insert(0, 'ROOT')
            lines0.append(tree_words(trees, score_cat_rex, ignore_terminal_rex))
            if debug_level >= 10000:
                sys.stderr.write("# line = %s,\n# words = %s\n"%(line, lines0[-1]))
    else:
        lines0 = [[word for word in word_split_rex.split(line) if word != ""] for line in strippedlines]

    if types_flag:
        lines = []
        dejavu = set()
        for words in lines0:
            word = ''.join(words)
            if word not in dejavu:
                dejavu.add(word)
                lines.append(words)
    else:
        lines = lines0
        
    # print "# tree_flag =", tree_flag, "source =", strippedlines[-1], "line =", lines[-1]
    
    sentences = [''.join(ws) for ws in lines]
    stringpos = [words_stringpos(ws) for ws in lines]
    return (sentences,stringpos)