-
Notifications
You must be signed in to change notification settings - Fork 0
/
negra.py
79 lines (56 loc) · 1.88 KB
/
negra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# Copyright (C) 2007-2011 Franco M. Luque
# URL: <http://www.cs.famaf.unc.edu.ar/~francolq/>
# For license information, see LICENSE.txt
import itertools
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
import treebank
def is_ellipsis(s):
#return s[:2] == '*T'
return s[0] == '*'
def is_punctuation(s):
return s[0] == '$'
class NegraTree(treebank.Tree):
def is_ellipsis(self, s):
return is_ellipsis(s)
def is_punctuation(self, s):
return is_punctuation(s)
class Negra(treebank.SavedTreebank):
default_basedir = 'negra-corpus'
trees = []
filename = 'negra.treebank'
def __init__(self, basedir=None):
if basedir == None:
basedir = self.default_basedir
self.basedir = basedir
self.reader = BracketParseCorpusReader(basedir, 'negra-corpus2.penn', comment_char='%')
def parsed(self, files=None):
#for t in treebank.SavedTreebank.parsed(self, files):
for (i, t) in itertools.izip(itertools.count(), self.reader.parsed_sents()):
yield NegraTree(t, labels=i)
def get_tree(self, offset=0):
t = self.get_trees2(offset, offset+1)[0]
return t
# Devuelve los arboles que se encuentran en la posicion i con start <= i < end
def get_trees2(self, start=0, end=None):
lt = [t for t in itertools.islice(self.parsed(), start, end)]
return lt
def is_ellipsis(self, s):
return is_ellipsis(s)
def is_punctuation(self, s):
return is_punctuation(s)
def test():
tb = Negra()
trees = tb.get_trees()
return tb
"""
PREPROCESAMIENTO DEL NEGRA:
>>> f = open('negra-corpus/negra-corpus.penn')
>>> g = open('negra-corpus/negra-corpus2.penn', 'w')
>>> for l in f:
... if l[0] == '(':
... l = '(ROOT'+l[1:]
... g.write(l)
...
>>> f.close()
>>> g.close()
"""