def create_tree(self,zone=None): root = Node() root.ord = 0 root._aux['descendants'] = [] root.set_zone(zone) self.trees.append(root) return root
def load(self,args): fh = None try: fh = args['filehandle'] except: filename = args['filename'] fh = open(filename, 'r') fh = codecs.getreader('utf8')(fh) nodes = [] comment = '' for line in fh: if re.search('^#',line): comment = comment + line elif re.search('^\d+\-',line): # HACK: multiword tokens temporarily avoided pass elif line.strip(): if not nodes: bundle = Bundle() self.bundles.append(bundle) root = Root() # TODO: nahradit bundle.create_tree, az bude odladene root._aux['comment'] = comment # TODO: ulozit nekam poradne nodes = [root] bundle.trees.append(root) columns = line.strip().split('\t') node = Node() nodes.append(node) for index in xrange(0,len(Document.attrnames)): setattr( node, Document.attrnames[index], columns[index] ) try: # TODO: kde se v tomhle sloupecku berou podtrzitka node.head = int(node.head) except ValueError: node.head = 0 try: # TODO: poresit multitokeny node.ord = int(node.ord) except ValueError: node.ord = 0 else: # an empty line is guaranteed even after the last sentence in a conll-u file nodes[0]._aux['descendants'] = nodes[1:] for node in nodes[1:]: node.set_parent( nodes[node.head] ) nodes = [] comment = ''