def __init__(self, lexcats=['N', 'P', 'V', 'A', 'ADV', 'C', 'D','PRO'], cmpdfile=None, lefffdir=None, subcatfile=None): # lexical categories (to spot compounds) self.lexcats = lexcats # lefff : used to infer the subcat attribute for components of compounds if lefffdir <> None: self.lefff = Lefff(lefffdir) else: self.lefff = None # allowed list of compounds self.allowed_cmpds = {} if cmpdfile <> None: self.load_cmpdfile(cmpdfile) # known subcats # dict : key = form, cal = dic: key2=cat, val=subcat self.form2cat2subcat = {} if subcatfile <> None: self.load_subcatfile(subcatfile) # new compounds to systematically recognize # (limited to the cases where the form spans a whole component) # search for a VN, with yield='il y a', and replace everything under the VN by (V il_y_a) self.new_compounds = [('il y a', 'VN', 'V')]
inputformat = str(opts.inputformat) token_rank = int(opts.token_rank) append = bool(opts.append) outtable = str(opts.outtable) clusternum = bool(opts.clusternum) ignoreP2 = bool(opts.ignoreP2) lowerfirstword = bool(opts.lowerfirstword) freezetokennumber = bool(opts.freezetokennumber) serializedlex = bool(opts.serializedlex) if (len(args) > 0): lefffloc = args[0] else: exit(usage+'\n Missing LEFFF_FILE_OR_DIR argument!\n') lefff = Lefff(lefffloc, clusternum, ignoreP2, lowerfirstword, serialized_input=serializedlex) inputstream = sys.stdin line = inputstream.readline() isnotfirst = 0 while line: line = line[0:-1] if inputformat == 'tok': tokens = line.split(' ') isnotfirst = 0 for token in tokens: ntoken = lefff.get_desinflected_form(token, isnotfirst) # the next token won't be considered the first of the sentence, unless only punctuations were encountered if isnotfirst == 0 and not (lefff.is_ponct_form(token)): isnotfirst = 1
parser.add_option("--clusternum",action="store_true", dest="clusternum",default=False,help="If set, changes any numerical expression ([0-9\.,]+) into NUMEXPR. Default = False") parser.add_option("--ignoreP2",action="store_true", dest="ignoreP2",default=True,help="If set, forms corresponding to second persons verbs are ignored unless specific second person pronoun found. Default = True") parser.add_option("--lowerfirstword",action="store_true", dest="lowerfirstword",default=True,help="If set, try to lower first word if unknown. Default = True") (opts,args) = parser.parse_args() lefffloc = str(opts.lefffloc) inputformat = str(opts.inputformat) outtable = str(opts.outtable) clusternum = bool(opts.clusternum) ignoreP2 = bool(opts.ignoreP2) lowerfirstword = bool(opts.lowerfirstword) inputstream = sys.stdin if (len(args) > 0): inputstream = open(args[0]) lefff = Lefff(lefffloc, clusternum, ignoreP2, lowerfirstword) #lefff.read_dir_or_file(lefffloc) line = inputstream.readline() while line: line = line[0:-1] if inputformat == 'raw': tokens = line.split(' ') for i, token in enumerate(tokens): # if token is not first word, or token is all in low cap if i > 0 or token.lower()==token: # try to get an already disinflected form if token in lefff.form2dflform: ntoken = lefff.form2dflform[token] else: ntoken = lefff.desinflect(token,isnotfirst=i)
inputformat = 'onetokenperline' if inputformat == 'onetokenperline': coldef = coldefstr.split('.') colform = int(coldef[0]) collemma = int(coldef[1]) colftb4cat = int(coldef[2]) if len(coldef) > 3: colfeats = int(coldef[3]) else: colfeats == None tagged_stream = sys.stdin if tagged_file == None else open(tagged_file) # load the lefff lexicon lefff = Lefff(lefffdir, serialized_input=serializedlex) #lefff = load_serialized_lefff(lefffdir) line = tagged_stream.readline() isfirst = True fieldseps = fieldsep + fieldsep + fieldsep while line <> '': line = line[0:-1] if inputformat == 'onesentenceperline': ltoks = line.split(tokensep) # morfette :premier token vide... for tok in ltoks[1:]: for tok in ltoks: if len(tok) == 0: continue # on split le token pour récupérer forme, lemme et catégorie
class SetCompounds: """ Systematically make compounds, according to a given list of allowed compounds (STILL TODO) Undo compounds that (i) have a known regular pattern, (ii) and aren't in the allowed compounds list""" def __init__(self, lexcats=['N', 'P', 'V', 'A', 'ADV', 'C', 'D','PRO'], cmpdfile=None, lefffdir=None, subcatfile=None): # lexical categories (to spot compounds) self.lexcats = lexcats # lefff : used to infer the subcat attribute for components of compounds if lefffdir <> None: self.lefff = Lefff(lefffdir) else: self.lefff = None # allowed list of compounds self.allowed_cmpds = {} if cmpdfile <> None: self.load_cmpdfile(cmpdfile) # known subcats # dict : key = form, cal = dic: key2=cat, val=subcat self.form2cat2subcat = {} if subcatfile <> None: self.load_subcatfile(subcatfile) # new compounds to systematically recognize # (limited to the cases where the form spans a whole component) # search for a VN, with yield='il y a', and replace everything under the VN by (V il_y_a) self.new_compounds = [('il y a', 'VN', 'V')] def load_cmpdfile(self, cmpdfile): try: instream = open(cmpdfile) except IOError: sys.stderr.write("Impossible to open "+cmpdfile) return for l in instream.readlines(): # get rid of new line l = l.rstrip() self.allowed_cmpds[l] = 1 def load_subcatfile(self, subcatfile): try: instream = open(subcatfile) except IOError: sys.stderr.write("Impossible to open "+subcatfile) return for l in instream.readlines(): #print ':',l,':' # get rid of new line l = l.rstrip() if not l: continue # ???? le split ne passe pas m = re.match('([^\t]+)\t([^\t]+)\t(.*)$',l) if m <> None: cat = m.group(1) subcat = m.group(2) form = m.group(3) #(cat, subcat, form) = re.split('\t', l, 2) if form in self.form2cat2subcat: self.form2cat2subcat[form][cat] = subcat else: self.form2cat2subcat[form] = { cat : subcat } def is_compound(self,node): if (node.label in self.lexcats) and not(node.is_leaf()) and len(node.get_children()) > 1: return True return False def has_compound_child(self,tree): if tree.has_children(): for x in tree.get_children(): if self.is_compound(x): return True return False def children_labels(self,tree): "Returns a space-separated string of the children labels" if not tree.has_children(): return "" return " ".join([ x.label for x in tree.get_children()]) def labels(self, nodes): "Returns a space-separated string of the labels of the nodelist" return " ".join([x.label for x in nodes]) def undo_compound(self, node): """If compound is not in allowed_cmpds, and has a known pattern, undo the compound : returns a list of nodes that should replace the compound node""" if not(self.is_compound(node)): return [node] cat = node.label # nothing to do if cat has no regular pattern if not cat in RegularCompoundPatterns: return [node] # nothing to do if this compound is allowed cmpd_str = self.children_labels(node) if cmpd_str in self.allowed_cmpds: return [node] for pattern in RegularCompoundPatterns[cat]: if re.match(pattern+'$', cmpd_str): # guess the subcat information for these new nodes # (cf. they're missing on compound components) for component in node.get_children(): self.guess_subcat(component) print "trace:PATTERN:", cmpd_str print "trace:BEFORE: ", node.printf()#.encode('iso-8859-1') if cat == 'N': (n,tail) = self.make_NP(node.get_children(), beforehead=True) elif self.isP(node): n = self.make_PP(node.get_children()) elif cat == 'V': n = self.make_VP(node.get_children()) print "trace:AFTER: ",n.printf()#.encode('iso-8859-1') print "trace:UNDONE:",node.get_compound_form() return n.get_children() return [node] def make_AP(self, nodes, cmpd_str, beforehead=True): if beforehead and cmpd_str == 'A': # for preverbal bare adjectives return nodes[0] ap = LabelledTree('AP') ap.add_child(nodes[0]) if cmpd_str == 'A C A': coord = LabelledTree('COORD') coord.add_child(nodes[1]) a2 = self.make_AP([nodes[2]], 'A', beforehead=False) coord.add_child(a2) ap.add_child(coord) return ap def isP(self,node): return node.label in ['P','P+D'] def make_NP(self, nodes, beforehead=True): np = LabelledTree('NP') tail = None while nodes <> []: if nodes[0].label in ['D','N','ET']: np.add_child(nodes[0]) if nodes[0].label == 'N': beforehead = False nodes = nodes[1:] elif nodes[0].label == 'A': if len(nodes) > 2 and nodes[1].label == 'C' and nodes[2].label == 'A': ap = self.make_AP(nodes[0:3],'A C A',beforehead) np.add_child(ap) nodes = nodes[3:] else: np.add_child(self.make_AP([nodes[0]], 'A', beforehead)) nodes = nodes[1:] # if a prep is encountered # => treat all remaining nodes as a whole PP # (cf. closest attachment preferred) # (unhandled case : N1 (P N2) others : where others attaches to N1) elif self.isP(nodes[0]): pp = self.make_PP(nodes) np.add_child(pp) nodes = [] elif nodes[0].label == 'C': (coord, type) = self.make_COORD(nodes) nodes = [] if type == 'PP': tail = coord else: np.add_child(coord) return [np, tail] def make_PP(self, nodes): pp = LabelledTree('PP') # prep is supposed to be the first node pp.add_child(nodes[0]) [np, tail] = self.make_NP(nodes[1:]) pp.add_child(np) if tail <> None: pp.add_child(tail) return pp def make_VP(self, nodes): vp = LabelledTree('VP') # V is supposed to be the first node vp.add_child(nodes[0]) if self.isP(nodes[1]): vp.add_child(self.make_PP(nodes[1:])) else: # sinon on bactracke vp.children = nodes return vp # coord node, for either NP conjunct or PP conjunct # (AP handled separately) def make_COORD(self, nodes): coord = LabelledTree('COORD') # conjunction is supposed to be the first node coord.add_child(nodes[0]) # if C P ... => coordination of PPs if self.isP(nodes[1]): pp = self.make_PP(nodes[1:]) coord.add_child(pp) type = 'PP' # otherwise = coordination of NPs (APs handled differently) else: np = self.make_NP(nodes[1:]) coord.add_child(np) type = 'NP' return [coord, type] def undo_compounds(self, tree): "Walk the whole tree, undoing some compounds" if not tree.has_children(): return # iterate on children # (allowing the remaining children list to change during this loop) #for child in tree.children: j = 0 while j < len(tree.get_children()): child = tree.children[j] j = j+1 if not child.has_children(): continue if self.has_compound_child(child): new = [] # iterate on grand children # (allowing the children list to change during this loop) for i,gchild in enumerate(child.get_children()): if self.is_compound(gchild): # returns either a list containing the compound ( not undone) # or a list of several nodes : the compound node (gchild) should be replaced by this list n = self.undo_compound(gchild) #print "n:",n # si child=VN, et si compound V, alors les freres du V doivent etre sortis du VN: # (VN (V (V mettre) (P en) (N place))) ==> (VN (V mettre)) (PP (P en) (NP (N place))) if len(n) > 1 and child.label == 'VN' and gchild.label == 'V': new = new + [ n[0] ] tree.children = tree.children[0:j] + n[1:] + tree.children[j:] else: new = new + n else: new.append(gchild) child.children = new # iterate on children # (allowing the children list to change during this loop) for child in tree.get_children(): self.undo_compounds(child) def guess_subcat(self,node): """Guess and add subcat information for that node (used for nodes that are components of a compound that is undone here""" cat = node.label # no subcats for these if cat in ['ET','P','P+D']: return # verbs : get at least mood and tense if cat == 'V': morphs = self.lefff.get_morphs(node.get_form()) if morphs <> None: for morph in morphs: feats = self.lefff.get_features(morph) if 'mood' in feats: # !! susceptible de changer ici !! print "trace:VERBAL FEATS: "+' '.join( [str(x[0])+':'+str(x[1]) for x in feats.items()] ) return subcat = '' form = node.get_form() # if known in the treebank (i.e. in the subcatfile) if form in self.form2cat2subcat and cat in self.form2cat2subcat[form]: subcat = self.form2cat2subcat[form][cat] elif re.match('[0-9,]+$',form): subcat = 'int' elif cat == 'N': # if lower case string => common noun if form.islower(): subcat = 'C' # else, if known in lefff, as such => proper noun elif self.lefff <> None: if self.lefff.form_is_known(form): subcat = 'P' # else, if the lowered form is known => common noun elif self.lefff.form_is_known(form.lower()): subcat = 'C' else: subcat = 'P?' elif cat == 'A': subcat = 'qual?' node.set_feature(subcat) if subcat == '': print "trace:UNKNOWN subcat for :", form, ' ', cat else: print "trace:SUBCAT:", form, ' ', cat, ' ', subcat def do_compounds(self,tree): """ systematically recognizes new_compounds """ if not tree.has_children(): return if len(tree.get_children()) > 1: for (form, nt_cat, pos) in self.new_compounds: if tree.label == nt_cat: y = tree.tree_yield_str() if y.lower() == form: # replace children with a compound, of specified cat n = LabelledTree(pos) n.add_child(LabelledTree(y.replace(' ','_'))) tree.children = [n] return for child in tree.get_children(): self.do_compounds(child)