def add_lexical_relations(self, table, restriction=["N", "V"], table_type="voisins", stop_words=set([])): """precompute lexical relations for each token in corpus, according to provided table table should be indexed on combo of lemma/pos/relation type and yield a real value TODO: x- each word should be there with itself at similarity = 1.0 -if no restriction, all related tokens are extracted. should be adjusted wrt type and location: - tokens in same text - tokens in corpus/subcorpus - type: part of speech, grammatical relations if available """ self._voisins = defaultdict(list) self._vsn_domain = restriction for doc in self._docs.values(): vocab = set([ x.lemma() for x in doc._prep._tokens.values() if x.simple_mp_cat() in restriction and not (self._voisins.has_key(x.lemma())) ]) vocab = vocab - stop_words print >> sys.stderr, "looking up additional %d words" % len(vocab) if table_type == "voisins": self._voisins.update(get_voisins_dict(table, vocab)) elif table_type == "synos": self._voisins.update(get_syno_norm_dict(vocab, table)) else: print >> sys.stderr, "unimplemented lexical resource type, use 'voisins' or 'synos'", table_type sys.exit(0) for word in vocab: self._voisins[word].append((word, 1.0)) # also store rank of neighbours in each other list self._ranked_vsn = {} for entry in self._voisins: self._ranked_vsn[entry] = [ (x[1], 1.0 / (i + 1)) for (i, x) in enumerate( sorted([(-s, w) for (w, s) in self._voisins[entry]])) ] self._ranked_vsn[entry] = dict(self._ranked_vsn[entry]) self._voisins[entry] = dict(self._voisins[entry])
def add_lexical_relations(self,table,restriction=["N","V"],table_type="voisins",stop_words=set([])): """precompute lexical relations for each token in corpus, according to provided table table should be indexed on combo of lemma/pos/relation type and yield a real value TODO: x- each word should be there with itself at similarity = 1.0 -if no restriction, all related tokens are extracted. should be adjusted wrt type and location: - tokens in same text - tokens in corpus/subcorpus - type: part of speech, grammatical relations if available """ self._voisins = defaultdict(list) self._vsn_domain = restriction for doc in self._docs.values(): vocab = set([x.lemma() for x in doc._prep._tokens.values() if x.simple_mp_cat() in restriction and not(self._voisins.has_key(x.lemma()))]) vocab = vocab - stop_words print >> sys.stderr, "looking up additional %d words"%len(vocab) if table_type=="voisins": self._voisins.update(get_voisins_dict(table,vocab)) elif table_type=="synos": self._voisins.update(get_syno_norm_dict(vocab,table)) else: print >> sys.stderr, "unimplemented lexical resource type, use 'voisins' or 'synos'", table_type sys.exit(0) for word in vocab: self._voisins[word].append((word,1.0)) # also store rank of neighbours in each other list self._ranked_vsn = {} for entry in self._voisins: self._ranked_vsn[entry] = [(x[1],1.0/(i+1)) for (i,x) in enumerate(sorted([(-s,w) for (w,s) in self._voisins[entry]]))] self._ranked_vsn[entry] = dict(self._ranked_vsn[entry]) self._voisins[entry]=dict(self._voisins[entry])
if options.merge: feats = FeatureMap("", empty=True) feats.init_from_dir(".", suffix=options.merge) basename = "no base file to consider" else: if True: basename = args[0] feats = FeatureMap(basename + ".features", weird=options.weird) feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2]) if allfuncs != {}: doc = annodisAnnot(basename + ".xml") prep = Preprocess(basename + ".txt.prep.xml") doc.add_preprocess(prep) if options.voisins: doc._voisins = get_voisins_dict(table, doc._vocab) for entry in doc._voisins: doc._voisins[entry] = dict(doc._voisins[entry]) else: print >> sys.stderr, "Usage: script file-basename ?", args sys.exit(0) if options.merge: feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2, "m#FILE"]) else: feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2]) for onename, onefunc in allfuncs.items(): feats.process(doc, onefunc, propagate=options.simple,
if options.merge: feats = FeatureMap("", empty = True) feats.init_from_dir(".", suffix = options.merge) basename = "no base file to consider" else: if True: basename = args[0] feats = FeatureMap(basename + ".features", weird = options.weird) feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2]) if allfuncs != {}: doc = annodisAnnot(basename + ".xml") prep = Preprocess(basename + ".txt.prep.xml") doc.add_preprocess(prep) if options.voisins: doc._voisins = get_voisins_dict(table, doc._vocab) for entry in doc._voisins: doc._voisins[entry] = dict(doc._voisins[entry]) else: print >> sys.stderr, "Usage: script file-basename ?", args sys.exit(0) if options.merge: feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2, "m#FILE"]) else: feats.index(["m#%s" % INDEX1, "m#%s" % INDEX2]) for onename, onefunc in allfuncs.items(): feats.process(doc, onefunc, propagate = options.simple, strand_orphans = options.strand_orphans) print >> sys.stderr, onename, " done" if options.distance: