def cluster_by_grs(target, source, env): import graphmod as gm args = source[-1].read() verb_map = {} gr_map = {} instances = gm.Instances() gm.load_instances(source[0].rstr(), instances) for ii in range(len(instances)): verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0]) grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]] verb_map[verb] = verb_map.get(verb, len(verb_map)) for gr in grs: gr_map[gr] = gr_map.get(gr, len(gr_map)) data = numpy.zeros(shape=(len(verb_map), len(gr_map))) for ii in range(len(instances)): verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0]) verb_id = verb_map[verb] grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]] gr_ids = [gr_map[x] for x in grs] for gr in gr_ids: data[verb_id, gr] += 1 data = numpy.transpose(data.T / data.sum(1)) tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) verbs = dict([(v, k) for k, v in verb_map.iteritems()]) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def cluster_by_valex(target, source, env): import graphmod as gm args = source[-1].read() target_verbs = set() instances = gm.Instances() gm.load_instances(source[0].rstr(), instances) for vid in range(instances.get_size("verb_lemma")): target_verbs.add(instances.get_name("verb_lemma", vid)) data = {} scfs = {} verbs = {} for fname in sorted(glob(os.path.join("%s/lex-%s" % (env["VALEX_LEXICON"], args["lexicon"]), "*"))): verb = os.path.basename(fname).split(".")[0] if verb not in target_verbs: continue data[verb] = {} for m in re.finditer(r":CLASSES \((.*?)\).*\n.*FREQCNT (\d+)", meta_open(fname).read()): scf = int(m.group(1).split()[0]) count = int(m.group(2)) scfs[scf] = scfs.get(scf, 0) + count verbs[verb] = verbs.get(verb, 0) + count data[verb][scf] = count ddata = numpy.zeros(shape=(len(verbs), len(scfs))) verbs = sorted(verbs) scfs = sorted(scfs) for row, verb in enumerate(verbs): for col, scf in enumerate(scfs): ddata[row, col] = data[verb].get(scf, 0) data = numpy.transpose(ddata.T / ddata.sum(1)) tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab")) gaps = tres[:, 2] err = tres[:, 3] best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax") res = stats.kmeans(numpy2ri(data), centers=best) ofd = meta_open(target[0].rstr(), "w") for c in set(res.rx2("cluster")): ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n") return None
def run_scf(target, source, env): import graphmod as gm args = source[-1].read() parameters = args["parameters"] nodes = {} factors = {} variables = {} instances = gm.Instances() gm.load_instances(source[0].rstr(), instances) if "features" in args: instances.transform("argument", args["features"]) graph = gm.Graph() if args.get("model", "") == "multi": variables["alpha"] = gm.ContinuousVectorVariable(parameters["scf"], parameters["alpha"]) variables["beta"] = gm.ContinuousVectorVariable(instances.get_size("argument"), parameters["beta"]) vxf = [] for v in range(instances.get_size("verb_lemma")): variables["VERB%dxFRAME" % v] = gm.ContinuousVectorVariable([1.0 / parameters["scf"] for x in range(parameters["scf"])]) vxf.append(variables["VERB%dxFRAME" % v]) factors["VERB%dxFRAME" % v] = gm.DirichletMultinomial(variables["alpha"], variables["VERB%dxFRAME" % v]) variables["VERBxFRAME"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(vxf)) fxa = [] for f in range(parameters["scf"]): variables["FRAME%dxARG" % f] = gm.ContinuousVectorVariable([1.0 / instances.get_size("argument") for x in range(instances.get_size("argument"))]) vxf.append(variables["FRAME%dxARG" % f]) factors["FRAME%dxARG" % f] = gm.DirichletMultinomial(variables["beta"], variables["FRAME%dxARG" % f]) variables["FRAMExARG"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(fxa)) for instance in range(len(instances)): verb_id = instances.at(instance)("verb_lemma")[0] vname = "instance%d_verb" % (instance) variables[vname] = gm.DiscreteScalarVariable(verb_id) variables[vname].set_support(instances.get_size("verb_lemma")) variables[vname].set_name("verb") iname = "instance%d_scf" % (instance) variables[iname] = gm.DiscreteScalarVariable() variables[iname].set_support(parameters["scf"]) variables[iname].set_name("scf") fnameA = "instance%d_fA" % (instance) factors[fnameA] = gm.MultinomialCategorical(variables["VERB0xFRAME"], variables[iname]) #factors[fnameA] = gm.MultinomialCategorical(variables["VERB0xFRAME"], variables[vname], variables[iname]) for i, val in enumerate(instances.at(instance)("argument")): oname = "instance%d_observation%d" % (instance, i) variables[oname] = gm.DiscreteScalarVariable(val) variables[oname].set_support(instances.get_size("argument")) variables[oname].set_name("argument") fnameB = "instance%d_fB_%d" % (instance, i) factors[fnameB] = gm.MultinomialCategorical(variables["FRAME0xARG"], variables[oname]) #factors[fnameB] = gm.DirichletMultinomialMixture(variables["beta"], variables[iname], variables[oname]) if args.get("model", "") == "pymulti": variables["alpha"] = gm.ContinuousScalarVariable(parameters["alpha"]) variables["gamma"] = gm.ContinuousScalarVariable(parameters["gamma"]) variables["beta"] = gm.ContinuousVectorVariable(instances.get_size("argument"), parameters["beta"]) for instance in range(len(instances)): verb_id = instances.at(instance)("verb_lemma")[0] vname = "instance%d_verb" % (instance) variables[vname] = gm.DiscreteScalarVariable(verb_id) variables[vname].set_support(instances.get_size("verb_lemma")) variables[vname].set_name("verb") iname = "instance%d_scf" % (instance) variables[iname] = gm.DiscreteScalarVariable() variables[iname].set_support(parameters["scf"]) variables[iname].set_name("scf") fnameA = "instance%d_fA" % (instance) factors[fnameA] = gm.DirichletMultinomialMixture(variables["alpha"], variables[vname], variables[iname]) for i, val in enumerate(instances.at(instance)("argument")): oname = "instance%d_observation%d" % (instance, i) variables[oname] = gm.DiscreteScalarVariable(val) variables[oname].set_support(instances.get_size("argument")) variables[oname].set_name("argument") fnameB = "instance%d_fB_%d" % (instance, i) factors[fnameB] = gm.DirichletMultinomialMixture(variables["beta"], variables[iname], variables[oname]) elif args.get("model", "") == "beta": variables["alpha"] = gm.ContinuousVectorVariable(parameters["scf"], parameters["alpha"]) betas = [] for i in range(instances.get_size("argument")): variables["beta%d" % i] = gm.ContinuousVectorVariable(2, parameters["beta"]) betas.append(variables["beta%d" % i]) variables["beta"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(betas)) for instance in range(len(instances)): verb_id = instances.at(instance)("verb_lemma")[0] vname = "instance%d_verb" % (instance) variables[vname] = gm.DiscreteScalarVariable(verb_id) variables[vname].set_support(instances.get_size("verb_lemma")) variables[vname].set_name("verb") iname = "instance%d_scf" % (instance) variables[iname] = gm.DiscreteScalarVariable() variables[iname].set_support(parameters["scf"]) variables[iname].set_name("scf") fnameA = "instance%d_fA" % (instance) factors[fnameA] = gm.DirichletMultinomialMixture(variables["alpha"], variables[vname], variables[iname]) oname = "instance%d_observation" % (instance) variables[oname] = gm.DiscreteMapVariable(instances.at(instance)("argument")) variables[oname].set_support(instances.get_size("argument")) variables[oname].set_name("argument") fnameB = "instance%d_fB" % (instance) factors[fnameB] = gm.BetaBinomialMixture(variables["beta"], variables[iname], variables[oname]) for variable in variables.values(): graph.add_variable(variable) for factor in factors.values(): graph.add_factor(factor) logging.error("Compiling graph...") graph.compile() logging.error("Creating counter...") counts = graph.get_counts_object() print len(graph.get_variables()), len(graph.get_factors()) samples = numpy.empty(shape=(instances.get_size("verb_lemma"), parameters["scf"], args["samples"])) logging.error("Starting sampling...") return None #for v in variables.values(): # if not gm.BaseVariable_get_fixed(v): # print v # v.sample(counts) #return None for i in range(args["burnins"]): graph.sample(counts) #print nodes["alpha"].get_value() logging.error("Burnin #%s", i + 1) #print numpy.asarray(variables["alpha"].get_value()) print numpy.asarray(counts("scf")) #print numpy.asarray(counts("scf", "argument")) #ll = sampler.log_likelihood(graph) #logging.error("LL = %f", ll) #print sampler.get_counts("scf") #print nodes["alpha"] #print nodes["beta"] variables["alpha"].set_fixed(False) #variables["beta"].set_fixed(False) for i in range(args["samples"]): graph.sample(counts) logging.error("Sample #%s", i + 1) #logging.error("alpha=%s, beta=%s", variables["alpha"].get_value()[0], variables["beta"].get_value()[0]) #ll = sampler.log_likelihood(graph) #logging.error("LL = %f", ll) #print sampler.get_counts("scf") #print nodes["beta"] samples[:, :, i] = numpy.asarray(counts("verb", "scf")) row_names = [instances.get_name("verb_lemma", x) for x in range(samples.shape[0])] pickle.dump((row_names, samples), meta_open(target[0].rstr(), "w")) return None