Esempio n. 1
0
def cluster_by_grs(target, source, env):
    import graphmod as gm
    args = source[-1].read()
    verb_map = {}
    gr_map = {}
    instances = gm.Instances()
    gm.load_instances(source[0].rstr(), instances)
    for ii in range(len(instances)):
        verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0])
        grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]]
        verb_map[verb] = verb_map.get(verb, len(verb_map))
        for gr in grs:
            gr_map[gr] = gr_map.get(gr, len(gr_map))
    data = numpy.zeros(shape=(len(verb_map), len(gr_map)))
    for ii in range(len(instances)):
        verb = instances.get_name("verb_lemma", instances.at(ii)["verb_lemma"][0])
        verb_id = verb_map[verb]
        grs = [instances.get_name("gr", x) for x in instances.at(ii)["gr"]]
        gr_ids = [gr_map[x] for x in grs]
        for gr in gr_ids:
            data[verb_id, gr] += 1
    data = numpy.transpose(data.T / data.sum(1))
    tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab"))
    gaps = tres[:, 2]
    err = tres[:, 3]    
    best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax")
    res = stats.kmeans(numpy2ri(data), centers=best)
    verbs = dict([(v, k) for k, v in verb_map.iteritems()])
    ofd = meta_open(target[0].rstr(), "w")
    for c in set(res.rx2("cluster")):
        ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n")
    return None
Esempio n. 2
0
def cluster_by_valex(target, source, env):
    import graphmod as gm
    args = source[-1].read()
    target_verbs = set()
    instances = gm.Instances()
    gm.load_instances(source[0].rstr(), instances)
    for vid in range(instances.get_size("verb_lemma")):
        target_verbs.add(instances.get_name("verb_lemma", vid))
    data = {}
    scfs = {}
    verbs = {}
    for fname in sorted(glob(os.path.join("%s/lex-%s" % (env["VALEX_LEXICON"], args["lexicon"]), "*"))):
        verb = os.path.basename(fname).split(".")[0]
        if verb not in target_verbs:
            continue
        data[verb] = {}
        for m in re.finditer(r":CLASSES \((.*?)\).*\n.*FREQCNT (\d+)", meta_open(fname).read()):
            scf = int(m.group(1).split()[0])
            count = int(m.group(2))
            scfs[scf] = scfs.get(scf, 0) + count
            verbs[verb] = verbs.get(verb, 0) + count
            data[verb][scf] = count
    ddata = numpy.zeros(shape=(len(verbs), len(scfs)))
    verbs = sorted(verbs)
    scfs = sorted(scfs)
    for row, verb in enumerate(verbs):
        for col, scf in enumerate(scfs):
            ddata[row, col] = data[verb].get(scf, 0)

    data = numpy.transpose(ddata.T / ddata.sum(1))
    tres = numpy.asarray(rcluster.clusGap(numpy2ri(data), FUN=stats.kmeans, K_max=30, B=500).rx2("Tab"))
    gaps = tres[:, 2]
    err = tres[:, 3]    
    best = rcluster.maxSE(numpy2ri(gaps), numpy2ri(err), method="globalmax")
    res = stats.kmeans(numpy2ri(data), centers=best)
    ofd = meta_open(target[0].rstr(), "w")
    for c in set(res.rx2("cluster")):
        ofd.write(" ".join([verbs[i] for i, a in enumerate(res.rx2("cluster")) if a == c]) + "\n")
    return None
Esempio n. 3
0
def run_scf(target, source, env):
    import graphmod as gm
    args = source[-1].read()
    parameters = args["parameters"]
    nodes = {}
    factors = {}
    variables = {}


    instances = gm.Instances()
    gm.load_instances(source[0].rstr(), instances)
    if "features" in args:
        instances.transform("argument", args["features"])

    graph = gm.Graph()

    if args.get("model", "") == "multi":
        variables["alpha"] = gm.ContinuousVectorVariable(parameters["scf"], parameters["alpha"])
        variables["beta"] = gm.ContinuousVectorVariable(instances.get_size("argument"), parameters["beta"])

        vxf = []
        for v in range(instances.get_size("verb_lemma")):
            variables["VERB%dxFRAME" % v] = gm.ContinuousVectorVariable([1.0 / parameters["scf"] for x in range(parameters["scf"])])
            vxf.append(variables["VERB%dxFRAME" % v])
            factors["VERB%dxFRAME" % v] = gm.DirichletMultinomial(variables["alpha"], variables["VERB%dxFRAME" % v])
        variables["VERBxFRAME"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(vxf))

        fxa = []
        for f in range(parameters["scf"]):
            variables["FRAME%dxARG" % f] = gm.ContinuousVectorVariable([1.0 / instances.get_size("argument") for x in range(instances.get_size("argument"))])
            vxf.append(variables["FRAME%dxARG" % f])
            factors["FRAME%dxARG" % f] = gm.DirichletMultinomial(variables["beta"], variables["FRAME%dxARG" % f])
        variables["FRAMExARG"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(fxa))


        for instance in range(len(instances)):
            verb_id = instances.at(instance)("verb_lemma")[0]
            vname = "instance%d_verb" % (instance)
            variables[vname] = gm.DiscreteScalarVariable(verb_id)
            variables[vname].set_support(instances.get_size("verb_lemma"))
            variables[vname].set_name("verb")

            iname = "instance%d_scf" % (instance)
            variables[iname] = gm.DiscreteScalarVariable()
            variables[iname].set_support(parameters["scf"])
            variables[iname].set_name("scf")

            fnameA = "instance%d_fA" % (instance)
            factors[fnameA] = gm.MultinomialCategorical(variables["VERB0xFRAME"], variables[iname])
            #factors[fnameA] = gm.MultinomialCategorical(variables["VERB0xFRAME"], variables[vname], variables[iname])

            for i, val in enumerate(instances.at(instance)("argument")):
                oname = "instance%d_observation%d" % (instance, i)
                variables[oname] = gm.DiscreteScalarVariable(val)
                variables[oname].set_support(instances.get_size("argument"))
                variables[oname].set_name("argument")                
                fnameB = "instance%d_fB_%d" % (instance, i)
                factors[fnameB] = gm.MultinomialCategorical(variables["FRAME0xARG"], variables[oname])
                #factors[fnameB] = gm.DirichletMultinomialMixture(variables["beta"], variables[iname], variables[oname])

    if args.get("model", "") == "pymulti":
        variables["alpha"] = gm.ContinuousScalarVariable(parameters["alpha"])
        variables["gamma"] = gm.ContinuousScalarVariable(parameters["gamma"])
        variables["beta"] = gm.ContinuousVectorVariable(instances.get_size("argument"), parameters["beta"])
        for instance in range(len(instances)):
            verb_id = instances.at(instance)("verb_lemma")[0]
            vname = "instance%d_verb" % (instance)
            variables[vname] = gm.DiscreteScalarVariable(verb_id)
            variables[vname].set_support(instances.get_size("verb_lemma"))
            variables[vname].set_name("verb")

            iname = "instance%d_scf" % (instance)
            variables[iname] = gm.DiscreteScalarVariable()
            variables[iname].set_support(parameters["scf"])
            variables[iname].set_name("scf")

            fnameA = "instance%d_fA" % (instance)
            factors[fnameA] = gm.DirichletMultinomialMixture(variables["alpha"], variables[vname], variables[iname])

            for i, val in enumerate(instances.at(instance)("argument")):
                oname = "instance%d_observation%d" % (instance, i)
                variables[oname] = gm.DiscreteScalarVariable(val)
                variables[oname].set_support(instances.get_size("argument"))
                variables[oname].set_name("argument")                
                fnameB = "instance%d_fB_%d" % (instance, i)
                factors[fnameB] = gm.DirichletMultinomialMixture(variables["beta"], variables[iname], variables[oname])


    elif args.get("model", "") == "beta":
        variables["alpha"] = gm.ContinuousVectorVariable(parameters["scf"], parameters["alpha"])
        betas = []
        for i in range(instances.get_size("argument")):
            variables["beta%d" % i] = gm.ContinuousVectorVariable(2, parameters["beta"])
            betas.append(variables["beta%d" % i])

        variables["beta"] = gm.TiledContinuousVectorVariable(gm.ContinuousVectorVariableVector(betas))
        for instance in range(len(instances)):
            verb_id = instances.at(instance)("verb_lemma")[0]
            vname = "instance%d_verb" % (instance)
            variables[vname] = gm.DiscreteScalarVariable(verb_id)
            variables[vname].set_support(instances.get_size("verb_lemma"))
            variables[vname].set_name("verb")

            iname = "instance%d_scf" % (instance)
            variables[iname] = gm.DiscreteScalarVariable()
            variables[iname].set_support(parameters["scf"])
            variables[iname].set_name("scf")

            fnameA = "instance%d_fA" % (instance)
            factors[fnameA] = gm.DirichletMultinomialMixture(variables["alpha"], variables[vname], variables[iname])

            oname = "instance%d_observation" % (instance)            
            variables[oname] = gm.DiscreteMapVariable(instances.at(instance)("argument"))
            variables[oname].set_support(instances.get_size("argument"))
            variables[oname].set_name("argument")                
            fnameB = "instance%d_fB" % (instance)
            factors[fnameB] = gm.BetaBinomialMixture(variables["beta"], variables[iname], variables[oname])


    for variable in variables.values():
        graph.add_variable(variable)

    for factor in factors.values():
        graph.add_factor(factor)

    logging.error("Compiling graph...")
    graph.compile()
    logging.error("Creating counter...")
    counts = graph.get_counts_object()
    print len(graph.get_variables()), len(graph.get_factors())
    samples = numpy.empty(shape=(instances.get_size("verb_lemma"), parameters["scf"], args["samples"]))
    logging.error("Starting sampling...")
    return None
    #for v in variables.values():
    #    if not gm.BaseVariable_get_fixed(v):
    #        print v
    #        v.sample(counts)

    #return None


    for i in range(args["burnins"]):
        graph.sample(counts)
        #print nodes["alpha"].get_value()
        logging.error("Burnin #%s", i + 1)
        #print numpy.asarray(variables["alpha"].get_value())
        print numpy.asarray(counts("scf"))
        #print numpy.asarray(counts("scf", "argument"))
        #ll = sampler.log_likelihood(graph)
        #logging.error("LL = %f", ll)
        #print sampler.get_counts("scf")
        #print nodes["alpha"]
        #print nodes["beta"]

    variables["alpha"].set_fixed(False)

    #variables["beta"].set_fixed(False)

    for i in range(args["samples"]):
        graph.sample(counts)
        logging.error("Sample #%s", i + 1)
        #logging.error("alpha=%s, beta=%s", variables["alpha"].get_value()[0], variables["beta"].get_value()[0])
        #ll = sampler.log_likelihood(graph)
        #logging.error("LL = %f", ll)
        #print sampler.get_counts("scf")

        #print nodes["beta"]
        samples[:, :, i] = numpy.asarray(counts("verb", "scf"))

    row_names = [instances.get_name("verb_lemma", x) for x in range(samples.shape[0])]
    pickle.dump((row_names, samples), meta_open(target[0].rstr(), "w"))
    return None