Exemple #1
0
 def get():
     results = [p.get() for p in promises]
     frontiers = []
     with timing("(Helmholtz enumeration) Decoded json into frontiers"):
         for request, result in zip(requests, results):
             response = json.loads(result.decode("utf-8"))
             for b, entry in enumerate(response):
                 frontiers.append(
                     Frontier([
                         FrontierEntry(program=Program.parse(p),
                                       logPrior=entry["ll"],
                                       logLikelihood=0.)
                         for p in entry["programs"]
                     ],
                              task=Task(str(b), request, [])))
     eprint("Total number of Helmholtz frontiers:", len(frontiers))
     return frontiers
Exemple #2
0
def memorizeInduce(g, frontiers, **kwargs):
    existingInventions = {p.uncurry()
                          for p in g.primitives }
    programs = {f.bestPosterior.program for f in frontiers if not f.empty}
    newInventions = programs - existingInventions
    newGrammar = Grammar.uniform([p for p in g.primitives] + \
                                 [Invented(ni) for ni in newInventions])
    
    # rewrite in terms of new primitives
    def substitute(p):
        nonlocal newInventions
        if p in newInventions: return Invented(p).uncurry()
        return p
    newFrontiers = [Frontier([FrontierEntry(program=np,
                                            logPrior=newGrammar.logLikelihood(f.task.request, np),
                                            logLikelihood=e.logLikelihood)
                           for e in f
                           for np in [substitute(e.program)] ],
                             task=f.task)
                 for f in frontiers ]
    return newGrammar, newFrontiers
Exemple #3
0
def ocamlInduce(g,
                frontiers,
                _=None,
                topK=1,
                pseudoCounts=1.0,
                aic=1.0,
                structurePenalty=0.001,
                a=0,
                CPUs=1,
                bs=1000000,
                topI=300):
    # This is a dirty hack!
    # Memory consumption increases with the number of CPUs
    # And early on we have a lot of stuff to compress
    # If this is the first iteration, only use a fraction of the available CPUs
    topK = 5
    topI = 600
    if all(not p.isInvented for p in g.primitives):
        if a > 3:
            CPUs = max(1, int(CPUs / 6))
        else:
            CPUs = max(1, int(CPUs / 3))
    else:
        CPUs = max(1, int(CPUs / 2))
    CPUs = 2

    # X X X FIXME X X X
    # for unknown reasons doing compression all in one go works correctly and doing it with Python and the outer loop causes problems
    iterations = 99  # maximum number of components to add at once

    while True:
        g0 = g

        originalFrontiers = frontiers
        t2f = {f.task: f for f in frontiers}
        frontiers = [f for f in frontiers if not f.empty]
        message = {
            "arity": a,
            "topK": topK,
            "pseudoCounts": float(pseudoCounts),
            "aic": aic,
            "bs": bs,
            "topI": topI,
            "structurePenalty": float(structurePenalty),
            "CPUs": CPUs,
            "DSL": g.json(),
            "iterations": iterations,
            "frontiers": [f.json() for f in frontiers]
        }

        message = json.dumps(message)
        if True:
            timestamp = datetime.datetime.now().isoformat()
            os.system("mkdir  -p compressionMessages")
            fn = "compressionMessages/%s" % timestamp
            with open(fn, "w") as f:
                f.write(message)
            eprint("Compression message saved to:", fn)

        try:
            # Get relative path
            compressor_file = os.path.join(get_root_dir(), 'compression')
            process = subprocess.Popen(compressor_file,
                                       stdin=subprocess.PIPE,
                                       stdout=subprocess.PIPE)
            response, error = process.communicate(
                bytes(message, encoding="utf-8"))
            response = json.loads(response.decode("utf-8"))
        except OSError as exc:
            raise exc

        g = response["DSL"]
        g = Grammar(g["logVariable"],
                    [(l, p.infer(), p) for production in g["productions"]
                     for l in [production["logProbability"]]
                     for p in [Program.parse(production["expression"])]],
                    continuationType=g0.continuationType)

        frontiers = {
            original.task: Frontier([
                FrontierEntry(p,
                              logLikelihood=e["logLikelihood"],
                              logPrior=g.logLikelihood(original.task.request,
                                                       p))
                for e in new["programs"]
                for p in [Program.parse(e["program"])]
            ],
                                    task=original.task)
            for original, new in zip(frontiers, response["frontiers"])
        }
        frontiers = [
            frontiers.get(f.task, t2f[f.task]) for f in originalFrontiers
        ]
        if iterations == 1 and len(g) > len(g0):
            eprint("Grammar changed - running another round of consolidation.")
            continue
        else:
            eprint("Finished consolidation.")
            return g, frontiers
Exemple #4
0
def rustInduce(g0,
               frontiers,
               _=None,
               topK=1,
               pseudoCounts=1.0,
               aic=1.0,
               structurePenalty=0.001,
               a=0,
               CPUs=1,
               iteration=-1,
               topk_use_only_likelihood=False,
               vs=False):
    def finite_logp(l):
        return l if l != float("-inf") else -1000

    message = {
        "strategy": {
            "version-spaces": {
                "top_i": 50
            }
        } if vs else {
            "fragment-grammars": {}
        },
        "params": {
            "structure_penalty": structurePenalty,
            "pseudocounts": int(pseudoCounts + 0.5),
            "topk": topK,
            "topk_use_only_likelihood": topk_use_only_likelihood,
            "aic": aic if aic != float("inf") else None,
            "arity": a,
        },
        "primitives": [{
            "name": p.name,
            "tp": str(t),
            "logp": finite_logp(l)
        } for l, t, p in g0.productions if p.isPrimitive],
        "inventions": [
            {
                "expression": str(p.body),
                "logp": finite_logp(l)
            }  # -inf=-100
            for l, t, p in g0.productions if p.isInvented
        ],
        "variable_logprob":
        finite_logp(g0.logVariable),
        "frontiers": [{
            "task_tp":
            str(f.task.request),
            "solutions": [{
                "expression": str(e.program),
                "logprior": finite_logp(e.logPrior),
                "loglikelihood": e.logLikelihood,
            } for e in f],
        } for f in frontiers],
    }

    eprint("running rust compressor")

    messageJson = json.dumps(message)

    with open("jsonDebug", "w") as f:
        f.write(messageJson)

    # check which version of python we are using
    # if >=3.6 do:
    if sys.version_info[1] >= 6:
        p = subprocess.Popen(['./rust_compressor/rust_compressor'],
                             encoding='utf-8',
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE)
    elif sys.version_info[1] == 5:
        p = subprocess.Popen(['./rust_compressor/rust_compressor'],
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE)

        messageJson = bytearray(messageJson, encoding='utf-8')
        # convert messageJson string to bytes
    else:
        eprint("must be python 3.5 or 3.6")
        assert False

    p.stdin.write(messageJson)
    p.stdin.flush()
    p.stdin.close()

    if p.returncode is not None:
        raise ValueError("rust compressor failed")

    if sys.version_info[1] >= 6:
        resp = json.load(p.stdout)
    elif sys.version_info[1] == 5:
        import codecs
        resp = json.load(codecs.getreader('utf-8')(p.stdout))

    productions = [(x["logp"], p) for p, x in
                   zip((p for (_, _, p) in g0.productions if p.isPrimitive), resp["primitives"])] + \
                  [(i["logp"], Invented(Program.parse(i["expression"])))
                   for i in resp["inventions"]]
    productions = [(l if l is not None else float("-inf"), p)
                   for l, p in productions]
    g = Grammar.fromProductions(productions,
                                resp["variable_logprob"],
                                continuationType=g0.continuationType)
    newFrontiers = [
        Frontier([
            FrontierEntry(Program.parse(s["expression"]),
                          logPrior=s["logprior"],
                          logLikelihood=s["loglikelihood"])
            for s in r["solutions"]
        ], f.task) for f, r in zip(frontiers, resp["frontiers"])
    ]
    return g, newFrontiers
Exemple #5
0
def test_task(m, task, timeout):
    start = time.time()
    failed_cands = set()

    print(task.examples)

    frontier = []

    sampleFrequency = {}

    while time.time() - start < timeout:
        query = makeExamples(task)
        #import pdb; pdb.set_trace()
        candidates = m.sample([query] * BATCHSIZE)  #i think this works
        #print('len failed', len(failed_cands))
        for cand in candidates:
            try:
                p = Program.parse(" ".join(cand))
            except ParseFailure:
                continue
            except IndexError:
                continue
            except AssertionError:
                continue
            if p not in failed_cands:
                if "STRING" in str(p):
                    assert arguments.domain == 'text'
                    if len(task.stringConstants) == 0: ll = float('-inf')
                    else:
                        ci = Text.ConstantInstantiateVisitor(
                            [[cc for cc in sc] for sc in task.stringConstants],
                            sample=False)
                        ll = min(
                            task.logLikelihood(pp,
                                               timeout=0.1 if arguments.
                                               domain != 'rational' else None)
                            for pp in p.visit(ci))
                if arguments.domain == 'regex':
                    # regex is handled specially
                    # we just collect all of the candidates and then marginalize over them
                    # but we have to make sure that each candidate is well typed and well formed
                    ll = float('-inf')
                    if not p.canHaveType(task.request): p = None
                    else:
                        from examineFrontier import ConstantVisitor
                        p = p.visit(ConstantVisitor(task.str_const))
                        try:
                            regex = p.evaluate([])(pre.String(""))
                            if arguments.sampleLikelihood:
                                sampleFrequency[
                                    regex] = 1 + sampleFrequency.get(regex)
                                p = None
                            else:
                                dataLikelihood = sum(
                                    regex.match("".join(y))
                                    for _, y in task.examples)
                                logPrior = g.logLikelihood(task.request, p)
                                frontier.append(
                                    FrontierEntry(
                                        p,
                                        logPrior=logPrior,
                                        logLikelihood=dataLikelihood))
                            #print("sampled program",p,
                            #      "which translates into regex",regex,
                            #      "and which assigns the following likelihood to the test data",
                            #      dataLikelihood,
                            #      "and which has prior probability",logPrior)
                        except:
                            p = None

                elif arguments.domain != 'logo':
                    ll = task.logLikelihood(
                        p,
                        timeout=0.1
                        if arguments.domain != 'rational' else None)
                else:
                    try:
                        yh = drawLogo(p, timeout=1., resolution=28)
                        if isinstance(yh, list) and list(map(
                                int, yh)) == task.examples[0][1]:
                            ll = 0.
                        else:
                            ll = float('-inf')
                        #print("no warning, we are cool.jpeg")
                    except JSONDecodeError:
                        eprint(
                            "WARNING: Could not decode json. If this occurs occasionally it might be because the neural network is producing invalid code. Otherwise, if this occurs frequently, then this is a bug."
                        )
                        ll = float('-inf')

                #print(ll)
                if ll > float('-inf'):
                    #print(p)
                    #print(task.name)
                    return True

                elif p is not None:
                    failed_cands.add(p)

    if arguments.domain != 'regex':
        return False

    from examineFrontier import testingRegexLikelihood
    if arguments.sampleLikelihood:
        return lse([
            math.log(frequency) + testingRegexLikelihood(task, regex)
            for regex, frequency in sampleFrequency.items()
        ])
    # calculate that thing that we have to for regex
    frontier = Frontier(frontier, task)
    from graphs import addStupidRegex
    frontier = addStupidRegex(frontier, g)
    print("for this task I think that the following is the map estimate:\n",
          frontier.topK(1))
    if arguments.taskLikelihood:
        return lse([e.logPrior + e.logLikelihood for e in frontier])
    return lse([
        e.logPosterior + testingRegexLikelihood(task, e.program)
        for e in frontier
    ])