def get(): results = [p.get() for p in promises] frontiers = [] with timing("(Helmholtz enumeration) Decoded json into frontiers"): for request, result in zip(requests, results): response = json.loads(result.decode("utf-8")) for b, entry in enumerate(response): frontiers.append( Frontier([ FrontierEntry(program=Program.parse(p), logPrior=entry["ll"], logLikelihood=0.) for p in entry["programs"] ], task=Task(str(b), request, []))) eprint("Total number of Helmholtz frontiers:", len(frontiers)) return frontiers
def memorizeInduce(g, frontiers, **kwargs): existingInventions = {p.uncurry() for p in g.primitives } programs = {f.bestPosterior.program for f in frontiers if not f.empty} newInventions = programs - existingInventions newGrammar = Grammar.uniform([p for p in g.primitives] + \ [Invented(ni) for ni in newInventions]) # rewrite in terms of new primitives def substitute(p): nonlocal newInventions if p in newInventions: return Invented(p).uncurry() return p newFrontiers = [Frontier([FrontierEntry(program=np, logPrior=newGrammar.logLikelihood(f.task.request, np), logLikelihood=e.logLikelihood) for e in f for np in [substitute(e.program)] ], task=f.task) for f in frontiers ] return newGrammar, newFrontiers
def ocamlInduce(g, frontiers, _=None, topK=1, pseudoCounts=1.0, aic=1.0, structurePenalty=0.001, a=0, CPUs=1, bs=1000000, topI=300): # This is a dirty hack! # Memory consumption increases with the number of CPUs # And early on we have a lot of stuff to compress # If this is the first iteration, only use a fraction of the available CPUs topK = 5 topI = 600 if all(not p.isInvented for p in g.primitives): if a > 3: CPUs = max(1, int(CPUs / 6)) else: CPUs = max(1, int(CPUs / 3)) else: CPUs = max(1, int(CPUs / 2)) CPUs = 2 # X X X FIXME X X X # for unknown reasons doing compression all in one go works correctly and doing it with Python and the outer loop causes problems iterations = 99 # maximum number of components to add at once while True: g0 = g originalFrontiers = frontiers t2f = {f.task: f for f in frontiers} frontiers = [f for f in frontiers if not f.empty] message = { "arity": a, "topK": topK, "pseudoCounts": float(pseudoCounts), "aic": aic, "bs": bs, "topI": topI, "structurePenalty": float(structurePenalty), "CPUs": CPUs, "DSL": g.json(), "iterations": iterations, "frontiers": [f.json() for f in frontiers] } message = json.dumps(message) if True: timestamp = datetime.datetime.now().isoformat() os.system("mkdir -p compressionMessages") fn = "compressionMessages/%s" % timestamp with open(fn, "w") as f: f.write(message) eprint("Compression message saved to:", fn) try: # Get relative path compressor_file = os.path.join(get_root_dir(), 'compression') process = subprocess.Popen(compressor_file, stdin=subprocess.PIPE, stdout=subprocess.PIPE) response, error = process.communicate( bytes(message, encoding="utf-8")) response = json.loads(response.decode("utf-8")) except OSError as exc: raise exc g = response["DSL"] g = Grammar(g["logVariable"], [(l, p.infer(), p) for production in g["productions"] for l in [production["logProbability"]] for p in [Program.parse(production["expression"])]], continuationType=g0.continuationType) frontiers = { original.task: Frontier([ FrontierEntry(p, logLikelihood=e["logLikelihood"], logPrior=g.logLikelihood(original.task.request, p)) for e in new["programs"] for p in [Program.parse(e["program"])] ], task=original.task) for original, new in zip(frontiers, response["frontiers"]) } frontiers = [ frontiers.get(f.task, t2f[f.task]) for f in originalFrontiers ] if iterations == 1 and len(g) > len(g0): eprint("Grammar changed - running another round of consolidation.") continue else: eprint("Finished consolidation.") return g, frontiers
def rustInduce(g0, frontiers, _=None, topK=1, pseudoCounts=1.0, aic=1.0, structurePenalty=0.001, a=0, CPUs=1, iteration=-1, topk_use_only_likelihood=False, vs=False): def finite_logp(l): return l if l != float("-inf") else -1000 message = { "strategy": { "version-spaces": { "top_i": 50 } } if vs else { "fragment-grammars": {} }, "params": { "structure_penalty": structurePenalty, "pseudocounts": int(pseudoCounts + 0.5), "topk": topK, "topk_use_only_likelihood": topk_use_only_likelihood, "aic": aic if aic != float("inf") else None, "arity": a, }, "primitives": [{ "name": p.name, "tp": str(t), "logp": finite_logp(l) } for l, t, p in g0.productions if p.isPrimitive], "inventions": [ { "expression": str(p.body), "logp": finite_logp(l) } # -inf=-100 for l, t, p in g0.productions if p.isInvented ], "variable_logprob": finite_logp(g0.logVariable), "frontiers": [{ "task_tp": str(f.task.request), "solutions": [{ "expression": str(e.program), "logprior": finite_logp(e.logPrior), "loglikelihood": e.logLikelihood, } for e in f], } for f in frontiers], } eprint("running rust compressor") messageJson = json.dumps(message) with open("jsonDebug", "w") as f: f.write(messageJson) # check which version of python we are using # if >=3.6 do: if sys.version_info[1] >= 6: p = subprocess.Popen(['./rust_compressor/rust_compressor'], encoding='utf-8', stdin=subprocess.PIPE, stdout=subprocess.PIPE) elif sys.version_info[1] == 5: p = subprocess.Popen(['./rust_compressor/rust_compressor'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) messageJson = bytearray(messageJson, encoding='utf-8') # convert messageJson string to bytes else: eprint("must be python 3.5 or 3.6") assert False p.stdin.write(messageJson) p.stdin.flush() p.stdin.close() if p.returncode is not None: raise ValueError("rust compressor failed") if sys.version_info[1] >= 6: resp = json.load(p.stdout) elif sys.version_info[1] == 5: import codecs resp = json.load(codecs.getreader('utf-8')(p.stdout)) productions = [(x["logp"], p) for p, x in zip((p for (_, _, p) in g0.productions if p.isPrimitive), resp["primitives"])] + \ [(i["logp"], Invented(Program.parse(i["expression"]))) for i in resp["inventions"]] productions = [(l if l is not None else float("-inf"), p) for l, p in productions] g = Grammar.fromProductions(productions, resp["variable_logprob"], continuationType=g0.continuationType) newFrontiers = [ Frontier([ FrontierEntry(Program.parse(s["expression"]), logPrior=s["logprior"], logLikelihood=s["loglikelihood"]) for s in r["solutions"] ], f.task) for f, r in zip(frontiers, resp["frontiers"]) ] return g, newFrontiers
def test_task(m, task, timeout): start = time.time() failed_cands = set() print(task.examples) frontier = [] sampleFrequency = {} while time.time() - start < timeout: query = makeExamples(task) #import pdb; pdb.set_trace() candidates = m.sample([query] * BATCHSIZE) #i think this works #print('len failed', len(failed_cands)) for cand in candidates: try: p = Program.parse(" ".join(cand)) except ParseFailure: continue except IndexError: continue except AssertionError: continue if p not in failed_cands: if "STRING" in str(p): assert arguments.domain == 'text' if len(task.stringConstants) == 0: ll = float('-inf') else: ci = Text.ConstantInstantiateVisitor( [[cc for cc in sc] for sc in task.stringConstants], sample=False) ll = min( task.logLikelihood(pp, timeout=0.1 if arguments. domain != 'rational' else None) for pp in p.visit(ci)) if arguments.domain == 'regex': # regex is handled specially # we just collect all of the candidates and then marginalize over them # but we have to make sure that each candidate is well typed and well formed ll = float('-inf') if not p.canHaveType(task.request): p = None else: from examineFrontier import ConstantVisitor p = p.visit(ConstantVisitor(task.str_const)) try: regex = p.evaluate([])(pre.String("")) if arguments.sampleLikelihood: sampleFrequency[ regex] = 1 + sampleFrequency.get(regex) p = None else: dataLikelihood = sum( regex.match("".join(y)) for _, y in task.examples) logPrior = g.logLikelihood(task.request, p) frontier.append( FrontierEntry( p, logPrior=logPrior, logLikelihood=dataLikelihood)) #print("sampled program",p, # "which translates into regex",regex, # "and which assigns the following likelihood to the test data", # dataLikelihood, # "and which has prior probability",logPrior) except: p = None elif arguments.domain != 'logo': ll = task.logLikelihood( p, timeout=0.1 if arguments.domain != 'rational' else None) else: try: yh = drawLogo(p, timeout=1., resolution=28) if isinstance(yh, list) and list(map( int, yh)) == task.examples[0][1]: ll = 0. else: ll = float('-inf') #print("no warning, we are cool.jpeg") except JSONDecodeError: eprint( "WARNING: Could not decode json. If this occurs occasionally it might be because the neural network is producing invalid code. Otherwise, if this occurs frequently, then this is a bug." ) ll = float('-inf') #print(ll) if ll > float('-inf'): #print(p) #print(task.name) return True elif p is not None: failed_cands.add(p) if arguments.domain != 'regex': return False from examineFrontier import testingRegexLikelihood if arguments.sampleLikelihood: return lse([ math.log(frequency) + testingRegexLikelihood(task, regex) for regex, frequency in sampleFrequency.items() ]) # calculate that thing that we have to for regex frontier = Frontier(frontier, task) from graphs import addStupidRegex frontier = addStupidRegex(frontier, g) print("for this task I think that the following is the map estimate:\n", frontier.topK(1)) if arguments.taskLikelihood: return lse([e.logPrior + e.logLikelihood for e in frontier]) return lse([ e.logPosterior + testingRegexLikelihood(task, e.program) for e in frontier ])