def test_eval(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( "y = x + 1" ) transform = AddPreviousActions(aencoder) prev_action_tensor = transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [2, -1, -1], [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1], [1, -1, -1] ], prev_action_tensor.numpy() )
def test_simple_case(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( ground_truth="y = x + 1" ) transform = EncodeActionSequence(aencoder) ground_truth = transform( action_sequence=action_sequence, reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], ) assert np.array_equal( [ [3, -1, -1], [4, -1, -1], [-1, 1, -1], [1, -1, -1], [5, -1, -1], [-1, 2, -1], [1, -1, -1], [4, -1, -1], [-1, 3, -1], [1, -1, -1], [6, -1, -1], [-1, 4, -1], [1, -1, -1] ], ground_truth.numpy() )
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddActionSequenceAsTree(aencoder,) matrix, depth = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [0, 1, 2, 3, 2, 3, 3, 4, 3, 4], depth.numpy() ) assert np.array_equal( [[0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1, 1, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], matrix.numpy() )
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddQueryForTreeGenDecoder(aencoder, 3,) query = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [-1, -1, -1], [2, -1, -1], [3, 2, -1], [4, 3, 2], [3, 2, -1], [5, 3, 2], [5, 3, 2], [4, 5, 3], [5, 3, 2], [6, 5, 3] ], query.numpy() )
def test_n_dependent(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddPreviousActionRules(aencoder, 2, n_dependent=3) prev_rule_action = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False, ) assert np.array_equal( [ # str -> "y" [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]], # Number -> number [[8, -1, -1], [9, -1, -1], [-1, -1, -1]], [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]], ], prev_rule_action.numpy() )
def download(cache_path: str = os.path.join(DEFAULT_CACHE_DIR, "django.pt"), base_path: str = BASE_PATH, get: Callable[[str], str] = default_get, num_train: int = 16000, num_test: int = 1000) \ -> Dict[str, ListDataset]: @file_cache(cache_path) def _download(): return { "annotation": format_annotations( get(BASE_PATH + "all.anno").split("\n")), "code": get(BASE_PATH + "all.code").split("\n") } data = _download() annotation = data["annotation"] code = data["code"] def to_sample(elem: Tuple[str, str]) -> Environment: anno, code = elem return Environment( { "text_query": anno, "ground_truth": code }, set(["ground_truth"]) ) samples = list(map(to_sample, zip(annotation, code))) data = { "train": samples[:num_train], "test": samples[num_train:num_train + num_test], "valid": samples[num_train + num_test:] } return {key: ListDataset(value) for key, value in data.items()}
def test_eval(self): entries = [Environment( {"text_query": "foo bar", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( "y = x + 1" ) transform = AddActions(aencoder) action_tensor = transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ [2, 2, 0], [4, 3, 1], [6, 4, 2], [6, 4, 2], [5, 3, 1], [6, 5, 5], [6, 5, 5], [5, 5, 5], [6, 4, 8], [6, 4, 8], [5, 5, 5], [9, 6, 11], [9, 6, 11], [-1, -1, -1] ], action_tensor.numpy() )
def dataset(): return ListDataset([ Environment({ "query": "query", "ground_truth": "name0" }, set(["ground_truth"])) ])
def test_multiprocess(self): accuracy = use_environment( Accuracy(), in_keys=["actual", ["ground_truth", "expected"]], value_key="actual" ) dataset = ListDataset([ Environment( {"query": "query0", "ground_truth": "c0"}, set(["ground_truth"]) ), Environment( {"query": "query1", "ground_truth": "c0"}, set(["ground_truth"]) ), Environment( {"query": "query2", "ground_truth": "c0"}, set(["ground_truth"]) ), ]) with tempfile.TemporaryDirectory() as init_dir: with context.Pool(2) as pool: procs = [] for i in range(2): p = pool.apply_async( self._run, args=(init_dir, dataset, {"accuracy": accuracy}, i), ) procs.append(p) out = [p.get() for p in procs] r0 = out[0] r1 = out[1] assert r0 == r1 results = r0 assert results.metrics == {1: {"accuracy": 1.0 / 3}, 3: {"accuracy": 2.0 / 3}} assert 3 == len(results.results) results.results[0].time = 0.0 results.results[1].time = 0.0 results.results[2].time = 0.0 results.results.sort(key=lambda x: x.sample["query"]) assert Result({"query": "query0", "ground_truth": "c0"}, ["c0", "c1", "c2"], {1: {"accuracy": 1.0}, 3: {"accuracy": 1.0}}, True, 0.0) == results.results[0] assert Result({"query": "query1", "ground_truth": "c0"}, ["c2", "c3", "c0"], {1: {"accuracy": 0.0}, 3: {"accuracy": 1.0}}, True, 0.0) == results.results[1] assert Result({"query": "query2", "ground_truth": "c0"}, ["c2", "c3", "c5"], {1: {"accuracy": 0.0}, 3: {"accuracy": 0.0}}, True, 0.0) == results.results[2]
def download(cache_path: str = os.path.join( DEFAULT_CACHE_DIR, "nl2bash.pt")) -> Dict[str, ListDataset]: @file_cache(cache_path) def _download(): logger.info("Download nl2bash dataset") with tempfile.TemporaryDirectory() as tmpdir: with open(os.path.join(tmpdir, "download.bash"), "w") as file: file.write(""" #! /bin/env bash tmpdir=$1 python -m venv $tmpdir/env source $tmpdir/env/bin/activate git clone --depth 1 https://github.com/TellinaTool/nl2bash $tmpdir/nl2bash pip install tensorflow pip install -r $tmpdir/nl2bash/requirements.txt make -C $tmpdir/nl2bash set +u export PYTHONPATH=$tmpdir/nl2bash:$PYTHONPATH set -u make -C $tmpdir/nl2bash/scripts data """) subprocess.run( ["bash", os.path.join(tmpdir, "download.bash"), tmpdir]) def load(name: str): with open(os.path.join(tmpdir, "nl2bash", "data", "bash", f"{name}.nl.filtered"), encoding="utf-8") as file: inputs = list(file.readlines()) with open(os.path.join(tmpdir, "nl2bash", "data", "bash", f"{name}.cm.filtered"), encoding="utf-8") as file: ground_truths = list(file.readlines()) return [ Environment( { "text_query": input, "ground_truth": ground_truth }, set(["ground_truth"])) for input, ground_truth in zip(inputs, ground_truths) ] dataset = {} dataset["train"] = load("train") dataset["test"] = load("dev") dataset["valid"] = load("test") return dataset dataset = _download() return {key: ListDataset(value) for key, value in dataset.items()}
def __init__(self, dataset: torch.utils.data.Dataset, synthesizer: Synthesizer[Environment, Code], metrics: Mapping[str, Callable[[Environment, Code], float]], top_n: List[int] = [1, 3], n_samples: Optional[int] = None): super().__init__() self.dataset = dataset if n_samples is not None: self.dataset = ListDataset( [self.dataset[i] for i in range(n_samples)]) self.synthesizer = synthesizer self.metrics = metrics self.top_n = top_n
def test_impossible_case(self): entries = [Environment( {"ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParser()) d.tokens = [("", "y"), ("", "1")] aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParser())( ground_truth="y = x + 1" ) transform = EncodeActionSequence(aencoder) with pytest.raises(RuntimeError): transform( reference=[Token(None, "foo", "foo"), Token(None, "bar", "bar")], action_sequence=action_sequence, )
def test_simple_case(self): accuracy = use_environment( Accuracy(), in_keys=["actual", ["ground_truth", "expected"]], value_key="actual" ) dataset = ListDataset([ Environment( {"query": "query0", "ground_truth": "c0"}, set(["ground_truth"]) ), Environment( {"query": "query1", "ground_truth": "c0"}, set(["ground_truth"]) ), Environment( {"query": "query2", "ground_truth": "c0"}, set(["ground_truth"]) ), ]) results = EvaluateSynthesizer(dataset, synthesize, metrics={"accuracy": accuracy})() assert results.metrics == \ {1: {"accuracy": 1.0 / 3.0}, 3: {"accuracy": 2.0 / 3.0}} assert 3 == len(results.results) results.results[0].time = 0.0 results.results[1].time = 0.0 results.results[2].time = 0.0 assert Result({"query": "query0", "ground_truth": "c0"}, ["c0", "c1", "c2"], {1: {"accuracy": 1.0}, 3: {"accuracy": 1.0}}, True, 0.0) == results.results[0] assert Result({"query": "query1", "ground_truth": "c0"}, ["c2", "c3", "c0"], {1: {"accuracy": 0.0}, 3: {"accuracy": 1.0}}, True, 0.0) == results.results[1] assert Result({"query": "query2", "ground_truth": "c0"}, ["c2", "c3", "c5"], {1: {"accuracy": 0.0}, 3: {"accuracy": 0.0}}, True, 0.0) == results.results[2]
def test_eval(self): entries = [Environment( {"text_query": "ab test", "ground_truth": "y = x + 1"}, set(["ground_truth"]) )] dataset = ListDataset(entries) d = get_samples(dataset, MockParserWithoutVariadicArgs()) aencoder = ActionSequenceEncoder(d, 0) action_sequence = GroundTruthToActionSequence(MockParserWithoutVariadicArgs())( "y = x + 1" ) transform = AddPreviousActionRules(aencoder, 2) prev_rule_action = transform( reference=[Token(None, "ab", "ab"), Token(None, "test", "test")], action_sequence=action_sequence, train=False ) assert np.array_equal( [ # None -> Root [[1, -1, -1], [2, -1, -1], [-1, -1, -1]], # Assign -> Name, expr [[3, -1, -1], [4, -1, -1], [5, -1, -1]], # Name -> str [[4, -1, -1], [6, -1, -1], [-1, -1, -1]], # str -> "x" [[-1, -1, -1], [-1, 1, -1], [-1, -1, -1]], # Op -> str, expr, expr [[7, -1, -1], [6, -1, -1], [5, -1, -1]], # str -> "+" [[-1, -1, -1], [-1, 2, -1], [-1, -1, -1]], # Name -> str [[4, -1, -1], [6, -1, -1], [-1, -1, -1]], # str -> "y" [[-1, -1, -1], [-1, 3, -1], [-1, -1, -1]], # Number -> number [[8, -1, -1], [9, -1, -1], [-1, -1, -1]], [[-1, -1, -1], [-1, 4, -1], [-1, -1, -1]], ], prev_rule_action.numpy() )
def download(cache_path: str = os.path.join(DEFAULT_CACHE_DIR, "deepfix.pt"), path: str = BASE_PATH, get: Callable[[str, str], None] = default_get) \ -> ListDataset: @file_cache(cache_path) def _download(): logger.info("Download DeepFix dataset") logger.debug(f"Dataset path: {path}") samples = [] with tempfile.TemporaryDirectory() as tmpdir: dst = os.path.join(tmpdir, "dataset.zip") get(path, dst) gzipfile = os.path.join(tmpdir, "dataset.gz") with zipfile.ZipFile(dst) as z: with z.open(os.path.join("prutor-deepfix-09-12-2017", "prutor-deepfix-09-12-2017.db.gz"), "r") as file, \ open(gzipfile, "wb") as dst_file: copyfileobj(file, dst_file) sqlitefile = os.path.join(tmpdir, "dataset.db") with gzip.open(gzipfile, "rb") as src_file, \ open(sqlitefile, "wb") as dst_file: copyfileobj(src_file, dst_file) conn = sqlite3.connect(sqlitefile) c = conn.cursor() for code, error, errorcount in \ c.execute("SELECT code, error, errorcount FROM Code"): samples.append( Environment( { "code": code, "error": error, "n_error": errorcount, }, set(["error", "n_error"]))) return samples samples = _download() return ListDataset(samples)
def dataset(): return ListDataset([ Environment( { "value": torch.tensor(0), "ground_truth": torch.tensor(0) }, set(["ground_truth"]), ), Environment( { "value": torch.tensor(1), "ground_truth": torch.tensor(1) }, set(["ground_truth"]), ), Environment( { "value": torch.tensor(2), "ground_truth": torch.tensor(2) }, set(["ground_truth"]), ), ])
def test_map_style_dataset(self): dataset = transform(ListDataset([0]), lambda x: x + 1) assert 1 == len(dataset) assert 1 == dataset[0] assert [1] == dataset[:1]
def pretrain(self, output_dir): dataset = Dataset(4, 1, 2, 1, 45, seed=0) """ """ train_dataset = ListDataset([ Environment( {"ground_truth": Circle(1)}, set(["ground_truth"]), ), Environment( {"ground_truth": Rectangle(1, 2)}, set(["ground_truth"]), ), Environment( {"ground_truth": Rectangle(1, 1)}, set(["ground_truth"]), ), Environment( {"ground_truth": Rotation(45, Rectangle(1, 1))}, set(["ground_truth"]), ), Environment( {"ground_truth": Translation(1, 1, Rectangle(1, 1))}, set(["ground_truth"]), ), Environment( {"ground_truth": Difference(Circle(1), Circle(1))}, set(["ground_truth"]), ), Environment( {"ground_truth": Union(Rectangle(1, 2), Circle(1))}, set(["ground_truth"]), ), Environment( {"ground_truth": Difference(Rectangle(1, 1), Circle(1))}, set(["ground_truth"]), ), ]) with tempfile.TemporaryDirectory() as tmpdir: interpreter = self.interpreter() train_dataset = data_transform( train_dataset, Apply( module=AddTestCases(interpreter), in_keys=["ground_truth"], out_key="test_cases", is_out_supervision=False, )) encoder = self.prepare_encoder(dataset, Parser()) collate = Collate( test_case_tensor=CollateOptions(False, 0, 0), variables_tensor=CollateOptions(True, 0, 0), previous_actions=CollateOptions(True, 0, -1), hidden_state=CollateOptions(False, 0, 0), state=CollateOptions(False, 0, 0), ground_truth_actions=CollateOptions(True, 0, -1) ) collate_fn = Sequence(OrderedDict([ ("to_episode", Map(self.to_episode(encoder, interpreter))), ("flatten", Flatten()), ("transform", Map(self.transform( encoder, interpreter, Parser()))), ("collate", collate.collate) ])) model = self.prepare_model(encoder) optimizer = self.prepare_optimizer(model) train_supervised( tmpdir, output_dir, train_dataset, model, optimizer, torch.nn.Sequential(OrderedDict([ ("loss", Apply( module=Loss( reduction="sum", ), in_keys=[ "rule_probs", "token_probs", "reference_probs", "ground_truth_actions", ], out_key="action_sequence_loss", )), ("normalize", # divided by batch_size Apply( [("action_sequence_loss", "lhs")], "loss", mlprogram.nn.Function(Div()), constants={"rhs": 1})), ("pick", mlprogram.nn.Function( Pick("loss"))) ])), None, "score", collate_fn, 1, Epoch(100), evaluation_interval=Epoch(10), snapshot_interval=Epoch(100) ) return encoder, train_dataset