Ejemplo n.º 1
0
def index(
    *,
    uasts_dir: str,
    instance_file: str,
    configs_dir: str,
    encoder_edge_types: List[str],
    max_length: int,
    log_level: str,
) -> None:
    """Index UASTs with respect to some fields."""
    Config.from_arguments(locals(), ["uasts_dir", "instance_file"],
                          "configs_dir").save(
                              Path(configs_dir) / "index.json")
    logger = setup_logging(__name__, log_level)

    uasts_dir_path = Path(uasts_dir).expanduser().resolve()
    instance_file_path = Path(instance_file).expanduser().resolve()

    instance = Instance(fields=[
        TypedDGLGraphField(name="typed_dgl_graph",
                           type="graph",
                           edge_types=encoder_edge_types),
        MetadataField(name="metadata", type="metadata"),
        BinaryLabelsField(name="label", type="label"),
        IndexesField(name="indexes", type="indexes"),
        InternalTypeField(name="internal_type", type="input"),
        RolesField(name="roles", type="input"),
        LengthField(name="max_length", type="input", max_length=max_length),
    ])

    logger.info(f"Indexing %s", uasts_dir_path)
    for file_path in uasts_dir_path.rglob("*.asdf"):
        with asdf_open(str(file_path)) as af:
            instance.index({
                Nodes:
                Nodes.from_tree(af.tree["nodes"]),
                CodRepLabel:
                CodRepLabel.from_tree(af.tree["codrep_label"]),
                str:
                af.tree["filepath"],
            })
    instance.save(instance_file_path)
    logger.info(f"Indexed  %s", uasts_dir_path)
Ejemplo n.º 2
0
def tensorize(
    *,
    uasts_dir: str,
    instance_file: str,
    tensors_dir: str,
    configs_dir: str,
    n_workers: int,
    pickle_protocol: int,
    log_level: str,
) -> None:
    """Tensorize the UASTs."""
    Config.from_arguments(locals(),
                          ["uasts_dir", "instance_file", "tensors_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "tensorize.json")
    logger = setup_logging(__name__, log_level)

    uasts_dir_path = Path(uasts_dir).expanduser().resolve()
    tensors_dir_path = Path(tensors_dir).expanduser().resolve()

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    worker = partial(
        _tensorize_worker,
        instance=instance,
        logger=logger,
        uasts_dir_path=uasts_dir_path,
        output_dir_path=tensors_dir_path,
        pickle_protocol=pickle_protocol,
    )

    logger.info(f"Tensorizing %s", uasts_dir_path)
    with Pool(n_workers) as pool:
        pool.map(
            worker,
            (p.relative_to(uasts_dir_path)
             for p in uasts_dir_path.rglob("*.asdf")),
        )
    logger.info(f"Tensorized  %s", uasts_dir_path)
Ejemplo n.º 3
0
def parse(*, raw_dir: str, uasts_dir: str, configs_dir: str,
          log_level: str) -> None:
    """Parse a CodRep 2019 dataset into UASTs."""
    Config.from_arguments(locals(), ["raw_dir", "uasts_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "parse.json")
    logger = setup_logging(__name__, log_level)
    raw_dir_path = Path(raw_dir).expanduser().resolve()
    uasts_dir_path = Path(uasts_dir).expanduser().resolve()
    uasts_dir_path.mkdir(parents=True, exist_ok=True)

    parser = JavaParser(split_formatting=True)
    logger.info("Parsing %s", raw_dir_path)
    labels_file = raw_dir_path / "out.txt"
    extract_labels = labels_file.is_file()
    if extract_labels:
        error_offsets = {}
        for i, line in enumerate(labels_file.open("r", encoding="utf8")):
            error_offsets["%d.txt" % i] = int(line) - 1
    for file_path in raw_dir_path.rglob("*.txt"):
        if file_path.samefile(labels_file):
            continue
        file_path_relative = file_path.relative_to(raw_dir_path)
        start = time()
        logger.debug("Parsing %s", file_path_relative)
        try:
            nodes = parser.parse(raw_dir_path, file_path_relative)
        except ParsingException:
            continue
        logger.debug(
            "Parsed  %s into %d nodes in %.2fms",
            file_path_relative,
            len(nodes.nodes),
            (time() - start) * 1000,
        )
        error_node_index = None
        if extract_labels:
            error_offset = error_offsets[file_path.name]
            for formatting_i, i in enumerate(nodes.formatting_indexes):
                node = nodes.nodes[i]
                if node.start == error_offset:
                    error_node_index = formatting_i
                    break
            else:
                logger.warning(
                    "Could not retrieve a formatting node for the error at offset %d "
                    "of file %s.",
                    error_offset,
                    file_path.with_suffix("").name,
                )
                continue
        codrep_label = CodRepLabel(
            error_index=error_node_index,
            n_formatting_nodes=len(nodes.formatting_indexes),
        )
        output_subdirectory = uasts_dir_path / file_path_relative.parent
        output_subdirectory.mkdir(parents=True, exist_ok=True)
        with (output_subdirectory /
              file_path.with_suffix(".asdf").name).open("wb") as fh:
            af = AsdfFile(
                dict(
                    nodes=nodes.to_tree(file_path.read_text(encoding="utf-8")),
                    codrep_label=codrep_label.to_tree(),
                    filepath=str(file_path_relative),
                ))
            af.write_to(fh, all_array_compression="bzp2")
Ejemplo n.º 4
0
Archivo: run.py Proyecto: m09/mloncode
def run(
    *,
    raw_dir: str,
    uasts_dir: str,
    instance_file: str,
    tensors_dir: str,
    checkpoint_file: str,
    configs_dir: str,
    training_configs_dir: str,
    prefix: str,
    metadata_dir: Optional[str],
    log_level: str,
) -> None:
    """Run the model and output CodRep predictions."""
    arguments = locals()
    configs_dir_path = Path(configs_dir).expanduser().resolve()
    configs_dir_path.mkdir(parents=True, exist_ok=True)
    training_configs_dir_path = Path(training_configs_dir).expanduser().resolve()
    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    Config.from_arguments(
        arguments, ["instance_file", "checkpoint_file"], "configs_dir"
    ).save(configs_dir_path / "train.json")
    logger = setup_logging(__name__, log_level)

    training_configs = {}
    for step in ["parse", "tensorize", "train"]:
        with (training_configs_dir_path / step).with_suffix(".json").open(
            "r", encoding="utf8"
        ) as fh:
            training_configs[step] = json_load(fh)

    parse(
        raw_dir=raw_dir,
        uasts_dir=uasts_dir,
        configs_dir=configs_dir,
        log_level=log_level,
    )

    tensorize(
        uasts_dir=uasts_dir,
        instance_file=instance_file,
        tensors_dir=tensors_dir,
        configs_dir=configs_dir,
        n_workers=training_configs["tensorize"]["options"]["n_workers"],
        pickle_protocol=training_configs["tensorize"]["options"]["pickle_protocol"],
        log_level=log_level,
    )

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info(f"Dataset of size {len(dataset)}")

    with bz2_open(instance_file, "rb") as fh_instance:
        instance = pickle_load(fh_instance)

    model = build_model(
        instance=instance,
        model_decoder_type=training_configs["train"]["options"]["model_decoder_type"],
        model_encoder_iterations=training_configs["train"]["options"][
            "model_encoder_iterations"
        ],
        model_encoder_output_dim=training_configs["train"]["options"][
            "model_encoder_output_dim"
        ],
        model_encoder_message_dim=training_configs["train"]["options"][
            "model_encoder_message_dim"
        ],
        model_learning_rate=training_configs["train"]["options"]["model_learning_rate"],
        model_batch_size=training_configs["train"]["options"]["model_batch_size"],
        train_dataset=dataset,
        eval_dataset=None,
        test_dataset=None,
    )

    # The model needs a forward to be completely initialized.
    model.training_step(instance.collate([dataset[0]]), 0)
    logger.info(f"Configured model {model}")

    model.load_state_dict(
        torch_load(checkpoint_file, map_location="cpu")["model_state_dict"]
    )
    model.eval()
    logger.info(f"Loaded model parameters from %s", checkpoint_file)

    metadata = None if metadata_dir is None else model.build_metadata()
    metadata_output = (
        None if metadata_dir is None else Path(metadata_dir) / "metadata.json"
    )

    dataloader = model.train_dataloader()

    graph_field = instance.get_field_by_type("graph")
    label_field = instance.get_field_by_type("label")
    indexes_field = instance.get_field_by_type("indexes")
    metadata_field = instance.get_field_by_type("metadata")
    graph_input_fields = instance.get_fields_by_type("input")
    graph_input_dimensions = [48, 48, 32]
    feature_names = [field.name for field in graph_input_fields]
    with no_grad():
        for batch in dataloader:
            graph, etypes = batch[graph_field.name]
            features = [batch[field_name] for field_name in feature_names]
            indexes, offsets = batch[indexes_field.name].indexes
            forward = model.forward(graph, etypes, features, indexes)
            model.decode(
                batched_graph=graph,
                indexes=indexes,
                offsets=offsets,
                forward=forward,
                paths=batch[metadata_field.name],
                prefix=prefix,
                metadata=metadata,
            )

    if metadata_output is not None:
        with metadata_output.open("w", encoding="utf8") as fh:
            json_dump(metadata, fh)
Ejemplo n.º 5
0
def train(
    *,
    instance_file: str,
    tensors_dir: str,
    train_dir: str,
    configs_dir: str,
    model_encoder_iterations: int,
    model_encoder_output_dim: int,
    model_encoder_message_dim: int,
    model_decoder_type: str,
    model_learning_rate: float,
    model_batch_size: int,
    trainer_epochs: int,
    trainer_eval_every: int,
    trainer_limit_epochs_at: Optional[int],
    trainer_train_eval_split: float,
    trainer_selection_metric: str,
    trainer_kept_checkpoints: int,
    trainer_cuda: Optional[int],
    log_level: str,
) -> None:
    """Run the training."""
    Config.from_arguments(locals(),
                          ["instance_file", "tensors_dir", "train_dir"],
                          "configs_dir").save(
                              Path(configs_dir) / "train.json")
    logger = setup_logging(__name__, log_level)

    tensors_dir_path = Path(tensors_dir).expanduser().resolve()
    train_dir_path = Path(train_dir).expanduser().resolve()
    train_dir_path.mkdir(parents=True, exist_ok=True)

    with bz2_open(instance_file, "rb") as fh:
        instance = pickle_load(fh)

    dataset = CodRepDataset(input_dir=tensors_dir_path)
    logger.info("Dataset of size %d", len(dataset))

    train_length = round(0.9 * len(dataset))
    eval_length = round(0.05 * len(dataset))
    test_length = len(dataset) - train_length - eval_length

    train_dataset, eval_dataset, test_dataset = random_split(
        dataset, [train_length, eval_length, test_length])

    if trainer_cuda is not None:
        if not cuda_is_available():
            raise RuntimeError("CUDA is not available on this system.")
        device = torch_device("cuda:%d" % trainer_cuda)
    else:
        device = torch_device("cpu")
    model = build_model(
        instance=instance,
        model_encoder_iterations=model_encoder_iterations,
        model_encoder_output_dim=model_encoder_output_dim,
        model_encoder_message_dim=model_encoder_message_dim,
        model_decoder_type=model_decoder_type,
        model_learning_rate=model_learning_rate,
        model_batch_size=model_batch_size,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        test_dataset=test_dataset,
    )
    # The model needs a forward to be completely initialized.
    model.training_step(instance.collate([dataset[0]]), 0)
    logger.info("Configured model %s", model)

    checkpoint_callback = ModelCheckpoint(
        filepath=train_dir,
        save_best_only=True,
        verbose=True,
        monitor="eval_mrr",
        mode="max",
        prefix="",
    )

    trainer = Trainer(default_save_path=train_dir,
                      checkpoint_callback=checkpoint_callback)
    trainer.fit(model)