def process_lines(lines):
    results = []
    for label, contexts in path_iterator(lines):
        masked_contexts = mask_variables_in_contexts(contexts)
        masked_contexts = [
            ",".join(masked_context) for masked_context in masked_contexts
        ]
        contexts_str = " ".join(masked_contexts)

        results.append(f"{label} {contexts_str}\n")

    return results
def extract_targets_and_subtokens(lines):
    subtokens = []
    targets = []

    for target, contexts in path_iterator(lines):
        targets.append(" ".join(target.split("|")))

        for start, _, end in contexts:
            subtokens.append(" ".join(start.split("|")))
            subtokens.append(" ".join(end.split("|")))

    return {"subtokens": subtokens, "targets": targets}
def compute_lengths(lines, subtoken_tokenizer, target_tokenizer):
    target_lengths = []
    subtoken_lengths = []
    for label, contexts in path_iterator(lines):
        target_lengths.append(len(target_tokenizer.encode(label.split("|"))))

        for start, path, end in contexts:
            subtoken_lengths.append(
                len(subtoken_tokenizer.encode(start.split("|"))))
            subtoken_lengths.append(
                len(subtoken_tokenizer.encode(end.split("|"))))

    return target_lengths, subtoken_lengths
def process_chunk(lines):
    target_count = {}
    subtoken_count = {}
    node_count = {}

    for label, contexts in path_iterator(lines):
        add_subtokens_to_dict(label, target_count, split_subtokens=True)
        for start, path, end in contexts:
            add_subtokens_to_dict(start, subtoken_count, split_subtokens=True)
            add_subtokens_to_dict(end, subtoken_count, split_subtokens=True)
            add_subtokens_to_dict(path, node_count)

    return target_count, subtoken_count, node_count
Beispiel #5
0
def process_chunk_serial(lines: List[str]):
    masked_entries = []
    for label, contexts in path_iterator(lines):
        variables = get_variables_from_contexts(contexts)
        for variable in variables:
            new_entries = create_path_for_variable(variable, variables,
                                                   contexts)
            new_entries = new_entries.strip()

            assert "\n" not in new_entries, "Newline found in new_entries"

            if len(new_entries) > 0:
                masked_entries.append(new_entries)

    return masked_entries
def mask_variables_for_method_name(src_folder, out_folder):
    raw_src_files = os.listdir(src_folder)
    raw_src_files = map(lambda p: os.path.join(src_folder, p), raw_src_files)

    for raw_src_file in raw_src_files:
        with open(raw_src_file, "r") as f:
            lines = map(lambda s: s.strip(), f.readlines())
            masked_paths = []
            for label, contexts in path_iterator(lines):
                masked_contexts = []
                masked_contexts = mask_variables_in_contexts(contexts)
                masked_contexts = [",".join(masked_context) for masked_context in masked_contexts]
                masked_contexts = " ".join(masked_contexts)
                masked_paths.append(f"{label} {masked_contexts}")

            out_file = os.path.join(out_folder, os.path.basename(raw_src_file))
            with open(out_file, "w") as out:
                for path in masked_paths:
                    out.write(f"{path}\n")
def verify_variable_dataset(lines):

    for variable, contexts in path_iterator(lines):
        # make sure that if variable appears in any context,
        # it is not a VDID or a Nm

        for context in contexts:
            start, path, end = map(lambda x: x.strip(), context)
            if start == variable:
                start_node = get_start_node(path)
                assert not (
                    is_name_expr(start_node) or is_variable(start_node)
                ), f"Verification Failed: Variable appears in start node and is not a variable or a name expr:\n{context}"
                # print(f"variable name appeared with {start_node}")
            if end == variable:
                end_node = get_end_node(path)
                assert not (
                    is_name_expr(end_node) or is_variable(end_node)
                ), f"Verification Failed: Variable appears in start node and is not a variable or a name expr:\n{context}"
def process_variables(lines):
    target_count, subtoken_count, node_count = {}, {}, {}

    for label, contexts in path_iterator(lines):

        variables = get_variables_from_contexts(contexts)
        for variable in variables:
            add_subtokens_to_dict(variable, target_count)

        for start, path, end in contexts:
            start_node, end_node = [
                f(path) for f in [get_start_node, get_end_node]
            ]
            if should_mask(start, start_node, variables):
                add_subtokens_to_dict(start, subtoken_count)

            if not should_mask(end, end_node, variables):
                add_subtokens_to_dict(end, subtoken_count)

            add_subtokens_to_dict(path, node_count)

    return target_count, subtoken_count, node_count