def test_canon_distance_correctness(self, src_vertex, subtests): src_canon_tree, = get_trees( all_fragments[INDICES_BY_VERTEX[src_vertex][0]], {TREE_TYPE.CANON}) for dst_vertex in VERTEX: with subtests.test(): dst_canon_tree, = get_trees( all_fragments[INDICES_BY_VERTEX[dst_vertex][0]], {TREE_TYPE.CANON}) real_dist = GumTreeDiff.create_tmp_files_and_get_diffs_number( src_canon_tree, dst_canon_tree) assert real_dist == canon_distance[src_vertex][dst_vertex]
def test_anon_distance_correctness(self, subtests): fragments = [ fragment_0, fragment_1, fragment_2, fragment_3, fragment_4, fragment_5 ] for i, src_fragment in enumerate(fragments): src_anon_tree, = get_trees(src_fragment, {TREE_TYPE.ANON}) for j, dst_fragment in enumerate(fragments): with subtests.test(): dst_anon_tree, = get_trees(dst_fragment, {TREE_TYPE.ANON}) real_dist = GumTreeDiff.create_tmp_files_and_get_diffs_number( src_anon_tree, dst_anon_tree) assert real_dist == anon_distance[i][ j], f'Dists are not equal: {i}, {j}'
def drop_same_anon_trees(df: pd.DataFrame) -> pd.DataFrame: log.info(f'Start dropping same anon trees, df size is {len(df)}') df.index = np.arange(0, len(df)) df_anon_trees = df[CODE_TRACKER_COLUMN.FRAGMENT.value].apply( lambda f: get_trees(f, {TREE_TYPE.ANON})[0]).to_list() indices_to_drop = [] i = 0 while i < len(df_anon_trees): log.info(f'Handling {i}/{len(df_anon_trees)} anon tree') current_anon_tree = df_anon_trees[i] j = i + 1 while j < len(df_anon_trees): next_anon_tree = df_anon_trees[j] if are_asts_equal(current_anon_tree, next_anon_tree): log.info(f'Dropping {j} anon tree') # We have to add len(indices_to_drop) because every time the index j is dropped, # corresponding anon_tree is deleted, so indices are shifted by one. indices_to_drop.append(j + len(indices_to_drop)) del df_anon_trees[j] else: j += 1 i += 1 df.drop(df.index[indices_to_drop], inplace=True) df.index = np.arange(0, len(df)) log.info(f'Stop dropping same anon trees, df size is {len(df)}') return df
def from_source(cls, source: str, rate: Optional[float], task: Optional[TASK] = None, language: consts.LANGUAGE = consts.LANGUAGE.PYTHON) -> Code: anon_tree, canon_tree = get_trees(source, {TREE_TYPE.ANON, TREE_TYPE.CANON}) if rate is None: if task is None: log_and_raise_error('Cannot find rate without task: both are None', log) rate = check_tasks([task], source, create_in_and_out_dict([task]), language)[0] return Code(anon_tree, canon_tree, rate, language)
def __init__(self, source_code: Optional[str] = None, anon_tree: Optional[ast.AST] = None, canon_tree: Optional[ast.AST] = None): if source_code is not None: self._orig_tree, self._anon_tree, self._canon_tree = get_trees( source_code, TREE_TYPE.get_all_types_set()) else: self._orig_tree, self._anon_tree, self._canon_tree = None, anon_tree, canon_tree
def test_same_canon_trees_in_same_vertices(self, vertex: VERTEX, subtests): same_canon_fragments = [ all_fragments[i] for i in INDICES_BY_VERTEX[vertex] ] canon_trees = [ get_trees(f, {TREE_TYPE.CANON})[0] for f in same_canon_fragments ] for canon_tree_1, canon_tree_2 in itertools.product(canon_trees, repeat=2): with subtests.test(): assert are_asts_equal(canon_tree_1, canon_tree_2)
def get_expected_out( solutions: pd.DataFrame, start_index: int, end_index: int) -> Tuple[int, List[AtiItem], ast.AST, ast.AST]: ati_elements = [] fragment = __get_column_value(solutions, start_index, CODE_TRACKER_COLUMN.FRAGMENT) anon_tree, canon_tree = get_trees(fragment, {TREE_TYPE.ANON, TREE_TYPE.CANON}) for i in range(start_index, end_index): ati_elements.append(__get_ati_data(solutions, i)) return end_index, ati_elements, anon_tree, canon_tree
def test_different_canon_trees_in_different_vertices(self, subtests): # Take the first fragment from each vertex to get all fragments with different canon trees different_canon_fragments = [ all_fragments[INDICES_BY_VERTEX[vertex][0]] for vertex in VERTEX ] canon_trees = [ get_trees(f, {TREE_TYPE.CANON})[0] for f in different_canon_fragments ] for canon_tree_1, canon_tree_2 in zip(canon_trees, np.roll(canon_trees, 1)): with subtests.test(): assert not are_asts_equal(canon_tree_1, canon_tree_2)
def __find_same_fragments( solutions: pd.DataFrame, start_index: int) -> Tuple[int, List[AtiItem], ast.AST, ast.AST]: i, ati_elements = start_index + 1, [] __handle_current_ati(ati_elements, solutions, start_index) current_fragment = __get_column_value(solutions, start_index, consts.CODE_TRACKER_COLUMN.FRAGMENT) current_anon_tree, current_canon_tree = get_trees( current_fragment, {TREE_TYPE.ANON, TREE_TYPE.CANON}) while i < solutions.shape[0] and __are_same_fragments( current_anon_tree, solutions, i): __handle_current_ati(ati_elements, solutions, i) i += 1 return i, ati_elements, current_anon_tree, current_canon_tree
def __are_same_fragments(current_anon_tree: ast.AST, solutions: pd.DataFrame, next_index: int) -> bool: fragment = __get_column_value(solutions, next_index, consts.CODE_TRACKER_COLUMN.FRAGMENT) next_anon_tree, = get_trees(fragment, {TREE_TYPE.ANON}) return are_asts_equal(current_anon_tree, next_anon_tree)
import sys sys.path.append('.') from src.main.canonicalization.consts import TREE_TYPE from src.main.canonicalization.canonicalization import get_trees from src.main.canonicalization.diffs.gumtree import GumTreeDiff src_source = 'a = 5' dst_source = 'a = 6' src_anon, = get_trees(src_source, {TREE_TYPE.ANON}) dst_anon, = get_trees(dst_source, {TREE_TYPE.ANON}) GumTreeDiff.create_tmp_files_and_get_diffs_number(src_anon, dst_anon)
def __get_code_by_source(source: str, is_goal: bool = False) -> Code: anon_tree, canon_tree = get_trees(source, {TREE_TYPE.ANON, TREE_TYPE.CANON}) rate = 0 if not is_goal else TEST_RESULT.FULL_SOLUTION.value return Code(anon_tree=anon_tree, canon_tree=canon_tree, rate=rate)
def create_code_from_source(source: str, rate: float = TEST_RESULT.CORRECT_CODE.value ) -> Code: anon_tree, canon_tree = get_trees(source, {TREE_TYPE.ANON, TREE_TYPE.CANON}) return Code(anon_tree, canon_tree, rate)
def get_canonicalized_code_from_file(file: str) -> str: canon_tree, = get_trees(get_content_from_file(file), {TREE_TYPE.CANON}) return get_code_from_tree(canon_tree).rstrip('\n')
def get_anonymized_code_from_file(file: str) -> str: anon_tree, = get_trees(get_content_from_file(file), {TREE_TYPE.ANON}, False) return get_code_from_tree(anon_tree).rstrip('\n')