def get_tt_name_from_ati_data(ct_file: str, language: consts.LANGUAGE,
                              files_from_at: List[str]) -> Tuple[str, bool]:
    """
    Try to find the current name of the code tracker file among those tracked by the activity tracker plugin.
    """
    log.info('Start getting project file name')
    extension = get_extension_by_language(language)
    hashed_file_name = get_name_from_path(ct_file)
    file_name = get_original_file_name_with_extension(hashed_file_name,
                                                      extension)
    does_contain_name = True
    if files_from_at is not None:
        log.info(
            f'Start searching the file_name {file_name} in activity tracker data'
        )
        if file_name not in files_from_at:
            log.info(
                f'Activity tracker data does not contain the original file {file_name}'
            )
            does_contain_name = False
        log.info(
            f'Finish searching the file_name {file_name} in activity tracker data'
        )

    log.info('Finish getting project file name')
    return file_name, does_contain_name
Esempio n. 2
0
def __handle_tt_files(tt_files: List[str], output_task_path: str) -> bool:
    """
    The function returns True if new task-tracker file was created and False otherwise
    We should choose the last state of the task-tracker files for the task or all last states and create a new file
    where we union them. The student can submit the solution several times, while the history of the task-tracker file
    is not erased. In this way, we only need to select the final file with the entire history. On the other hand,
    if the file was full, then it will be sent additionally and new files will contain a new history.
    In this case, it is necessary to find the last states of all files with a unique history, combine according to
    timestamps and write to a new final file.

    For more details see https://github.com/JetBrains-Research/codetracker-data/wiki/Data-preprocessing:-primary-data-processing
    """
    dataframes = []
    file_name = None
    for tt_file in tt_files:
        current_df = pd.read_csv(tt_file, encoding=consts.ISO_ENCODING)
        if not is_test_mode(current_df):
            dataframes.append(current_df)
            if file_name is None:
                file_name = get_name_from_path(tt_file)
    if len(dataframes) == 0:
        return False
    new_tt_path = os.path.join(output_task_path, file_name)
    create_file("", new_tt_path)
    __merge_dataframes(
        dataframes,
        sorted_column=TASK_TRACKER_COLUMN.TIMESTAMP.value).to_csv(new_tt_path)
    return True
 def run_test(self, input: str, expected_output: str,
              source_file: str) -> bool:
     args = [
         'java', '-cp', SOURCE_FOLDER,
         self.package + get_name_from_path(source_file, False)
     ]
     return check_output_safely(input, expected_output, args)
def __get_task_by_ct_file(file: str) -> Optional[TASK]:
    task_key = get_name_from_path(get_parent_folder(file),
                                  with_extension=False)
    try:
        return TASK(task_key)
    except ValueError:
        log.info(f'Unexpected task for the file {file}')
        return None
def get_in_and_out_files(test_type: DIFF_HANDLER_TEST_TYPES,
                         task: TASK) -> List[Tuple[str, str, str]]:
    src_and_dst_files = get_src_and_dst_files(test_type, task)
    in_and_out_files = []
    for src_file, dst_file in src_and_dst_files:
        src_file_number = get_name_from_path(src_file, with_extension=False)
        dst_file_number = get_name_from_path(dst_file, with_extension=False)
        out_file = os.path.join(get_parent_folder(src_file),
                                f'out_{src_file_number}_{dst_file_number}.py')
        if get_name_from_path(
                out_file) in FAILED_APPLYING_DIFFS_TO_STUDENTS_CODE_TEST.get(
                    task, []):
            continue
        # If there is no such out_file, it means that out-code is the same as dst-code from dst_file
        if not os.path.isfile(out_file):
            out_file = dst_file
        in_and_out_files.append((src_file, dst_file, out_file))
    return in_and_out_files
def run_test(paths_data_dict: PathsDataDict) -> None:
    for path in paths_data_dict[PATHS_TEST_DATA.PATHS]:
        # Check ValueError raising if passed path has no extensions, but we want to return a filename with an extension:
        with pytest.raises(ValueError):
            get_name_from_path(path[EXTENSION.WITHOUT], True)

        # If passed path has no extensions, and we want to return a filename without extension:
        file_name = get_name_from_path(path[EXTENSION.WITHOUT], False)
        assert file_name == paths_data_dict[PATHS_TEST_DATA.FILE_NAME][
            EXTENSION.WITHOUT]

        # If passed path has an extension, and we want to return a filename without extension:
        file_name = get_name_from_path(path[EXTENSION.WITH], False)
        assert file_name == paths_data_dict[PATHS_TEST_DATA.FILE_NAME][
            EXTENSION.WITHOUT]

        # If passed path has an extension, and we want to return a filename with the extension:
        file_name = get_name_from_path(path[EXTENSION.WITH], True)
        assert file_name == paths_data_dict[PATHS_TEST_DATA.FILE_NAME][
            EXTENSION.WITH]
def __are_same_files(code_tracker_file_name: str,
                     activity_tracker_file_path: str) -> bool:
    if pd.isnull(activity_tracker_file_path):
        return False
    try:
        activity_tracker_file_name = get_name_from_path(
            activity_tracker_file_path)
    except ValueError:
        # If the activity_tracker_file_name has an invalid extension, it does not equal code_tracker_file_name
        return False
    return code_tracker_file_name == activity_tracker_file_name
Esempio n. 8
0
def preprocess_data(path: str) -> str:
    """
    We use task-tracker plugin (see https://github.com/JetBrains-Research/task-tracker-plugin)
    and activity tracker plugin (see https://plugins.jetbrains.com/plugin/8126-activity-tracker)
    to gather the source data. The data gathering consists of us collecting code snapshots and actions during
    the solving of various programming tasks by students. The data also contains information about the age,
    programming experience and so on of the student (student profile), and the current task that the student is solving.

    - At this stage, the test files that were created during the testing phase are deleted. They have ON value in the
    test mode column in the task-tracker file.
    - Also, the student could send several files with the history of solving the task, each of which can include
    the previous ones. At this stage, unnecessary files are deleted. Ultimately, there is only one file with a unique
    history of solving the current problem.
    - In addition, for each task-tracker file, a unique file of the activity tracker is sent. In this step,
    all files of the activity tracker are combined into one.

    For more details see
    https://github.com/JetBrains-Research/task-tracker-post-processing/wiki/Data-processing:-primary-data-processing
    """
    output_directory = get_output_directory(path,
                                            consts.PREPROCESSING_DIRECTORY)
    user_folders = get_all_file_system_items(path, user_subdirs_condition,
                                             consts.FILE_SYSTEM_ITEM.SUBDIR)
    for user_folder in user_folders:
        output_user_path = os.path.join(output_directory,
                                        get_name_from_path(user_folder, False))
        log.info(f'Start handling the path {user_folder}')
        task_folders = get_all_file_system_items(
            user_folder, all_items_condition, consts.FILE_SYSTEM_ITEM.SUBDIR)
        for task_folder in task_folders:
            output_task_path = os.path.join(
                output_user_path, get_name_from_path(task_folder, False))
            log.info(f'Start handling the folder {task_folder}')
            files = get_all_file_system_items(
                task_folder, extension_file_condition(EXTENSION.CSV))
            tt_files, ati_files = __partition_into_tt_and_ati_files(files)
            if __handle_tt_files(tt_files, output_task_path) and ati_files:
                new_ati_path = os.path.join(output_task_path,
                                            get_name_from_path(ati_files[0]))
                __merge_ati_files(ati_files).to_csv(new_ati_path)
    return output_directory
Esempio n. 9
0
def check_anonymization(old_files_root: str, new_files_root: str) -> List[str]:
    """
    Find incorrect anonymized files. The file is incorrect if:
     - does not exist in the new folder
     - has more or less rows than in the old folder
    """
    files_with_errors = []
    language_dirs = get_all_file_system_items(
        new_files_root,
        item_condition=language_item_condition,
        item_type=FILE_SYSTEM_ITEM.SUBDIR)
    for language_dir in language_dirs:
        task_dirs = get_all_file_system_items(
            language_dir,
            item_condition=task_item_condition,
            item_type=FILE_SYSTEM_ITEM.SUBDIR)
        language = get_name_from_path(language_dir, with_extension=False)
        for task_dir in task_dirs:
            task = get_name_from_path(task_dir, with_extension=False)
            old_path = f'{remove_slash(old_files_root)}/{language}/{task}'
            old_files = get_all_file_system_items(
                old_path,
                item_condition=extension_file_condition(EXTENSION.CSV))
            for old_file in old_files:
                name = get_name_from_path(old_file)
                new_file_path = f'{task_dir}/{name}'
                if not does_exist(new_file_path):
                    files_with_errors.append(new_file_path)
                else:
                    try:
                        new_df = pd.read_csv(new_file_path,
                                             encoding=ISO_ENCODING)
                        old_df = pd.read_csv(old_file, encoding=ISO_ENCODING)
                        if new_df.shape[0] != old_df.shape[0]:
                            files_with_errors.append(new_file_path)
                    except pd.errors.EmptyDataError:
                        files_with_errors.append(new_file_path)
    return files_with_errors
def get_tasks_statistics(path: str) -> TaskStatistics:
    statistics = {}
    language_values = [language.value for language in consts.LANGUAGE]
    language_folders = get_all_file_system_items(
        path, contains_substrings_condition(language_values), SUBDIR)
    for l_f in language_folders:
        language = consts.LANGUAGE(get_name_from_path(l_f, False))
        if statistics.get(language):
            log_and_raise_error(
                f'Duplicate language folder for {language.value}', log)
        statistics[language] = {}
        task_values = consts.TASK.tasks_values()
        task_folders = get_all_file_system_items(
            l_f, contains_substrings_condition(task_values), SUBDIR)
        for t_f in task_folders:
            files = get_all_file_system_items(t_f)
            task = consts.TASK(get_name_from_path(t_f, False))
            if statistics.get(language).get(task):
                log_and_raise_error(
                    f'Duplicate task for {task.value} in folder {l_f}', log)
            statistics.get(language)[task] = len(files)

    return statistics
def anonymize_cpp_code(root: str,
                       local_gorshochek_path: str,
                       output_folder_name: str = 'anonymizerResult') -> None:
    """
    We use gorshochek library: https://github.com/JetBrains-Research/gorshochek
    You need to clone the repo and build a docker image (see gorshochek README).

    Note: you need to change the config.yaml file before building the docker image:

    n transformations: 1
    transformations:
      - remove comments:
          p: 1.0
      - rename entities:
          p: 1
          rename functions: true
          rename variables: true
          strategy:
              name: hash
              hash prefix: d

    You can change 'seed', 'max tokens', 'max token len' params if you want.
    """
    cpp_path = f'{remove_slash(root)}/{LANGUAGE.CPP.value}'
    output_path = f'{get_parent_folder(root)}/{output_folder_name}/{LANGUAGE.CPP.value}'

    task_dirs = get_all_file_system_items(cpp_path,
                                          item_condition=task_item_condition,
                                          item_type=FILE_SYSTEM_ITEM.SUBDIR)
    gorshochek_anonymizer = GorshochekAnonymizer(local_gorshochek_path)
    for task_dir in task_dirs:
        task = get_name_from_path(task_dir, with_extension=False)
        print(f'Start handling the task {task}')
        files = get_all_file_system_items(
            task_dir, item_condition=extension_file_condition(EXTENSION.CSV))
        for file in files:
            print(f'Start handling the file {file}')
            df = pd.read_csv(file, encoding=ISO_ENCODING)
            # Delete incorrect fragments
            df = df[df.apply(
                lambda row: not is_incorrect_fragment(row[TESTS_RESULTS]),
                axis=1)]
            df[TASK_TRACKER_COLUMN.FRAGMENT.value] = \
                df[TASK_TRACKER_COLUMN.FRAGMENT.value].apply(gorshochek_anonymizer.anonymize_code_fragment)
            current_output_path = f'{output_path}/{task}/{get_name_from_path(file)}'
            create_file('', current_output_path)
            df.to_csv(current_output_path)

    gorshochek_anonymizer.remove_directories()
def get_files_from_ati(activity_tracker_data: pd.DataFrame) -> List[str]:
    paths = __remove_nan(activity_tracker_data[
        consts.ACTIVITY_TRACKER_COLUMN.CURRENT_FILE.value].unique())
    paths_dict = {}
    for current_path in paths:
        path = get_parent_folder(current_path)
        file = get_name_from_path(current_path)
        if paths_dict.get(file) is None:
            paths_dict[file] = path
        else:
            if paths_dict[file] != path:
                log_and_raise_error(
                    'Activity tracker data contains several files with the same names',
                    log)
    return list(paths_dict.keys())
Esempio n. 13
0
def crop_data_and_save(original_data_path: str,
                       column: Column,
                       start_value: Any,
                       end_value: Any = None,
                       file_name_prefix: str = 'crop_',
                       folder_name_prefix: str = 'cropped_data',
                       create_sub_folder: bool = True) -> str:
    original_data = pd.read_csv(original_data_path, encoding=ISO_ENCODING)
    cropped_data = crop_data_by_timestamp(original_data, column, start_value,
                                          end_value)
    cropped_data_name = file_name_prefix + get_name_from_path(
        original_data_path)
    cropped_data_folder = get_parent_folder(original_data_path)
    if create_sub_folder:
        cropped_data_folder = os.path.join(cropped_data_folder,
                                           folder_name_prefix)
    cropped_data_result_path = os.path.join(cropped_data_folder,
                                            cropped_data_name)
    create_folder_and_write_df_to_file(cropped_data_folder,
                                       cropped_data_result_path, cropped_data)
    return cropped_data_result_path
def calculate_current_task_rate(df: pd.DataFrame) -> pd.DataFrame:
    file_name = df[FILE_NAME].unique()[0]
    current_task = TASK(get_name_from_path(file_name, False))
    return df[TESTS_RESULTS].apply(lambda x: unpack_tests_results(x, TASK.tasks())[TASK.tasks().index(current_task)])
Esempio n. 15
0
def __get_current_task_key(df: pd.DataFrame) -> str:
    key = get_name_from_path(df.iloc[0][FILE_NAME], with_extension=False)
    # Old name from old data
    if key == 'zero':
        key = 'is_zero'
    return consts.TASK(key).value
def get_short_name(path: str) -> str:
    folder = get_parent_folder_name(path)
    file_name = get_name_from_path(path)
    return os.path.join(folder,
                        crop_string(file_name, plot_consts.SHORT_NAME_LENGTH))
def __are_same_files(code_tracker_file_name: str,
                     activity_tracker_file_path: str) -> bool:
    if pd.isnull(activity_tracker_file_path):
        return False
    activity_tracker_file_name = get_name_from_path(activity_tracker_file_path)
    return code_tracker_file_name == activity_tracker_file_name
Esempio n. 18
0
def __get_dst_path(src_file: str, output_directory: str) -> str:
    file_name = get_name_from_path(src_file)
    task_path = get_parent_folder(src_file)
    task = get_name_from_path(task_path, with_extension=False)
    language = get_parent_folder_name(task_path)
    return os.path.join(output_directory, language, task, file_name)
Esempio n. 19
0
def split_tasks_into_separate_files(
        path: str, output_directory_suffix: str = 'separated_tasks') -> str:
    files = get_all_file_system_items(path, ct_file_condition)
    output_directory = get_output_directory(path, output_directory_suffix)
    for file in files:
        log.info(f'Start splitting file {file}')
        ct_df = pd.read_csv(file, encoding=consts.ISO_ENCODING)
        language = get_ct_language(ct_df)
        split_df = find_splits(ct_df)
        for task in consts.TASK:
            task_dfs = find_task_dfs(split_df, task)
            for i, task_df in enumerate(task_dfs):
                if not task_df.empty:
                    # Change name to get something like pies/ati_207_test_5894859_i.csv
                    filename = task.value + '/' + get_parent_folder_name(file) + '_' + get_name_from_path(file, False) \
                               + f'_{i}' + get_extension_from_file(file).value
                    write_based_on_language(output_directory, filename,
                                            task_df, language)
    return output_directory