Esempio n. 1
0
def merge_to_selected_repositories(config: Config, filename: str) -> None:
    selected = config.load_csv(File.SELECTED_REPOSITORIES)
    listed = config.load_csv(filename)

    selected = pd.concat([listed, selected])
    selected = selected.drop_duplicates('repository_name')

    config.backup(File.SELECTED_REPOSITORIES)
    config.save_csv(selected, File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        config.backup(File.PREPARED_REPOSITORIES)
Esempio n. 2
0
def select_more_repositories(config: Config, languages: List[str]) -> None:
    LOGGER.info('Choose more repositories per language')
    LOGGER.info('This operation might take several minutes...')

    input_data = config.load_csv(File.ALTERED_DATASET)
    known = config.load_csv(File.SELECTED_REPOSITORIES)

    mask = ~input_data['repository_name'].isin(known['repository_name'])
    repositories = input_data[mask]
    shuffled = repositories.sample(frac=1).reset_index(drop=True)

    max_repositories = config.nb_repositories_per_language

    selected_list = []
    for lang in languages:
        if lang not in config.languages:
            LOGGER.error(f'Unknown language {lang}')
            raise RuntimeError(f'Unknown language {lang}')

        pending = shuffled[shuffled['repository_language'] == lang]
        nb_known = len(known[known['repository_language'] == lang])
        nb_pending = len(pending)
        nb_required = max(max_repositories - nb_known, 0)
        nb_selected = min(nb_pending, nb_required)
        total = nb_known + nb_selected

        LOGGER.info(f'{lang}: repositories per language: {max_repositories}, '
                    f'pending: {nb_pending}, known: {nb_known}, '
                    f'selected: {nb_selected}, total: {total}')

        if total < max_repositories:
            LOGGER.warning(f'{lang}, not enough repositories, '
                           f'required: {max_repositories}')

        if nb_selected == 0:
            continue

        selected = pending[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    config.backup(File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        config.backup(File.PREPARED_REPOSITORIES)

    new_repositories = pd.concat(selected_list)
    united = known.append(new_repositories)
    config.save_csv(united, File.SELECTED_REPOSITORIES)
Esempio n. 3
0
def select_only_downloaded_repo(config: Config) -> None:
    downloaded_repo = (path.name for path in config.repositories_dir.glob('*'))
    selected = config.load_csv(File.SELECTED_REPOSITORIES)
    prepared = config.load_csv(File.PREPARED_REPOSITORIES)

    LOGGER.info(f'{len(selected)} repositories previously selected')

    repo = pd.DataFrame(downloaded_repo, columns=['repository_dirname'])
    mask = prepared['repository_dirname'].isin(repo['repository_dirname'])
    prepared = prepared[mask]
    mask = selected['repository_name'].isin(prepared['repository_name'])
    selected = selected[mask]

    LOGGER.info(f'{len(selected)} downloaded repositories selected')

    config.backup(File.SELECTED_REPOSITORIES)
    config.backup(File.PREPARED_REPOSITORIES)
    config.save_csv(selected, File.SELECTED_REPOSITORIES)
    config.save_csv(prepared, File.PREPARED_REPOSITORIES)