def extract() -> None:
    LOGGER.info('Extract selected files')
    LOGGER.info('This operation might take a lot of time...')

    train_path = Config.extracted_files_dir.joinpath('train')
    valid_path = Config.extracted_files_dir.joinpath('valid')
    test_path = Config.extracted_files_dir.joinpath('test')

    train_path.mkdir(exist_ok=True)
    valid_path.mkdir(exist_ok=True)
    test_path.mkdir(exist_ok=True)

    source = load_csv(File.FILES_SPLIT_BY_USAGE)
    columns = [
        'extract_to', 'filename', 'language', 'rank', 'repository_filename',
        'repository_language', 'usage', 'status'
    ]
    try:
        files = load_csv(File.EXTRACTED_FILES)
    except IOError:
        files = pd.DataFrame([], columns=columns)

    df = pd.merge(source, files, how='outer', on=list(source.columns))
    df.loc[df['status'].isnull(), 'status'] = Status.PENDING.value

    while True:
        selected = _choose_files_to_extract(df)
        LOGGER.info('%s files to extract', len(selected))

        if not len(selected):
            break

        result = _extract_files(selected)

        result_extracted = result[result['status'] == Status.EXTRACTED.value]
        mask = df['extract_to'].isin(result_extracted['extract_to'])
        df.loc[mask, 'status'] = Status.EXTRACTED.value

        result_discarded = result[result['status'] == Status.DISCARDED.value]
        mask = df['extract_to'].isin(result_discarded['extract_to'])
        df.loc[mask, 'status'] = Status.DISCARDED.value

        extracted = df[df['status'] == Status.EXTRACTED.value]
        discarded = df[df['status'] == Status.DISCARDED.value]

        LOGGER.info('Processed %s files: %s extracted, %s discarded',
                    len(result), len(result_extracted), len(result_discarded))

        LOGGER.info('%s total files extracted', len(extracted))
        LOGGER.info('%s total files discarded', len(discarded))

    save_csv(df, File.EXTRACTED_FILES)

    LOGGER.info('The training files are located in %s', train_path)
    LOGGER.info('The validation files are located in %s', valid_path)
    LOGGER.info('The test files are located in %s', test_path)
Ejemplo n.º 2
0
def select_more_repositories(languages: List[str]) -> None:
    LOGGER.info('Choose more repositories per language')
    LOGGER.info('This operation might take few minutes...')

    output_path = absolute(File.SELECTED_REPOSITORIES)

    input_data = load_csv(File.ALTERED_DATASET)
    known = load_csv(File.SELECTED_REPOSITORIES)

    mask = ~input_data['repository_name'].isin(known['repository_name'])
    repositories = input_data[mask]
    shuffled = repositories.sample(frac=1).reset_index(drop=True)

    max_repositories = Config.nb_repositories_per_language

    selected_list = []
    for language in languages:
        if language not in Config.languages:
            LOGGER.error('Unknown language %s', language)
            raise RuntimeError(f'Unknown language {language}')

        pending = shuffled[shuffled['repository_language'] == language]
        nb_known = len(known[known['repository_language'] == language])
        nb_pending = len(pending)
        nb_required = max(max_repositories - nb_known, 0)
        nb_selected = min(nb_pending, nb_required)
        total = nb_known + nb_selected

        LOGGER.info(
            '%s: repositories per language: %s, pending: %s, known: %s, '
            'selected: %s, total: %s', language, max_repositories, nb_pending,
            nb_known, nb_selected, total)

        if total < max_repositories:
            LOGGER.warning('%s, not enough repositories, required: %s',
                           language, max_repositories)

        if nb_selected == 0:
            continue

        selected = pending[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    backup(File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        backup(File.PREPARED_REPOSITORIES)

    new_repositories = pd.concat(selected_list)
    united = known.append(new_repositories)
    united.to_csv(output_path, index=False)
Ejemplo n.º 3
0
def download() -> None:
    LOGGER.info('Download chosen repositories')
    LOGGER.info('This operation might take a lot of time...')

    input_data = load_csv(File.PREPARED_REPOSITORIES)

    rows = (row_info[1] for row_info in input_data.iterrows())
    result_rows = []
    for step, row in enumerate(pool_imap(_download_repository, rows), 1):
        result_rows.append(row)
        if step % Config.step == 0:
            LOGGER.info('--> Processed %s repositories...', step)

    dataframes = [pd.DataFrame(row).T for row in result_rows]
    data = pd.concat(dataframes)

    data.loc[:, 'repository_size'] = 0
    data = data.apply(_check_size, axis=1)
    data = data[data['repository_size'] != 0]
    data = data[data['repository_filename'] != '']

    fieldnames = ['repository_language', 'repository_filename']
    output_data = data[fieldnames]
    output_path = absolute(File.DOWNLOADED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
def split() -> None:
    LOGGER.info('Split repositories by usage: train, valid & test')
    LOGGER.info('This operation should take few seconds...')

    files = load_csv(File.AVAILABLE_FILES)
    files = files.drop('dedup_key', axis=1)
    columns = ['repository_language', 'repository_filename']

    repo = files[columns].drop_duplicates()
    repo = repo.sample(frac=1).reset_index(drop=True)
    repo.loc[:, 'usage'] = ''

    LOGGER.info('Total downloaded repositories: %s', len(repo))

    total_files = (Config.nb_train_files_per_language +
                   Config.nb_valid_files_per_language +
                   Config.nb_test_files_per_language)
    valid_ratio = Config.nb_valid_files_per_language / total_files
    valid_ratio = max(valid_ratio, MIN_SPLIT_RATIO)

    test_ratio = Config.nb_test_files_per_language / total_files
    test_ratio = max(test_ratio, MIN_SPLIT_RATIO)

    repositories = {}
    for language in Config.languages:
        by_language = repo[repo['repository_language'] == language]
        total = len(by_language)
        if total < 3:
            raise RuntimeError(
                f'Need more than 3 repositories for language {language}')

        nb_test = max(int(total * test_ratio), 1)
        nb_valid = max(int(total * valid_ratio), 1)
        nb_test_valid = nb_test + nb_valid

        test = by_language[:nb_test]
        test['usage'].values[:] = 'test'
        repositories[f'{language}/test'] = test

        valid = by_language[nb_test:nb_test_valid]
        valid['usage'].values[:] = 'valid'
        repositories[f'{language}/valid'] = valid

        train = by_language[nb_test_valid:]
        train['usage'].values[:] = 'train'
        repositories[f'{language}/train'] = train

        LOGGER.info('%s nb repositories, train: %s, valid: %s, test: %s',
                    language, total - nb_test_valid, nb_valid, nb_test)

    for name, repository in repositories.items():
        if not len(repository):
            LOGGER.error('No repositories available for %s', name)
            raise RuntimeError(f'No repositories for category: {name}')

    repo = pd.concat(repositories.values())
    files = pd.merge(files, repo, on=columns)
    save_csv(files, File.FILES_SPLIT_BY_USAGE)
Ejemplo n.º 5
0
def show_repositories_distribution() -> None:
    LOGGER.info('Loading repositories info')
    LOGGER.info('This operation should take few seconds...')

    selected = load_csv(File.SELECTED_REPOSITORIES)
    count = selected.repository_language.value_counts()

    pd.set_option('display.max_rows', None)
    print(count)
Ejemplo n.º 6
0
def select_only_downloaded_repo() -> None:
    downloaded_repo = (path.name for path in Config.repositories_dir.glob('*'))
    selected = load_csv(File.SELECTED_REPOSITORIES)
    prepared = load_csv(File.PREPARED_REPOSITORIES)

    LOGGER.info('%s repositories previously selected', len(selected))

    repo = pd.DataFrame(downloaded_repo, columns=['repository_filename'])
    mask = prepared['repository_filename'].isin(repo['repository_filename'])
    prepared = prepared[mask]
    mask = selected['repository_name'].isin(prepared['repository_name'])
    selected = selected[mask]

    LOGGER.info('%s downloaded repositories selected', len(selected))

    backup(File.SELECTED_REPOSITORIES)
    backup(File.PREPARED_REPOSITORIES)
    save_csv(selected, File.SELECTED_REPOSITORIES)
    save_csv(prepared, File.PREPARED_REPOSITORIES)
def list_all() -> None:
    LOGGER.info('List source files from repositories')
    LOGGER.info('This operation might take few minutes...')

    columns = [
        'extract_to',
        'filename',
        'language',
        'rank',
        'repository_filename',
        'dedup_key',
    ]

    repo = load_csv(File.DOWNLOADED_REPOSITORIES)
    try:
        files = load_csv(File.AVAILABLE_FILES)
    except IOError:
        files = pd.DataFrame([], columns=columns)

    mask = ~repo['repository_filename'].isin(files['repository_filename'])
    new_repo = repo[mask]
    LOGGER.info('%s newly downloaded repositories', len(new_repo))

    nb_repo_before = len(files.repository_filename.unique())
    mask = files['repository_filename'].isin(repo['repository_filename'])
    files = files[mask]
    nb_repo_after = len(files.repository_filename.unique())
    nb_removed = nb_repo_before - nb_repo_after
    LOGGER.info('%s deleted repositories', nb_removed)

    new_files = _list_files_by_language(new_repo)
    df = pd.concat([files, new_files], axis=0, sort=False)

    df.drop_duplicates(subset='dedup_key', inplace=True)
    df.sort_values(by='rank', inplace=True)

    LOGGER.info('Files available by language:')
    for language in Config.languages:
        nb_files = len(df[df['language'] == language])
        LOGGER.info('--> %s: %s', language, nb_files)

    save_csv(df, File.AVAILABLE_FILES)
Ejemplo n.º 8
0
def prepare() -> None:
    LOGGER.info('Prepare repositories download')
    LOGGER.info('This operation should take few seconds...')

    input_data = load_csv(File.SELECTED_REPOSITORIES)
    input_data.loc[:, 'repository_filename'] = ''
    input_data.loc[:, 'repository_url'] = ''

    output_data = input_data.apply(_add_download_info, axis=1)
    output_path = absolute(File.PREPARED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Ejemplo n.º 9
0
def alter() -> None:
    LOGGER.info('Alter repositories list file')
    LOGGER.info('This operation might take few minutes...')

    output_path = absolute(File.ALTERED_DATASET)

    df = load_csv(File.SHRUNK_DATASET)

    # Set repositories with no language as Markdown repositories.
    # Because most of Github repositories have a Readme.md file.
    mask = df['repository_language'].isnull()
    df.loc[mask, 'repository_language'] = 'Markdown'

    # There are too few repositories tagged as SQL repositories.
    # To mitigate this problem, a list of known repositories are flagged as
    # SQL repositories.
    sql_df = pd.read_csv(SQL_DATASET_PATH)
    mask = df['repository_name'].isin(sql_df['repository_name'])
    df.loc[mask, 'repository_language'] = 'SQL'

    df.to_csv(output_path, index=False)
Ejemplo n.º 10
0
def select() -> None:
    LOGGER.info('Choose repositories per language')
    LOGGER.info('This operation might take few minutes...')

    input_data = load_csv(File.ALTERED_DATASET)
    shuffled = input_data.sample(frac=1).reset_index(drop=True)

    max_repositories = Config.nb_repositories_per_language

    selected_list = []
    for language in Config.languages:
        filtered = shuffled[shuffled['repository_language'] == language]
        nb_found = len(filtered)
        nb_selected = min(nb_found, max_repositories)

        LOGGER.info(
            '%s repositories, found: %s, kept: %s',
            language, nb_found, nb_selected)

        if nb_selected < max_repositories:
            LOGGER.warning(
                '%s, not enough repositories, required: %s',
                language, max_repositories)

        if nb_selected == 0:
            continue

        selected = filtered[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    output_path = absolute(File.SELECTED_REPOSITORIES)
    united = pd.concat(selected_list)
    united.to_csv(output_path, index=False)