Esempio n. 1
0
def extract() -> None:
    LOGGER.info('Extracting repositories list file')
    LOGGER.info('This operation might take few minutes...')

    compressed_filename = absolute(File.COMPRESSED_DATASET)
    with tarfile.open(compressed_filename) as tar:
        tar.extract(DATASET_FILENAME, path=absolute('.'))

    extracted_file = absolute(DATASET_FILENAME)
    extracted_file.rename(absolute(File.DATASET))
Esempio n. 2
0
def download() -> None:
    LOGGER.info('Download chosen repositories')
    LOGGER.info('This operation might take a lot of time...')

    input_data = load_csv(File.PREPARED_REPOSITORIES)

    rows = (row_info[1] for row_info in input_data.iterrows())
    result_rows = []
    for step, row in enumerate(pool_imap(_download_repository, rows), 1):
        result_rows.append(row)
        if step % Config.step == 0:
            LOGGER.info('--> Processed %s repositories...', step)

    dataframes = [pd.DataFrame(row).T for row in result_rows]
    data = pd.concat(dataframes)

    data.loc[:, 'repository_size'] = 0
    data = data.apply(_check_size, axis=1)
    data = data[data['repository_size'] != 0]
    data = data[data['repository_filename'] != '']

    fieldnames = ['repository_language', 'repository_filename']
    output_data = data[fieldnames]
    output_path = absolute(File.DOWNLOADED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Esempio n. 3
0
def select_more_repositories(languages: List[str]) -> None:
    LOGGER.info('Choose more repositories per language')
    LOGGER.info('This operation might take few minutes...')

    output_path = absolute(File.SELECTED_REPOSITORIES)

    input_data = load_csv(File.ALTERED_DATASET)
    known = load_csv(File.SELECTED_REPOSITORIES)

    mask = ~input_data['repository_name'].isin(known['repository_name'])
    repositories = input_data[mask]
    shuffled = repositories.sample(frac=1).reset_index(drop=True)

    max_repositories = Config.nb_repositories_per_language

    selected_list = []
    for language in languages:
        if language not in Config.languages:
            LOGGER.error('Unknown language %s', language)
            raise RuntimeError(f'Unknown language {language}')

        pending = shuffled[shuffled['repository_language'] == language]
        nb_known = len(known[known['repository_language'] == language])
        nb_pending = len(pending)
        nb_required = max(max_repositories - nb_known, 0)
        nb_selected = min(nb_pending, nb_required)
        total = nb_known + nb_selected

        LOGGER.info(
            '%s: repositories per language: %s, pending: %s, known: %s, '
            'selected: %s, total: %s', language, max_repositories, nb_pending,
            nb_known, nb_selected, total)

        if total < max_repositories:
            LOGGER.warning('%s, not enough repositories, required: %s',
                           language, max_repositories)

        if nb_selected == 0:
            continue

        selected = pending[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    backup(File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        backup(File.PREPARED_REPOSITORIES)

    new_repositories = pd.concat(selected_list)
    united = known.append(new_repositories)
    united.to_csv(output_path, index=False)
Esempio n. 4
0
def prepare() -> None:
    LOGGER.info('Prepare repositories download')
    LOGGER.info('This operation should take few seconds...')

    input_data = load_csv(File.SELECTED_REPOSITORIES)
    input_data.loc[:, 'repository_filename'] = ''
    input_data.loc[:, 'repository_url'] = ''

    output_data = input_data.apply(_add_download_info, axis=1)
    output_path = absolute(File.PREPARED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
def check_files():
    path = absolute(File.EXTRACTED_FILES)
    assert path.exists()

    languages = Config.languages
    files = {lang: 0 for lang in languages}

    with path.open() as csv_file:
        for item in DictReader(csv_file):
            if not item['status'] == Status.EXTRACTED.value:
                continue

            language = item['language']
            path_elements = ('files', item['usage'], item['extract_to'])
            extracted_path = absolute(*path_elements)
            ext = extracted_path.suffix.lstrip('.')

            assert extracted_path.exists()
            assert ext in languages[language]

            files[language] += 1

    assert all(count == 30 for count in files.values())
Esempio n. 6
0
def shrink() -> None:
    LOGGER.info('Shrink repositories list file')
    LOGGER.info('This operation might take few minutes...')

    input_path = absolute(File.DATASET)
    output_path = absolute(File.SHRUNK_DATASET)

    # The input dataset is too huge to be fully loaded into memory
    csv.field_size_limit(CSV_FIELD_LIMIT)
    with input_path.open() as input_file, output_path.open('w') as output_file:
        reader = csv.DictReader(input_file)
        fieldnames = ['repository_name', 'repository_language']
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()

        for item in reader:
            if _ignore(item):
                continue

            smaller_item = {
                'repository_name': item['Name with Owner'],
                'repository_language': item['Language'],
            }
            writer.writerow(smaller_item)
Esempio n. 7
0
def alter() -> None:
    LOGGER.info('Alter repositories list file')
    LOGGER.info('This operation might take few minutes...')

    output_path = absolute(File.ALTERED_DATASET)

    df = load_csv(File.SHRUNK_DATASET)

    # Set repositories with no language as Markdown repositories.
    # Because most of Github repositories have a Readme.md file.
    mask = df['repository_language'].isnull()
    df.loc[mask, 'repository_language'] = 'Markdown'

    # There are too few repositories tagged as SQL repositories.
    # To mitigate this problem, a list of known repositories are flagged as
    # SQL repositories.
    sql_df = pd.read_csv(SQL_DATASET_PATH)
    mask = df['repository_name'].isin(sql_df['repository_name'])
    df.loc[mask, 'repository_language'] = 'SQL'

    df.to_csv(output_path, index=False)
Esempio n. 8
0
def select() -> None:
    LOGGER.info('Choose repositories per language')
    LOGGER.info('This operation might take few minutes...')

    input_data = load_csv(File.ALTERED_DATASET)
    shuffled = input_data.sample(frac=1).reset_index(drop=True)

    max_repositories = Config.nb_repositories_per_language

    selected_list = []
    for language in Config.languages:
        filtered = shuffled[shuffled['repository_language'] == language]
        nb_found = len(filtered)
        nb_selected = min(nb_found, max_repositories)

        LOGGER.info(
            '%s repositories, found: %s, kept: %s',
            language, nb_found, nb_selected)

        if nb_selected < max_repositories:
            LOGGER.warning(
                '%s, not enough repositories, required: %s',
                language, max_repositories)

        if nb_selected == 0:
            continue

        selected = filtered[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    output_path = absolute(File.SELECTED_REPOSITORIES)
    united = pd.concat(selected_list)
    united.to_csv(output_path, index=False)
Esempio n. 9
0
def download() -> None:
    LOGGER.info('Retrieving repositories dataset (8GB)')
    LOGGER.info('This operation might take a lot of time...')

    destination = absolute(File.COMPRESSED_DATASET)
    download_file(DATASET_URL, destination)