Esempio n. 1
0
def alter(config: Config) -> None:
    LOGGER.info('Alter repositories list file')
    LOGGER.info('This operation might take several minutes...')

    output_path = config.absolute(File.ALTERED_DATASET)

    df = config.load_csv(File.SHRUNK_DATASET)

    # Set repositories with no language as Markdown repositories.
    # Because most of Github repositories have a Readme.md file.
    mask = df['repository_language'].isnull()
    df.loc[mask, 'repository_language'] = 'Markdown'

    # Handle language aliases
    for alias, languages in config.alias_mapping.items():
        lang = languages[0]
        mask = df['repository_language'] == alias
        df.loc[mask, 'repository_language'] = lang

    # There are too few repositories for some languages.
    # To mitigate this problem, a list of known repositories
    # is added to the dataset.
    other_df = pd.read_csv(OTHER_REPO_DATASET_PATH)
    df = pd.concat([other_df, df]).drop_duplicates('repository_name')
    df.to_csv(output_path, index=False)
Esempio n. 2
0
def download(config: Config) -> None:
    LOGGER.info('Download chosen repositories')
    LOGGER.info('This operation might take a lot of time...')

    input_data = config.load_csv(File.PREPARED_REPOSITORIES)

    input_data.loc[:, 'repository_is_empty'] = True
    rows = (dict(row) for _, row in input_data.iterrows())
    result_rows = []
    total = len(input_data)
    for step, row in enumerate(pool_map(_clone_repository, rows, config), 1):
        result_rows.append(row)
        if step % LOG_STEP == 0:
            LOGGER.info(f'--> Processed {step} / {total} repositories...')
    LOGGER.info(f'--> Processed {total} / {total} repositories!')

    data = pd.DataFrame(result_rows)

    LOGGER.info('Removing empty repositories')
    data = data[~data['repository_is_empty']]
    LOGGER.info(f'Kept {len(data)} non empty repositories')

    fieldnames = ['repository_language', 'repository_dirname']
    output_data = data[fieldnames]
    output_path = config.absolute(File.DOWNLOADED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Esempio n. 3
0
def select(config: Config) -> None:
    LOGGER.info('Choose repositories per language')
    LOGGER.info('This operation might take several minutes...')

    input_data = config.load_csv(File.ALTERED_DATASET)
    shuffled = input_data.sample(frac=1).reset_index(drop=True)

    max_repositories = config.nb_repositories_per_language

    selected_list = []
    for lang in config.languages:
        filtered = shuffled[shuffled['repository_language'] == lang]
        nb_found = len(filtered)
        nb_selected = min(nb_found, max_repositories)

        LOGGER.info(
            f'{lang} repositories, found: {nb_found}, kept: {nb_selected}')

        if nb_selected < max_repositories:
            LOGGER.warning(f'{lang}, not enough repositories, '
                           f'required: {max_repositories}')

        if nb_selected == 0:
            continue

        selected = filtered[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    output_path = config.absolute(File.SELECTED_REPOSITORIES)
    united = pd.concat(selected_list)
    united.to_csv(output_path, index=False)
Esempio n. 4
0
def extract(config: Config) -> None:
    LOGGER.info('Extracting repositories list file')
    LOGGER.info('This operation might take several minutes...')

    compressed_filename = config.absolute(File.COMPRESSED_DATASET)
    with tarfile.open(compressed_filename) as tar:
        tar.extract(DATASET_FILENAME, path=config.absolute('.'))

    extracted_file = config.absolute(DATASET_FILENAME)
    extracted_file.rename(config.absolute(File.DATASET))
Esempio n. 5
0
def prepare(config: Config) -> None:
    LOGGER.info('Prepare repositories download')
    LOGGER.info('This operation should take few seconds...')

    input_data = config.load_csv(File.SELECTED_REPOSITORIES)
    input_data.loc[:, 'repository_dirname'] = ''
    input_data.loc[:, 'repository_url'] = ''

    output_data = input_data.apply(_add_download_info, axis=1)
    output_path = config.absolute(File.PREPARED_REPOSITORIES)
    output_data.to_csv(output_path, index=False)
Esempio n. 6
0
def deduplicate(config: Config) -> None:
    df = config.load_csv(File.AVAILABLE_FILES)
    df.drop_duplicates(subset='dedup_key', inplace=True)
    df.sort_values(by='rank', inplace=True)

    LOGGER.info('Files available by language:')
    for lang in config.languages:
        nb_files = len(df[df['language'] == lang])
        LOGGER.info(f'--> {lang}: {nb_files}')

    config.save_csv(df, File.DEDUPLICATED_FILES)
def setup_function(_):
    tempdir = mkdtemp(suffix='_gesslangtools_unittest')
    print(f'Temporary config directory: {tempdir}')
    Config.setup(
        cache_dir=tempdir,
        nb_repositories=10,
        nb_train=10,
        nb_valid=10,
        nb_test=10,
    )

    assert Config.cache_dir == tempdir
Esempio n. 8
0
def list_all(config: Config) -> None:
    LOGGER.info('List source files from repositories')
    LOGGER.info('This operation might take several minutes...')

    # Start or resume files listing
    repo = config.load_csv(File.DOWNLOADED_REPOSITORIES)
    try:
        files = config.load_csv(File.AVAILABLE_FILES)
    except IOError:
        files = pd.DataFrame([], columns=AVAILABLE_FILES_COLUMNS)

    # Find repositories that have not been processed yet
    mask = ~repo['repository_dirname'].isin(files['repository_dirname'])
    new_repo = repo[mask]
    LOGGER.info(f'{len(new_repo)} newly downloaded repositories')

    # Show the number of deleted repositories
    nb_repo_before = len(files['repository_dirname'].unique())
    mask = files['repository_dirname'].isin(repo['repository_dirname'])
    files = files[mask]
    nb_repo_after = len(files['repository_dirname'].unique())
    nb_removed = nb_repo_before - nb_repo_after
    LOGGER.info(f'{nb_removed} deleted repositories')

    # List unprocessed repositories files
    total = len(new_repo)
    rows = (dict(repo) for _, repo in new_repo.iterrows())

    output_path = config.absolute(File.AVAILABLE_FILES)
    write_headers = not output_path.exists()
    csv.field_size_limit(CSV_FIELD_LIMIT)
    with output_path.open('a') as output:
        writer = csv.DictWriter(output, fieldnames=AVAILABLE_FILES_COLUMNS)
        if write_headers:
            writer.writeheader()

        for index, result in enumerate(pool_map(_list_files, rows, config)):
            for item in result:
                writer.writerow(item)

            if index % LOG_STEP == 0:
                LOGGER.info(f'--> Processed {index} / {total} repositories...')
        LOGGER.info(f'--> Processed {total} / {total} repositories!')

    LOGGER.info(f'Created file: {output_path}')
Esempio n. 9
0
def show_repositories_distribution(config: Config) -> None:
    LOGGER.info('Loading repositories info')
    LOGGER.info('This operation should take few seconds...')

    selected = config.load_csv(File.SELECTED_REPOSITORIES)
    count = selected.repository_language.value_counts()

    pd.set_option('display.max_rows', None)
    print(count)
Esempio n. 10
0
def merge_to_selected_repositories(config: Config, filename: str) -> None:
    selected = config.load_csv(File.SELECTED_REPOSITORIES)
    listed = config.load_csv(filename)

    selected = pd.concat([listed, selected])
    selected = selected.drop_duplicates('repository_name')

    config.backup(File.SELECTED_REPOSITORIES)
    config.save_csv(selected, File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        config.backup(File.PREPARED_REPOSITORIES)
Esempio n. 11
0
def select_only_downloaded_repo(config: Config) -> None:
    downloaded_repo = (path.name for path in config.repositories_dir.glob('*'))
    selected = config.load_csv(File.SELECTED_REPOSITORIES)
    prepared = config.load_csv(File.PREPARED_REPOSITORIES)

    LOGGER.info(f'{len(selected)} repositories previously selected')

    repo = pd.DataFrame(downloaded_repo, columns=['repository_dirname'])
    mask = prepared['repository_dirname'].isin(repo['repository_dirname'])
    prepared = prepared[mask]
    mask = selected['repository_name'].isin(prepared['repository_name'])
    selected = selected[mask]

    LOGGER.info(f'{len(selected)} downloaded repositories selected')

    config.backup(File.SELECTED_REPOSITORIES)
    config.backup(File.PREPARED_REPOSITORIES)
    config.save_csv(selected, File.SELECTED_REPOSITORIES)
    config.save_csv(prepared, File.PREPARED_REPOSITORIES)
Esempio n. 12
0
def select_more_repositories(config: Config, languages: List[str]) -> None:
    LOGGER.info('Choose more repositories per language')
    LOGGER.info('This operation might take several minutes...')

    input_data = config.load_csv(File.ALTERED_DATASET)
    known = config.load_csv(File.SELECTED_REPOSITORIES)

    mask = ~input_data['repository_name'].isin(known['repository_name'])
    repositories = input_data[mask]
    shuffled = repositories.sample(frac=1).reset_index(drop=True)

    max_repositories = config.nb_repositories_per_language

    selected_list = []
    for lang in languages:
        if lang not in config.languages:
            LOGGER.error(f'Unknown language {lang}')
            raise RuntimeError(f'Unknown language {lang}')

        pending = shuffled[shuffled['repository_language'] == lang]
        nb_known = len(known[known['repository_language'] == lang])
        nb_pending = len(pending)
        nb_required = max(max_repositories - nb_known, 0)
        nb_selected = min(nb_pending, nb_required)
        total = nb_known + nb_selected

        LOGGER.info(f'{lang}: repositories per language: {max_repositories}, '
                    f'pending: {nb_pending}, known: {nb_known}, '
                    f'selected: {nb_selected}, total: {total}')

        if total < max_repositories:
            LOGGER.warning(f'{lang}, not enough repositories, '
                           f'required: {max_repositories}')

        if nb_selected == 0:
            continue

        selected = pending[:nb_selected]
        selected_list.append(selected)

    if not selected_list:
        LOGGER.error('No repository found')
        raise RuntimeError('No repository found')

    config.backup(File.SELECTED_REPOSITORIES)
    with suppress(IOError):
        config.backup(File.PREPARED_REPOSITORIES)

    new_repositories = pd.concat(selected_list)
    united = known.append(new_repositories)
    config.save_csv(united, File.SELECTED_REPOSITORIES)
Esempio n. 13
0
def finalize(config: Config) -> None:
    items = config.extensions.items()
    lang_ext = OrderedDict(sorted(items, key=_lang_name))
    language_filename = config.absolute('languages.json')
    with language_filename.open('w') as output:
        json.dump(lang_ext, output, indent=2)

    LOGGER.info('Dataset successfully generated')
    LOGGER.info('To train Guesslang with this dataset:')
    LOGGER.info(f'* copy {language_filename} into guesslang/data/ directory')
    LOGGER.info(
        f'* run $ guesslang --train {config.cache_path} /path/to/new_model'
    )
Esempio n. 14
0
def config():
    tempdir = mkdtemp(prefix='guesslangtools_unittest_')
    print(f'Temporary config directory: {tempdir}')
    config = Config(
        cache_dir=tempdir,
        nb_repositories=REPO_PER_LANG,
        nb_train=FILES_PER_LANG_PER_DATASET,
        nb_valid=FILES_PER_LANG_PER_DATASET,
        nb_test=FILES_PER_LANG_PER_DATASET,
    )
    assert config.cache_path == Path(tempdir).absolute()
    try:
        yield config
    finally:
        rmtree(config.cache_path)
Esempio n. 15
0
def shrink(config: Config) -> None:
    LOGGER.info('Shrink repositories list file')
    LOGGER.info('This operation might take several minutes...')

    input_path = config.absolute(File.DATASET)
    output_path = config.absolute(File.SHRUNK_DATASET)

    # The input dataset is too huge to be fully loaded into memory
    csv.field_size_limit(CSV_FIELD_LIMIT)
    with input_path.open() as input_file, output_path.open('w') as output_file:
        reader = csv.DictReader(input_file)
        fieldnames = ['repository_name', 'repository_language']
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)
        writer.writeheader()

        for item in reader:
            if _ignore(item):
                continue

            smaller_item = {
                'repository_name': item['Name with Owner'],
                'repository_language': item['Language'],
            }
            writer.writerow(smaller_item)
Esempio n. 16
0
def main() -> None:
    parser = ArgumentParser(description='Guesslang data preparation tool')
    parser.add_argument('-d',
                        '--debug',
                        action='store_true',
                        help='display debug messages')

    parser.add_argument(
        'CACHE_DIR',
        help='directory where the generated content will be stored')
    parser.add_argument('--nb-train-files',
                        type=int,
                        default=27000,
                        help='number of training files per language')
    parser.add_argument('--nb-valid-files',
                        type=int,
                        default=4000,
                        help='number of validation files per language')
    parser.add_argument('--nb-test-files',
                        type=int,
                        default=4000,
                        help='number of testing files per language')
    parser.add_argument('--nb-repo',
                        type=int,
                        default=4000,
                        help='number of repositories per language')

    parser.add_argument(
        '--hack-repo-dist',
        action='store_true',
        default=False,
        help='show the number of selected repositories per languages')
    parser.add_argument(
        '--hack-add-repo',
        nargs='+',
        metavar='LANGUAGE',
        help='select more repositories for the listed languages')
    parser.add_argument(
        '--hack-only-downloaded-repo',
        action='store_true',
        default=False,
        help='only use the repositories that have already been downloaded')

    args = parser.parse_args()
    items = vars(args).items()
    hack_args = any(val for name, val in items if name.startswith('hack_'))

    log_level = 'DEBUG' if args.debug else 'INFO'
    LOGGING_CONFIG['root']['level'] = log_level
    logging.config.dictConfig(LOGGING_CONFIG)

    Config.setup(
        cache_dir=args.CACHE_DIR,
        nb_repositories=args.nb_repo,
        nb_train=args.nb_train_files,
        nb_valid=args.nb_valid_files,
        nb_test=args.nb_test_files,
    )

    with suppress(KeyboardInterrupt):
        if hack_args:
            run_hacks(args)
        else:
            run_workflow()
Esempio n. 17
0
def main() -> None:
    parser = ArgumentParser(description='Guesslang data preparation tool')
    parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        help='display debug messages',
    )

    # Setup to generate Guesslang training, validation and test datasets
    parser.add_argument(
        'CACHE_DIR',
        help='directory where the generated content will be stored',
    )
    parser.add_argument(
        '--nb-repo',
        type=int,
        default=8000,
        help='number of repositories per language',
    )
    parser.add_argument(
        '--nb-train-files',
        type=int,
        default=27000,
        help='number of training files per language',
    )
    parser.add_argument(
        '--nb-valid-files',
        type=int,
        default=4000,
        help='number of validation files per language',
    )
    parser.add_argument(
        '--nb-test-files',
        type=int,
        default=4000,
        help='number of testing files per language',
    )

    # Utils to analyse Guesslang model performances
    parser.add_argument(
        '--util-prediction-confidence',
        action='store_true',
        default=False,
        help='plot the prediction probabilies distribution for each language',
    )
    parser.add_argument(
        '--util-confusion-matrix',
        metavar='GUESSLANG_TEST_REPORT_FILENAME',
        help='show languages that Guesslange confuses with others',
    )
    parser.add_argument(
        '--util-less-training-files',
        type=int,
        metavar='NB_FILES_PER_LANGUAGE',
        help='extract a subset of the training files dataset',
    )

    # Hacks to use when you don't have enough files for some language
    parser.add_argument(
        '--hack-repo-dist',
        action='store_true',
        default=False,
        help='show the number of selected repositories per languages',
    )
    parser.add_argument(
        '--hack-download-repo-list',
        nargs=3,
        type=str,
        # To get a Github token, check https://developer.github.com/v3/oauth/
        metavar=('GITHUB_TOKEN', 'LANGUAGE', 'REPO_LIST_FILENAME'),
        help='download a list or repository names from Github for a language',
    )
    parser.add_argument(
        '--hack-merge-repo-list',
        metavar='REPO_LIST_FILENAME',
        help='merge downloaded repository names to the selected repositories',
    )
    parser.add_argument(
        '--hack-add-repo',
        nargs='+',
        metavar='LANGUAGE',
        help='select more repositories for the listed languages',
    )
    parser.add_argument(
        '--hack-only-use-downloaded-repo',
        action='store_true',
        default=False,
        help='only use the repositories that have already been downloaded',
    )

    args = parser.parse_args()
    items = vars(args).items()
    util_args = any(val for name, val in items if name.startswith('util_'))
    hack_args = any(val for name, val in items if name.startswith('hack_'))

    log_level = 'DEBUG' if args.debug else 'INFO'
    LOGGING_CONFIG['root']['level'] = log_level
    logging.config.dictConfig(LOGGING_CONFIG)

    config = Config(
        cache_dir=args.CACHE_DIR,
        nb_repositories=args.nb_repo,
        nb_train=args.nb_train_files,
        nb_valid=args.nb_valid_files,
        nb_test=args.nb_test_files,
    )

    with suppress(KeyboardInterrupt):
        if util_args:
            run_utils(config, args)
        elif hack_args:
            run_hacks(config, args)
        else:
            run_workflow(config)
Esempio n. 18
0
def split(config: Config) -> None:
    LOGGER.info('Split repositories by usage: train, valid & test')
    LOGGER.info('This operation should take few seconds...')

    files = config.load_csv(File.DEDUPLICATED_FILES)
    files = files.drop('dedup_key', axis=1)
    repo_columns = ['repository_language', 'repository_dirname']

    repo = files[repo_columns].drop_duplicates()
    repo = repo.sample(frac=1).reset_index(drop=True)
    repo.loc[:, 'usage'] = ''

    LOGGER.info(f'Total downloaded repositories: {len(repo)}')

    total_files = (
        config.nb_train_files_per_language
        + config.nb_valid_files_per_language
        + config.nb_test_files_per_language
    )
    valid_ratio = config.nb_valid_files_per_language / total_files
    valid_ratio = max(valid_ratio, MIN_SPLIT_RATIO)

    test_ratio = config.nb_test_files_per_language / total_files
    test_ratio = max(test_ratio, MIN_SPLIT_RATIO)

    repositories = {}
    for lang in config.languages:
        by_language = repo[repo['repository_language'] == lang]
        total = len(by_language)
        if total < MIN_REPOSITORIES:
            raise RuntimeError(
                f'Need more than {MIN_REPOSITORIES}, '
                f'only {total} repositories usable for language {lang}'
            )

        nb_test = max(int(total*test_ratio), 1)
        nb_valid = max(int(total*valid_ratio), 1)
        nb_test_valid = nb_test + nb_valid

        test = by_language[:nb_test]
        test['usage'].values[:] = 'test'
        repositories[f'{lang}/test'] = test

        valid = by_language[nb_test:nb_test_valid]
        valid['usage'].values[:] = 'valid'
        repositories[f'{lang}/valid'] = valid

        train = by_language[nb_test_valid:]
        train['usage'].values[:] = 'train'
        repositories[f'{lang}/train'] = train

        LOGGER.info(
            f'{lang} nb repositories, train: {total-nb_test_valid}, '
            f'valid: {nb_valid}, test: {nb_test}'
        )

    for name, repository in repositories.items():
        if not len(repository):
            LOGGER.error(f'No repositories available for {name}')
            raise RuntimeError(f'No repositories for category: {name}')

    repo = pd.concat(repositories.values())
    files = pd.merge(files, repo, on=repo_columns)
    config.save_csv(files, File.FILES_SPLIT_BY_USAGE)
Esempio n. 19
0
def download(config: Config) -> None:
    LOGGER.info('Retrieving repositories dataset (8GB)')
    LOGGER.info('This operation might take a lot of time...')

    destination = config.absolute(File.COMPRESSED_DATASET)
    download_file(DATASET_URL, destination)
Esempio n. 20
0
def extract(config: Config) -> None:
    LOGGER.info('Extract selected files')
    LOGGER.info('This operation might take a lot of time...')

    train_path = config.extracted_files_dir.joinpath('train')
    valid_path = config.extracted_files_dir.joinpath('valid')
    test_path = config.extracted_files_dir.joinpath('test')

    train_path.mkdir(exist_ok=True)
    valid_path.mkdir(exist_ok=True)
    test_path.mkdir(exist_ok=True)

    # Load list of files to extract
    source = config.load_csv(File.FILES_SPLIT_BY_USAGE)

    # Load list of processed files
    try:
        files = config.load_csv(File.EXTRACTED_FILES)
    except IOError:
        files = pd.DataFrame([], columns=EXTRACTED_FILES_COLUMNS)

    df = pd.merge(source, files, how='outer', on=list(source.columns))
    df.loc[df['status'].isnull(), 'status'] = Status.PENDING.value

    # Flag existing files
    is_pending = df['status'] == Status.PENDING.value
    file_exists = df.apply(partial(_destination_exists, config), axis=1)
    df.loc[(is_pending & file_exists), 'status'] = Status.DISCARDED.value

    while True:
        selected = _choose_files_to_extract(config, df)
        LOGGER.info(f'{len(selected)} files to extract')

        if not len(selected):
            break

        result = _extract_files(config, selected)

        result_extracted = result[result['status'] == Status.EXTRACTED.value]
        mask = df['extract_to'].isin(result_extracted['extract_to'])
        df.loc[mask, 'status'] = Status.EXTRACTED.value

        result_discarded = result[result['status'] == Status.DISCARDED.value]
        mask = df['extract_to'].isin(result_discarded['extract_to'])
        df.loc[mask, 'status'] = Status.DISCARDED.value

        extracted = df[df['status'] == Status.EXTRACTED.value]
        discarded = df[df['status'] == Status.DISCARDED.value]

        LOGGER.info(
            f'Processed {len(result)} files: {len(result_extracted)} '
            f'extracted, {len(result_discarded)} discarded'
        )

        LOGGER.info(f'{len(extracted)} total files extracted')
        LOGGER.info(f'{len(discarded)} total files discarded')

    config.save_csv(df, File.EXTRACTED_FILES)

    LOGGER.info(f'The training files are located in {train_path}')
    LOGGER.info(f'The validation files are located in {valid_path}')
    LOGGER.info(f'The test files are located in {test_path}')