def deduplicate(config: Config) -> None: df = config.load_csv(File.AVAILABLE_FILES) df.drop_duplicates(subset='dedup_key', inplace=True) df.sort_values(by='rank', inplace=True) LOGGER.info('Files available by language:') for lang in config.languages: nb_files = len(df[df['language'] == lang]) LOGGER.info(f'--> {lang}: {nb_files}') config.save_csv(df, File.DEDUPLICATED_FILES)
def merge_to_selected_repositories(config: Config, filename: str) -> None: selected = config.load_csv(File.SELECTED_REPOSITORIES) listed = config.load_csv(filename) selected = pd.concat([listed, selected]) selected = selected.drop_duplicates('repository_name') config.backup(File.SELECTED_REPOSITORIES) config.save_csv(selected, File.SELECTED_REPOSITORIES) with suppress(IOError): config.backup(File.PREPARED_REPOSITORIES)
def select_more_repositories(config: Config, languages: List[str]) -> None: LOGGER.info('Choose more repositories per language') LOGGER.info('This operation might take several minutes...') input_data = config.load_csv(File.ALTERED_DATASET) known = config.load_csv(File.SELECTED_REPOSITORIES) mask = ~input_data['repository_name'].isin(known['repository_name']) repositories = input_data[mask] shuffled = repositories.sample(frac=1).reset_index(drop=True) max_repositories = config.nb_repositories_per_language selected_list = [] for lang in languages: if lang not in config.languages: LOGGER.error(f'Unknown language {lang}') raise RuntimeError(f'Unknown language {lang}') pending = shuffled[shuffled['repository_language'] == lang] nb_known = len(known[known['repository_language'] == lang]) nb_pending = len(pending) nb_required = max(max_repositories - nb_known, 0) nb_selected = min(nb_pending, nb_required) total = nb_known + nb_selected LOGGER.info(f'{lang}: repositories per language: {max_repositories}, ' f'pending: {nb_pending}, known: {nb_known}, ' f'selected: {nb_selected}, total: {total}') if total < max_repositories: LOGGER.warning(f'{lang}, not enough repositories, ' f'required: {max_repositories}') if nb_selected == 0: continue selected = pending[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') config.backup(File.SELECTED_REPOSITORIES) with suppress(IOError): config.backup(File.PREPARED_REPOSITORIES) new_repositories = pd.concat(selected_list) united = known.append(new_repositories) config.save_csv(united, File.SELECTED_REPOSITORIES)
def select_only_downloaded_repo(config: Config) -> None: downloaded_repo = (path.name for path in config.repositories_dir.glob('*')) selected = config.load_csv(File.SELECTED_REPOSITORIES) prepared = config.load_csv(File.PREPARED_REPOSITORIES) LOGGER.info(f'{len(selected)} repositories previously selected') repo = pd.DataFrame(downloaded_repo, columns=['repository_dirname']) mask = prepared['repository_dirname'].isin(repo['repository_dirname']) prepared = prepared[mask] mask = selected['repository_name'].isin(prepared['repository_name']) selected = selected[mask] LOGGER.info(f'{len(selected)} downloaded repositories selected') config.backup(File.SELECTED_REPOSITORIES) config.backup(File.PREPARED_REPOSITORIES) config.save_csv(selected, File.SELECTED_REPOSITORIES) config.save_csv(prepared, File.PREPARED_REPOSITORIES)
def extract(config: Config) -> None: LOGGER.info('Extract selected files') LOGGER.info('This operation might take a lot of time...') train_path = config.extracted_files_dir.joinpath('train') valid_path = config.extracted_files_dir.joinpath('valid') test_path = config.extracted_files_dir.joinpath('test') train_path.mkdir(exist_ok=True) valid_path.mkdir(exist_ok=True) test_path.mkdir(exist_ok=True) # Load list of files to extract source = config.load_csv(File.FILES_SPLIT_BY_USAGE) # Load list of processed files try: files = config.load_csv(File.EXTRACTED_FILES) except IOError: files = pd.DataFrame([], columns=EXTRACTED_FILES_COLUMNS) df = pd.merge(source, files, how='outer', on=list(source.columns)) df.loc[df['status'].isnull(), 'status'] = Status.PENDING.value # Flag existing files is_pending = df['status'] == Status.PENDING.value file_exists = df.apply(partial(_destination_exists, config), axis=1) df.loc[(is_pending & file_exists), 'status'] = Status.DISCARDED.value while True: selected = _choose_files_to_extract(config, df) LOGGER.info(f'{len(selected)} files to extract') if not len(selected): break result = _extract_files(config, selected) result_extracted = result[result['status'] == Status.EXTRACTED.value] mask = df['extract_to'].isin(result_extracted['extract_to']) df.loc[mask, 'status'] = Status.EXTRACTED.value result_discarded = result[result['status'] == Status.DISCARDED.value] mask = df['extract_to'].isin(result_discarded['extract_to']) df.loc[mask, 'status'] = Status.DISCARDED.value extracted = df[df['status'] == Status.EXTRACTED.value] discarded = df[df['status'] == Status.DISCARDED.value] LOGGER.info( f'Processed {len(result)} files: {len(result_extracted)} ' f'extracted, {len(result_discarded)} discarded' ) LOGGER.info(f'{len(extracted)} total files extracted') LOGGER.info(f'{len(discarded)} total files discarded') config.save_csv(df, File.EXTRACTED_FILES) LOGGER.info(f'The training files are located in {train_path}') LOGGER.info(f'The validation files are located in {valid_path}') LOGGER.info(f'The test files are located in {test_path}')
def split(config: Config) -> None: LOGGER.info('Split repositories by usage: train, valid & test') LOGGER.info('This operation should take few seconds...') files = config.load_csv(File.DEDUPLICATED_FILES) files = files.drop('dedup_key', axis=1) repo_columns = ['repository_language', 'repository_dirname'] repo = files[repo_columns].drop_duplicates() repo = repo.sample(frac=1).reset_index(drop=True) repo.loc[:, 'usage'] = '' LOGGER.info(f'Total downloaded repositories: {len(repo)}') total_files = ( config.nb_train_files_per_language + config.nb_valid_files_per_language + config.nb_test_files_per_language ) valid_ratio = config.nb_valid_files_per_language / total_files valid_ratio = max(valid_ratio, MIN_SPLIT_RATIO) test_ratio = config.nb_test_files_per_language / total_files test_ratio = max(test_ratio, MIN_SPLIT_RATIO) repositories = {} for lang in config.languages: by_language = repo[repo['repository_language'] == lang] total = len(by_language) if total < MIN_REPOSITORIES: raise RuntimeError( f'Need more than {MIN_REPOSITORIES}, ' f'only {total} repositories usable for language {lang}' ) nb_test = max(int(total*test_ratio), 1) nb_valid = max(int(total*valid_ratio), 1) nb_test_valid = nb_test + nb_valid test = by_language[:nb_test] test['usage'].values[:] = 'test' repositories[f'{lang}/test'] = test valid = by_language[nb_test:nb_test_valid] valid['usage'].values[:] = 'valid' repositories[f'{lang}/valid'] = valid train = by_language[nb_test_valid:] train['usage'].values[:] = 'train' repositories[f'{lang}/train'] = train LOGGER.info( f'{lang} nb repositories, train: {total-nb_test_valid}, ' f'valid: {nb_valid}, test: {nb_test}' ) for name, repository in repositories.items(): if not len(repository): LOGGER.error(f'No repositories available for {name}') raise RuntimeError(f'No repositories for category: {name}') repo = pd.concat(repositories.values()) files = pd.merge(files, repo, on=repo_columns) config.save_csv(files, File.FILES_SPLIT_BY_USAGE)