def extract() -> None: LOGGER.info('Extract selected files') LOGGER.info('This operation might take a lot of time...') train_path = Config.extracted_files_dir.joinpath('train') valid_path = Config.extracted_files_dir.joinpath('valid') test_path = Config.extracted_files_dir.joinpath('test') train_path.mkdir(exist_ok=True) valid_path.mkdir(exist_ok=True) test_path.mkdir(exist_ok=True) source = load_csv(File.FILES_SPLIT_BY_USAGE) columns = [ 'extract_to', 'filename', 'language', 'rank', 'repository_filename', 'repository_language', 'usage', 'status' ] try: files = load_csv(File.EXTRACTED_FILES) except IOError: files = pd.DataFrame([], columns=columns) df = pd.merge(source, files, how='outer', on=list(source.columns)) df.loc[df['status'].isnull(), 'status'] = Status.PENDING.value while True: selected = _choose_files_to_extract(df) LOGGER.info('%s files to extract', len(selected)) if not len(selected): break result = _extract_files(selected) result_extracted = result[result['status'] == Status.EXTRACTED.value] mask = df['extract_to'].isin(result_extracted['extract_to']) df.loc[mask, 'status'] = Status.EXTRACTED.value result_discarded = result[result['status'] == Status.DISCARDED.value] mask = df['extract_to'].isin(result_discarded['extract_to']) df.loc[mask, 'status'] = Status.DISCARDED.value extracted = df[df['status'] == Status.EXTRACTED.value] discarded = df[df['status'] == Status.DISCARDED.value] LOGGER.info('Processed %s files: %s extracted, %s discarded', len(result), len(result_extracted), len(result_discarded)) LOGGER.info('%s total files extracted', len(extracted)) LOGGER.info('%s total files discarded', len(discarded)) save_csv(df, File.EXTRACTED_FILES) LOGGER.info('The training files are located in %s', train_path) LOGGER.info('The validation files are located in %s', valid_path) LOGGER.info('The test files are located in %s', test_path)
def select_more_repositories(languages: List[str]) -> None: LOGGER.info('Choose more repositories per language') LOGGER.info('This operation might take few minutes...') output_path = absolute(File.SELECTED_REPOSITORIES) input_data = load_csv(File.ALTERED_DATASET) known = load_csv(File.SELECTED_REPOSITORIES) mask = ~input_data['repository_name'].isin(known['repository_name']) repositories = input_data[mask] shuffled = repositories.sample(frac=1).reset_index(drop=True) max_repositories = Config.nb_repositories_per_language selected_list = [] for language in languages: if language not in Config.languages: LOGGER.error('Unknown language %s', language) raise RuntimeError(f'Unknown language {language}') pending = shuffled[shuffled['repository_language'] == language] nb_known = len(known[known['repository_language'] == language]) nb_pending = len(pending) nb_required = max(max_repositories - nb_known, 0) nb_selected = min(nb_pending, nb_required) total = nb_known + nb_selected LOGGER.info( '%s: repositories per language: %s, pending: %s, known: %s, ' 'selected: %s, total: %s', language, max_repositories, nb_pending, nb_known, nb_selected, total) if total < max_repositories: LOGGER.warning('%s, not enough repositories, required: %s', language, max_repositories) if nb_selected == 0: continue selected = pending[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') backup(File.SELECTED_REPOSITORIES) with suppress(IOError): backup(File.PREPARED_REPOSITORIES) new_repositories = pd.concat(selected_list) united = known.append(new_repositories) united.to_csv(output_path, index=False)
def download() -> None: LOGGER.info('Download chosen repositories') LOGGER.info('This operation might take a lot of time...') input_data = load_csv(File.PREPARED_REPOSITORIES) rows = (row_info[1] for row_info in input_data.iterrows()) result_rows = [] for step, row in enumerate(pool_imap(_download_repository, rows), 1): result_rows.append(row) if step % Config.step == 0: LOGGER.info('--> Processed %s repositories...', step) dataframes = [pd.DataFrame(row).T for row in result_rows] data = pd.concat(dataframes) data.loc[:, 'repository_size'] = 0 data = data.apply(_check_size, axis=1) data = data[data['repository_size'] != 0] data = data[data['repository_filename'] != ''] fieldnames = ['repository_language', 'repository_filename'] output_data = data[fieldnames] output_path = absolute(File.DOWNLOADED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def split() -> None: LOGGER.info('Split repositories by usage: train, valid & test') LOGGER.info('This operation should take few seconds...') files = load_csv(File.AVAILABLE_FILES) files = files.drop('dedup_key', axis=1) columns = ['repository_language', 'repository_filename'] repo = files[columns].drop_duplicates() repo = repo.sample(frac=1).reset_index(drop=True) repo.loc[:, 'usage'] = '' LOGGER.info('Total downloaded repositories: %s', len(repo)) total_files = (Config.nb_train_files_per_language + Config.nb_valid_files_per_language + Config.nb_test_files_per_language) valid_ratio = Config.nb_valid_files_per_language / total_files valid_ratio = max(valid_ratio, MIN_SPLIT_RATIO) test_ratio = Config.nb_test_files_per_language / total_files test_ratio = max(test_ratio, MIN_SPLIT_RATIO) repositories = {} for language in Config.languages: by_language = repo[repo['repository_language'] == language] total = len(by_language) if total < 3: raise RuntimeError( f'Need more than 3 repositories for language {language}') nb_test = max(int(total * test_ratio), 1) nb_valid = max(int(total * valid_ratio), 1) nb_test_valid = nb_test + nb_valid test = by_language[:nb_test] test['usage'].values[:] = 'test' repositories[f'{language}/test'] = test valid = by_language[nb_test:nb_test_valid] valid['usage'].values[:] = 'valid' repositories[f'{language}/valid'] = valid train = by_language[nb_test_valid:] train['usage'].values[:] = 'train' repositories[f'{language}/train'] = train LOGGER.info('%s nb repositories, train: %s, valid: %s, test: %s', language, total - nb_test_valid, nb_valid, nb_test) for name, repository in repositories.items(): if not len(repository): LOGGER.error('No repositories available for %s', name) raise RuntimeError(f'No repositories for category: {name}') repo = pd.concat(repositories.values()) files = pd.merge(files, repo, on=columns) save_csv(files, File.FILES_SPLIT_BY_USAGE)
def show_repositories_distribution() -> None: LOGGER.info('Loading repositories info') LOGGER.info('This operation should take few seconds...') selected = load_csv(File.SELECTED_REPOSITORIES) count = selected.repository_language.value_counts() pd.set_option('display.max_rows', None) print(count)
def select_only_downloaded_repo() -> None: downloaded_repo = (path.name for path in Config.repositories_dir.glob('*')) selected = load_csv(File.SELECTED_REPOSITORIES) prepared = load_csv(File.PREPARED_REPOSITORIES) LOGGER.info('%s repositories previously selected', len(selected)) repo = pd.DataFrame(downloaded_repo, columns=['repository_filename']) mask = prepared['repository_filename'].isin(repo['repository_filename']) prepared = prepared[mask] mask = selected['repository_name'].isin(prepared['repository_name']) selected = selected[mask] LOGGER.info('%s downloaded repositories selected', len(selected)) backup(File.SELECTED_REPOSITORIES) backup(File.PREPARED_REPOSITORIES) save_csv(selected, File.SELECTED_REPOSITORIES) save_csv(prepared, File.PREPARED_REPOSITORIES)
def list_all() -> None: LOGGER.info('List source files from repositories') LOGGER.info('This operation might take few minutes...') columns = [ 'extract_to', 'filename', 'language', 'rank', 'repository_filename', 'dedup_key', ] repo = load_csv(File.DOWNLOADED_REPOSITORIES) try: files = load_csv(File.AVAILABLE_FILES) except IOError: files = pd.DataFrame([], columns=columns) mask = ~repo['repository_filename'].isin(files['repository_filename']) new_repo = repo[mask] LOGGER.info('%s newly downloaded repositories', len(new_repo)) nb_repo_before = len(files.repository_filename.unique()) mask = files['repository_filename'].isin(repo['repository_filename']) files = files[mask] nb_repo_after = len(files.repository_filename.unique()) nb_removed = nb_repo_before - nb_repo_after LOGGER.info('%s deleted repositories', nb_removed) new_files = _list_files_by_language(new_repo) df = pd.concat([files, new_files], axis=0, sort=False) df.drop_duplicates(subset='dedup_key', inplace=True) df.sort_values(by='rank', inplace=True) LOGGER.info('Files available by language:') for language in Config.languages: nb_files = len(df[df['language'] == language]) LOGGER.info('--> %s: %s', language, nb_files) save_csv(df, File.AVAILABLE_FILES)
def prepare() -> None: LOGGER.info('Prepare repositories download') LOGGER.info('This operation should take few seconds...') input_data = load_csv(File.SELECTED_REPOSITORIES) input_data.loc[:, 'repository_filename'] = '' input_data.loc[:, 'repository_url'] = '' output_data = input_data.apply(_add_download_info, axis=1) output_path = absolute(File.PREPARED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def alter() -> None: LOGGER.info('Alter repositories list file') LOGGER.info('This operation might take few minutes...') output_path = absolute(File.ALTERED_DATASET) df = load_csv(File.SHRUNK_DATASET) # Set repositories with no language as Markdown repositories. # Because most of Github repositories have a Readme.md file. mask = df['repository_language'].isnull() df.loc[mask, 'repository_language'] = 'Markdown' # There are too few repositories tagged as SQL repositories. # To mitigate this problem, a list of known repositories are flagged as # SQL repositories. sql_df = pd.read_csv(SQL_DATASET_PATH) mask = df['repository_name'].isin(sql_df['repository_name']) df.loc[mask, 'repository_language'] = 'SQL' df.to_csv(output_path, index=False)
def select() -> None: LOGGER.info('Choose repositories per language') LOGGER.info('This operation might take few minutes...') input_data = load_csv(File.ALTERED_DATASET) shuffled = input_data.sample(frac=1).reset_index(drop=True) max_repositories = Config.nb_repositories_per_language selected_list = [] for language in Config.languages: filtered = shuffled[shuffled['repository_language'] == language] nb_found = len(filtered) nb_selected = min(nb_found, max_repositories) LOGGER.info( '%s repositories, found: %s, kept: %s', language, nb_found, nb_selected) if nb_selected < max_repositories: LOGGER.warning( '%s, not enough repositories, required: %s', language, max_repositories) if nb_selected == 0: continue selected = filtered[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') output_path = absolute(File.SELECTED_REPOSITORIES) united = pd.concat(selected_list) united.to_csv(output_path, index=False)