def alter(config: Config) -> None: LOGGER.info('Alter repositories list file') LOGGER.info('This operation might take several minutes...') output_path = config.absolute(File.ALTERED_DATASET) df = config.load_csv(File.SHRUNK_DATASET) # Set repositories with no language as Markdown repositories. # Because most of Github repositories have a Readme.md file. mask = df['repository_language'].isnull() df.loc[mask, 'repository_language'] = 'Markdown' # Handle language aliases for alias, languages in config.alias_mapping.items(): lang = languages[0] mask = df['repository_language'] == alias df.loc[mask, 'repository_language'] = lang # There are too few repositories for some languages. # To mitigate this problem, a list of known repositories # is added to the dataset. other_df = pd.read_csv(OTHER_REPO_DATASET_PATH) df = pd.concat([other_df, df]).drop_duplicates('repository_name') df.to_csv(output_path, index=False)
def download(config: Config) -> None: LOGGER.info('Download chosen repositories') LOGGER.info('This operation might take a lot of time...') input_data = config.load_csv(File.PREPARED_REPOSITORIES) input_data.loc[:, 'repository_is_empty'] = True rows = (dict(row) for _, row in input_data.iterrows()) result_rows = [] total = len(input_data) for step, row in enumerate(pool_map(_clone_repository, rows, config), 1): result_rows.append(row) if step % LOG_STEP == 0: LOGGER.info(f'--> Processed {step} / {total} repositories...') LOGGER.info(f'--> Processed {total} / {total} repositories!') data = pd.DataFrame(result_rows) LOGGER.info('Removing empty repositories') data = data[~data['repository_is_empty']] LOGGER.info(f'Kept {len(data)} non empty repositories') fieldnames = ['repository_language', 'repository_dirname'] output_data = data[fieldnames] output_path = config.absolute(File.DOWNLOADED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def select(config: Config) -> None: LOGGER.info('Choose repositories per language') LOGGER.info('This operation might take several minutes...') input_data = config.load_csv(File.ALTERED_DATASET) shuffled = input_data.sample(frac=1).reset_index(drop=True) max_repositories = config.nb_repositories_per_language selected_list = [] for lang in config.languages: filtered = shuffled[shuffled['repository_language'] == lang] nb_found = len(filtered) nb_selected = min(nb_found, max_repositories) LOGGER.info( f'{lang} repositories, found: {nb_found}, kept: {nb_selected}') if nb_selected < max_repositories: LOGGER.warning(f'{lang}, not enough repositories, ' f'required: {max_repositories}') if nb_selected == 0: continue selected = filtered[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') output_path = config.absolute(File.SELECTED_REPOSITORIES) united = pd.concat(selected_list) united.to_csv(output_path, index=False)
def extract(config: Config) -> None: LOGGER.info('Extracting repositories list file') LOGGER.info('This operation might take several minutes...') compressed_filename = config.absolute(File.COMPRESSED_DATASET) with tarfile.open(compressed_filename) as tar: tar.extract(DATASET_FILENAME, path=config.absolute('.')) extracted_file = config.absolute(DATASET_FILENAME) extracted_file.rename(config.absolute(File.DATASET))
def prepare(config: Config) -> None: LOGGER.info('Prepare repositories download') LOGGER.info('This operation should take few seconds...') input_data = config.load_csv(File.SELECTED_REPOSITORIES) input_data.loc[:, 'repository_dirname'] = '' input_data.loc[:, 'repository_url'] = '' output_data = input_data.apply(_add_download_info, axis=1) output_path = config.absolute(File.PREPARED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def deduplicate(config: Config) -> None: df = config.load_csv(File.AVAILABLE_FILES) df.drop_duplicates(subset='dedup_key', inplace=True) df.sort_values(by='rank', inplace=True) LOGGER.info('Files available by language:') for lang in config.languages: nb_files = len(df[df['language'] == lang]) LOGGER.info(f'--> {lang}: {nb_files}') config.save_csv(df, File.DEDUPLICATED_FILES)
def setup_function(_): tempdir = mkdtemp(suffix='_gesslangtools_unittest') print(f'Temporary config directory: {tempdir}') Config.setup( cache_dir=tempdir, nb_repositories=10, nb_train=10, nb_valid=10, nb_test=10, ) assert Config.cache_dir == tempdir
def list_all(config: Config) -> None: LOGGER.info('List source files from repositories') LOGGER.info('This operation might take several minutes...') # Start or resume files listing repo = config.load_csv(File.DOWNLOADED_REPOSITORIES) try: files = config.load_csv(File.AVAILABLE_FILES) except IOError: files = pd.DataFrame([], columns=AVAILABLE_FILES_COLUMNS) # Find repositories that have not been processed yet mask = ~repo['repository_dirname'].isin(files['repository_dirname']) new_repo = repo[mask] LOGGER.info(f'{len(new_repo)} newly downloaded repositories') # Show the number of deleted repositories nb_repo_before = len(files['repository_dirname'].unique()) mask = files['repository_dirname'].isin(repo['repository_dirname']) files = files[mask] nb_repo_after = len(files['repository_dirname'].unique()) nb_removed = nb_repo_before - nb_repo_after LOGGER.info(f'{nb_removed} deleted repositories') # List unprocessed repositories files total = len(new_repo) rows = (dict(repo) for _, repo in new_repo.iterrows()) output_path = config.absolute(File.AVAILABLE_FILES) write_headers = not output_path.exists() csv.field_size_limit(CSV_FIELD_LIMIT) with output_path.open('a') as output: writer = csv.DictWriter(output, fieldnames=AVAILABLE_FILES_COLUMNS) if write_headers: writer.writeheader() for index, result in enumerate(pool_map(_list_files, rows, config)): for item in result: writer.writerow(item) if index % LOG_STEP == 0: LOGGER.info(f'--> Processed {index} / {total} repositories...') LOGGER.info(f'--> Processed {total} / {total} repositories!') LOGGER.info(f'Created file: {output_path}')
def show_repositories_distribution(config: Config) -> None: LOGGER.info('Loading repositories info') LOGGER.info('This operation should take few seconds...') selected = config.load_csv(File.SELECTED_REPOSITORIES) count = selected.repository_language.value_counts() pd.set_option('display.max_rows', None) print(count)
def merge_to_selected_repositories(config: Config, filename: str) -> None: selected = config.load_csv(File.SELECTED_REPOSITORIES) listed = config.load_csv(filename) selected = pd.concat([listed, selected]) selected = selected.drop_duplicates('repository_name') config.backup(File.SELECTED_REPOSITORIES) config.save_csv(selected, File.SELECTED_REPOSITORIES) with suppress(IOError): config.backup(File.PREPARED_REPOSITORIES)
def select_only_downloaded_repo(config: Config) -> None: downloaded_repo = (path.name for path in config.repositories_dir.glob('*')) selected = config.load_csv(File.SELECTED_REPOSITORIES) prepared = config.load_csv(File.PREPARED_REPOSITORIES) LOGGER.info(f'{len(selected)} repositories previously selected') repo = pd.DataFrame(downloaded_repo, columns=['repository_dirname']) mask = prepared['repository_dirname'].isin(repo['repository_dirname']) prepared = prepared[mask] mask = selected['repository_name'].isin(prepared['repository_name']) selected = selected[mask] LOGGER.info(f'{len(selected)} downloaded repositories selected') config.backup(File.SELECTED_REPOSITORIES) config.backup(File.PREPARED_REPOSITORIES) config.save_csv(selected, File.SELECTED_REPOSITORIES) config.save_csv(prepared, File.PREPARED_REPOSITORIES)
def select_more_repositories(config: Config, languages: List[str]) -> None: LOGGER.info('Choose more repositories per language') LOGGER.info('This operation might take several minutes...') input_data = config.load_csv(File.ALTERED_DATASET) known = config.load_csv(File.SELECTED_REPOSITORIES) mask = ~input_data['repository_name'].isin(known['repository_name']) repositories = input_data[mask] shuffled = repositories.sample(frac=1).reset_index(drop=True) max_repositories = config.nb_repositories_per_language selected_list = [] for lang in languages: if lang not in config.languages: LOGGER.error(f'Unknown language {lang}') raise RuntimeError(f'Unknown language {lang}') pending = shuffled[shuffled['repository_language'] == lang] nb_known = len(known[known['repository_language'] == lang]) nb_pending = len(pending) nb_required = max(max_repositories - nb_known, 0) nb_selected = min(nb_pending, nb_required) total = nb_known + nb_selected LOGGER.info(f'{lang}: repositories per language: {max_repositories}, ' f'pending: {nb_pending}, known: {nb_known}, ' f'selected: {nb_selected}, total: {total}') if total < max_repositories: LOGGER.warning(f'{lang}, not enough repositories, ' f'required: {max_repositories}') if nb_selected == 0: continue selected = pending[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') config.backup(File.SELECTED_REPOSITORIES) with suppress(IOError): config.backup(File.PREPARED_REPOSITORIES) new_repositories = pd.concat(selected_list) united = known.append(new_repositories) config.save_csv(united, File.SELECTED_REPOSITORIES)
def finalize(config: Config) -> None: items = config.extensions.items() lang_ext = OrderedDict(sorted(items, key=_lang_name)) language_filename = config.absolute('languages.json') with language_filename.open('w') as output: json.dump(lang_ext, output, indent=2) LOGGER.info('Dataset successfully generated') LOGGER.info('To train Guesslang with this dataset:') LOGGER.info(f'* copy {language_filename} into guesslang/data/ directory') LOGGER.info( f'* run $ guesslang --train {config.cache_path} /path/to/new_model' )
def config(): tempdir = mkdtemp(prefix='guesslangtools_unittest_') print(f'Temporary config directory: {tempdir}') config = Config( cache_dir=tempdir, nb_repositories=REPO_PER_LANG, nb_train=FILES_PER_LANG_PER_DATASET, nb_valid=FILES_PER_LANG_PER_DATASET, nb_test=FILES_PER_LANG_PER_DATASET, ) assert config.cache_path == Path(tempdir).absolute() try: yield config finally: rmtree(config.cache_path)
def shrink(config: Config) -> None: LOGGER.info('Shrink repositories list file') LOGGER.info('This operation might take several minutes...') input_path = config.absolute(File.DATASET) output_path = config.absolute(File.SHRUNK_DATASET) # The input dataset is too huge to be fully loaded into memory csv.field_size_limit(CSV_FIELD_LIMIT) with input_path.open() as input_file, output_path.open('w') as output_file: reader = csv.DictReader(input_file) fieldnames = ['repository_name', 'repository_language'] writer = csv.DictWriter(output_file, fieldnames=fieldnames) writer.writeheader() for item in reader: if _ignore(item): continue smaller_item = { 'repository_name': item['Name with Owner'], 'repository_language': item['Language'], } writer.writerow(smaller_item)
def main() -> None: parser = ArgumentParser(description='Guesslang data preparation tool') parser.add_argument('-d', '--debug', action='store_true', help='display debug messages') parser.add_argument( 'CACHE_DIR', help='directory where the generated content will be stored') parser.add_argument('--nb-train-files', type=int, default=27000, help='number of training files per language') parser.add_argument('--nb-valid-files', type=int, default=4000, help='number of validation files per language') parser.add_argument('--nb-test-files', type=int, default=4000, help='number of testing files per language') parser.add_argument('--nb-repo', type=int, default=4000, help='number of repositories per language') parser.add_argument( '--hack-repo-dist', action='store_true', default=False, help='show the number of selected repositories per languages') parser.add_argument( '--hack-add-repo', nargs='+', metavar='LANGUAGE', help='select more repositories for the listed languages') parser.add_argument( '--hack-only-downloaded-repo', action='store_true', default=False, help='only use the repositories that have already been downloaded') args = parser.parse_args() items = vars(args).items() hack_args = any(val for name, val in items if name.startswith('hack_')) log_level = 'DEBUG' if args.debug else 'INFO' LOGGING_CONFIG['root']['level'] = log_level logging.config.dictConfig(LOGGING_CONFIG) Config.setup( cache_dir=args.CACHE_DIR, nb_repositories=args.nb_repo, nb_train=args.nb_train_files, nb_valid=args.nb_valid_files, nb_test=args.nb_test_files, ) with suppress(KeyboardInterrupt): if hack_args: run_hacks(args) else: run_workflow()
def main() -> None: parser = ArgumentParser(description='Guesslang data preparation tool') parser.add_argument( '-d', '--debug', action='store_true', help='display debug messages', ) # Setup to generate Guesslang training, validation and test datasets parser.add_argument( 'CACHE_DIR', help='directory where the generated content will be stored', ) parser.add_argument( '--nb-repo', type=int, default=8000, help='number of repositories per language', ) parser.add_argument( '--nb-train-files', type=int, default=27000, help='number of training files per language', ) parser.add_argument( '--nb-valid-files', type=int, default=4000, help='number of validation files per language', ) parser.add_argument( '--nb-test-files', type=int, default=4000, help='number of testing files per language', ) # Utils to analyse Guesslang model performances parser.add_argument( '--util-prediction-confidence', action='store_true', default=False, help='plot the prediction probabilies distribution for each language', ) parser.add_argument( '--util-confusion-matrix', metavar='GUESSLANG_TEST_REPORT_FILENAME', help='show languages that Guesslange confuses with others', ) parser.add_argument( '--util-less-training-files', type=int, metavar='NB_FILES_PER_LANGUAGE', help='extract a subset of the training files dataset', ) # Hacks to use when you don't have enough files for some language parser.add_argument( '--hack-repo-dist', action='store_true', default=False, help='show the number of selected repositories per languages', ) parser.add_argument( '--hack-download-repo-list', nargs=3, type=str, # To get a Github token, check https://developer.github.com/v3/oauth/ metavar=('GITHUB_TOKEN', 'LANGUAGE', 'REPO_LIST_FILENAME'), help='download a list or repository names from Github for a language', ) parser.add_argument( '--hack-merge-repo-list', metavar='REPO_LIST_FILENAME', help='merge downloaded repository names to the selected repositories', ) parser.add_argument( '--hack-add-repo', nargs='+', metavar='LANGUAGE', help='select more repositories for the listed languages', ) parser.add_argument( '--hack-only-use-downloaded-repo', action='store_true', default=False, help='only use the repositories that have already been downloaded', ) args = parser.parse_args() items = vars(args).items() util_args = any(val for name, val in items if name.startswith('util_')) hack_args = any(val for name, val in items if name.startswith('hack_')) log_level = 'DEBUG' if args.debug else 'INFO' LOGGING_CONFIG['root']['level'] = log_level logging.config.dictConfig(LOGGING_CONFIG) config = Config( cache_dir=args.CACHE_DIR, nb_repositories=args.nb_repo, nb_train=args.nb_train_files, nb_valid=args.nb_valid_files, nb_test=args.nb_test_files, ) with suppress(KeyboardInterrupt): if util_args: run_utils(config, args) elif hack_args: run_hacks(config, args) else: run_workflow(config)
def split(config: Config) -> None: LOGGER.info('Split repositories by usage: train, valid & test') LOGGER.info('This operation should take few seconds...') files = config.load_csv(File.DEDUPLICATED_FILES) files = files.drop('dedup_key', axis=1) repo_columns = ['repository_language', 'repository_dirname'] repo = files[repo_columns].drop_duplicates() repo = repo.sample(frac=1).reset_index(drop=True) repo.loc[:, 'usage'] = '' LOGGER.info(f'Total downloaded repositories: {len(repo)}') total_files = ( config.nb_train_files_per_language + config.nb_valid_files_per_language + config.nb_test_files_per_language ) valid_ratio = config.nb_valid_files_per_language / total_files valid_ratio = max(valid_ratio, MIN_SPLIT_RATIO) test_ratio = config.nb_test_files_per_language / total_files test_ratio = max(test_ratio, MIN_SPLIT_RATIO) repositories = {} for lang in config.languages: by_language = repo[repo['repository_language'] == lang] total = len(by_language) if total < MIN_REPOSITORIES: raise RuntimeError( f'Need more than {MIN_REPOSITORIES}, ' f'only {total} repositories usable for language {lang}' ) nb_test = max(int(total*test_ratio), 1) nb_valid = max(int(total*valid_ratio), 1) nb_test_valid = nb_test + nb_valid test = by_language[:nb_test] test['usage'].values[:] = 'test' repositories[f'{lang}/test'] = test valid = by_language[nb_test:nb_test_valid] valid['usage'].values[:] = 'valid' repositories[f'{lang}/valid'] = valid train = by_language[nb_test_valid:] train['usage'].values[:] = 'train' repositories[f'{lang}/train'] = train LOGGER.info( f'{lang} nb repositories, train: {total-nb_test_valid}, ' f'valid: {nb_valid}, test: {nb_test}' ) for name, repository in repositories.items(): if not len(repository): LOGGER.error(f'No repositories available for {name}') raise RuntimeError(f'No repositories for category: {name}') repo = pd.concat(repositories.values()) files = pd.merge(files, repo, on=repo_columns) config.save_csv(files, File.FILES_SPLIT_BY_USAGE)
def download(config: Config) -> None: LOGGER.info('Retrieving repositories dataset (8GB)') LOGGER.info('This operation might take a lot of time...') destination = config.absolute(File.COMPRESSED_DATASET) download_file(DATASET_URL, destination)
def extract(config: Config) -> None: LOGGER.info('Extract selected files') LOGGER.info('This operation might take a lot of time...') train_path = config.extracted_files_dir.joinpath('train') valid_path = config.extracted_files_dir.joinpath('valid') test_path = config.extracted_files_dir.joinpath('test') train_path.mkdir(exist_ok=True) valid_path.mkdir(exist_ok=True) test_path.mkdir(exist_ok=True) # Load list of files to extract source = config.load_csv(File.FILES_SPLIT_BY_USAGE) # Load list of processed files try: files = config.load_csv(File.EXTRACTED_FILES) except IOError: files = pd.DataFrame([], columns=EXTRACTED_FILES_COLUMNS) df = pd.merge(source, files, how='outer', on=list(source.columns)) df.loc[df['status'].isnull(), 'status'] = Status.PENDING.value # Flag existing files is_pending = df['status'] == Status.PENDING.value file_exists = df.apply(partial(_destination_exists, config), axis=1) df.loc[(is_pending & file_exists), 'status'] = Status.DISCARDED.value while True: selected = _choose_files_to_extract(config, df) LOGGER.info(f'{len(selected)} files to extract') if not len(selected): break result = _extract_files(config, selected) result_extracted = result[result['status'] == Status.EXTRACTED.value] mask = df['extract_to'].isin(result_extracted['extract_to']) df.loc[mask, 'status'] = Status.EXTRACTED.value result_discarded = result[result['status'] == Status.DISCARDED.value] mask = df['extract_to'].isin(result_discarded['extract_to']) df.loc[mask, 'status'] = Status.DISCARDED.value extracted = df[df['status'] == Status.EXTRACTED.value] discarded = df[df['status'] == Status.DISCARDED.value] LOGGER.info( f'Processed {len(result)} files: {len(result_extracted)} ' f'extracted, {len(result_discarded)} discarded' ) LOGGER.info(f'{len(extracted)} total files extracted') LOGGER.info(f'{len(discarded)} total files discarded') config.save_csv(df, File.EXTRACTED_FILES) LOGGER.info(f'The training files are located in {train_path}') LOGGER.info(f'The validation files are located in {valid_path}') LOGGER.info(f'The test files are located in {test_path}')