def extract(config: Config) -> None: LOGGER.info('Extracting repositories list file') LOGGER.info('This operation might take several minutes...') compressed_filename = config.absolute(File.COMPRESSED_DATASET) with tarfile.open(compressed_filename) as tar: tar.extract(DATASET_FILENAME, path=config.absolute('.')) extracted_file = config.absolute(DATASET_FILENAME) extracted_file.rename(config.absolute(File.DATASET))
def alter(config: Config) -> None: LOGGER.info('Alter repositories list file') LOGGER.info('This operation might take several minutes...') output_path = config.absolute(File.ALTERED_DATASET) df = config.load_csv(File.SHRUNK_DATASET) # Set repositories with no language as Markdown repositories. # Because most of Github repositories have a Readme.md file. mask = df['repository_language'].isnull() df.loc[mask, 'repository_language'] = 'Markdown' # Handle language aliases for alias, languages in config.alias_mapping.items(): lang = languages[0] mask = df['repository_language'] == alias df.loc[mask, 'repository_language'] = lang # There are too few repositories for some languages. # To mitigate this problem, a list of known repositories # is added to the dataset. other_df = pd.read_csv(OTHER_REPO_DATASET_PATH) df = pd.concat([other_df, df]).drop_duplicates('repository_name') df.to_csv(output_path, index=False)
def download(config: Config) -> None: LOGGER.info('Download chosen repositories') LOGGER.info('This operation might take a lot of time...') input_data = config.load_csv(File.PREPARED_REPOSITORIES) input_data.loc[:, 'repository_is_empty'] = True rows = (dict(row) for _, row in input_data.iterrows()) result_rows = [] total = len(input_data) for step, row in enumerate(pool_map(_clone_repository, rows, config), 1): result_rows.append(row) if step % LOG_STEP == 0: LOGGER.info(f'--> Processed {step} / {total} repositories...') LOGGER.info(f'--> Processed {total} / {total} repositories!') data = pd.DataFrame(result_rows) LOGGER.info('Removing empty repositories') data = data[~data['repository_is_empty']] LOGGER.info(f'Kept {len(data)} non empty repositories') fieldnames = ['repository_language', 'repository_dirname'] output_data = data[fieldnames] output_path = config.absolute(File.DOWNLOADED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def select(config: Config) -> None: LOGGER.info('Choose repositories per language') LOGGER.info('This operation might take several minutes...') input_data = config.load_csv(File.ALTERED_DATASET) shuffled = input_data.sample(frac=1).reset_index(drop=True) max_repositories = config.nb_repositories_per_language selected_list = [] for lang in config.languages: filtered = shuffled[shuffled['repository_language'] == lang] nb_found = len(filtered) nb_selected = min(nb_found, max_repositories) LOGGER.info( f'{lang} repositories, found: {nb_found}, kept: {nb_selected}') if nb_selected < max_repositories: LOGGER.warning(f'{lang}, not enough repositories, ' f'required: {max_repositories}') if nb_selected == 0: continue selected = filtered[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') output_path = config.absolute(File.SELECTED_REPOSITORIES) united = pd.concat(selected_list) united.to_csv(output_path, index=False)
def prepare(config: Config) -> None: LOGGER.info('Prepare repositories download') LOGGER.info('This operation should take few seconds...') input_data = config.load_csv(File.SELECTED_REPOSITORIES) input_data.loc[:, 'repository_dirname'] = '' input_data.loc[:, 'repository_url'] = '' output_data = input_data.apply(_add_download_info, axis=1) output_path = config.absolute(File.PREPARED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def finalize(config: Config) -> None: items = config.extensions.items() lang_ext = OrderedDict(sorted(items, key=_lang_name)) language_filename = config.absolute('languages.json') with language_filename.open('w') as output: json.dump(lang_ext, output, indent=2) LOGGER.info('Dataset successfully generated') LOGGER.info('To train Guesslang with this dataset:') LOGGER.info(f'* copy {language_filename} into guesslang/data/ directory') LOGGER.info( f'* run $ guesslang --train {config.cache_path} /path/to/new_model' )
def shrink(config: Config) -> None: LOGGER.info('Shrink repositories list file') LOGGER.info('This operation might take several minutes...') input_path = config.absolute(File.DATASET) output_path = config.absolute(File.SHRUNK_DATASET) # The input dataset is too huge to be fully loaded into memory csv.field_size_limit(CSV_FIELD_LIMIT) with input_path.open() as input_file, output_path.open('w') as output_file: reader = csv.DictReader(input_file) fieldnames = ['repository_name', 'repository_language'] writer = csv.DictWriter(output_file, fieldnames=fieldnames) writer.writeheader() for item in reader: if _ignore(item): continue smaller_item = { 'repository_name': item['Name with Owner'], 'repository_language': item['Language'], } writer.writerow(smaller_item)
def list_all(config: Config) -> None: LOGGER.info('List source files from repositories') LOGGER.info('This operation might take several minutes...') # Start or resume files listing repo = config.load_csv(File.DOWNLOADED_REPOSITORIES) try: files = config.load_csv(File.AVAILABLE_FILES) except IOError: files = pd.DataFrame([], columns=AVAILABLE_FILES_COLUMNS) # Find repositories that have not been processed yet mask = ~repo['repository_dirname'].isin(files['repository_dirname']) new_repo = repo[mask] LOGGER.info(f'{len(new_repo)} newly downloaded repositories') # Show the number of deleted repositories nb_repo_before = len(files['repository_dirname'].unique()) mask = files['repository_dirname'].isin(repo['repository_dirname']) files = files[mask] nb_repo_after = len(files['repository_dirname'].unique()) nb_removed = nb_repo_before - nb_repo_after LOGGER.info(f'{nb_removed} deleted repositories') # List unprocessed repositories files total = len(new_repo) rows = (dict(repo) for _, repo in new_repo.iterrows()) output_path = config.absolute(File.AVAILABLE_FILES) write_headers = not output_path.exists() csv.field_size_limit(CSV_FIELD_LIMIT) with output_path.open('a') as output: writer = csv.DictWriter(output, fieldnames=AVAILABLE_FILES_COLUMNS) if write_headers: writer.writeheader() for index, result in enumerate(pool_map(_list_files, rows, config)): for item in result: writer.writerow(item) if index % LOG_STEP == 0: LOGGER.info(f'--> Processed {index} / {total} repositories...') LOGGER.info(f'--> Processed {total} / {total} repositories!') LOGGER.info(f'Created file: {output_path}')
def download(config: Config) -> None: LOGGER.info('Retrieving repositories dataset (8GB)') LOGGER.info('This operation might take a lot of time...') destination = config.absolute(File.COMPRESSED_DATASET) download_file(DATASET_URL, destination)