def extract() -> None: LOGGER.info('Extracting repositories list file') LOGGER.info('This operation might take few minutes...') compressed_filename = absolute(File.COMPRESSED_DATASET) with tarfile.open(compressed_filename) as tar: tar.extract(DATASET_FILENAME, path=absolute('.')) extracted_file = absolute(DATASET_FILENAME) extracted_file.rename(absolute(File.DATASET))
def download() -> None: LOGGER.info('Download chosen repositories') LOGGER.info('This operation might take a lot of time...') input_data = load_csv(File.PREPARED_REPOSITORIES) rows = (row_info[1] for row_info in input_data.iterrows()) result_rows = [] for step, row in enumerate(pool_imap(_download_repository, rows), 1): result_rows.append(row) if step % Config.step == 0: LOGGER.info('--> Processed %s repositories...', step) dataframes = [pd.DataFrame(row).T for row in result_rows] data = pd.concat(dataframes) data.loc[:, 'repository_size'] = 0 data = data.apply(_check_size, axis=1) data = data[data['repository_size'] != 0] data = data[data['repository_filename'] != ''] fieldnames = ['repository_language', 'repository_filename'] output_data = data[fieldnames] output_path = absolute(File.DOWNLOADED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def select_more_repositories(languages: List[str]) -> None: LOGGER.info('Choose more repositories per language') LOGGER.info('This operation might take few minutes...') output_path = absolute(File.SELECTED_REPOSITORIES) input_data = load_csv(File.ALTERED_DATASET) known = load_csv(File.SELECTED_REPOSITORIES) mask = ~input_data['repository_name'].isin(known['repository_name']) repositories = input_data[mask] shuffled = repositories.sample(frac=1).reset_index(drop=True) max_repositories = Config.nb_repositories_per_language selected_list = [] for language in languages: if language not in Config.languages: LOGGER.error('Unknown language %s', language) raise RuntimeError(f'Unknown language {language}') pending = shuffled[shuffled['repository_language'] == language] nb_known = len(known[known['repository_language'] == language]) nb_pending = len(pending) nb_required = max(max_repositories - nb_known, 0) nb_selected = min(nb_pending, nb_required) total = nb_known + nb_selected LOGGER.info( '%s: repositories per language: %s, pending: %s, known: %s, ' 'selected: %s, total: %s', language, max_repositories, nb_pending, nb_known, nb_selected, total) if total < max_repositories: LOGGER.warning('%s, not enough repositories, required: %s', language, max_repositories) if nb_selected == 0: continue selected = pending[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') backup(File.SELECTED_REPOSITORIES) with suppress(IOError): backup(File.PREPARED_REPOSITORIES) new_repositories = pd.concat(selected_list) united = known.append(new_repositories) united.to_csv(output_path, index=False)
def prepare() -> None: LOGGER.info('Prepare repositories download') LOGGER.info('This operation should take few seconds...') input_data = load_csv(File.SELECTED_REPOSITORIES) input_data.loc[:, 'repository_filename'] = '' input_data.loc[:, 'repository_url'] = '' output_data = input_data.apply(_add_download_info, axis=1) output_path = absolute(File.PREPARED_REPOSITORIES) output_data.to_csv(output_path, index=False)
def check_files(): path = absolute(File.EXTRACTED_FILES) assert path.exists() languages = Config.languages files = {lang: 0 for lang in languages} with path.open() as csv_file: for item in DictReader(csv_file): if not item['status'] == Status.EXTRACTED.value: continue language = item['language'] path_elements = ('files', item['usage'], item['extract_to']) extracted_path = absolute(*path_elements) ext = extracted_path.suffix.lstrip('.') assert extracted_path.exists() assert ext in languages[language] files[language] += 1 assert all(count == 30 for count in files.values())
def shrink() -> None: LOGGER.info('Shrink repositories list file') LOGGER.info('This operation might take few minutes...') input_path = absolute(File.DATASET) output_path = absolute(File.SHRUNK_DATASET) # The input dataset is too huge to be fully loaded into memory csv.field_size_limit(CSV_FIELD_LIMIT) with input_path.open() as input_file, output_path.open('w') as output_file: reader = csv.DictReader(input_file) fieldnames = ['repository_name', 'repository_language'] writer = csv.DictWriter(output_file, fieldnames=fieldnames) writer.writeheader() for item in reader: if _ignore(item): continue smaller_item = { 'repository_name': item['Name with Owner'], 'repository_language': item['Language'], } writer.writerow(smaller_item)
def alter() -> None: LOGGER.info('Alter repositories list file') LOGGER.info('This operation might take few minutes...') output_path = absolute(File.ALTERED_DATASET) df = load_csv(File.SHRUNK_DATASET) # Set repositories with no language as Markdown repositories. # Because most of Github repositories have a Readme.md file. mask = df['repository_language'].isnull() df.loc[mask, 'repository_language'] = 'Markdown' # There are too few repositories tagged as SQL repositories. # To mitigate this problem, a list of known repositories are flagged as # SQL repositories. sql_df = pd.read_csv(SQL_DATASET_PATH) mask = df['repository_name'].isin(sql_df['repository_name']) df.loc[mask, 'repository_language'] = 'SQL' df.to_csv(output_path, index=False)
def select() -> None: LOGGER.info('Choose repositories per language') LOGGER.info('This operation might take few minutes...') input_data = load_csv(File.ALTERED_DATASET) shuffled = input_data.sample(frac=1).reset_index(drop=True) max_repositories = Config.nb_repositories_per_language selected_list = [] for language in Config.languages: filtered = shuffled[shuffled['repository_language'] == language] nb_found = len(filtered) nb_selected = min(nb_found, max_repositories) LOGGER.info( '%s repositories, found: %s, kept: %s', language, nb_found, nb_selected) if nb_selected < max_repositories: LOGGER.warning( '%s, not enough repositories, required: %s', language, max_repositories) if nb_selected == 0: continue selected = filtered[:nb_selected] selected_list.append(selected) if not selected_list: LOGGER.error('No repository found') raise RuntimeError('No repository found') output_path = absolute(File.SELECTED_REPOSITORIES) united = pd.concat(selected_list) united.to_csv(output_path, index=False)
def download() -> None: LOGGER.info('Retrieving repositories dataset (8GB)') LOGGER.info('This operation might take a lot of time...') destination = absolute(File.COMPRESSED_DATASET) download_file(DATASET_URL, destination)