def calc_cross_occurences(merges_list, path_to_save: Optional[str] = None) -> Counter: occ_accross_lists = count_occurences_across_lists(merges_list) if path_to_save: dump_list( list( map( lambda t: f'{t[0]} {t[1]}', sorted(occ_accross_lists.items(), key=lambda s: s[1], reverse=True))), path_to_save) return occ_accross_lists
def calc_cross_occurences_summary( occ_accross_lists: Counter, path_to_save: Optional[str] = None) -> Dict[int, float]: summary = defaultdict(int) for k, v in occ_accross_lists.items(): summary[v] += v total_merges_across_fractions = sum(summary.values()) summary_for_1_list = {} for k, v in summary.items(): summary_for_1_list[k] = float(v) / total_merges_across_fractions if path_to_save: dump_list( list( map( lambda t: f'{t[0]} {t[1]}', sorted(summary_for_1_list.items(), key=lambda s: s[0], reverse=True))), path_to_save) return summary_for_1_list
def run(dataset, repr, threshold): PrepConfig.assert_classification_config(repr) path_to_classification = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset, CLASSIFICATION_DIR) dest_dir = os.path.join(path_to_classification, CLASSIFICATION_TYPE, repr) logger.info(f"Getting stats for {dest_dir}") logger.info( f"Ignoring projects where the percentage of file that contain logging is less than {threshold} %" ) projects_to_ignore, logged_stats = calc_stats(dest_dir, threshold) for i, p in enumerate(projects_to_ignore): logger.info(f"{i}: {p}") logger.info("") logger.info(logged_stats) output_file_path = os.path.join( path_to_classification, f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") dump_list(projects_to_ignore, output_file_path) logger.info( f"Ignored files with threshold {threshold} % were written to {output_file_path}" ) logger.info(f"Total ignored projects: {len(projects_to_ignore)}")
def run(dataset: str, repr: str, n_merges: int, reset: bool, percent: float, start_from: float) -> None: bpe_dir_prefix = fractions_manager.get_percent_prefix(percent, start_from) bpe_dir_prefix = '' if bpe_dir_prefix == '100_' else bpe_dir_prefix base_dir = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset, METADATA_DIR, repr) if reset: starting_from_scratch = True archive_existing_common_bpe_folder(base_dir) else: logger.info("Using existing merges...") most_recent_bpe_dir = get_most_recent_bpe_dir(base_dir, bpe_dir_prefix) if not most_recent_bpe_dir: logger.warning("Existing merges not found ") starting_from_scratch = True else: all_vocab = read_dict_from_2_columns( os.path.join(most_recent_bpe_dir, REASSEMBLED_VOCAB_FILE_NAME)) vocab, non_splitable_vocab = separate_non_splittable_vocab(all_vocab, from_reassambled=True) merges = read_list(os.path.join(most_recent_bpe_dir, MERGES_FILE_NAME)) starting_from_scratch = False if starting_from_scratch: logger.info("Starting the encoding from scratch...") all_vocab = read_dict_from_2_columns(os.path.join(base_dir, f'{bpe_dir_prefix}{VOCAB_FILE_NAME}')) vocab, non_splitable_vocab = separate_non_splittable_vocab(all_vocab, from_reassambled=False) merges = [] pairs = get_stats(vocab) n_done_merges = len(merges) for i in range(n_merges): try: best, occurences = pairs.pop_pair() print(f'Processing pair number {n_done_merges + i+1} {best}') merges.append((best[0], best[1], str(occurences))) except KeyError: break vocab = merge_vocab(best, vocab, pairs) for k, v in non_splitable_vocab.items(): vocab[k] = v resulting_vocab = collections.defaultdict(int) for entry, frequency in vocab.items(): for subword in entry.split(" "): resulting_vocab[subword] += frequency resulting_vocab_sorted = sorted(resulting_vocab.items(), key=lambda x: x[1], reverse=True) merges_cache = {} for entry, frequency in vocab.items(): subword_list = entry.split(' ') key = ''.join(subword_list) merges_cache[key] = subword_list new_bpe_dir = os.path.join(base_dir, f'{bpe_dir_prefix}{BPE_DIR}', str(len(merges))) if os.path.exists(new_bpe_dir): raise AssertionError(f'Dir {new_bpe_dir} already exists? Something went wrong.' f'Check the contents of {os.path.join(base_dir, BPE_DIR)} folder') os.makedirs(new_bpe_dir) dump_list(merges, os.path.join(new_bpe_dir, MERGES_FILE_NAME)) dump_dict_into_2_columns(vocab, os.path.join(new_bpe_dir, REASSEMBLED_VOCAB_FILE_NAME)) dump_dict_into_2_columns(merges_cache, os.path.join(new_bpe_dir, MERGES_CACHE_FILE_NAME), val_type=list) dump_dict_into_2_columns(resulting_vocab_sorted, os.path.join(new_bpe_dir, RESULTING_VOCAB_FILE_NAME)) logger.info(f'Bpe output files are saved into {new_bpe_dir} folder')