def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) sample_size = math.ceil(sample_percent * len(all_files)) if sample_size > len(files_remaining): sample_size = len(files_remaining) files = random.sample(files_remaining.keys(), sample_size) replaced = [0 for x in range(sample_size)] while True: for i in range(len(files)): f = files[i] other_files = {x for x in files if x != f} if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1: replaced[i] = 1 f = random.sample(files_remaining.keys(), 1) while f[0] in files: f = random.sample(files_remaining.keys(), 1) files[i] = f[0] else: replaced[i] = 0 if sum(replaced) < 2: break for f in files: files_remaining[f] = files_remaining[f].union({x for x in files if x != f}) if len(files_remaining[f]) == len(all_files) - 1: del files_remaining[f] with open('files_remaining.txt', 'w+') as f: files_remaining_converted = {} for path in files_remaining: files_remaining_converted[path] = list(files_remaining[path]) print(json.dumps(files_remaining_converted), file = f) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') project_map = {x:[] for x in files} print(project_map) for f in tqdm(files, total=len(files)): rich_f = RichPath.create(f, azure_info_path) lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))) lines_with_docstrings = [] for line in lines: if len(line['docstring_tokens']) > 0: lines_with_docstrings.append(line) if line['nwo'] not in project_map[str(rich_f)]: project_map[str(rich_f)].append(line['nwo']) dfs.append(pd.DataFrame(lines_with_docstrings)) return pd.concat(dfs), project_map
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') for f in tqdm(files, total=len(files)): dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))))) return pd.concat(dfs)