Esempio n. 1
0
def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    
    sample_size = math.ceil(sample_percent * len(all_files))

    if sample_size > len(files_remaining):
        sample_size = len(files_remaining)
    files = random.sample(files_remaining.keys(), sample_size)
    replaced = [0 for x in range(sample_size)]

    while True:
        for i in range(len(files)):
            f = files[i]
            other_files = {x for x in files if x != f}
            if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1:
                replaced[i] = 1
                f = random.sample(files_remaining.keys(), 1)
                while f[0] in files:
                    f = random.sample(files_remaining.keys(), 1)
                files[i] = f[0]
            else:
                replaced[i] = 0
        if sum(replaced) < 2:
            break

    for f in files:
        files_remaining[f] = files_remaining[f].union({x for x in files if x != f})
        if len(files_remaining[f]) == len(all_files) - 1:
            del files_remaining[f]
        with open('files_remaining.txt', 'w+') as f:
            files_remaining_converted = {}
            
            for path in files_remaining:
                files_remaining_converted[path] = list(files_remaining[path])

            print(json.dumps(files_remaining_converted), file = f)

    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    project_map = {x:[] for x in files}
    print(project_map)
    for f in tqdm(files, total=len(files)):
        rich_f = RichPath.create(f, azure_info_path)
        lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))
        lines_with_docstrings = []

        for line in lines:
            if len(line['docstring_tokens']) > 0:
                lines_with_docstrings.append(line)
                
                if line['nwo'] not in project_map[str(rich_f)]:
                    project_map[str(rich_f)].append(line['nwo'])
        
        dfs.append(pd.DataFrame(lines_with_docstrings))
    return pd.concat(dfs), project_map
Esempio n. 2
0
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    for f in tqdm(files, total=len(files)):
        dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))))
    return pd.concat(dfs)