def run_stats(graph_path: RichPath, output_path: RichPath): number_graphs, number_annotations, number_variables = 0, 0, 0 annotation_table = Counter() data_generator = chain( *(g.read_as_jsonl() for g in graph_path.iterate_filtered_files_in_dir('*.jsonl.gz'))) for data in data_generator: number_graphs += 1 if len(data['supernodes']) > 0 else 0 number_variables += len(data['supernodes']) number_annotations += sum( 1 for supernode in data['supernodes'].values() if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'}) annotation_table.update((supernode['annotation'] for supernode in data['supernodes'].values() if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'})) with open(output_path.to_local_path().path, "a") as f: f.write("Statistics for file: " + graph_path.to_local_path().path + "\n") f.write("Number of graphs: %d\n" % (number_graphs)) f.write("Number of variables: %d\n" % (number_variables)) f.write("Number of annotations: %d\n" % (number_annotations)) f.write("Number of different annotations: %d\n" % (len(list(annotation_table)))) f.write("\nFrequency distribution of annotations type:\n\n") for annotation, value in annotation_table.most_common(): f.write("%s\t%d\n" % (annotation, value))
def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) sample_size = math.ceil(sample_percent * len(all_files)) if sample_size > len(files_remaining): sample_size = len(files_remaining) files = random.sample(files_remaining.keys(), sample_size) replaced = [0 for x in range(sample_size)] while True: for i in range(len(files)): f = files[i] other_files = {x for x in files if x != f} if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1: replaced[i] = 1 f = random.sample(files_remaining.keys(), 1) while f[0] in files: f = random.sample(files_remaining.keys(), 1) files[i] = f[0] else: replaced[i] = 0 if sum(replaced) < 2: break for f in files: files_remaining[f] = files_remaining[f].union({x for x in files if x != f}) if len(files_remaining[f]) == len(all_files) - 1: del files_remaining[f] with open('files_remaining.txt', 'w+') as f: files_remaining_converted = {} for path in files_remaining: files_remaining_converted[path] = list(files_remaining[path]) print(json.dumps(files_remaining_converted), file = f) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') project_map = {x:[] for x in files} print(project_map) for f in tqdm(files, total=len(files)): rich_f = RichPath.create(f, azure_info_path) lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))) lines_with_docstrings = [] for line in lines: if len(line['docstring_tokens']) > 0: lines_with_docstrings.append(line) if line['nwo'] not in project_map[str(rich_f)]: project_map[str(rich_f)].append(line['nwo']) dfs.append(pd.DataFrame(lines_with_docstrings)) return pd.concat(dfs), project_map
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame: "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ." assert input_folder.is_dir(), 'Argument supplied must be a directory' dfs = [] files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz')) assert files, 'There were no jsonl.gz files in the specified directory.' print(f'reading files from {input_folder.path}') for f in tqdm(files, total=len(files)): dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}'))))) return pd.concat(dfs)
def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> Iterator[GraphSample]: all_data_files = data_dir.iterate_filtered_files_in_dir("*.gz") max_num_files = self.params.get('max_num_data_files', None) if max_num_files is not None: all_data_files = sorted(all_data_files)[:max_num_files] else: all_data_files = list(all_data_files) print(" Loading VarMisuse data from %s [%i data files]." % (data_dir, len(all_data_files))) unsplittable_keywords = get_language_keywords('csharp') return _load_data(all_data_files, unsplittable_keywords, self.params['graph_node_label_max_num_chars'], self.params['max_variable_candidates'], self.params['add_self_loop_edges'])
def load_all_msgpack_l_gz( path: RichPath, shuffle: bool = False, take_only_first_n_files: Optional[int] = None, limit_num_yielded_elements: Optional[int] = None, rng: Optional[random.Random] = None, ) -> Iterator: """ Iterate through all the elements of all the `.msgpack.l.gz` in a given directory. :param path: :param shuffle: :param take_only_first_n_files: :param limit_num_yielded_elements: :param rng: :return: """ all_files = sorted(path.iterate_filtered_files_in_dir("*.msgpack.l.gz")) if take_only_first_n_files is not None: all_files = all_files[:take_only_first_n_files] if shuffle and rng is None: random.shuffle(all_files) elif shuffle: rng.shuffle(all_files) sample_idx = 0 for msgpack_file in all_files: try: for element in load_msgpack_l_gz( msgpack_file.to_local_path().path): if element is not None: sample_idx += 1 yield element if limit_num_yielded_elements is not None and sample_idx > limit_num_yielded_elements: return except Exception as e: print(f"Error loading {msgpack_file}: {e}.")