Example #1
0
def run_stats(graph_path: RichPath, output_path: RichPath):
    number_graphs, number_annotations, number_variables = 0, 0, 0
    annotation_table = Counter()
    data_generator = chain(
        *(g.read_as_jsonl()
          for g in graph_path.iterate_filtered_files_in_dir('*.jsonl.gz')))
    for data in data_generator:
        number_graphs += 1 if len(data['supernodes']) > 0 else 0
        number_variables += len(data['supernodes'])
        number_annotations += sum(
            1 for supernode in data['supernodes'].values()
            if supernode['annotation'] not in {None, 'None', 'Nothing', 'Any'})
        annotation_table.update((supernode['annotation']
                                 for supernode in data['supernodes'].values()
                                 if supernode['annotation'] not in
                                 {None, 'None', 'Nothing', 'Any'}))
    with open(output_path.to_local_path().path, "a") as f:
        f.write("Statistics for file: " + graph_path.to_local_path().path +
                "\n")
        f.write("Number of graphs: %d\n" % (number_graphs))
        f.write("Number of variables: %d\n" % (number_variables))
        f.write("Number of annotations: %d\n" % (number_annotations))
        f.write("Number of different annotations: %d\n" %
                (len(list(annotation_table))))
        f.write("\nFrequency distribution of annotations type:\n\n")
        for annotation, value in annotation_table.most_common():
            f.write("%s\t%d\n" % (annotation, value))
def jsonl_to_df(input_folder: RichPath, sample_percent: float, files_remaining: dict, azure_info_path) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    all_files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    
    sample_size = math.ceil(sample_percent * len(all_files))

    if sample_size > len(files_remaining):
        sample_size = len(files_remaining)
    files = random.sample(files_remaining.keys(), sample_size)
    replaced = [0 for x in range(sample_size)]

    while True:
        for i in range(len(files)):
            f = files[i]
            other_files = {x for x in files if x != f}
            if f not in files_remaining or len(set.intersection(files_remaining[f], other_files)) == len(files) - 1:
                replaced[i] = 1
                f = random.sample(files_remaining.keys(), 1)
                while f[0] in files:
                    f = random.sample(files_remaining.keys(), 1)
                files[i] = f[0]
            else:
                replaced[i] = 0
        if sum(replaced) < 2:
            break

    for f in files:
        files_remaining[f] = files_remaining[f].union({x for x in files if x != f})
        if len(files_remaining[f]) == len(all_files) - 1:
            del files_remaining[f]
        with open('files_remaining.txt', 'w+') as f:
            files_remaining_converted = {}
            
            for path in files_remaining:
                files_remaining_converted[path] = list(files_remaining[path])

            print(json.dumps(files_remaining_converted), file = f)

    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    project_map = {x:[] for x in files}
    print(project_map)
    for f in tqdm(files, total=len(files)):
        rich_f = RichPath.create(f, azure_info_path)
        lines = list(rich_f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))
        lines_with_docstrings = []

        for line in lines:
            if len(line['docstring_tokens']) > 0:
                lines_with_docstrings.append(line)
                
                if line['nwo'] not in project_map[str(rich_f)]:
                    project_map[str(rich_f)].append(line['nwo'])
        
        dfs.append(pd.DataFrame(lines_with_docstrings))
    return pd.concat(dfs), project_map
Example #3
0
def jsonl_to_df(input_folder: RichPath) -> pd.DataFrame:
    "Concatenates all jsonl files from path and returns them as a single pandas.DataFrame ."

    assert input_folder.is_dir(), 'Argument supplied must be a directory'
    dfs = []
    files = list(input_folder.iterate_filtered_files_in_dir('*.jsonl.gz'))
    assert files, 'There were no jsonl.gz files in the specified directory.'
    print(f'reading files from {input_folder.path}')
    for f in tqdm(files, total=len(files)):
        dfs.append(pd.DataFrame(list(f.read_as_jsonl(error_handling=lambda m,e: print(f'Error while loading {m} : {e}')))))
    return pd.concat(dfs)
Example #4
0
    def __load_data(self, data_dir: RichPath, data_fold: DataFold) -> Iterator[GraphSample]:
        all_data_files = data_dir.iterate_filtered_files_in_dir("*.gz")

        max_num_files = self.params.get('max_num_data_files', None)
        if max_num_files is not None:
            all_data_files = sorted(all_data_files)[:max_num_files]
        else:
            all_data_files = list(all_data_files)
        print(" Loading VarMisuse data from %s [%i data files]." % (data_dir, len(all_data_files)))

        unsplittable_keywords = get_language_keywords('csharp')
        return _load_data(all_data_files,
                          unsplittable_keywords,
                          self.params['graph_node_label_max_num_chars'],
                          self.params['max_variable_candidates'],
                          self.params['add_self_loop_edges'])
Example #5
0
def load_all_msgpack_l_gz(
    path: RichPath,
    shuffle: bool = False,
    take_only_first_n_files: Optional[int] = None,
    limit_num_yielded_elements: Optional[int] = None,
    rng: Optional[random.Random] = None,
) -> Iterator:
    """
    Iterate through all the elements of all the `.msgpack.l.gz` in a given directory.

    :param path:
    :param shuffle:
    :param take_only_first_n_files:
    :param limit_num_yielded_elements:
    :param rng:
    :return:
    """
    all_files = sorted(path.iterate_filtered_files_in_dir("*.msgpack.l.gz"))
    if take_only_first_n_files is not None:
        all_files = all_files[:take_only_first_n_files]
    if shuffle and rng is None:
        random.shuffle(all_files)
    elif shuffle:
        rng.shuffle(all_files)

    sample_idx = 0
    for msgpack_file in all_files:
        try:
            for element in load_msgpack_l_gz(
                    msgpack_file.to_local_path().path):
                if element is not None:
                    sample_idx += 1
                    yield element
                if limit_num_yielded_elements is not None and sample_idx > limit_num_yielded_elements:
                    return
        except Exception as e:
            print(f"Error loading {msgpack_file}: {e}.")