def create_entity_packs(processed_folder: ProcessedFolder, pack_size: int) -> List[Tuple[int, List]]: #add a flag to force recalc if os.path.exists(processed_folder.entity_packs(pack_size)): #Do not load if the flag is set print("Loading packs for each entity") return pickle.load(open(processed_folder.entity_packs(pack_size), 'rb')) print("Creating packs for each entity") _, _, author_to_changes, _ = compute_occurrences(processed_folder) packs = [] for author, changes in author_to_changes.items(): np.random.shuffle(changes) while len(changes) % pack_size != 0: changes.append(np.random.choice(changes)) for s in range(0, len(changes), pack_size): if s + pack_size <= len(changes): packs.append((author, changes[s:s + pack_size])) pickle.dump(packs, open(processed_folder.entity_packs(pack_size), 'wb')) print("Packs saved on disk") return packs
def time_split(processed_folder: ProcessedFolder, n_time_buckets: int, uniform_distribution: bool) -> Dict: if os.path.exists(processed_folder.time_buckets_split(n_time_buckets)): print("Loading split into time-separated buckets") return pickle.load( open(processed_folder.time_buckets_split(n_time_buckets), 'rb')) print("Splitting into time-separated buckets") change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "authorTime"], squeeze=True) change_metadata.sort_values(inplace=True) author_occurrences, change_occurrences, author_to_changes, total_count = compute_occurrences( processed_folder) change_entities = resolve_entities(processed_folder) change_to_time_bucket, bucket_to_timestamps = \ continuous_distribution(change_metadata, change_occurrences, n_time_buckets, total_count) \ if not uniform_distribution else \ uni_distribution(author_occurrences, change_metadata, change_occurrences, n_time_buckets, change_entities) bucket_to_timestamps.to_csv( processed_folder.time_buckets_range(n_time_buckets), index=False) pickle.dump( change_to_time_bucket, open(processed_folder.time_buckets_split(n_time_buckets), 'wb')) print("Buckets saved on disk") return change_to_time_bucket
def time_split(processed_folder: ProcessedFolder, n_time_buckets: int) -> Dict: if os.path.exists(processed_folder.time_buckets_split(n_time_buckets)): print("Loading split into time-separated buckets") return pickle.load( open(processed_folder.time_buckets_split(n_time_buckets), 'rb')) print("Splitting into time-separated buckets") change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "authorTime"], squeeze=True) change_metadata.sort_values(inplace=True) _, change_occurrences, author_to_changes, total_count = compute_occurrences( processed_folder) bucket_size = total_count // n_time_buckets + 1 change_to_time_bucket = {} cur_changes = 0 cur_bucket = 0 bucket_indices = [i for i in range(1, n_time_buckets + 1)] bucket_start_times = [None for _ in range(n_time_buckets)] bucket_finish_times = [None for _ in range(n_time_buckets)] for change_id in change_metadata.index: cur_changes += change_occurrences[change_id] change_to_time_bucket[change_id] = cur_bucket if bucket_start_times[cur_bucket] is None: bucket_start_times[cur_bucket] = change_metadata.loc[change_id] bucket_finish_times[cur_bucket] = change_metadata.loc[change_id] while cur_changes >= bucket_size: cur_bucket += 1 cur_changes -= bucket_size bucket_to_timestamps = pd.DataFrame(data={ 'start_time': bucket_start_times, 'finish_time': bucket_finish_times }, index=bucket_indices) bucket_to_timestamps['start_date'] = bucket_to_timestamps[ 'start_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp)) bucket_to_timestamps['finish_date'] = bucket_to_timestamps[ 'finish_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp)) bucket_to_timestamps.to_csv( processed_folder.time_buckets_range(n_time_buckets)) pickle.dump( change_to_time_bucket, open(processed_folder.time_buckets_split(n_time_buckets), 'wb')) print("Buckets saved on disk") return change_to_time_bucket
def run_preprocessing(n_time_buckets: int, min_context_train: float, max_context_train: float, min_count: int, max_count: int, interactive: bool, random_seed: int = 239, projects_file: str = None, project_folder: str = None, ): fix_seed(random_seed) if project_folder is not None: process_folder(ProcessedFolder(project_folder), n_time_buckets, min_context_train, max_context_train, min_count, max_count, interactive) elif projects_file is not None: projects = [l.strip() for l in open(projects_file, "r").readlines()] for p in projects: process_folder(ProcessedFolder("../gitminer/out/" + p + "/"), n_time_buckets, min_context_train, max_context_train, min_count, max_count, interactive) else: raise ValueError("One of projects folder or projects file should be set")
def get_trained_model(processed_folder: ProcessedFolder, pack_size: int, embedding_size: int, min_samples: int, n_run: int, total_runs: int, mask_tokens: bool) -> Tuple[Model, List]: print("Gathering model configuration") author_occurrences, _, _, _ = compute_occurrences(processed_folder) filtered_authors = [] for author, count in author_occurrences.most_common(): if count >= min_samples: filtered_authors.append(author) print("{} authors have at least {} samples".format(len(filtered_authors), min_samples)) n_tokens = processed_folder.n_tokens() n_paths = processed_folder.n_paths() print("Found {} tokens and {} paths".format(n_tokens, n_paths)) load_path = os.path.join( processed_folder.trained_model_folder(pack_size, min_samples, mask_tokens), "model") config = Config.get_representation_config( dataset_folder=processed_folder.folder, load_path=load_path, changes_path=processed_folder.file_changes, n_tokens=n_tokens, n_paths=n_paths, n_entities=max(filtered_authors), embedding_size=embedding_size, pack_size=pack_size, n_run=n_run, total_runs=total_runs) code2vec_model = Model(config) if config.LOAD_PATH == '': print("Did not find a pretrained model") packs = create_entity_packs(processed_folder, pack_size) packs = [pack for pack in packs if pack[0] in filtered_authors] code2vec_model.train(packs, mask_tokens) print("Completed training") return code2vec_model, filtered_authors
def get_representations(processed_folder: ProcessedFolder, pack_size: int, embedding_size: int, min_samples: int, n_time_buckets: int, n_run: int, total_runs: int, mask_tokens: bool): if os.path.exists( processed_folder.vectorization_file(pack_size, min_samples)): print("Loading previously computed representations") return pd.read_csv(processed_folder.vectorization_file( pack_size, min_samples), index_col=0) code2vec_model, filtered_authors = get_trained_model( processed_folder, pack_size, embedding_size, min_samples, n_run, total_runs, mask_tokens) change_authors = resolve_entities(processed_folder) change_to_time_bucket = time_split(processed_folder, n_time_buckets) print("Computing representations") code2vec_model.programmer_representation( processed_folder.vectorization_file(pack_size, min_samples), change_authors, change_to_time_bucket, filtered_authors, mask_tokens) print("Representations saved on disk")
def dump(self, processed_folder: ProcessedFolder): pickle.dump(self.entity_dict, open(processed_folder.entity_dict, 'wb')) pickle.dump(self.reverse_dict, open(processed_folder.reversed_entity_dict, 'wb')) def merge_aliases_naive(processed_folder: ProcessedFolder) -> dict: if os.path.exists(processed_folder.entity_dict): print("Loading merged entities") return pickle.load(open(processed_folder.entity_dict, 'rb')) print("Naively merging entities...") naive_merger = NaiveEntityMerger() change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "authorName", "authorEmail"]) for index, row in change_metadata.iterrows(): naive_merger.add_entity(row["authorName"], row["authorEmail"]) naive_merger.dump(processed_folder) print("Merged entities saved on disk") return naive_merger.entity_dict if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() merge_aliases_naive(ProcessedFolder(args.data_folder))
parser = ArgumentParser() parser.add_argument("--pack_size", type=int, required=True) parser.add_argument("--embedding_size", type=int, required=True) parser.add_argument("--min_samples", type=int, default=0) parser.add_argument("--n_time_buckets", type=int, required=True) parser.add_argument("--n_runs", type=int, required=True) parser.add_argument("--init_run_number", type=int, default=1) parser.add_argument("--mask_tokens", action="store_true") args = parser.parse_args() projects = [ l.strip() for l in open("../../pythonminer/projects.txt", "r").readlines() ] for p in projects: project_folder = ProcessedFolder("../../pythonminer/out/" + p + "/", args.init_run_number) merge_aliases_bipartite(project_folder) for n_run in range(args.init_run_number, args.init_run_number + args.n_runs): project_folder.set_run_number(n_run) tf.reset_default_graph() fix_seed(n_run) get_representations(project_folder, args.pack_size, args.embedding_size, args.min_samples, args.n_time_buckets, n_run, args.n_runs, args.mask_tokens)
processed_folder.vectorization_file(pack_size, min_samples)): print("Loading previously computed representations") return pd.read_csv(processed_folder.vectorization_file( pack_size, min_samples), index_col=0) code2vec_model, filtered_authors = get_trained_model( processed_folder, pack_size, embedding_size, min_samples, n_run, total_runs, mask_tokens) change_authors = resolve_entities(processed_folder) change_to_time_bucket = time_split(processed_folder, n_time_buckets) print("Computing representations") code2vec_model.programmer_representation( processed_folder.vectorization_file(pack_size, min_samples), change_authors, change_to_time_bucket, filtered_authors, mask_tokens) print("Representations saved on disk") if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) parser.add_argument("--pack_size", type=int, required=True) parser.add_argument("--embedding_size", type=int, required=True) parser.add_argument("--min_samples", type=int, default=0) parser.add_argument("--n_time_buckets", type=int, required=True) parser.add_argument("--mask_tokens", type=bool, default=False) args = parser.parse_args() get_representations(ProcessedFolder(args.data_folder), args.pack_size, args.embedding_size, args.min_samples, args.n_time_buckets, 0, 0, args.mask_tokens)
for ent, maps in self.reverse_dict.items(): fout.write("{},{},{}\n".format(ent, "|".join(maps["names"]), "|".join(maps["emails"]))) def merge_aliases_bipartite(processed_folder: ProcessedFolder) -> dict: if os.path.exists(processed_folder.entity_dict): print("Loading merged entities") return pickle.load(open(processed_folder.entity_dict, 'rb')) print("Merging entities by bipartite strategy...") bipartite_merger = BipartiteEntityMerger() change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "authorName", "authorEmail"]) for index, row in change_metadata.iterrows(): bipartite_merger.add_entity(row["authorName"], row["authorEmail"]) bipartite_merger.run_matching() bipartite_merger.dump(processed_folder) print("Merged entities saved on disk") return bipartite_merger.entity_dict if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() merge_aliases_bipartite(ProcessedFolder(args.data_folder))
if row['pathsCountBefore'] > 0 or row['pathsCountAfter'] == 0: continue author = resolved_entities.loc[row['changeId']] author_occurrences[author] += 1 change_occurrences[row['changeId']] += 1 if author not in author_to_changes: author_to_changes[author] = [] author_to_changes[author].append(total_count) total_count += 1 for i, (author, count) in enumerate(author_occurrences.most_common()): print(f"#{i + 1} entity: {author} -> {count}") pickle.dump(author_occurrences, open(processed_folder.author_occurrences, 'wb')) pickle.dump(change_occurrences, open(processed_folder.change_occurrences, 'wb')) pickle.dump(author_to_changes, open(processed_folder.author_to_changes, 'wb')) print("Occurrences saved on disk") return author_occurrences, change_occurrences, author_to_changes, total_count if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() compute_occurrences(ProcessedFolder(args.data_folder))
lambda path: change_entities[int( pathlib.Path(path).name.split('_')[0])], files))), author_occurrences) print("Computing mutual info") print(feature_values.shape) with Parallel(n_jobs=-1) as pool: part_size = 1000 m = dataset.feature_values.shape[1] mutual_info_parts = pool( delayed(mutual_info_classif)(dataset.feature_values[:, i:i + part_size], dataset.authors, random_state=0) for i in tqdm(range(0, m, part_size))) mutual_info = np.concatenate(mutual_info_parts) mutual_info /= np.max(mutual_info) pickle.dump(dataset, open(processed_folder.caliskan_dataset, 'wb')) pickle.dump(mutual_info, open(processed_folder.caliskan_mutual_info, 'wb')) print("Extracted data dumped on disk") return dataset, mutual_info if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() compute_caliskan_features(ProcessedFolder(args.data_folder))
def main(args): # if os.path.isfile(output_filename(args.config_file)): # print("Already processed") # exit(0) config = Config.fromyaml(args.config_file) if config.mode() == 'snapshot': project_folder = ProcessedSnapshotFolder(config.source_folder()) change_entities = None author_occurrences = None else: project_folder = ProcessedFolder(config.source_folder()) change_entities = resolve_entities(project_folder) author_occurrences, _, _, _ = compute_occurrences(project_folder) if config.mode() == 'time': change_to_time_bucket = time_split(project_folder, config.time_folds(), uniform_distribution=True) else: change_to_time_bucket = None if config.mode() == 'context': context_splits = context_split(project_folder, *config.min_max_count(), *config.min_max_train()) else: context_splits = None if config.classifier_type() == 'nn': classifier = NNClassifier(config, project_folder, change_entities, change_to_time_bucket, config.min_max_count(), author_occurrences, context_splits) elif config.classifier_type() == 'rf': classifier = RFClassifier(config, project_folder, change_entities, change_to_time_bucket, config.min_max_count(), author_occurrences, context_splits) elif config.classifier_type() == 'caliskan': classifier = CaliskanClassifier(config, project_folder, change_entities, change_to_time_bucket, config.min_max_count(), context_splits) else: raise ValueError('Classifier type should be set in config') if config.mode() == 'time': fold_indices = [(i, j) for i in range(config.time_folds()) for j in range(i + 1, config.time_folds())] elif config.mode() == 'context': fold_indices = [i for i in range(len(context_splits))] else: fold_indices = classifier.cross_validation_folds() mean, std, scores = classifier.run(fold_indices) print(f'{mean:.3f}+-{std:.3f}') for i, score in enumerate(scores): if isinstance(score, ClassificationResult): scores[i] = ClassificationResult(float(score.accuracy), float(score.macro_precision), float(score.macro_recall), score.fold_ind) yaml.dump({ 'mean': mean, 'std': std, 'scores': scores }, output_file(args.config_file), default_flow_style=False)
if not os.path.exists(processed_folder.entity_dict): raise ValueError( "You should provide dictionary of entities for resolving: {}". format(processed_folder.entity_dict)) print("Resolving entities for individual changes") entity_resolver = EntityResolver(processed_folder.entity_dict) change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "authorName", "authorEmail"]) change_entities = change_metadata.apply( lambda row: entity_resolver.get_entity(row["authorName"], row[ "authorEmail"]), axis=1) change_entities.to_csv(processed_folder.resolved_entities, header=True) print("Resolved entities saved on disk") print("{} unknown aliases in EntityResolver".format( entity_resolver.unknown_count)) dump_unknowns(entity_resolver.unknowns, processed_folder.unknown_entities) return change_entities if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() resolve_entities(ProcessedFolder(args.data_folder))
#Do not load if the flag is set print("Loading packs for each entity") return pickle.load(open(processed_folder.entity_packs(pack_size), 'rb')) print("Creating packs for each entity") _, _, author_to_changes, _ = compute_occurrences(processed_folder) packs = [] for author, changes in author_to_changes.items(): np.random.shuffle(changes) while len(changes) % pack_size != 0: changes.append(np.random.choice(changes)) for s in range(0, len(changes), pack_size): if s + pack_size <= len(changes): packs.append((author, changes[s:s + pack_size])) pickle.dump(packs, open(processed_folder.entity_packs(pack_size), 'wb')) print("Packs saved on disk") return packs if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) parser.add_argument("--pack_size", type=int, required=True) args = parser.parse_args() print( create_entity_packs(ProcessedFolder(args.data_folder), args.pack_size))
while cur_changes >= bucket_size: cur_bucket += 1 cur_changes -= bucket_size bucket_to_timestamps = pd.DataFrame(data={ 'start_time': bucket_start_times, 'finish_time': bucket_finish_times }, index=bucket_indices) bucket_to_timestamps['start_date'] = bucket_to_timestamps[ 'start_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp)) bucket_to_timestamps['finish_date'] = bucket_to_timestamps[ 'finish_time'].map(lambda tstamp: datetime.fromtimestamp(tstamp)) bucket_to_timestamps.to_csv( processed_folder.time_buckets_range(n_time_buckets)) pickle.dump( change_to_time_bucket, open(processed_folder.time_buckets_split(n_time_buckets), 'wb')) print("Buckets saved on disk") return change_to_time_bucket if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) parser.add_argument("--n_time_buckets", type=int, required=True) args = parser.parse_args() print(time_split(ProcessedFolder(args.data_folder), args.n_time_buckets))
def context_split(processed_folder: ProcessedFolder, min_count: int = 100, max_count: int = 10**9, min_train: float = 0.7, max_train: float = 0.8) -> List[ContextSplit]: author_occurrences, change_occurrences, author_to_changes, total_count = compute_occurrences( processed_folder) change_entities = resolve_entities(processed_folder) if os.path.exists(processed_folder.context_split(min_train, max_train)): print("Loading context-split data") resulting_split = pickle.load( open(processed_folder.context_split(min_train, max_train), 'rb')) _filter_authors(resulting_split, min_count, max_count, author_occurrences, change_occurrences, change_entities) return resulting_split print("Splitting changes by context") change_metadata = pd.read_csv(processed_folder.change_metadata_file, index_col="id", usecols=["id", "newPath"], squeeze=True) project_root = _build_tree( change_metadata, change_entities, change_occurrences, lambda change_id: change_occurrences[change_id] > 0) depth = _max_depth(project_root) nodes_at_depth = [[] for _ in range(depth + 1)] _get_all_nodes_at_depth(project_root, nodes_at_depth) print(f"Depth: {depth}") min_depth, max_depth = _detect_min_max_depth(project_root, nodes_at_depth, max_train) print(f'Trying to find splits for depth from {min_depth} to {max_depth}') authors = {author for author, count in author_occurrences.items()} resulting_split = [ ContextSplit(d, {}) for d in range(min_depth, max_depth + 1) ] success_size = 0 author_success = 0 with Parallel(n_jobs=-1) as pool: split_result = pool( delayed(_find_split)(author, change_entities, min_depth, max_depth, min_train, max_train, nodes_at_depth, iters=10) for author in tqdm(authors)) for author_split, success, size in split_result: if success: success_size += size author_success += 1 _merge_splits(resulting_split, author_split) print( f"Kept {success_size / project_root.count * 100:.2f}% of changes by {author_success}/{len(authors)} authors" ) pickle.dump( resulting_split, open(processed_folder.context_split(min_train, max_train), 'wb')) print("Buckets saved on disk") _filter_authors(resulting_split, min_count, max_count, author_occurrences, change_occurrences, change_entities) return resulting_split
min_depth, max_depth, min_train, max_train, nodes_at_depth, iters=10) for author in tqdm(authors)) for author_split, success, size in split_result: if success: success_size += size author_success += 1 _merge_splits(resulting_split, author_split) print( f"Kept {success_size / project_root.count * 100:.2f}% of changes by {author_success}/{len(authors)} authors" ) pickle.dump( resulting_split, open(processed_folder.context_split(min_train, max_train), 'wb')) print("Buckets saved on disk") _filter_authors(resulting_split, min_count, max_count, author_occurrences, change_occurrences, change_entities) return resulting_split if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) args = parser.parse_args() print(context_split(ProcessedFolder(args.data_folder)))
n_paths=n_paths, n_entities=max(filtered_authors), embedding_size=embedding_size, pack_size=pack_size, n_run=n_run, total_runs=total_runs) code2vec_model = Model(config) if config.LOAD_PATH == '': print("Did not find a pretrained model") packs = create_entity_packs(processed_folder, pack_size) packs = [pack for pack in packs if pack[0] in filtered_authors] code2vec_model.train(packs, mask_tokens) print("Completed training") return code2vec_model, filtered_authors if __name__ == '__main__': parser = ArgumentParser() parser.add_argument("--data_folder", type=str, required=True) parser.add_argument("--pack_size", type=int, required=True) parser.add_argument("--embedding_size", type=int, required=True) parser.add_argument("--min_samples", type=int, default=0) parser.add_argument("--mask_tokens", type=bool, default=False) args = parser.parse_args() get_trained_model(ProcessedFolder(args.data_folder), args.pack_size, args.embedding_size, args.min_samples, 0, 0, args.mask_tokens)