def label_data_base(data_base_file: str, heuristics: bool): label_extractor = LabelExtractor() labeled_data_base, heuristic, unknown, unlabeled, unlabeled_hashes = label_extractor.get_labeled_data_base( data_base_file=get_data_base_path(data_base_file), heuristics=heuristics) data_base_id = get_file_id(config.data_base_path) data_base_name = data_base_file.split('.')[0] if len(labeled_data_base) > 0: drop_duplicates(labeled_data_base) labeled_data_base.to_csv(path_or_buf=join( config.data_base_path, f'{data_base_name}__labeled_{data_base_id:_>3}_heur{heuristics}.csv' ), index=False) print('labeled') data_base_info(labeled_data_base, printable=True) if not heuristics and len(heuristic) > 0: drop_duplicates(heuristic) heuristic.to_csv(path_or_buf=join( config.data_base_path, f'{data_base_name}__heuristic_{data_base_id:_>3}_heur{heuristics}.csv' ), index=False) print('heuristic') data_base_info(heuristic, printable=True) if len(unknown) > 0: drop_duplicates(unknown) unknown.to_csv(path_or_buf=join( config.data_base_path, f'{data_base_name}__unknown_{data_base_id:_>3}_heur{heuristics}.csv' ), index=False) print('unknown') data_base_info(unknown, printable=True) if len(unlabeled) > 0: drop_duplicates(unlabeled) unlabeled.to_csv(path_or_buf=join( config.data_base_path, f'{data_base_name}__unlabeled_{data_base_id:_>3}_heur{heuristics}.csv' ), index=False) print('unlabeled') data_base_info(unlabeled, printable=True) if len(unlabeled_hashes) > 0: param_id = get_file_id(config.parameters_path) save_obj(unlabeled_hashes, filename=f'unlabeled_hashes_{data_base_name}_{param_id:_>3}')
def create_data_base(paths: tuple = (config.reports_path, config.info_lack_reports_path), debug: bool = False, process_count: int = 12) -> None: if debug: report_paths = get_debug_report_paths() else: report_paths = get_report_paths(paths) manager = Manager() report_paths = SharedReportPaths(report_paths, manager) reports_features = SharedReportsFeatures(manager) ps = [ Process(target=parsing_reports, args=(report_paths, reports_features)) for _ in range(process_count) ] for p in ps: p.start() for p in ps: p.join() reports_features = reports_features.get_reports_features() data_base = pd.DataFrame(reports_features) data_base_info(data_base) data_base_id = get_file_id(config.data_base_path) data_base.to_csv(path_or_buf=join(config.data_base_path, f'database_{data_base_id:_>3}.csv'), index=False)
def probabilities_distribution_visualization(correct: dict, incorrect: dict, incorrect_target: dict, training_space: dict) -> None: def get_ratio(dist: dict) -> np.ndarray: n = sum(dist.values()) ratio = np.array(list(dist.values())) return ratio / n thresholds = [f'{lbl + 0.1:.1f}-{lbl:.1f}' for lbl in correct.keys()] x_ticks = np.arange(0, len(thresholds)) y_ticks = np.arange(0, 1, 0.1) y_labels = [f'{int(y_tick * 100)}%' for y_tick in y_ticks] correct_ratio = get_ratio(correct) incorrect_ratio = get_ratio(incorrect) incorrect_target_ratio = get_ratio(incorrect_target) width = 0.2 fig, ax = plt.subplots(figsize=(19.2, 10.8)) ax.bar(x_ticks, correct_ratio, width, color='lightseagreen', label='True Positive') ax.bar(x_ticks + width, incorrect_ratio, width, color='lightcoral', label='False Positive + False Negative') ax.bar(x_ticks + width * 2, incorrect_target_ratio, width, color='mediumslateblue', label='Target among wrong cases') ax.set_title(f"Classifier prediction ratio") ax.set_ylabel('Percent') ax.set_xlabel('Thresholds') ax.set_xticks(x_ticks + ((width * 3) / 3)) ax.set_xticklabels(thresholds) ax.set_yticks(y_ticks) ax.set_yticklabels(y_labels) ax.set_ybound(lower=0) ax.legend() fig.tight_layout() file_id = get_file_id(training_space['performance']) plt.savefig(join(training_space['performance'], f'{training_space["mark"]}__{file_id}.png'), dpi='figure', format='png', bbox_inches='tight') plt.clf() plt.close('all')
def generate_data_base(model: nn.Module, device: torch.device, each_class: int = 5000, batch_size: int = 512) -> None: def batch_decoding(_batch_size: int) -> None: batch_z = torch.randn((_batch_size, model.d_in), dtype=torch.float32, device=device) batch_labels = torch.stack([ labels[label], ] * _batch_size).type(torch.float32).to(device) _, _, batch_data = model(batch_z, batch_labels) data.append(torch.cat([mark, batch_data.detach().cpu()], dim=1)) torch.manual_seed(2531) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') last_batch_size = each_class % batch_size n_classes = 8 labels = torch.eye(n_classes, n_classes) model.eval() training_space = create_training_space(model_name='CVAE', evaluate=True) model.load_parameters( get_check_point_parameters_path(training_space['check_point'])) data = list() for label in range(n_classes): mark = torch.zeros((batch_size, 2)).fill_(label) for _ in range(each_class // batch_size): batch_decoding(batch_size) mark = torch.zeros((last_batch_size, 2)).fill_(label) if last_batch_size != 0: batch_decoding(last_batch_size) data = torch.cat(data, dim=0).numpy() data_base = pd.DataFrame(data) data_base_id = get_file_id(config.data_base_path) data_base.to_csv(path_or_buf=join( config.data_base_path, f'artificial_database_{data_base_id:_>3}.csv'), index=False)
def silhouette_analysis(data: pd.DataFrame, prediction: np.ndarray, n_clusters: int, model_name: str) -> None: cluster_labels = np.unique(prediction) if n_clusters == 1: print('Count of clusters need to be more then 1') return silhouette_values = silhouette_samples(data, prediction, metric='euclidean') y_ax_lower, y_ax_upper = 0, 0 y_ticks = [] rainbow = cm.get_cmap('rainbow', n_clusters) colors = rainbow(np.linspace(0, 1, n_clusters)) plt.figure(figsize=(19.2, 10.8)) for i, c in enumerate(cluster_labels): c_silhouette_values = silhouette_values[prediction == c] c_silhouette_values.sort() y_ax_upper += len(c_silhouette_values) plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_values, height=1.0, edgecolor='none', color=colors[i]) y_ticks.append((y_ax_lower + y_ax_upper) / 2.) y_ax_lower += len(c_silhouette_values) silhouette_avg = np.mean(silhouette_values) plt.axvline(silhouette_avg, color="red", linestyle="--") plt.yticks(y_ticks, cluster_labels + 1) plt.ylabel('Cluster') plt.xlabel('Silhouette coefficient') plt.tight_layout() im_id = get_file_id(config.simple_analysis_path) plt.savefig(join(config.simple_analysis_path, f'silhouette_{n_clusters}_clusters_{model_name}_{im_id:_>3}.png'), dpi='figure', bbox_inches='tight') plt.clf() plt.close('all')
def cleaning(reports_path: tuple = (config.reports_path, ), debug: bool = False, process_count: int = 12, seen_hashes_file: str = None): manager = Manager() if debug: report_paths = manager.list(get_debug_report_paths()) else: report_paths = manager.list(get_report_paths(paths=reports_path)) data = manager.dict() lock_report_paths = Lock() lock_data = Lock() ps = [ Process(target=report_cleaning, args=(report_paths, data, lock_report_paths, lock_data)) for _ in range(process_count) ] for p in ps: p.start() for p in ps: p.join() if seen_hashes_file: hashes = get_seen_hashes(seen_hashes_file) else: hashes = dict() for i_report in data.keys(): i_hash = data[i_report] if i_hash in hashes: move_to_duplicates(i_report, hashes[i_hash]) else: hashes.setdefault(i_hash, i_report) hash_id = get_file_id(config.parameters_path) save_obj(hashes, join(config.parameters_path, f'hashes_{hash_id:_>3}'))
def scale_data_base(data_base_file: str, scale, scale_type: str) -> None: data_base_id = get_file_id(config.data_base_path) data_base_name = data_base_file.split('.')[0] data_base = get_data_base(data_base_file) data_base_labels = data_base[data_base.columns[:2]] data_base = data_base.drop(data_base.columns[:2], axis=1) data_base_std = pd.DataFrame(scale.fit_transform(data_base), columns=data_base.columns) save_obj( scale, filename=f'{scale_type}_scale_{data_base_name}_{data_base_id:_>3}') data_base = pd.concat([data_base_labels, data_base_std], axis=1) data_base.to_csv(path_or_buf=join( config.data_base_path, f'{data_base_name}_{scale_type}_{data_base_id:_>3}.csv'), index=False)
def scatter_prediction(data: pd.DataFrame, prediction: np.ndarray, n_clusters: int, model_name: str) -> None: dots = data.to_numpy() color_len = prediction.max(initial=1) / n_clusters ticks = list(np.arange(color_len / 2, n_clusters * color_len, color_len)) color_map = cm.get_cmap('rainbow', n_clusters) fig, ax = plt.subplots(figsize=(19.2, 10.8)) scatter = ax.scatter(dots[:, 0], dots[:, 1], c=prediction, cmap=color_map, marker='o', alpha=1.0) color_bar = fig.colorbar(scatter, ticks=ticks, ax=ax, drawedges=True) color_bar.ax.set_yticklabels([t + 1 for t in range(n_clusters)]) ax.set_title(f'Latent space') fig.tight_layout() im_id = get_file_id(config.simple_analysis_path) plt.savefig(join(config.simple_analysis_path, f'scatter_{n_clusters}_clusters_{model_name}_{im_id:_>3}.png'), dpi='figure', format='png', bbox_inches='tight') plt.clf() plt.close('all')
def elbow_method(data: pd.DataFrame, max_n_cluster: int = 20) -> None: distortions = [] for i in range(1, max_n_cluster): km = KMeans(n_clusters=i, init='k-means++', n_init=15, max_iter=300, random_state=2531) km.fit(data) distortions.append(km.inertia_) plt.figure(figsize=(19.2, 10.8)) plt.plot(range(1, max_n_cluster), distortions, marker='o', color='royalblue') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.tight_layout() im_id = get_file_id(config.simple_analysis_path) plt.savefig(join(config.simple_analysis_path, f'elbow_{im_id:_>3}.png'), dpi='figure', bbox_inches='tight')