def setup_feature_matrix(unlabeled_file_references, soft_labeled_path, labeled_file_references, feature_list, soft_labeled_sample_size, unlabeled_sample_size, labeled_sample_size, class_sampling, num_labels, alpha_labeled=0.95, alpha_unlabeled=0.95, alpha_soft_labeled=hcs_soft_label_alphas, class_labels=hcs_soft_labels, ignore_labels=[6], normalize_data=True): ''' Reads files with unlabeled and labeled information, and creates the feature matrix with the features specified in the feature list Parameters ---------- unlabeled_file_references -- paths to txt files with raw unlabeled data labeled_file_references -- paths to arff files with labeled data soft_labeled_path -- paths txt soft-labeled files. This path should contain a directory per label feature_list -- list with the indices of the selected features (1 based) sample_size -- size of sampling over unlabeled data (-1 means use all data) class_sampling -- if sampling is required, sample the same number of points per class Returns ------- Matrix with features as columns, and individuals (cells) as rows. The matrix contains an additional column for the labeling, if present, or -1 otherwise. ''' # [TODO] parametrize this: #alpha_labeled, alpha_unlabeled = 0.95, 0.95 #alpha_soft_labeled = {1: 0.95, 5: 0.95} alpha_vector = [] summary = {'nl': 0, 'nsu': 0, 'nsi': 0, 'nu': 0} labeled_data = [read_arff_file(input_file, feature_list, ignore_labels=ignore_labels) for input_file in labeled_file_references] labeled_points = np.concatenate(labeled_data) labeled_points = get_sample(labeled_points, labeled_sample_size) # ??? ___("%i labeled points:\n\t%s" % (len(labeled_points), labeled_points[:, -1])) summary['nl'] = len(labeled_points) alpha_vector += [alpha_labeled] * len(labeled_points) soft_labeled_points = None if soft_labeled_path: soft_labeled_data = {label_key: [] for label_key in class_labels.keys()} soft_labeled_file_references = get_files(soft_labeled_path) uninf_label_key = 'bPEGI-29' soft_labeled_data_uninf = [] for input_file in soft_labeled_file_references: label_key = parentdir(input_file) #soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])] if label_key == uninf_label_key: soft_labeled_data_uninf += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])] else: soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])] # Sample unlabeled data uniformly over classes if class_sampling: class_sample_size = {class_label: soft_labeled_sample_size / len(class_labels) if class_sampling else -1 for class_label in class_labels.keys()} summary['nsu'] = class_sample_size[uninf_label_key] soft_labeled_data = [get_sample(np.concatenate(soft_labeled_data_uninf), class_sample_size[uninf_label_key])] + \ [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name]) for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems() if len(soft_labeled_set) > 0] #soft_labeled_data = [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name]) # for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems()] else: summary['nsu'] = len(soft_labeled_data_uninf) soft_labeled_data = np.concatenate([soft_labeled_data_uninf, np.concatenate(soft_labeled_data.values())]) #soft_labeled_data = np.concatenate(soft_labeled_data.values()) summary['nsi'] = len(soft_labeled_data) - summary['nsu'] soft_labeled_points = np.concatenate(soft_labeled_data) soft_labeled_alphas = [alpha_soft_labeled[label] for label in soft_labeled_points[:, -1]] ___("%i soft labeled points:\n\t%s" % (len(soft_labeled_points), soft_labeled_points[:, -1])) alpha_vector += soft_labeled_alphas unlabeled_data = [read_hcs_file(input_file, feature_list) for input_file in unlabeled_file_references] unlabeled_points = np.concatenate(unlabeled_data) unlabeled_points = get_sample(unlabeled_points, unlabeled_sample_size) # ??? ___("%i unlabeled points:\n\t%s" % (len(unlabeled_points), unlabeled_points[:, -1])) summary['nu'] = len(unlabeled_points) alpha_vector += [alpha_unlabeled] * len(unlabeled_points) if soft_labeled_points is None: soft_labeled_points = [] M = np.concatenate([labeled_points, unlabeled_points]) else: M = np.concatenate([labeled_points, soft_labeled_points, unlabeled_points]) initial_labels = get_label_matrix(M[:, -1], class_labels, summary, alpha_soft_labeled[1], alpha_soft_labeled[2]) M = M[:, :-1] if normalize_data: # M = np.column_stack([normalize(M), initial_labels]) scores = np.array([1.3275, 1.1739, 1.0605, 0.9868]) weights = np.max(scores) / scores M = normalize(M, weights) return (M, initial_labels, alpha_vector, len(labeled_points), len(soft_labeled_points))
def setup_validation_matrix(labeled_file_references, soft_labeled_path, feature_list, labeled_sample_size, unlabeled_sample_size, class_sampling, alpha_labeled, alpha_unlabeled, alpha_soft_labeled, class_labels=hcs_soft_labels, ignore_labels=[6], normalize_data=True): alpha_vector = [] summary = {'nl': 0, 'nsu': 0, 'nsi': 0, 'nu': 0} # read labeled data validation_data = [read_arff_file(input_file, feature_list, ignore_labels=ignore_labels) for input_file in labeled_file_references] validation_points = np.concatenate(validation_data) np.random.shuffle(validation_points) # split labeled data in labeled points... labeled_points = validation_points[:labeled_sample_size] summary['nl'] = len(labeled_points) # ...and (virtually) unlabeled points unlabeled_points = validation_points[labeled_sample_size:labeled_sample_size + unlabeled_sample_size] summary['nu'] = len(unlabeled_points) expected_labels = unlabeled_points[:, -1].copy() unlabeled_points[:, -1] = -1 soft_labeled_sample_size = len(unlabeled_points) alpha_vector += [alpha_labeled] * len(labeled_points) #read soft-labeled data soft_labeled_points = None if soft_labeled_path: soft_labeled_data = {label_key: [] for label_key in class_labels.keys()} soft_labeled_file_references = get_files(soft_labeled_path) uninf_label_key = 'bPEGI-29' soft_labeled_data_uninf = [] for input_file in soft_labeled_file_references: label_key = parentdir(input_file) if label_key == uninf_label_key: soft_labeled_data_uninf += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])] else: soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])] # Sample unlabeled data uniformly over classes if class_sampling: class_sample_boundaries = np.rint(np.linspace(0, soft_labeled_sample_size, len(class_labels) + 1)) class_sample_sizes = class_sample_boundaries[1:] - class_sample_boundaries[:-1] i = iter(class_sample_sizes) class_sample_size = {class_label: int(i.next()) for class_label in class_labels.keys()} #soft_labeled_data = [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name]) # for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems()] summary['nsu'] = class_sample_size[uninf_label_key] soft_labeled_data = [get_sample(np.concatenate(soft_labeled_data_uninf), class_sample_size[uninf_label_key])] + \ [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name]) for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems() if len(soft_labeled_set) > 0] else: summary['nsu'] = len(soft_labeled_data_uninf) soft_labeled_data = np.concatenate([soft_labeled_data_uninf, np.concatenate(soft_labeled_data.values())]) summary['nsi'] = sum([len(_data) for _data in soft_labeled_data]) - summary['nsu'] soft_labeled_points = np.concatenate(soft_labeled_data) soft_labeled_alphas = [alpha_soft_labeled[label] for label in soft_labeled_points[:, -1]] alpha_vector += soft_labeled_alphas alpha_vector += [alpha_unlabeled] * len(unlabeled_points) if soft_labeled_points is None: soft_labeled_points = [] M = np.concatenate([labeled_points, unlabeled_points]) else: M = np.concatenate([labeled_points, soft_labeled_points, unlabeled_points]) '''must check consistent ordering ''' #set_printoptions(edgeitems=55000, linewidth=180) initial_labels = get_label_matrix(M[:, -1], class_labels, summary, alpha_soft_labeled[1], alpha_soft_labeled[2]) M = M[:, :-1] if normalize_data: # M = np.column_stack([normalize(M), initial_labels]) # 4,3,2,1,92,53,54 scores = np.array([1.3384, 1.0987, 1.0309, 0.9315, 0.9133, 0.9064, 0.8906]) weights = scores / np.max(scores) M = normalize(M, weights) return (M, initial_labels, alpha_vector, len(labeled_points), len(soft_labeled_points), expected_labels)