Esempio n. 1
0
def setup_feature_matrix(unlabeled_file_references, soft_labeled_path, labeled_file_references, feature_list,
                         soft_labeled_sample_size, unlabeled_sample_size, labeled_sample_size, class_sampling,
                         num_labels, alpha_labeled=0.95, alpha_unlabeled=0.95, alpha_soft_labeled=hcs_soft_label_alphas,
                         class_labels=hcs_soft_labels, ignore_labels=[6], normalize_data=True):
    '''
    Reads files with unlabeled and labeled information, and creates the feature matrix with the features
    specified in the feature list

    Parameters
    ----------
    unlabeled_file_references -- paths to txt files with raw unlabeled data
    labeled_file_references   -- paths to arff files with labeled data
    soft_labeled_path         -- paths txt soft-labeled files. This path should contain a directory per label
    feature_list              -- list with the indices of the selected features (1 based)
    sample_size               -- size of sampling over unlabeled data (-1 means use all data)
    class_sampling            -- if sampling is required, sample the same number of points per class

    Returns
    -------
    Matrix with features as columns, and individuals (cells) as rows.
    The matrix contains an additional column for the labeling, if present, or -1 otherwise.
    '''

    # [TODO] parametrize this:
    #alpha_labeled, alpha_unlabeled = 0.95, 0.95
    #alpha_soft_labeled = {1: 0.95, 5: 0.95}
    alpha_vector = []
    summary = {'nl': 0, 'nsu': 0, 'nsi': 0, 'nu': 0}

    labeled_data = [read_arff_file(input_file, feature_list, ignore_labels=ignore_labels)
                    for input_file in labeled_file_references]
    labeled_points = np.concatenate(labeled_data)
    labeled_points = get_sample(labeled_points, labeled_sample_size)  # ???
    ___("%i labeled points:\n\t%s" % (len(labeled_points), labeled_points[:, -1]))
    summary['nl'] = len(labeled_points)

    alpha_vector += [alpha_labeled] * len(labeled_points)

    soft_labeled_points = None
    if soft_labeled_path:
        soft_labeled_data = {label_key: [] for label_key in class_labels.keys()}
        soft_labeled_file_references = get_files(soft_labeled_path)

        uninf_label_key = 'bPEGI-29'
        soft_labeled_data_uninf = []

        for input_file in soft_labeled_file_references:
            label_key = parentdir(input_file)
            #soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])]
            if label_key == uninf_label_key:
                soft_labeled_data_uninf += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])]
            else:
                soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])]
        # Sample unlabeled data uniformly over classes
        if class_sampling:
            class_sample_size = {class_label: soft_labeled_sample_size / len(class_labels) if class_sampling else -1
                                 for class_label in class_labels.keys()}
            summary['nsu'] = class_sample_size[uninf_label_key]
            soft_labeled_data = [get_sample(np.concatenate(soft_labeled_data_uninf), class_sample_size[uninf_label_key])] + \
                                [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name])
                                 for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems() if len(soft_labeled_set) > 0]
            #soft_labeled_data = [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name])
            #                     for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems()]
        else:
            summary['nsu'] = len(soft_labeled_data_uninf)
            soft_labeled_data = np.concatenate([soft_labeled_data_uninf, np.concatenate(soft_labeled_data.values())])
            #soft_labeled_data = np.concatenate(soft_labeled_data.values())
        summary['nsi'] = len(soft_labeled_data) - summary['nsu']
        soft_labeled_points = np.concatenate(soft_labeled_data)
        soft_labeled_alphas = [alpha_soft_labeled[label] for label in soft_labeled_points[:, -1]]
        ___("%i soft labeled points:\n\t%s" % (len(soft_labeled_points), soft_labeled_points[:, -1]))
        alpha_vector += soft_labeled_alphas

    unlabeled_data = [read_hcs_file(input_file, feature_list) for input_file in unlabeled_file_references]
    unlabeled_points = np.concatenate(unlabeled_data)

    unlabeled_points = get_sample(unlabeled_points, unlabeled_sample_size)  # ???
    ___("%i unlabeled points:\n\t%s" % (len(unlabeled_points), unlabeled_points[:, -1]))
    summary['nu'] = len(unlabeled_points)

    alpha_vector += [alpha_unlabeled] * len(unlabeled_points)

    if soft_labeled_points is None:
        soft_labeled_points = []
        M = np.concatenate([labeled_points, unlabeled_points])
    else:
        M = np.concatenate([labeled_points, soft_labeled_points, unlabeled_points])

    initial_labels = get_label_matrix(M[:, -1], class_labels, summary,
                                      alpha_soft_labeled[1], alpha_soft_labeled[2])
    M = M[:, :-1]

    if normalize_data:
        # M = np.column_stack([normalize(M), initial_labels])
        scores = np.array([1.3275, 1.1739, 1.0605, 0.9868])
        weights = np.max(scores) / scores
        M = normalize(M, weights)
    return (M, initial_labels, alpha_vector, len(labeled_points), len(soft_labeled_points))
Esempio n. 2
0
def setup_validation_matrix(labeled_file_references, soft_labeled_path, feature_list,
                            labeled_sample_size, unlabeled_sample_size, class_sampling, alpha_labeled,
                            alpha_unlabeled, alpha_soft_labeled,
                            class_labels=hcs_soft_labels, ignore_labels=[6],
                            normalize_data=True):
    alpha_vector = []
    summary = {'nl': 0, 'nsu': 0, 'nsi': 0, 'nu': 0}

    # read labeled data
    validation_data = [read_arff_file(input_file, feature_list, ignore_labels=ignore_labels)
                       for input_file in labeled_file_references]
    validation_points = np.concatenate(validation_data)
    np.random.shuffle(validation_points)
    # split labeled data in labeled points...
    labeled_points = validation_points[:labeled_sample_size]
    summary['nl'] = len(labeled_points)
    # ...and (virtually) unlabeled points
    unlabeled_points = validation_points[labeled_sample_size:labeled_sample_size + unlabeled_sample_size]
    summary['nu'] = len(unlabeled_points)
    expected_labels = unlabeled_points[:, -1].copy()
    unlabeled_points[:, -1] = -1
    soft_labeled_sample_size = len(unlabeled_points)

    alpha_vector += [alpha_labeled] * len(labeled_points)

    #read soft-labeled data
    soft_labeled_points = None
    if soft_labeled_path:
        soft_labeled_data = {label_key: [] for label_key in class_labels.keys()}
        soft_labeled_file_references = get_files(soft_labeled_path)

        uninf_label_key = 'bPEGI-29'
        soft_labeled_data_uninf = []

        for input_file in soft_labeled_file_references:
            label_key = parentdir(input_file)
            if label_key == uninf_label_key:
                soft_labeled_data_uninf += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])]
            else:
                soft_labeled_data[label_key] += [read_hcs_file(input_file, feature_list, soft_label=class_labels[label_key])]
        # Sample unlabeled data uniformly over classes
        if class_sampling:
            class_sample_boundaries = np.rint(np.linspace(0, soft_labeled_sample_size, len(class_labels) + 1))
            class_sample_sizes = class_sample_boundaries[1:] - class_sample_boundaries[:-1]
            i = iter(class_sample_sizes)
            class_sample_size = {class_label: int(i.next()) for class_label in class_labels.keys()}
            #soft_labeled_data = [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name])
            #                     for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems()]
            summary['nsu'] = class_sample_size[uninf_label_key]
            soft_labeled_data = [get_sample(np.concatenate(soft_labeled_data_uninf), class_sample_size[uninf_label_key])] + \
                                [get_sample(np.concatenate(soft_labeled_set), class_sample_size[soft_labeled_set_name])
                                 for soft_labeled_set_name, soft_labeled_set in soft_labeled_data.iteritems() if len(soft_labeled_set) > 0]
        else:
            summary['nsu'] = len(soft_labeled_data_uninf)
            soft_labeled_data = np.concatenate([soft_labeled_data_uninf, np.concatenate(soft_labeled_data.values())])
        summary['nsi'] = sum([len(_data) for _data in soft_labeled_data]) - summary['nsu']
        soft_labeled_points = np.concatenate(soft_labeled_data)
        soft_labeled_alphas = [alpha_soft_labeled[label] for label in soft_labeled_points[:, -1]]
        alpha_vector += soft_labeled_alphas
    alpha_vector += [alpha_unlabeled] * len(unlabeled_points)

    if soft_labeled_points is None:
        soft_labeled_points = []
        M = np.concatenate([labeled_points, unlabeled_points])
    else:
        M = np.concatenate([labeled_points, soft_labeled_points, unlabeled_points])

        '''must check consistent ordering
        '''
    #set_printoptions(edgeitems=55000, linewidth=180)
    initial_labels = get_label_matrix(M[:, -1], class_labels, summary,
                                      alpha_soft_labeled[1], alpha_soft_labeled[2])
    M = M[:, :-1]

    if normalize_data:
        # M = np.column_stack([normalize(M), initial_labels])
        # 4,3,2,1,92,53,54
        scores = np.array([1.3384, 1.0987, 1.0309, 0.9315, 0.9133, 0.9064, 0.8906])
        weights = scores / np.max(scores)
        M = normalize(M, weights)

    return (M, initial_labels, alpha_vector, len(labeled_points), len(soft_labeled_points), expected_labels)