Beispiel #1
0
def fit_PCA(maindir,
            d,
            origcodes_f="msd_codes_k2045",
            outpca="PCA-codes.pk",
            N=50000,
            norm=False,
            pca_components=[50, 100, 200, 500, 1000]):
    """Fits a PCA transformation with N codes."""
    import binary_task as B
    import cover_id_test as CO

    logger = configure_logger()

    td = load_transform(d)
    codes = np.ones((N, 2045)) * np.nan
    k = 0

    # Load codes
    origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30)

    while k < N:
        track_idx = np.random.random_integers(0, len(track_ids) - 1)
        while track_ids[track_idx] == -2:
            track_idx = np.random.random_integers(0, len(track_ids) - 1)
        track_id = track_ids[track_idx]
        code = origcodes[track_idx]
        if code is not None:
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0],
                                                        1)).squeeze()
            codes[k] = code
            # Marked as used
            track_ids[track_idx] = -2
            k += 1

        if k % 100 == 0:
            logger.info("----Computing features %.1f%%" % \
                            (k/float(N) * 100))

    # Remove nans
    nan_idx = np.unique(np.where(np.isnan(codes))[0])
    codes = np.delete(codes, nan_idx, axis=0)

    # Fit PCA
    res = []
    codes = np.asarray(codes)
    for c in pca_components:
        pca = PCA(n_components=c)
        pca.fit(codes)
        res.append(pca)

    # Save Result
    save_pickle(res, outpca)
def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", 
        N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]):
    """Fits a PCA transformation with N codes."""
    import binary_task as B
    import cover_id_test as CO

    logger = configure_logger()

    td = load_transform(d)
    codes = np.ones((N, 2045)) * np.nan
    k = 0

    # Load codes
    origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30)

    while k < N:
        track_idx = np.random.random_integers(0,len(track_ids)-1)
        while  track_ids[track_idx] == -2:
            track_idx = np.random.random_integers(0,len(track_ids)-1)
        track_id = track_ids[track_idx]
        code = origcodes[track_idx]
        if code is not None:
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze()
            codes[k] = code
            # Marked as used
            track_ids[track_idx] = -2
            k += 1

        if k % 100 == 0:
            logger.info("----Computing features %.1f%%" % \
                            (k/float(N) * 100))

    # Remove nans
    nan_idx = np.unique(np.where(np.isnan(codes))[0])
    codes = np.delete(codes, nan_idx, axis=0)

    # Fit PCA
    res = []
    codes = np.asarray(codes)
    for c in pca_components:
        pca = PCA(n_components=c)
        pca.fit(codes)
        res.append(pca)

    # Save Result
    save_pickle(res, outpca)
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, 
        norm=False, outlda="LDAs.pk", lda_components=[50,100,200]):
    """Fits an LDA with a filtered version of the dataset, such that each
        clique contains at least n tracks."""

    import cover_id_test as CO

    clique_test = load_pickle("SHS/clique_ids_test.pk")
    clique_train = load_pickle("SHS/clique_ids_train.pk")
    track_test = load_pickle("SHS/track_ids_test.pk")
    track_train = load_pickle("SHS/track_ids_train.pk")

    # Result to 
    codes = []
    labels = []

    if pca is not None:
        P = load_pickle(pca)
        P = P[pca_n]

    C = CO.load_codes(codes_f, -1, 30)
    C = C[0]

    # Load the codes from the training set
    codestrain = load_pickle("results/codes-shs-train-k2045.pk")

    clique_idx = 0
    label_id = 1000001

    td = load_transform(d)

    while len(codes) < N:
        # Pick the tracks from the train set that belong to a
        # clique that has at least n tracks
        if clique_idx < len(clique_train):
            while clique_idx < len(clique_train) and \
                    len(np.where(clique_train == clique_train[clique_idx])[0]) < n :
                clique_idx += 1

            if clique_idx < len(clique_train) and clique_train[clique_idx] != -2:
                for clique_id in \
                        np.where(clique_train == clique_train[clique_idx])[0]:
                    code = codestrain[clique_id]
                    if norm:
                        code = dan_tools.chromnorm(code.reshape(code.shape[0], 
                                        1)).squeeze()
                    clique_train[clique_id] = -2
                    if code is None:
                        continue
                    if pca is not None:
                        code = P.transform(code)
                    codes.append( code )
                    labels.append( clique_idx )

            clique_idx += 1

        # Pick random tracks from the MSD and assign new labels
        else:
            clique_id = np.random.random_integers(0, len(C)-1)
            while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2:
                clique_id = np.random.random_integers(0, len(C)-1)
            code = C[clique_id]
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0], 
                                        1)).squeeze()
            if pca is not None:
                code = P.transform(code)
            codes.append( code )
            labels.append( label_id )
            label_id += 1
            clique_test[clique_id] = -2

        print "Computed %d out of %d codes" % (len(codes), N)

    codes_pk = "codes_filter_LDA_PCA.pk"
    cliques_pk = "cliques_filter_LDA_PCA.pk"
    save_pickle(codes, codes_pk)
    save_pickle(labels, cliques_pk)

    time.sleep(3)

    # fit LDA and save model
    fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
Beispiel #4
0
def fit_LDA_filter(maindir,
                   d,
                   codes_f,
                   N=9000,
                   n=9,
                   pca=None,
                   pca_n=0,
                   norm=False,
                   outlda="LDAs.pk",
                   lda_components=[50, 100, 200]):
    """Fits an LDA with a filtered version of the dataset, such that each
        clique contains at least n tracks."""

    import cover_id_test as CO

    clique_test = load_pickle("SHS/clique_ids_test.pk")
    clique_train = load_pickle("SHS/clique_ids_train.pk")
    track_test = load_pickle("SHS/track_ids_test.pk")
    track_train = load_pickle("SHS/track_ids_train.pk")

    # Result to
    codes = []
    labels = []

    if pca is not None:
        P = load_pickle(pca)
        P = P[pca_n]

    C = CO.load_codes(codes_f, -1, 30)
    C = C[0]

    # Load the codes from the training set
    codestrain = load_pickle("results/codes-shs-train-k2045.pk")

    clique_idx = 0
    label_id = 1000001

    td = load_transform(d)

    while len(codes) < N:
        # Pick the tracks from the train set that belong to a
        # clique that has at least n tracks
        if clique_idx < len(clique_train):
            while clique_idx < len(clique_train) and \
                    len(np.where(clique_train == clique_train[clique_idx])[0]) < n :
                clique_idx += 1

            if clique_idx < len(
                    clique_train) and clique_train[clique_idx] != -2:
                for clique_id in \
                        np.where(clique_train == clique_train[clique_idx])[0]:
                    code = codestrain[clique_id]
                    if norm:
                        code = dan_tools.chromnorm(
                            code.reshape(code.shape[0], 1)).squeeze()
                    clique_train[clique_id] = -2
                    if code is None:
                        continue
                    if pca is not None:
                        code = P.transform(code)
                    codes.append(code)
                    labels.append(clique_idx)

            clique_idx += 1

        # Pick random tracks from the MSD and assign new labels
        else:
            clique_id = np.random.random_integers(0, len(C) - 1)
            while np.any(np.equal(C[clique_id],
                                  None)) or clique_test[clique_id] == -2:
                clique_id = np.random.random_integers(0, len(C) - 1)
            code = C[clique_id]
            if norm:
                code = dan_tools.chromnorm(code.reshape(code.shape[0],
                                                        1)).squeeze()
            if pca is not None:
                code = P.transform(code)
            codes.append(code)
            labels.append(label_id)
            label_id += 1
            clique_test[clique_id] = -2

        print "Computed %d out of %d codes" % (len(codes), N)

    codes_pk = "codes_filter_LDA_PCA.pk"
    cliques_pk = "cliques_filter_LDA_PCA.pk"
    save_pickle(codes, codes_pk)
    save_pickle(labels, cliques_pk)

    time.sleep(3)

    # fit LDA and save model
    fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)