def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]): """Fits a PCA transformation with N codes.""" import binary_task as B import cover_id_test as CO logger = configure_logger() td = load_transform(d) codes = np.ones((N, 2045)) * np.nan k = 0 # Load codes origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30) while k < N: track_idx = np.random.random_integers(0, len(track_ids) - 1) while track_ids[track_idx] == -2: track_idx = np.random.random_integers(0, len(track_ids) - 1) track_id = track_ids[track_idx] code = origcodes[track_idx] if code is not None: if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() codes[k] = code # Marked as used track_ids[track_idx] = -2 k += 1 if k % 100 == 0: logger.info("----Computing features %.1f%%" % \ (k/float(N) * 100)) # Remove nans nan_idx = np.unique(np.where(np.isnan(codes))[0]) codes = np.delete(codes, nan_idx, axis=0) # Fit PCA res = [] codes = np.asarray(codes) for c in pca_components: pca = PCA(n_components=c) pca.fit(codes) res.append(pca) # Save Result save_pickle(res, outpca)
def fit_PCA(maindir, d, origcodes_f="msd_codes_k2045", outpca="PCA-codes.pk", N=50000, norm=False, pca_components=[50, 100, 200, 500, 1000]): """Fits a PCA transformation with N codes.""" import binary_task as B import cover_id_test as CO logger = configure_logger() td = load_transform(d) codes = np.ones((N, 2045)) * np.nan k = 0 # Load codes origcodes, track_ids, clique_ids = CO.load_codes(origcodes_f, -1, 30) while k < N: track_idx = np.random.random_integers(0,len(track_ids)-1) while track_ids[track_idx] == -2: track_idx = np.random.random_integers(0,len(track_ids)-1) track_id = track_ids[track_idx] code = origcodes[track_idx] if code is not None: if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() codes[k] = code # Marked as used track_ids[track_idx] = -2 k += 1 if k % 100 == 0: logger.info("----Computing features %.1f%%" % \ (k/float(N) * 100)) # Remove nans nan_idx = np.unique(np.where(np.isnan(codes))[0]) codes = np.delete(codes, nan_idx, axis=0) # Fit PCA res = [] codes = np.asarray(codes) for c in pca_components: pca = PCA(n_components=c) pca.fit(codes) res.append(pca) # Save Result save_pickle(res, outpca)
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, norm=False, outlda="LDAs.pk", lda_components=[50,100,200]): """Fits an LDA with a filtered version of the dataset, such that each clique contains at least n tracks.""" import cover_id_test as CO clique_test = load_pickle("SHS/clique_ids_test.pk") clique_train = load_pickle("SHS/clique_ids_train.pk") track_test = load_pickle("SHS/track_ids_test.pk") track_train = load_pickle("SHS/track_ids_train.pk") # Result to codes = [] labels = [] if pca is not None: P = load_pickle(pca) P = P[pca_n] C = CO.load_codes(codes_f, -1, 30) C = C[0] # Load the codes from the training set codestrain = load_pickle("results/codes-shs-train-k2045.pk") clique_idx = 0 label_id = 1000001 td = load_transform(d) while len(codes) < N: # Pick the tracks from the train set that belong to a # clique that has at least n tracks if clique_idx < len(clique_train): while clique_idx < len(clique_train) and \ len(np.where(clique_train == clique_train[clique_idx])[0]) < n : clique_idx += 1 if clique_idx < len(clique_train) and clique_train[clique_idx] != -2: for clique_id in \ np.where(clique_train == clique_train[clique_idx])[0]: code = codestrain[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() clique_train[clique_id] = -2 if code is None: continue if pca is not None: code = P.transform(code) codes.append( code ) labels.append( clique_idx ) clique_idx += 1 # Pick random tracks from the MSD and assign new labels else: clique_id = np.random.random_integers(0, len(C)-1) while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2: clique_id = np.random.random_integers(0, len(C)-1) code = C[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = P.transform(code) codes.append( code ) labels.append( label_id ) label_id += 1 clique_test[clique_id] = -2 print "Computed %d out of %d codes" % (len(codes), N) codes_pk = "codes_filter_LDA_PCA.pk" cliques_pk = "cliques_filter_LDA_PCA.pk" save_pickle(codes, codes_pk) save_pickle(labels, cliques_pk) time.sleep(3) # fit LDA and save model fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)
def fit_LDA_filter(maindir, d, codes_f, N=9000, n=9, pca=None, pca_n=0, norm=False, outlda="LDAs.pk", lda_components=[50, 100, 200]): """Fits an LDA with a filtered version of the dataset, such that each clique contains at least n tracks.""" import cover_id_test as CO clique_test = load_pickle("SHS/clique_ids_test.pk") clique_train = load_pickle("SHS/clique_ids_train.pk") track_test = load_pickle("SHS/track_ids_test.pk") track_train = load_pickle("SHS/track_ids_train.pk") # Result to codes = [] labels = [] if pca is not None: P = load_pickle(pca) P = P[pca_n] C = CO.load_codes(codes_f, -1, 30) C = C[0] # Load the codes from the training set codestrain = load_pickle("results/codes-shs-train-k2045.pk") clique_idx = 0 label_id = 1000001 td = load_transform(d) while len(codes) < N: # Pick the tracks from the train set that belong to a # clique that has at least n tracks if clique_idx < len(clique_train): while clique_idx < len(clique_train) and \ len(np.where(clique_train == clique_train[clique_idx])[0]) < n : clique_idx += 1 if clique_idx < len( clique_train) and clique_train[clique_idx] != -2: for clique_id in \ np.where(clique_train == clique_train[clique_idx])[0]: code = codestrain[clique_id] if norm: code = dan_tools.chromnorm( code.reshape(code.shape[0], 1)).squeeze() clique_train[clique_id] = -2 if code is None: continue if pca is not None: code = P.transform(code) codes.append(code) labels.append(clique_idx) clique_idx += 1 # Pick random tracks from the MSD and assign new labels else: clique_id = np.random.random_integers(0, len(C) - 1) while np.any(np.equal(C[clique_id], None)) or clique_test[clique_id] == -2: clique_id = np.random.random_integers(0, len(C) - 1) code = C[clique_id] if norm: code = dan_tools.chromnorm(code.reshape(code.shape[0], 1)).squeeze() if pca is not None: code = P.transform(code) codes.append(code) labels.append(label_id) label_id += 1 clique_test[clique_id] = -2 print "Computed %d out of %d codes" % (len(codes), N) codes_pk = "codes_filter_LDA_PCA.pk" cliques_pk = "cliques_filter_LDA_PCA.pk" save_pickle(codes, codes_pk) save_pickle(labels, cliques_pk) time.sleep(3) # fit LDA and save model fit_LDA_from_codes_file(codes_pk, cliques_pk, lda_components, outlda)