Ejemplo n.º 1
0
def main(n_images, n_tissues, n_patches, patch_size, model_file_id):
    logger.info('Initializing cluster_classify script')
    dataset = Dataset(n_tissues=n_tissues, n_images=n_images)
    data = dataset.sample_data(patch_size, n_patches)
    patches, GTEx_IDs = data
    image_objs = [Image(x) for x in GTEx_IDs]

    dataset_name = ''.join([s for s in str(dataset) if s.isalnum()])
    features_ID = dataset_name + f'_{n_patches}_{patch_size}_{n_images}' \
                               + model_file_id

    features = generate_features(features_ID, patches, model_file_id)

    a_features, a_image_objs = aggregate_features(dataset_name, features,
                                                  image_objs, 'GTEx_IDs',
                                                  np.mean)

    a_features, a_image_objs = aggregated_features['GTEx_factor_IDs'][
        'np.mean']

    lung_features, lung_image_objs = subselect_tissue(dataset_name, 'Lung',
                                                      features, image_objs)

    train_classifiers(dataset_name,
                      features_ID,
                      lung_features,
                      lung_image_objs,
                      'GTEx_IDs',
                      retrain=True)
Ejemplo n.º 2
0
def run_naive_bayes_bow_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_bayers(nbr)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB BOW voc Avg f1: " + avg_f1.__str__(),
        "NB BOW voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Ejemplo n.º 3
0
def run_lp_bow_vocabulary(nbr, str_list, gamma):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = CountVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelPropagation(kernel='rbf',
                               gamma=gamma).fit(vectors.todense(),
                                                dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "RBF BOW voc Avg f1: " + avg_f1.__str__(),
        "RBF BOW voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Ejemplo n.º 4
0
def main(n_tissues, n_images, n_patches, patch_size, model_type, param_string):
    np.random.seed(42)
    os.makedirs('data/images', exist_ok=True)
    dataset = Dataset(n_tissues=n_tissues, n_images=n_images)

    logger.debug('Initializing download script')

    params = extract_params(param_string)
    params['patch_size'] = patch_size

    N = dataset.n_tissues * dataset.n_images * params['batch_size']

    data = dataset.sample_data(patch_size, int(n_patches))
    patches_data, imageIDs_data = data

    if model_type == 'concrete_vae':
        from dependencies.vae_concrete.vae_concrete import VAE
        m = VAE(latent_cont_dim=256)
        m.fit(patches_data, num_epochs=20)

    else:
        Model = eval(model_type)
        m = Model(inner_dim=params['inner_dim'])
        N = patches_data.shape[0]
        assert N == imageIDs_data.shape[0]
        p = np.random.permutation(N)
        patches_data, imageIDs_data = patches_data[p], imageIDs_data[p]

        m.train_on_data(patches_data, params)

        m.save()
Ejemplo n.º 5
0
def main():
    logger.info('Initializing debug script')
    dataset = Dataset(n_tissues=6, n_images=10)
    data = dataset.sample_data(128, 50)
    patches_data, imageIDs_data = data
    for i in tqdm(range(len(imageIDs_data))):
        GTEx_ID = imageIDs_data[i]
        idx = i % 50
        scipy.misc.imsave(
            f'data/cellprofiler/patches/{i:04d}_{GTEx_ID}_{idx}.png',
            255 - patches_data[i])
Ejemplo n.º 6
0
def run_lp_bow_runtime_vocabulary(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_true(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = CountVectorizer(vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN BOW runtime voc Avg f1: " + avg_f1.__str__(), "KNN BOW runtime vod Avg acc: "
                     + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Ejemplo n.º 7
0
def run_naive_bayes_tfidf_runtime_vocabulary(nbr, str_list):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.load_preprocessed(categories)
        dataset.split_train_bayers(nbr)
        print_v2_test_docs_vocabulary_labeled(categories)
        dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)
        vectorizer = TfidfVectorizer(
            vocabulary=Vocabulary.get_vocabulary(categories))
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = MultinomialNB().fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'],
                                   pred,
                                   average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy / 10
    avg_f1 = avg_f1 / 10
    str_list.extend([
        "NB TF-IDF runtime voc Avg f1: " + avg_f1.__str__(),
        "NB TF-IDF runtime voc Avg acc: " + avg_accuracy.__str__()
    ])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Ejemplo n.º 8
0
def main(n_tissues, n_images, n_patches, patch_size, model_file):
    logger.info('Initializing inspect script')
    dataset = Dataset(n_tissues=n_tissues, n_images=n_images)
    data = dataset.sample_data(patch_size, 15)
    patches_data, imageIDs_data = data
    K = 5
    N = patches_data.shape[0]
    idx = np.random.choice(range(N), K)
    patches = patches_data[idx]
    if model_file:
        # fig, ax = plt.subplots(
        #     2, K, figsize=(8, 3)
        # )
        fig = plt.figure()
        figsize = 128
        figure = np.zeros((figsize * 2, figsize * K, 3))
        model = load_model(MODEL_PATH + f'{model_file}.pkl')
        decoded_patches = model.predict(patches)
        fig.suptitle(model_file, fontsize=10)

        for i in range(K):
            figure[0 * figsize:(0 + 1) * figsize,
                   i * figsize:(i + 1) * figsize, :] = deprocess(patches[i])
            figure[1 * figsize:(1 + 1) * figsize, i * figsize:(i + 1) *
                   figsize, :] = deprocess(decoded_patches[i])
            # ax[0][i].imshow(deprocess(patches[i]))
            # ax[0][i].axis('off')
            # ax[1][i].imshow(deprocess(decoded_patches[i]))
            # ax[1][i].axis('off')
        plt.imshow(figure)
        fig.savefig(f'figures/{model_file}.png', bbox_inches='tight')
    else:
        model_files = sorted(os.listdir(MODEL_PATH))
        n = len(model_files)

        fig, ax = plt.subplots(2 * n, K, figsize=(8, 4 * n))
        for (k, model_file) in enumerate(model_files):
            model_name = model_file.replace('.pkl', '')
            model = load_model(MODEL_PATH + f'{model_name}.pkl')
            logger.debug(f'Generating decodings for {model_file}')
            decoded_patches = model.predict(patches)
            for i in range(K):
                ax[2 * k][i].imshow(deprocess(patches[i]))
                ax[2 * k][i].axis('off')
                if i == int(K / 2):
                    ax[2 * k][i].set_title(model_file)
                ax[2 * k + 1][i].imshow(deprocess(decoded_patches[i]))
                ax[2 * k + 1][i].axis('off')
        plt.savefig(f'figures/all_models.png')
Ejemplo n.º 9
0
def run_lp_tfidf(nbr, str_list, neighbors):
    i = 0
    avg_f1 = 0
    avg_accuracy = 0
    while i < 10:
        dataset = Dataset(categories)
        dataset.split_train_true(nbr)
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform(dataset.train['data'])

        clf = LabelSpreading(kernel='knn', n_neighbors=neighbors).fit(vectors.todense(), dataset.train['target'])
        test_vec = vectorizer.transform(dataset.test['data'])
        pred = clf.predict(test_vec.todense())
        avg_f1 += metrics.f1_score(dataset.test['target'], pred, average='macro')
        avg_accuracy += clf.score(test_vec.todense(), dataset.test['target'])
        i += 1
    avg_accuracy = avg_accuracy/10
    avg_f1 = avg_f1/10
    str_list.extend(["KNN TF-IDF Avg f1: " + avg_f1.__str__(), "KNN TF-IDF Avg acc: " + avg_accuracy.__str__()])
    print("Avg f1: " + avg_f1.__str__())
    print("Avg acc: " + avg_accuracy.__str__())
Ejemplo n.º 10
0
def process(categories):
    i = 0
    while i < len(categories):
        trainingdata = fetch_20newsgroups(subset='train',
                                          remove=('headers', 'footers',
                                                  'quotes'),
                                          categories=[categories[i]])
        testdata = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=[categories[i]])

        lemmatize_newsgroup(trainingdata, testdata, categories[i])
        remove_stopwords(trainingdata)
        remove_stopwords(testdata)
        print_docs(trainingdata, testdata, categories[i])
        i += 1
    dataset = Dataset(categories)
    dataset.load_preprocessed_V1(categories)
    remove_frequent_and_infrequent_words(dataset.train)
    print_docs_reduced_feature_count(dataset, categories)
    print_v2_docs(categories)
    print_v2_test_docs_vocabulary(categories)
Ejemplo n.º 11
0
def print_v2_test_docs_vocabulary_labeled(categories):
    i = 0
    removed_test = 0
    print("Printing docs...")
    while i < len(categories):
        with open(
                '../assets/20newsgroups/test2vocabulary_labeled/newsgroups_test_'
                + categories[i] + '.txt', 'w') as f:
            lines = [
                line.rstrip('\n') for line in
                open('../assets/20newsgroups/test/newsgroups_test_' +
                     categories[i] + '.txt')
            ]
            j = 0
            dataset = Dataset(categories)
            vectorizer = CountVectorizer(
                vocabulary=voc.get_vocabulary_only_labeled(categories))
            vectors = vectorizer.fit_transform(dataset.train['data'])
            vocabulary = vectorizer.vocabulary_
            while j < len(lines):
                lines[j] = re.sub(r'[^\w]', " ", lines[j])
                lines[j] = re.sub(r'\b[a-zA-Z]\b', " ", lines[j])
                lines[j] = re.sub(r'[ \t]+', " ",
                                  lines[j])  # remove extra space or tab
                lines[j] = lines[j].strip() + "\n"
                remove_doc = 1
                words = lines[j].split()
                for word in words:
                    if word in vocabulary.keys():
                        remove_doc = 0
                        break
                size = len(lines[j])
                # lines[j] = lines[j][1:size]
                if len(lines[j]) > 4 and not remove_doc:
                    f.write(lines[j])
                else:
                    removed_test += 1
                j += 1
            f.close()
        i += 1
    print("Printing finished")
    print("Removed testing doc:", removed_test)
Ejemplo n.º 12
0
def main(n_images, n_tissues):
    os.makedirs('data/images', exist_ok=True)
    logger.info('Initializing download script')
    dataset = Dataset(n_images=n_images, n_tissues=n_tissues)
    dataset.download()
Ejemplo n.º 13
0
def main(n_images, n_tissues):
    os.makedirs('data/patches', exist_ok=True)
    logger.info('Initializing patches script')
    dataset = Dataset(n_images=n_images, n_tissues=n_tissues)
    dataset.get_patchcoordfiles()
Ejemplo n.º 14
0
              'sci.electronics',
              'sci.med',
              'sci.space',
              'soc.religion.christian',
              'talk.politics.guns',
              'talk.politics.mideast',
              'talk.politics.misc',
              'talk.religion.misc']
"""

categories = [
    'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey'
]

# initialize dataset
dataset = Dataset(categories)
dataset.load_preprocessed(categories)
dataset.split_train_true(10)
print_v2_test_docs_vocabulary_labeled(categories)
dataset.load_preprocessed_test_vocabulary_labeled_in_use(categories)

dataset_knn = Dataset(categories)
dataset_knn.load_preprocessed_vocabulary_in_use(categories)
dataset_knn.split_train_true(10)
print_v2_test_docs_vocabulary_labeled(categories)
dataset_knn.load_preprocessed_test_vocabulary_labeled_in_use(categories)

# feature extraction
vectorizer_rbf = TfidfVectorizer(
    vocabulary=voc.get_vocabulary_only_labeled(categories))
vectorizer_knn = TfidfVectorizer(
Ejemplo n.º 15
0
              'sci.med',
              'sci.space',
              'soc.religion.christian',
              'talk.politics.guns',
              'talk.politics.mideast',
              'talk.politics.misc',
              'talk.religion.misc']
"""

categories = ['rec.autos',
              'rec.motorcycles',
              'rec.sport.baseball',
              'rec.sport.hockey']

# initialize dataset
dataset_rbf = Dataset(categories)
dataset_rbf.split_train_true(100)
dataset_knn = Dataset(categories)
dataset_knn.split_train_true(100)

# feature extraction
vectorizer_rbf = TfidfVectorizer()
vectorizer_knn = TfidfVectorizer()
vectors_rbf = vectorizer_rbf.fit_transform(dataset_rbf.train['data'])
vectors_knn = vectorizer_knn.fit_transform(dataset_knn.train['data'])

# classification
# use max_iter=10 when 20 categories
clf_rbf = LabelPropagation(kernel='rbf', gamma=5).fit(vectors_rbf.todense(), dataset_rbf.train['target'])
clf_knn = LabelSpreading(kernel='knn', n_neighbors=10).fit(vectors_knn.todense(), dataset_knn.train['target'])
test_vec_rbf = vectorizer_rbf.transform(dataset_rbf.test['data'])
Ejemplo n.º 16
0
                                    'credit_card_balance.csv.zip': 'mean',
                                    'installments_payments.csv.zip': 'min',
                                    'POS_CASH_balance.csv.zip': 'mean',
                                    'bureau.csv.zip': 'max'
                                })

df_test = proj_utils.load_data(train=False,
                               supp_dict={
                                   'previous_application.csv.zip': 'max',
                                   'credit_card_balance.csv.zip': 'mean',
                                   'installments_payments.csv.zip': 'min',
                                   'POS_CASH_balance.csv.zip': 'mean',
                                   'bureau.csv.zip': 'max'
                               })

data = Dataset(df_train, df_test, 'TARGET')

# Clean and transform data
data.preprocess()

# Determine initial feature importances
data.ae_train_model(model=LGBMClassifier())

# Auto-discover ratios weighted by feature importance
data.autoengineer_ratios()

#############################################################
models = {}
for m in M:
    # Define a model
    models[m] = {}