Esempio n. 1
0
def create_and_save_partitions(dataset,
                               study_name,
                               meta_label,
                               test_groups,
                               pretest_groups,
                               valid_groups,
                               save_text_files=True):

    # determine dataset orientation
    orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat'

    # discard null categories
    tobediscarded = np.in1d(
        dataset.rowmeta[meta_label],
        ['-666', '', 'NA', 'N/A', 'na', 'n/a', 'NaN', 'NAN', 'nan'])
    dataset.discard(tobediscarded, 0)
    print('discarding {0!s} samples...'.format(tobediscarded.sum()),
          flush=True)
    print(dataset, flush=True)

    # partition the data
    tobepopped = np.in1d(dataset.rowmeta[meta_label], test_groups)
    dataset_test = dataset.pop(tobepopped, 0)
    print('    TEST', flush=True)
    print(dataset_test, flush=True)
    tobepopped = np.in1d(dataset.rowmeta[meta_label], pretest_groups)
    dataset_pretest = dataset.pop(tobepopped, 0)
    print('    PRETEST', flush=True)
    print(dataset_pretest, flush=True)
    tobepopped = np.in1d(dataset.rowmeta[meta_label], valid_groups)
    dataset_valid = dataset.pop(tobepopped, 0)
    print('    VALID', flush=True)
    print(dataset_valid, flush=True)
    dataset_train = dataset
    print('    TRAIN', flush=True)
    print(dataset_train, flush=True)

    # save data partitions
    savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation)
    print('    SAVING PARTITIONS TO {0}'.format(savefolder), flush=True)
    os.makedirs(savefolder)
    datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder),
                              dataset_test)
    datasetIO.save_datamatrix('{0}/pretest.pickle'.format(savefolder),
                              dataset_pretest)
    datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder),
                              dataset_valid)
    datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder),
                              dataset_train)
    if save_text_files:
        os.mkdir('{0}/test'.format(savefolder))
        datasetIO.save_splitdata('{0}/test'.format(savefolder), dataset_test)
        os.mkdir('{0}/pretest'.format(savefolder))
        datasetIO.save_splitdata('{0}/pretest'.format(savefolder),
                                 dataset_pretest)
        os.mkdir('{0}/valid'.format(savefolder))
        datasetIO.save_splitdata('{0}/valid'.format(savefolder), dataset_valid)
        os.mkdir('{0}/train'.format(savefolder))
        datasetIO.save_splitdata('{0}/train'.format(savefolder), dataset_train)
Esempio n. 2
0
gene_atb.discard(tobediscarded, 0)
print(gene_atb, flush=True)

# discard pseudogenes
print('discarding pseudogenes data...', flush=True)
print(np.unique(gene_atb.rowmeta['locus_type']).tolist())
tobediscarded = ~np.in1d(gene_atb.rowmeta['locus_type'], ['RNA, long non-coding', 'RNA, micro', 'T cell receptor gene', 'gene with protein product', 'immunoglobulin gene', 'protocadherin'])
gene_atb.discard(tobediscarded, 0)
print(gene_atb, flush=True)

# add mp metadata
print('adding mouse phenotype metadata data...', flush=True)
with open('../../original_data/impc/mpid_name_dict.pickle', 'rb') as fr:
    mpid_name = pickle.load(fr)
gene_atb.columnmeta['mp_name'] = np.array([mpid_name[mpid] if mpid in mpid_name else 'nan' for mpid in gene_atb.columnlabels], dtype='object')
print('missing phenotype names for {0!s} phenotype ids'.format((gene_atb.columnmeta['mp_name'] == 'nan').sum()), flush=True)

# save the data
print('saving prepared data...', flush=True)
gene_atb.matrixname += '_prepared'
datasetIO.save_datamatrix('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.pickle', gene_atb)
datasetIO.save_datamatrix('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.txt.gz', gene_atb)
savefolder = '../../input_data/impc_binary'
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, gene_atb)
shutil.copyfile('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.pickle', '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_prepared.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))

print('done.', flush=True)
Esempio n. 3
0
        snp_genome.columnmeta[metalabel][np.in1d(
            snp_genome.columnmeta[metalabel], low_freq_uvals)] = 'NA'

# save the data
print('saving prepared data...', flush=True)
snp_genome.matrixname += '_prepared'
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle',
    snp_genome)
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz',
    snp_genome)
savefolder = '../../input_data/1000genomes_genomes'
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, snp_genome)
shutil.copyfile(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle',
    '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz',
    '{0}/datamatrix.txt.gz'.format(savefolder))

print('done.', flush=True)

# visualization
pca_model = PCA(n_components=2).fit(snp_genome.matrix)
pca_matrix = pca_model.transform(snp_genome.matrix)
fg, ax = plt.subplots(1, 1, figsize=(6.5, 4.3))
ax.set_position([0.15 / 6.5, 0.15 / 4.3, 4.0 / 6.5, 4.0 / 4.3])
ax.plot(pca_matrix[:, 0],
dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle')
print(dataset, flush=True)

# discard samples
print('discarding samples...', flush=True)
dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0)
print(dataset, flush=True)

# save the data
print('saving data...', flush=True)
datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', dataset)
datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', dataset)
savefolder = '../../input_data/pratfelip_transposed_plus_clinical_no_stabledisease'
if not os.path.exists(savefolder):
	os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, dataset)
shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))

# load the data
print('loading dataset...', flush=True)
dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_ft_pratfelip_only_clinical_and_deconv.pickle')
print(dataset, flush=True)

# discard samples
print('discarding samples...', flush=True)
dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0)
print(dataset, flush=True)

# save the data
print('saving data...', flush=True)
Esempio n. 5
0
    print('rgep_genes: {0!s}'.format(len(gene_cell)), flush=True)
    print(atb_gene)

    # add cell type metadata
    print('adding cell type metadata...', flush=True)
    atb_gene.columnmeta['rgep_cell_type'] = np.array(
        [gene_cell[gene_sym] for gene_sym in atb_gene.columnmeta['symbol']],
        dtype='object')

    # save the data
    print('saving filtered data...', flush=True)
    datasetIO.save_datamatrix(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle'
        .format(rgep_name), atb_gene)
    datasetIO.save_datamatrix(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz'
        .format(rgep_name), atb_gene)
    savefolder = '../../input_data/hugolo_transposed_filtered_by_{0}_rgep'.format(
        rgep_name)
    if not os.path.exists(savefolder):
        os.makedirs(savefolder)
    datasetIO.save_splitdata(savefolder, atb_gene)
    shutil.copyfile(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle'
        .format(rgep_name), '{0}/datamatrix.pickle'.format(savefolder))
    shutil.copyfile(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz'
        .format(rgep_name), '{0}/datamatrix.txt.gz'.format(savefolder))

print('done.', flush=True)
snp_haplome.reorder(np.random.permutation(snp_haplome.shape[1]), 1)
print(snp_haplome, flush=True)

# save the data
print('saving prepared data...', flush=True)
snp_haplome.matrixname += '_prepared'
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.pickle',
    snp_haplome)
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.txt.gz',
    snp_haplome)
savefolder = '../../input_data/1000genomes_haplomes'
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, snp_haplome)
shutil.copyfile(
    '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.pickle',
    '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile(
    '../../original_data/1000genomes/snp_haplome_1000genomes-phased-MHC_prepared.txt.gz',
    '{0}/datamatrix.txt.gz'.format(savefolder))

print('done.', flush=True)

# visualization
pca_model = PCA(n_components=2).fit(snp_haplome.matrix)
pca_matrix = pca_model.transform(snp_haplome.matrix)
fg, ax = plt.subplots(1, 1, figsize=(6.5, 4.3))
ax.set_position([0.15 / 6.5, 0.15 / 4.3, 4.0 / 6.5, 4.0 / 4.3])
ax.plot(pca_matrix[:, 0],