def test_cpm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) first_pos = dge_list.counts[0][0] col_sum = np.sum(dge_list.counts, axis=0) assert isinstance(first_pos, np.integer) new_dge_list = dge_list.cpm() assert new_dge_list.counts[0][0] == first_pos * 1e6 / col_sum[0]
def test_tpm(): # example hand calculated as in https://www.youtube.com/watch?time_continue=611&v=TTUrtCY2k-w counts = np.array([[10, 12, 30], [20, 25, 60], [5, 8, 15], [0, 0, 1]]) gene_lengths = np.array([2000, 4000, 1000, 10000]) expected = np.array( [ [333_333.333_333_33, 296_296.296_296_3, 332_594.235_033_26], [333_333.333_333_33, 308_641.975_308_64, 332_594.235_033_26], [333_333.333_333_33, 395_061.728_395_06, 332_594.235_033_26], [0.0, 0.0, 2217.294_900_22], ] ) dge_list = DGEList( counts=counts, samples=np.array(['a', 'b', 'c']), genes=np.array(['a', 'b', 'c', 'd']), groups_in_dict={'group1': ['a', 'c'], 'group2': ['b', 'd']}, ) assert isinstance(dge_list.counts[0][0], np.integer) new_dge_list = dge_list.tpm(gene_lengths) assert np.allclose(new_dge_list.counts, expected, atol=1e-1) # make sure that the sums of all genes across are the same the each sample (an important property of TPM) gene_sums = new_dge_list.counts.sum(axis=0) assert np.allclose(gene_sums, [gene_sums[0]] * len(gene_sums))
def test_minimal_init(): dge_list = DGEList( to_remove_zeroes=False, counts=np.ones(shape=(5, 5)), samples=["A", "B", "C", "D", "E"], genes=["ENSG001", "ENSG002", "ENSG003", "ENSG004", "ENSG005"], groups_in_dict={'group1': ["A", "B"], 'group2': ["C", "D", "E"]}, ) assert dge_list.__repr__() == "DGEList(num_samples=5, num_genes=5)"
def test_init(): dge_list = DGEList.create_DGEList_data_file( data_file=get_dataset_path(TEST_DATASET), group_file=get_dataset_path(TEST_GROUPS) ) assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)" dge_list = DGEList.create_DGEList_handle( data_handle=smart_open(get_dataset_path(TEST_DATASET)), group_handle=smart_open(get_dataset_path(TEST_GROUPS)), ) assert dge_list.__repr__() == "DGEList(num_samples=10, num_genes=21,711)"
def test_rpkm(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) icd = CanonicalDataStore( get_dataset_path(TEST_GENE_SET_DATA), get_dataset_path(TEST_GENE_SYMBOLS) ) first_pos = dge_list.counts[0][0] first_gene = dge_list.genes[0] col_sum = np.sum(dge_list.counts, axis=0) assert isinstance(first_pos, np.integer) rpm_dge = dge_list.rpkm(icd) ensg_gene = icd.pick_gene_id(icd.get_genes_from_symbol(first_gene)) gene_len = icd.get_length_of_canonical_transcript(ensg_gene) # RPKM=numReads / (geneLength / 1000 * totalNumReads / 1, 000, 000) assert rpm_dge.counts[0][0] == (first_pos / ((gene_len / 1e3) * (col_sum[0] / 1e6)))
def test_create_DGEList_handle_init(): dge_list = DGEList.create_DGEList_handle( data_handle=smart_open(get_dataset_path(TEST_DATASET)), group_handle=smart_open(get_dataset_path(TEST_GROUPS)), ) assert 2 == len(dge_list.groups_dict) assert 5 == len(dge_list.groups_dict["Group 1"]) assert 5 == len(dge_list.groups_dict["Group 2"]) assert dge_list.samples.shape == dge_list.groups_list.shape
def testing_setting_samples_and_counts(): # Empty list should fail with pytest.raises(Exception): DGEList( to_remove_zeroes=False, groups_in_list=['A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'B', 'B'], ) # Lists with just counts should fail with pytest.raises(ValueError): DGEList(counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B']) # lists sith samples and counts and groups should pass: DGEList( counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B'], samples=["S1", 'S2', 'S3'], ) # Lists with just samples should fail with pytest.raises(Exception): DGEList( samples=np.array(["1", "2", "3"]), to_remove_zeroes=False, groups_in_list=['A', 'A', 'B'], ) # Properly formed samples and counts should pass DGEList( samples=np.array(["1", "2", "3"]), counts=np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]), groups_in_list=['A', 'A', 'B'], ) # Lists with ill-matched samples and counts should fail pytest.raises( ValueError, "DGEList(samples = np.array(['2', '3'])," " counts = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]]))", )
def test_cycle_dge_npz(): import tempfile import os tempdir = tempfile.mkdtemp(prefix="edgePy_tmp") file_name = tempdir + os.sep + next(tempfile._get_candidate_names()) dge_list_first = dge_list() dge_list_first.write_npz_file(filename=file_name) dge_list_second = DGEList(filename=file_name + ".npz") assert np.array_equal(dge_list_first.counts, dge_list_second.counts) assert np.array_equal(dge_list_first.genes, dge_list_second.genes) assert np.array_equal(dge_list_first.samples, dge_list_second.samples) assert np.array_equal(dge_list_first.norm_factors, dge_list_second.norm_factors) assert np.array_equal(dge_list_first.groups_list, dge_list_second.groups_list) os.remove(file_name + ".npz") os.rmdir(tempdir)
def test_library_size(): dge_list = DGEList(filename=str(get_dataset_path(TEST_DATASET_NPZ))) assert np.array_equal( dge_list.library_size, np.array( [ 63_579_607, 58_531_933, 39_138_521, 78_565_885, 48_667_119, 62_799_917, 66_032_107, 66_194_776, 55_085_875, 37_760_315, ] ), )
def test_create_DGEList(): """Tests the function that converts data into a DGE_List object""" samples = ["AAA", "BBB", "CCC"] genes = ["ENSG001", "ENSG002"] data_set = { "AAA": {"ENSG001": 10, "ENSG002": 20}, "BBB": {"ENSG001": 15, "ENSG002": 40}, "CCC": {"ENSG001": 20, "ENSG002": 80}, } categories = {"One": ["AAA", "BBB"], "Two": ["CCC"]} dge_list = DGEList.create_DGEList( sample_list=samples, data_set=data_set, gene_list=genes, category_to_samples=categories ) assert np.array_equal(dge_list.samples, np.array(samples)) # 2 rows (genes), 3 columns(samples) assert np.array_equal(dge_list.counts, np.array([[10, 15, 20], [20, 40, 80]])) assert np.array_equal(dge_list.groups_list, np.array(["One", "One", "Two"])) assert dge_list.groups_dict, {"One:"} assert np.array_equal(dge_list.genes, np.array(genes))
def test_setting_DGElist_counts(): dge_list = DGEList( counts=np.zeros(shape=(5, 10)), groups_in_list=['A', 'A', 'B', 'B', 'B'], samples=['S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9'], ) assert 5 == dge_list.counts.shape[0] assert 10 == dge_list.counts.shape[1] with pytest.raises(ValueError): c = np.array([[1, 1, 1], [-1, 1, 1]]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(ValueError): c = np.array([[1, 1, 1], [np.nan, 1, 1]]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(ValueError): c = np.array([1, 1, 1]) DGEList(counts=c, groups_in_list=["a", "b"]) with pytest.raises(TypeError): c = [1, 1, 1] dge_list.counts = c
def test_too_many_options2(): with pytest.raises(Exception): DGEList(counts=np.ones(shape=(5, 10)), filename=str(get_dataset_path(TEST_DATASET_NPZ)))
def test_sample_group_list(): samples = ["A", "B", "C", "D", "E"] sample_by_group = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} expected_output = np.array(['group1', 'group1', 'group2', 'group2', 'group2']) output = DGEList._sample_group_list(sample_by_group, samples) assert np.array_equal(output, expected_output)
def test_broken_dge_caGENE_SYMBOL_SQLll(): with pytest.raises(Exception): DGEList(filename="fake_filename", counts=np.array([[1, 1, 1], [1, 1, 1]])) with pytest.raises(Exception): DGEList(counts=None)
def dge_list(): with smart_open(get_dataset_path(TEST_DATASET), 'r') as data_handle, smart_open( get_dataset_path(TEST_GROUPS), 'r' ) as group_handle: return DGEList.create_DGEList_handle(data_handle, group_handle)
def __init__(self, args): self.dge_list = None if args.dge_file: self.dge_list = DGEList(filename=args.dge_file) log.info(f"The DGE list is {self.dge_list}") elif args.mongo_config: # This section is only useful for MongoDB based analyses. Talk to @apfejes about this section if you have # any questions. config = configparser.ConfigParser() config.read(args.mongo_config) if args.group1_sample_names and args.group2_sample_names: key = 'sample_name' value = args.group1_sample_names + args.group2_sample_names elif args.key_name and args.mongo_key_value: key = args.mongo_key_name value = args.mongo_key_value else: raise ValueError("Insufficient parameters for use of Mongodb") mongo_importer = ImportFromMongodb( host=config.get("Mongo", "host"), port=config.get("Mongo", "port"), mongo_key=key, mongo_value=value, gene_list_file=args.gene_list, ) sample_list, data_set, gene_list, sample_category = mongo_importer.get_data_from_mongo( database=args.database_name ) if key == 'sample_name': # Override sample categories if sample name is the source of the categories. sample_category_list = [ "group1" if sample_name in args.group1_sample_names else "group2" for sample_name in sample_list ] sample_category_dict = None else: # TODO: read from file sample_category_dict = args.groups_json sample_category_list = None self.dge_list = DGEList.create_DGEList( sample_list, data_set, gene_list, sample_to_category=sample_category_list, category_to_samples=sample_category_dict, ) self.ensg_to_symbol = mongo_importer.mongo_reader.find_as_dict( 'ensembl_90_37', "symbol_by_ensg", query={} ) else: self.dge_list = DGEList.create_DGEList_data_file( data_file=args.counts_file, group_file=args.groups_file ) self.output = args.output if args.output else None self.p_value_cutoff = args.cutoff self.minimum_cpm = args.minimum_cpm
def test_sample_by_group(): samples = ["A", "B", "C", "D", "E"] expected_output = {'group1': ["A", "B"], 'group2': ["C", "D", "E"]} group_by_sample = ['group1', 'group1', 'group2', 'group2', 'group2'] output = DGEList._sample_group_dict(group_by_sample, samples) assert output == expected_output