def test_intersect(self): isamp = self.gen_fake_isamp() series = Series('GSE0') sample_list = [Sample('GSM10', series), Sample('GSM20', series)] for __ in sample_list : series.add_passed_sample(__) self.assertEqual(ppr.intersect(series, isamp), sample_list)
def test_gen_orig_params_per_with_a_single_sra(self): # mock a series and sample series = Series('GSE123456', 'GSE123456_family.soft.subset') sample = Sample('GSM1', series) sample.outdir = 'some_outdir/GSE123456/some_species/GSM1' series.add_passed_sample(sample) with mock.patch('rsempipeline.utils.download.open', mock.mock_open(read_data=SRA_INFO_YAML_SINGLE_SRA)): vals = download.gen_orig_params_per(sample) self.assertEqual(vals, [ [None, ['some_outdir/GSE123456/some_species/GSM1/SRX685892/SRR1557065/SRR1557065.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR1557065.sra.download.COMPLETE'], sample]])
class SampleTestCase(unittest.TestCase): def setUp(self): self.series = Series("GSE123456", "GSE123456_family.soft.subset") self.sample = Sample("GSM1", self.series) def test_is_info_complete(self): self.assertFalse(self.sample.is_info_complete()) self.sample.organism = "Mus musculus" self.assertFalse(self.sample.is_info_complete()) self.sample.url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX000/SRX000000" self.assertTrue(self.sample.is_info_complete()) @mock.patch("rsempipeline.utils.objs.Sample.is_info_complete") def test_gen_outdir(self, mock_is_info_complete): self.sample.organism = "Mus musculus" self.sample.url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX000/SRX000000" self.assertEqual(self.sample.gen_outdir("some_outdir"), "some_outdir/GSE123456/mus_musculus/GSM1") self.assertEqual(self.sample.outdir, "some_outdir/GSE123456/mus_musculus/GSM1") mock_is_info_complete.return_value = False self.assertRaisesRegexp(ValueError, "not information complete", self.sample.gen_outdir, "some_outdir") def test___str__(self): self.sample.organism = "Mus musculus" self.sample.url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX000/SRX000000" self.sample.gen_outdir("some_outdir") self.assertEqual(str(self.sample), "<GSM1 (0/0/0) of GSE123456 at some_outdir/GSE123456/mus_musculus/GSM1>") def test___repr__(self): self.sample.organism = "Mus musculus" self.sample.url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX000/SRX000000" self.sample.gen_outdir("some_outdir") self.assertEqual(str(self.sample), "<GSM1 (0/0/0) of GSE123456 at some_outdir/GSE123456/mus_musculus/GSM1>")
def test_gen_all_samples_from_soft_and_isamp( self, mock_get_isamp, mock_analyze_one, mock_sanity_check): mock_sanity_check.return_value = True mock_get_isamp.return_value = self.gen_fake_isamp() fake_series = Series('GSE0') sample_list = [Sample('GSM10', fake_series), Sample('GSM20', fake_series)] for __ in sample_list : __.organism = 'H**o Sapiens' fake_series.add_passed_sample(__) mock_analyze_one.return_value = sample_list self.assertEqual(ppr.gen_all_samples_from_soft_and_isamp( ['soft1'], 'isamp_file_or_str', {'INTERESTED_ORGANISMS': ['H**o Sapiens']}), sample_list)
def test_add(self, mock_is_info_complete): series = Series('GSE123456', 'GSE123456_family.soft.subset') current_sample = Sample('GSM1', series) mock_is_info_complete.return_value = True self.assertEqual(soft_parser.add(current_sample, series, 1), 2) mock_is_info_complete.return_value = False self.assertEqual(soft_parser.add(current_sample, series, 1), 1)
def test_gen_orig_params_per_with_a_single_sra(self): # mock a series and sample series = Series('GSE123456', 'GSE123456_family.soft.subset') sample = Sample('GSM1', series) sample.outdir = 'some_outdir/GSE123456/some_species/GSM1' series.add_passed_sample(sample) with mock.patch('rsempipeline.utils.download.open', mock.mock_open(read_data=SRA_INFO_YAML_SINGLE_SRA)): vals = download.gen_orig_params_per(sample) self.assertEqual(vals, [[ None, [ 'some_outdir/GSE123456/some_species/GSM1/SRX685892/SRR1557065/SRR1557065.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR1557065.sra.download.COMPLETE' ], sample ]])
def test_gen_orig_params_per_with_multiple_sras(self): # mock a series and sample series = Series('GSE123456', 'GSE123456_family.soft.subset') sample = Sample('GSM1', series) sample.outdir = 'some_outdir/GSE123456/some_species/GSM1' series.add_passed_sample(sample) with mock.patch('rsempipeline.utils.download.open', mock.mock_open(read_data=SRA_INFO_YAML_MULTIPLE_SRAS)): vals = download.gen_orig_params_per(sample) self.assertEqual(vals, [ # in the format of input, outputs, other params [None, ['some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453140/SRR453140.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453140.sra.download.COMPLETE'], sample], [None, ['some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453141/SRR453141.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453141.sra.download.COMPLETE'], sample], [None, ['some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453142/SRR453142.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453142.sra.download.COMPLETE'], sample], [None, ['some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453143/SRR453143.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453143.sra.download.COMPLETE'], sample] ])
def test_intersect_with_discrenpacy(self, L): isamp = self.gen_fake_isamp() series = Series('GSE0') sample_list = [Sample('GSM10', series)] for __ in sample_list : series.add_passed_sample(__) self.assertEqual(ppr.intersect(series, isamp), sample_list) L.check(('rsempipeline.utils.pre_pipeline_run', 'ERROR', 'Discrepancy for GSE0: 1 GSMs in soft, 2 GSMs in isamp, and only 1 left after intersection.'),)
def test_gen_orig_params_per_with_multiple_sras(self): # mock a series and sample series = Series('GSE123456', 'GSE123456_family.soft.subset') sample = Sample('GSM1', series) sample.outdir = 'some_outdir/GSE123456/some_species/GSM1' series.add_passed_sample(sample) with mock.patch('rsempipeline.utils.download.open', mock.mock_open(read_data=SRA_INFO_YAML_MULTIPLE_SRAS)): vals = download.gen_orig_params_per(sample) self.assertEqual( vals, [ # in the format of input, outputs, other params [ None, [ 'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453140/SRR453140.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453140.sra.download.COMPLETE' ], sample ], [ None, [ 'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453141/SRR453141.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453141.sra.download.COMPLETE' ], sample ], [ None, [ 'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453142/SRR453142.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453142.sra.download.COMPLETE' ], sample ], [ None, [ 'some_outdir/GSE123456/some_species/GSM1/SRX135160/SRR453143/SRR453143.sra', 'some_outdir/GSE123456/some_species/GSM1/SRR453143.sra.download.COMPLETE' ], sample ] ])
def test_analyze_one(self, mock_parse, L): fake_isamp = self.gen_fake_isamp() fake_series = Series('GSE0') sample_list = [Sample('GSM10', fake_series)] for __ in sample_list : __.organism = 'H**o Sapiens' fake_series.add_passed_sample(__) mock_parse.return_value = fake_series self.assertEqual(ppr.analyze_one('GSE0_family.soft.subset', fake_isamp, ['H**o sapiens']), sample_list) L.check(('rsempipeline.utils.pre_pipeline_run', 'ERROR', 'Discrepancy for GSE0: 1 GSMs in soft, 2 GSMs in isamp, and only 1 left after intersection.'),)
def parse(soft_file, interested_organisms): """Parse the soft file :param interested_organisms: a list of interested organisms: ['H**o sapiens', 'Mus musculus'] """ logger.info("Parsing file: {0} ...".format(soft_file)) series_name_from_file = get_series_name_from(soft_file) print series_name_from_file # Assume one GSE per soft file # index: the index of all passed samples, unpassed samples are not indexed index, series, current_sample = 1, None, None with open(soft_file, 'rb') as inf: for line in inf: label, value = [__.strip() for __ in line.split('=')] if label == '^SERIES': series = Series(value, os.path.abspath(soft_file)) if series.name != series_name_from_file: msg = ('series contained in the soft file doesn\'t match ' 'that in the filename: {0} != {1}'.format( series, series_name_from_file)) raise ValueError(msg) elif label == '^SAMPLE': index = add(current_sample, series, index) current_sample = Sample(name=value, series=series) if current_sample: current_sample = update(current_sample, label, value, interested_organisms) if series is not None: # add the last sample add(current_sample, series, index) logger.info("{0}: {1}/{2} samples passed".format( series.name, series.num_passed_samples(), series.num_samples())) logger.info('=' * 30) return series
def setUp(self): self.series = Series("GSE123456", "GSE123456_family.soft.subset") self.sample = Sample("GSM1", self.series)