Exemple #1
0
    def test_run_pipeline_epic_plus_export_data():
        """ check that we get back useful data with --export option """
        test_data_dir = 'docs/example_data/epic_plus'
        testfile_1 = Path(test_data_dir, '202651080072',
                          '202651080072_R01C01_processed.csv')
        if testfile_1.exists():
            testfile_1.unlink()
        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     export=True)
        if not testfile_1.exists():
            raise AssertionError("no exported processed csv found")

        # spot checking the output.
        test1 = pd.read_csv(testfile_1)
        num_missing = test1['beta_value'].isna().sum()
        if num_missing == 1:
            if test1[test1.beta_value.isna(
            )]['IlmnID'].iloc[0] == 'cg00968771_I_F_C_rep1_GWG1':
                print(
                    "WARNING: cg00968771_I_F_C_rep1_GWG1 probe data is STILL missing from output"
                )
                #NOT A FATAL ERROR. but not fixing today.
        elif num_missing > 0:
            print(test1.head())
            raise AssertionError(
                '{num_missing} missing values in processed csv')
        if not np.isclose(test1['beta_value'].iloc[5], 0.145):
            print(test1.iloc[5])
            raise AssertionError('beta_value doesnt match expected value')
        if not np.isclose(
                round(
                    test_data_containers[0].unmethylated.data_frame.iloc[0]
                    ['noob'], 1), 274.7):
            raise AssertionError(
                "data_container output differs from expected value")
Exemple #2
0
 def test_batch_size_betas():
     test_data_dir = 'docs/example_data/GSE69852'
     betas = pipeline.run_pipeline(test_data_dir, betas=True, batch_size=1)
     if not np.isclose(betas.iloc[0]['9247377093_R02C01'], 0.23623395577166542):
         raise AssertionError()
     if not (Path(test_data_dir, 'beta_values_1.pkl').is_file() and Path(test_data_dir, 'beta_values_2.pkl').is_file()):
         raise AssertionError()
Exemple #3
0
 def test_with_batch_size():
     test_data_dir = 'docs/example_data/GSE69852'
     df = pipeline.run_pipeline(test_data_dir, export=True, batch_size=1, sample_name='AdultLiver1')
     if not np.isclose(df[0].unmethylated.data_frame.iloc[0]['noob'], 4119.633578946326):
         raise AssertionError()
     if not Path('docs/example_data/GSE69852/9247377093/9247377093_R02C01_processed.csv').is_file():
         raise AssertionError()
Exemple #4
0
 def test_batch_size_betas():
     test_data_dir = 'docs/example_data/GSE69852'
     betas = pipeline.run_pipeline(test_data_dir, betas=True, batch_size=1)
     if not np.isclose(betas.iloc[0]['9247377093_R02C01'],
                       0.23624517):  #0.23623395577166542):
         raise AssertionError(
             f"{betas.iloc[0]['9247377093_R02C01']} != 0.23623395577166542")
     if not Path(test_data_dir, 'beta_values.pkl').is_file():
         raise AssertionError()
     print(f"TEST OUTPUT FILES: {list(Path(test_data_dir).rglob('*'))}")
     for file in Path(test_data_dir).rglob('*.pkl'):
         file.unlink()
Exemple #5
0
    def test_run_pipeline_export_data():
        """ check that we get back useful data with --export option """
        test_data_dir = 'docs/example_data/GSE69852'
        testfile_1 = Path(test_data_dir, '9247377093',
                          '9247377093_R02C01_processed.csv')
        testfile_2 = Path(test_data_dir, '9247377085',
                          '9247377085_R04C02_processed.csv')
        if testfile_1.exists():
            testfile_1.unlink()
        if testfile_2.exists():
            testfile_2.unlink()
        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     export=True,
                                                     sesame=False)
        if not testfile_1.exists():
            raise AssertionError("no exported processed csv found")

        test1 = pd.read_csv(testfile_1)
        if test1['beta_value'].isna().sum() > 0:
            print(test1.head())
            raise AssertionError('missing values in processed csv')
        test2 = pd.read_csv(testfile_2)
        if test2['beta_value'].isna().sum() > 0:
            print(test2.head())
            raise AssertionError('missing values in processed csv')

        # spot checking the output.
        if not np.isclose(
                test_data_containers[1]._SampleDataContainer__data_frame.
                iloc[0]['beta_value'],
                0.30799999,
                atol=0.01):
            print(test_data_containers[1]._SampleDataContainer__data_frame)
            raise AssertionError(
                f"{test_data_containers[1]._SampleDataContainer__data_frame.iloc[0]['beta_value']} vs {0.30799999}"
            )
        # spot checking the output.
        total_nas = test_data_containers[0]._SampleDataContainer__data_frame[
            'beta_value'].isna().sum()
        if total_nas > 0:
            print(
                f'found {total_nas} missing beta_values (N/A or inf) in sample'
            )
            raise AssertionError()
        if not np.isclose(
                test_data_containers[1]._SampleDataContainer__data_frame.
                iloc[3]['noob_meth'],
                3811.0,
                atol=1.0):
            raise AssertionError(
                f"{test_data_containers[1]._SampleDataContainer__data_frame.iloc[3]['noob_meth']} vs {3811.162109}"
            )
Exemple #6
0
    def test_run_pipeline_demo_containers():
        """ check that we get back useful data.
        check that output files exist, then remove them."""
        test_data_dir = 'docs/example_data/GSE69852'
        test_data_containers = pipeline.run_pipeline(test_data_dir)
        print('containers:', test_data_containers)

        # spot checking the output.
        if not test_data_containers[1].unmethylated.data_frame.iloc[0][
                'mean_value'] == 2712:
            raise AssertionError()
        if not np.isclose(
                test_data_containers[1].unmethylated.data_frame.iloc[0]
            ['noob'], 4479.96501260212):
            raise AssertionError()
Exemple #7
0
 def test_run_pipeline_with_create_sample_sheet():
     test_data_dir = 'docs/example_data/epic_plus'
     test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                  export=False,
                                                  sample_name=['Sample_1'],
                                                  meta_data_frame=False,
                                                  make_sample_sheet=True)
     # spot checking the output.
     if not np.isclose(
             test_data_containers[0]._SampleDataContainer__data_frame.
             iloc[0]['noob_meth'], 1180.23):
         raise AssertionError()
     if not np.isclose(
             test_data_containers[0]._SampleDataContainer__data_frame.
             iloc[0]['beta_value'], 0.75902253):
         raise AssertionError()
Exemple #8
0
 def test_pipeline_two_samples():
     """ pass in --sample_name with 2 samples -- from fake command line args """
     test_data_dir = 'docs/example_data/GSE69852'
     testargs = [
         "__program__", '-d', test_data_dir, '--no_export', '--sample_name',
         'AdultLiver1', 'FetalLiver1'
     ]
     with patch.object(sys, 'argv', testargs):
         test_data_containers = pipeline.run_pipeline(test_data_dir)
         # spot checking the output.
         if not test_data_containers[1].unmethylated.data_frame.iloc[0][
                 'mean_value'] == 2712:
             raise AssertionError()
         if not np.isclose(
                 test_data_containers[1].unmethylated.data_frame.iloc[0]
             ['noob'], 4479.96501260212):
             raise AssertionError()
Exemple #9
0
    def test_run_pipeline_export_data():
        """ check that we get back useful data with --export option """
        test_data_dir = 'docs/example_data/GSE69852'
        testfile_1 = Path(test_data_dir, '9247377093',
                          '9247377093_R02C01_processed.csv')
        testfile_2 = Path(test_data_dir, '9247377085',
                          '9247377085_R04C02_processed.csv')
        if testfile_1.exists():
            testfile_1.unlink()
        if testfile_2.exists():
            testfile_2.unlink()
        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     export=True)
        if not testfile_1.exists():
            raise AssertionError("no exported processed csv found")

        test1 = pd.read_csv(testfile_1)
        if test1['beta_value'].isna().sum() > 0:
            print(test1.head())
            raise AssertionError('missing values in processed csv')
        test2 = pd.read_csv(testfile_2)
        if test2['beta_value'].isna().sum() > 0:
            print(test2.head())
            raise AssertionError('missing values in processed csv')

        # spot checking the output.
        if not test_data_containers[1].unmethylated.data_frame.iloc[0][
                'mean_value'] == 2712:
            raise AssertionError()
        # spot checking the output.
        total_nas = test_data_containers[0]._SampleDataContainer__data_frame[
            'beta_value'].isna().sum()
        if total_nas > 0:
            print(
                f'found {total_nas} missing beta_values (N/A or inf) in sample'
            )
            raise AssertionError()
        if not np.isclose(
                test_data_containers[1].unmethylated.data_frame.iloc[0]
            ['noob'], 4479.96501260212):
            raise AssertionError()
 def test_batch_size_betas():
     test_data_dir = 'docs/example_data/GSE69852'
     betas = pipeline.run_pipeline(test_data_dir, betas=True, batch_size=1)
     ref = [
         ['cg00063477', 0.959879, 0.961307],
         ['cg00121626', 0.512332, 0.351993],
         ['cg27619353', 0.184946, 0.358009],
         ['cg27620176', 0.984706, 0.983877],
     ]
     ref_data = pd.DataFrame(
         ref, columns=['IlmnID', '9247377093_R02C01',
                       '9247377085_R04C02']).set_index('IlmnID')
     test_betas = betas.loc[ref_data.index]
     if not np.isclose(ref_data, test_betas, atol=0.01).all():
         raise AssertionError("betas returned don't match")
     #if not np.isclose(betas.iloc[0]['9247377093_R02C01'], 0.23624517): #0.23623395577166542):
     #    raise AssertionError(f"{betas.iloc[0]['9247377093_R02C01']} != 0.23623395577166542")
     if not Path(test_data_dir, 'beta_values.pkl').is_file():
         raise AssertionError()
     print(f"TEST OUTPUT FILES: {list(Path(test_data_dir).rglob('*'))}")
     for file in Path(test_data_dir).rglob('*.pkl'):
         file.unlink()
Exemple #11
0
 def test_with_batch_size():
     test_data_dir = 'docs/example_data/GSE69852'
     df = pipeline.run_pipeline(test_data_dir,
                                export=True,
                                batch_size=1,
                                sample_name='AdultLiver1')
     #if not np.isclose(df[0].unmethylated.data_frame.iloc[0]['noob'], 4119.633578946326):
     if not np.isclose(
             df[0]._SampleDataContainer__data_frame.iloc[0]['beta_value'],
             0.236):
         raise AssertionError()
     if not np.isclose(
             df[0]._SampleDataContainer__data_frame.iloc[2]['m_value'],
             4.146):
         raise AssertionError()
     if not Path(
             'docs/example_data/GSE69852/9247377093/9247377093_R02C01_processed.csv'
     ).is_file():
         raise AssertionError()
     print(f"TEST OUTPUT FILES: {list(Path(test_data_dir).rglob('*'))}")
     for file in Path(test_data_dir).rglob('*.pkl'):
         file.unlink()
Exemple #12
0
    def test_run_pipeline_demo_containers():
        """ check that we get back useful data.
        check that output files exist, then remove them."""
        test_data_dir = 'docs/example_data/GSE69852'
        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     sesame=False)
        print('containers:', test_data_containers)

        # spot checking the output.
        #if not test_data_containers[1].unmethylated.data_frame.iloc[0]['mean_value'] == 2712:
        if not np.isclose(test_data_containers[1].
                          _SampleDataContainer__data_frame.iloc[0]['m_value'],
                          -1.1347262,
                          atol=0.01):
            raise AssertionError()
        #if not np.isclose(test_data_containers[1].unmethylated.data_frame.iloc[0]['noob'], 4479.96501260212):
        if not np.isclose(
                test_data_containers[1]._SampleDataContainer__data_frame.
                iloc[0]['noob_unmeth'],
                4480.919922,
                atol=1.0):
            raise AssertionError()
Exemple #13
0
    def test_run_pipeline_all():
        """ check that we get back useful data.
        check that output files exist, then remove them."""
        test_data_dir = 'docs/example_data/GSE69852'
        test_outputs = [
            Path(test_data_dir, 'control_probes.pkl'),
            Path(test_data_dir, 'beta_values.pkl'),
            Path(test_data_dir, 'm_values.pkl'),
            Path(test_data_dir, 'meth_values.pkl'),
            Path(test_data_dir, 'unmeth_values.pkl'),
            Path(test_data_dir, 'noob_meth_values.pkl'),
            Path(test_data_dir, 'noob_unmeth_values.pkl'),
            Path(test_data_dir, 'sample_sheet_meta_data.pkl'),
            Path(test_data_dir, '9247377085',
                 '9247377085_R04C02_processed.csv'),
            Path(test_data_dir, '9247377093',
                 '9247377093_R02C01_processed.csv'),
        ]
        for outfile in test_outputs:
            if outfile.exists():
                outfile.unlink()

        beta_df = pipeline.run_pipeline(test_data_dir,
                                        export=True,
                                        save_uncorrected=True,
                                        save_control=True,
                                        betas=True,
                                        m_value=True,
                                        batch_size=None)
        for outfile in test_outputs:
            if not outfile.exists():
                raise FileNotFoundError(
                    f"Expected {outfile.name} to be generated by run_pipeline() but it was missing."
                )
            else:
                print('+', outfile)
                outfile.unlink()
 def test_with_batch_size():
     test_data_dir = 'docs/example_data/GSE69852'
     df = pipeline.run_pipeline(test_data_dir,
                                export=True,
                                batch_size=1,
                                sample_name='AdultLiver1')
     ref = [
         ['cg00063477', 4115.0, 172.0, 0.000, 1.0, 0.960, 4.580],
         ['cg00121626', 3552.0, 3381.0, 0.000, 1.0, 0.512, 0.071],
         ['cg27619353', 2204.0, 9713.0, 0.000, 1.0, 0.185, -2.140],
         ['cg27620176', 6052.0, 94.0, 0.001, 1.0, 0.985, 6.009],
     ]
     ref_data = pd.DataFrame(ref,
                             columns=[
                                 'IlmnID', 'noob_meth', 'noob_unmeth',
                                 'poobah_pval', 'quality_mask',
                                 'beta_value', 'm_value'
                             ]).set_index('IlmnID')
     data = df[0]._SampleDataContainer__data_frame.loc[ref_data.index]
     #if not np.isclose(df[0].unmethylated.data_frame.iloc[0]['noob'], 4119.633578946326):
     #if not np.isclose(df[0]._SampleDataContainer__data_frame.iloc[0]['beta_value'], 0.236):
     if not np.isclose(data, ref_data, atol=1.0).all():
         raise AssertionError()
     if not Path(
             'docs/example_data/GSE69852/9247377093/9247377093_R02C01_processed.csv'
     ).is_file():
         raise AssertionError()
     csv_data = pd.read_csv(
         Path(test_data_dir, '9247377093',
              '9247377093_R02C01_processed.csv')).set_index('IlmnID')
     test_csv = csv_data.loc[ref_data.index]
     if not np.isclose(test_csv, ref_data, atol=1.0).all():
         raise AssertionError()
     print(f"TEST OUTPUT FILES: {list(Path(test_data_dir).rglob('*'))}")
     for file in Path(test_data_dir).rglob('*.pkl'):
         file.unlink()
Exemple #15
0
 def test_run_pipeline_with_create_sample_sheet():
     test_data_dir = 'docs/example_data/epic_plus'
     test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                  export=False,
                                                  sample_name=['Sample_1'],
                                                  meta_data_frame=False,
                                                  make_sample_sheet=True,
                                                  sesame=False)
     # spot checking the output.
     if not np.isclose(
             test_data_containers[0]._SampleDataContainer__data_frame.
             iloc[0]['noob_meth'],
             1180.22998046875,
             atol=1.0):
         print(test_data_containers[0]._SampleDataContainer__data_frame)
         raise AssertionError(
             f"{test_data_containers[0]._SampleDataContainer__data_frame.iloc[0]['noob_meth']} vs {1180.2299}"
         )
     if not np.isclose(
             test_data_containers[0]._SampleDataContainer__data_frame.
             iloc[0]['beta_value'],
             0.759056,
             atol=0.01):
         raise AssertionError()
Exemple #16
0
    def test_run_pipeline_sesame_defaults():
        """ check that we get back useful data.
        checks SDC, CSV outputs, and pickles after sesame=True processing
        check that output files exist, then remove them.
        """
        test_data_dir = 'docs/example_data/GSE69852'
        test_outputs = [
            Path(test_data_dir, 'control_probes.pkl'),
            Path(test_data_dir, 'beta_values.pkl'),
            Path(test_data_dir, 'm_values.pkl'),
            Path(test_data_dir, 'meth_values.pkl'),
            Path(test_data_dir, 'unmeth_values.pkl'),
            Path(test_data_dir, 'noob_meth_values.pkl'),
            Path(test_data_dir, 'noob_unmeth_values.pkl'),
            Path(test_data_dir, 'sample_sheet_meta_data.pkl'),
            Path(test_data_dir, '9247377085',
                 '9247377085_R04C02_processed.csv'),
            Path(test_data_dir, '9247377093',
                 '9247377093_R02C01_processed.csv'),
        ]
        for outfile in test_outputs:
            if outfile.exists():
                outfile.unlink()

        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     sesame=True,
                                                     export=True)
        test_probes = [
            'cg00063477', 'cg00121626', 'cg00223952', 'cg27614706',
            'cg27619353', 'cg27620176', 'cg27647370', 'cg27652464'
        ]
        # for version 1.4.0
        minfi_reference_data = [
            ['cg00035864', 2040.0, 4480.0, 0.308157, -1.134930],
            ['cg00061679', 5946.0, 5276.0, 0.525172, 0.172475],
            ['cg00063477', 5759.0, 315.0, 0.932783, 4.192395],
            ['cg00121626', 3811.0, 7636.0, 0.330042, -1.002648],
            ['cg00223952', 277.0, 12107.0, 0.022188, -5.449811],
            ['cg27614706', 5831.0, 265.0, 0.941091, 4.459679],
            ['cg27619353', 7466.0, 14894.0, 0.332413, -0.996324],
            ['cg27620176', 11753.0, 222.0, 0.973333, 5.726326],
            ['cg27647370', 15752.0, 2212.0, 0.872011, 2.832112],
            ['cg27652464', 656.0, 15224.0, 0.041051, -4.536508],
        ]
        minfi_ref = pd.DataFrame(minfi_reference_data,
                                 columns=[
                                     'IlmnID', 'noob_meth', 'noob_unmeth',
                                     'beta_value', 'm_value'
                                 ]).set_index('IlmnID')
        NaN = np.nan  # this matches '9247377093_R02C01'
        reference_data_old_noob = [
            ['cg00063477', 4107.0, 172.0, 1.0, 0.960, 4.578],
            ['cg00121626', 3542.0, 3397.0, 1.0, 0.510, 0.060],
            ['cg00223952', NaN, NaN, NaN, NaN, NaN],
            ['cg27614706', NaN, NaN, NaN, NaN, NaN],
            ['cg27619353', 2226.0, 9714.0, 1.0, 0.186, -2.126],
            ['cg27620176', 6057.0, 94.0, 1.0, 0.985, 6.010],
            ['cg27647370', 8897.0, 167.0, 1.0, 0.982, 5.735],
            ['cg27652464', 398.0, 8832.0, 1.0, 0.043, -4.472],
        ]
        reference_data = [  #CSV file
            ['cg00063477', 4115.0, 172.0, 1.0, 0.960, 4.580],
            ['cg00121626', 3552.0, 3381.0, 1.0, 0.512, 0.071],
            ['cg00223952', 420.0, 7058.0, 0.0, 0.056, -4.071],
            ['cg27614706', 3612.0, 90.0, 0.0, 0.976, 5.327],
            ['cg27619353', 2204.0, 9713.0, 1.0, 0.185, -2.140],
            ['cg27620176', 6052.0, 94.0, 1.0, 0.985, 6.010],
            ['cg27647370', 8895.0, 167.0, 1.0, 0.982, 5.735],
            ['cg27652464', 396.0, 8829.0, 1.0, 0.043, -4.479],
        ]
        reference_container_data = [
            ['cg00063477', 4115.0, 172.0, 1.0, 0.960, 4.580],
            ['cg00121626', 3552.0, 3381.0, 1.0, 0.512, 0.071],
            ['cg00223952', NaN, NaN, NaN, 0.056, -4.071],
            ['cg27614706', NaN, NaN, NaN, 0.976, 5.327],
            ['cg27619353', 2204.0, 9713.0, 1.0, 0.185, -2.140],
            ['cg27620176', 6052.0, 94.0, 1.0, 0.985, 6.010],
            ['cg27647370', 8895.0, 167.0, 1.0, 0.982, 5.735],
            ['cg27652464', 396.0, 8829.0, 1.0, 0.043, -4.479],
        ]

        ref = pd.DataFrame(reference_data,
                           columns=[
                               'IlmnID', 'noob_meth', 'noob_unmeth',
                               'quality_mask', 'beta_value', 'm_value'
                           ]).set_index('IlmnID')
        container_ref = pd.DataFrame(reference_container_data,
                                     columns=[
                                         'IlmnID', 'noob_meth', 'noob_unmeth',
                                         'quality_mask', 'beta_value',
                                         'm_value'
                                     ]).set_index('IlmnID')
        # checking outputs.
        idata = test_data_containers[0]._SampleDataContainer__data_frame.index
        iref = ref.index
        subdata = test_data_containers[0]._SampleDataContainer__data_frame[
            idata.isin(iref)]
        meth = all(
            np.isclose(subdata[['noob_meth']],
                       container_ref[['noob_meth']],
                       atol=1.0,
                       equal_nan=True))
        unmeth = all(
            np.isclose(subdata[['noob_unmeth']],
                       container_ref[['noob_unmeth']],
                       atol=1.0,
                       equal_nan=True))
        beta = all(
            np.isclose(subdata[['beta_value']],
                       ref[['beta_value']],
                       atol=0.01,
                       equal_nan=True))
        m = all(
            np.isclose(subdata[['m_value']],
                       ref[['m_value']],
                       atol=0.01,
                       equal_nan=True))
        if meth is False:
            raise AssertionError(
                f"container meth values don't match in data container:\n{subdata[['noob_meth']]}\n{container_ref[['noob_meth']]}"
            )
        if unmeth is False:
            raise AssertionError(
                f"container unmeth values don't match in data container:\n{subdata[['noob_unmeth']]}\n{container_ref[['noob_unmeth']]}"
            )
        if beta is False:
            raise AssertionError(
                f"container beta values don't match in data container")
        if m is False:
            raise AssertionError(
                f"container m values don't match in data container")

        csv_ref = pd.DataFrame(reference_data,
                               columns=[
                                   'IlmnID', 'noob_meth', 'noob_unmeth',
                                   'quality_mask', 'beta_value', 'm_value'
                               ]).set_index('IlmnID')
        csv_ref = csv_ref[csv_ref.index.isin(test_probes)]
        csv_data = pd.read_csv(
            Path(test_data_dir, '9247377093',
                 '9247377093_R02C01_processed.csv')).set_index('IlmnID')
        csv_data = csv_data[csv_data.index.isin(test_probes)]
        csv_meth = all(
            np.isclose(csv_data[['noob_meth']],
                       csv_ref[['noob_meth']],
                       atol=1.0,
                       equal_nan=True))
        csv_unmeth = all(
            np.isclose(csv_data[['noob_unmeth']],
                       csv_ref[['noob_unmeth']],
                       atol=1.0,
                       equal_nan=True))
        csv_beta = all(
            np.isclose(csv_data[['beta_value']],
                       csv_ref[['beta_value']],
                       atol=0.01,
                       equal_nan=True))
        csv_m = all(
            np.isclose(csv_data[['m_value']],
                       csv_ref[['m_value']],
                       atol=0.01,
                       equal_nan=True))
        if csv_meth is False:
            raise AssertionError(
                f"csv meth values don't match in data container:\n{csv_data[['noob_meth']]}\n{csv_ref[['noob_meth']]}"
            )
        if csv_unmeth is False:
            raise AssertionError(
                f"csv unmeth values don't match in data container:\n{csv_data[['noob_unmeth']]}\n{csv_ref[['noob_unmeth']]}"
            )
        if csv_beta is False:
            raise AssertionError(
                f"csv beta values don't match in data container")
        if csv_m is False:
            raise AssertionError(f"csv m values don't match in data container")

        #beta = pd.read_pickle(Path(test_data_dir, 'beta_values.pkl'))
        noob_meth = pd.read_pickle(Path(test_data_dir, 'noob_meth_values.pkl'))
        noob_unmeth = pd.read_pickle(
            Path(test_data_dir, 'noob_unmeth_values.pkl'))
        ref_meth = [
            ['cg00000029', 2231],
            ['cg00000108', 7880],
            ['cg00000109', 3516],
            ['cg00000165', 344],
            ['cg00000236', 3601],
        ]
        ref_meth = pd.DataFrame(ref_meth,
                                columns=['IlmnID', '9247377085_R04C02'
                                         ]).set_index('IlmnID')
        test_noob_meth = noob_meth['9247377085_R04C02'][noob_meth.index.isin(
            ref_meth.index)]
        meth = all(
            np.isclose(test_noob_meth,
                       ref_meth['9247377085_R04C02'],
                       atol=1.0,
                       equal_nan=True))
        if meth is False:
            raise AssertionError("meth values don't match in pickle")

        test_data_dir = 'docs/example_data/GSE69852'
        test_outputs = [
            Path(test_data_dir, 'control_probes.pkl'),
            Path(test_data_dir, 'beta_values.pkl'),
            Path(test_data_dir, 'm_values.pkl'),
            Path(test_data_dir, 'meth_values.pkl'),
            Path(test_data_dir, 'unmeth_values.pkl'),
            Path(test_data_dir, 'noob_meth_values.pkl'),
            Path(test_data_dir, 'noob_unmeth_values.pkl'),
            Path(test_data_dir, 'sample_sheet_meta_data.pkl'),
            Path(test_data_dir, '9247377085',
                 '9247377085_R04C02_processed.csv'),
            Path(test_data_dir, '9247377093',
                 '9247377093_R02C01_processed.csv'),
        ]
        for outfile in test_outputs:
            if outfile.exists():
                outfile.unlink()
    def test_make_pipeline_sesame_steps_vs_all(self):
        """
        - check that we get back useful data.
        - compare sesame=True with a list of equivalent steps
        check that output files exist, then remove them."""
        self.clean_dir()
        alt_data_dir = 'docs/example_data/GSE69852_alt'
        copy_files = [
            '9247377093_R02C01_Red.idat', '9247377093_R02C01_Grn.idat',
            '9247377085_R04C02_Red.idat', '9247377085_R04C02_Grn.idat',
            'samplesheet.csv'
        ]

        if not Path(alt_data_dir).exists():
            Path(alt_data_dir).mkdir()
        for copy_file in copy_files:
            if not Path(alt_data_dir, copy_file).exists():
                shutil.copy(Path(self.test_data_dir, copy_file),
                            Path(alt_data_dir, copy_file))

        df1 = pipeline.make_pipeline(self.test_data_dir,
                                     steps=['all'],
                                     exports=['all'],
                                     estimator='betas')

        df2 = pipeline.make_pipeline(alt_data_dir,
                                     steps=[
                                         'infer_channel_switch', 'poobah',
                                         'quality_mask', 'noob', 'dye_bias'
                                     ],
                                     exports=['all'],
                                     estimator='betas')

        test_outputs = [
            'control_probes.pkl',
            'beta_values.pkl',
            'meth_values.pkl',
            'unmeth_values.pkl',
            'noob_meth_values.pkl',
            'noob_unmeth_values.pkl',
            'sample_sheet_meta_data.pkl',
            'poobah_values.pkl',
            Path('9247377085', '9247377085_R04C02_processed.csv'),
            Path('9247377093', '9247377093_R02C01_processed.csv'),
        ]

        assert df1.equals(df2)

        # verify outputs all exist
        for outfile in test_outputs:
            filepath = Path(self.test_data_dir, outfile)
            if not filepath.exists():
                raise FileNotFoundError(
                    f"Expected {filepath.name} to be generated by run_pipeline() but it was missing."
                )
            else:
                print('+', outfile)
                #outfile.unlink()
        for outfile in test_outputs:
            filepath = Path(alt_data_dir, outfile)
            if not filepath.exists():
                raise FileNotFoundError(
                    f"Expected {filepath.name} to be generated by run_pipeline() but it was missing."
                )
            else:
                print('+', outfile)
                #outfile.unlink()

        # compare output files to ensure they match each other
        for outfile in test_outputs:
            filepath1 = Path(self.test_data_dir, outfile)
            filepath2 = Path(alt_data_dir, outfile)
            if filepath1.suffix in ('.pkl', '.csv'):
                if filepath1.suffix == '.pkl':
                    df1 = pd.read_pickle(filepath1)
                    df2 = pd.read_pickle(filepath2)
                elif filepath1.suffix == '.csv':
                    df1 = pd.read_csv(filepath1)
                    df2 = pd.read_csv(filepath2)
                if isinstance(df1, pd.DataFrame) and isinstance(
                        df2, pd.DataFrame):
                    assert df1.equals(df2)
                    print(f"{outfile}: df1 equals df2: {df1.equals(df2)}")
                elif isinstance(df1, dict) and isinstance(df2, dict):
                    # control, mouse probes are dict of dataframes; assume save length
                    for i in range(len(df1)):
                        dfa = list(df1.values())[i]
                        dfb = list(df2.values())[i]
                        assert dfa.equals(dfb)
                        print(
                            f"{outfile}, sample[{i}]: df1 equals df2: {dfa.equals(dfb)}"
                        )
                else:
                    raise ValueError("unknown/mismatched output")

        # match run_pipeline to make_pipeline for basic sesame
        shutil.rmtree(Path(alt_data_dir))
        if not Path(alt_data_dir).exists():
            Path(alt_data_dir).mkdir()
        for copy_file in copy_files:
            if not Path(alt_data_dir, copy_file).exists():
                shutil.copy(Path(self.test_data_dir, copy_file),
                            Path(alt_data_dir, copy_file))

        df2 = pipeline.run_pipeline(
            alt_data_dir,
            sesame=True,
            betas=True,
            poobah=True,  # sesame sets this
            export_poobah=True,
            save_uncorrected=True,
            save_control=True,
            export=True,  #CSV
        )

        # compare output files to ensure they match each other
        # passes: control, meth, unmeth
        failed = []
        for outfile in test_outputs:
            filepath1 = Path(self.test_data_dir, outfile)
            filepath2 = Path(alt_data_dir, outfile)
            if filepath1.suffix in ('.pkl', '.csv'):
                if filepath1.suffix == '.pkl':
                    df1 = pd.read_pickle(filepath1)
                    df2 = pd.read_pickle(filepath2)
                elif filepath1.suffix == '.csv':
                    df1 = pd.read_csv(filepath1)
                    df2 = pd.read_csv(filepath2)
                if isinstance(df1, pd.DataFrame) and isinstance(
                        df2, pd.DataFrame):
                    if not df1.equals(df2):
                        failed.append(
                            f"{outfile} FAILED to match {df1.equals(df2)}")
                    else:
                        print(f"{outfile}: df1 equals df2: {df1.equals(df2)}")
                elif isinstance(df1, dict) and isinstance(df2, dict):
                    # control, mouse probes are dict of dataframes; assume save length
                    for i in range(len(df1)):
                        dfa = list(df1.values())[i]
                        dfb = list(df2.values())[i]
                        assert dfa.equals(dfb)
                        print(
                            f"run vs make pipeline: {outfile}, sample[{i}]: df1 equals df2: {dfa.equals(dfb)}"
                        )
                else:
                    raise ValueError("unknown/mismatched output")
        # reset
        shutil.rmtree(Path(alt_data_dir))
        self.clean_dir()

        if failed:
            for test in failed:
                print(test)
            raise AssertionError("One or more tests failed")
Exemple #18
0
    def test_pipeline_meth_unmeth_int16():
        test_data_dir = 'docs/example_data/GSE69852'
        testfile_1 = Path(test_data_dir, 'meth_values.pkl')
        testfile_2 = Path(test_data_dir, 'unmeth_values.pkl')
        if testfile_1.exists():
            testfile_1.unlink()
        if testfile_2.exists():
            testfile_2.unlink()
        test_data_containers = pipeline.run_pipeline(test_data_dir,
                                                     export=True,
                                                     save_uncorrected=True,
                                                     sesame=False)
        if not testfile_1.exists():
            raise AssertionError("no meth_values.pkl found")
        if not testfile_2.exists():
            raise AssertionError("no unmeth_values.pkl found")
        # ensure no negative values, as these mean some data exceeded the allowed intensity range
        m = pd.read_pickle(Path(
            test_data_dir, 'meth_values.pkl'))  # standard output, as int16
        u = pd.read_pickle(Path(test_data_dir, 'unmeth_values.pkl'))
        errors = []
        mask = (m < 0)
        for sample in m.columns:
            match = len(m[sample][mask[sample]]) == len(pd.Series())
            if not match:
                print(m[sample][mask[sample] == True])
                print("")
                errors.append(sample)
        mask = (u < 0)
        for sample in u.columns:
            match = len(u[sample][mask[sample]]) == len(pd.Series())
            if not match:
                print(u[sample][mask[sample] == True])
                print("")
                errors.append(sample)
        if testfile_1.exists():
            testfile_1.unlink()
        if testfile_2.exists():
            testfile_2.unlink()
        # also confirm these CSV columns are the same as the pickled columns, and non-negative
        testfile_3 = Path(test_data_dir, '9247377093',
                          '9247377093_R02C01_processed.csv')
        testfile_4 = Path(test_data_dir, '9247377085',
                          '9247377085_R04C02_processed.csv')
        csv3 = pd.read_csv(testfile_3).set_index('IlmnID')

        if (~np.isclose(m['9247377093_R02C01'].sort_index(),
                        csv3['meth'].sort_index(),
                        atol=10.0)).sum() > 0:
            #if not m['9247377093_R02C01'].equals( csv3['meth'] ):
            errors.append(
                f"9247377093_R02C01 meth pkl != csv {(~np.isclose( m['9247377093_R02C01'].sort_index(), csv3['meth'].sort_index(), atol=10.0)).sum()}"
            )
        if (~np.isclose(u['9247377093_R02C01'].sort_index(),
                        csv3['unmeth'].sort_index(),
                        atol=10.0)).sum() > 0:
            #if not u['9247377093_R02C01'].equals( csv3['unmeth'] ):
            errors.append(
                f"9247377093_R02C01 unmeth pkl != csv {(~np.isclose( m['9247377093_R02C01'].sort_index(), csv3['unmeth'].sort_index(), atol=10.0)).sum()}"
            )

        # order not the same, but probes should all be there
        same_probes = m.sort_index().index.equals(
            test_data_containers[0]._SampleDataContainer__data_frame['meth'].
            sort_index().index)
        if not same_probes:
            errors.append(
                "probes in meth_values.pkl don't match probes in SampleDataContainer"
            )
        same_order = m.index.equals(
            test_data_containers[0]._SampleDataContainer__data_frame['meth'].
            index)
        #if not same_order:
        #    errors.append("order of probes in meth_values.pkl don't match SampleDataContainer")
        same_order = csv3['meth'].index.equals(
            test_data_containers[0]._SampleDataContainer__data_frame['meth'].
            index)
        if not same_order:
            errors.append(
                "order of probes in output CSV don't match SampleDataContainer"
            )
        #same_order = csv3['meth'].index.equals( m.index )
        #if not same_order:
        #    errors.append("order of probes in output CSV don't match meth_values.pkl")

        # turns out CSV doesn't match SDC exactly, but closer. (everything needs same orientation)
        # I know the sdc doesn't match exactly. But is close. Always less than 10 units off.
        #test_data_containers[0]._SampleDataContainer__data_frame['meth'] == csv3['meth']
        sdc_match = (~np.isclose(
            test_data_containers[0]._SampleDataContainer__data_frame['meth'],
            csv3['meth'],
            atol=10.0)).sum()
        if sdc_match > 0:
            errors.append(
                "SampleDataContainer['meth'] does not match csv3 output")

        csv4 = pd.read_csv(testfile_4).set_index('IlmnID')
        if (~np.isclose(m['9247377085_R04C02'].sort_index(),
                        csv4['meth'].sort_index(),
                        atol=10.0)).sum() > 0:
            #if not m['9247377085_R04C02'].equals( csv4['meth'] ):
            errors.append(
                f"9247377085_R04C02 meth pkl != csv {(~np.isclose( m['9247377085_R04C02'].sort_index(), csv4['meth'].sort_index(), atol=10.0)).sum()}"
            )
        if (~np.isclose(u['9247377085_R04C02'].sort_index(),
                        csv4['unmeth'].sort_index(),
                        atol=10.0)).sum() > 0:
            #if not u['9247377085_R04C02'].equals( csv4['unmeth'] ):
            errors.append(
                f"9247377085_R04C02 unmeth pkl != csv {(~np.isclose( m['9247377085_R04C02'].sort_index(), csv4['unmeth'].sort_index(), atol=10.0)).sum()}"
            )
        sdc_match = (~np.isclose(
            test_data_containers[1]._SampleDataContainer__data_frame['meth'],
            csv4['meth'],
            atol=10.0)).sum()
        if sdc_match > 0:
            errors.append(
                "SampleDataContainer['meth'] does not match csv4 output")

        if errors:
            #import pdb;pdb.set_trace()
            raise ValueError('\n'.join(errors))