Beispiel #1
0
def process_series(id, path, seen_platforms, batch_size, **kwargs):
    """Processes the samples for each platform for the specified series, saving one pickled dataframe for each platform

    Arguments:
        id [required]
            the Accension for the desired series
        series_path [required]
            the directory containing the series data
        seen_platforms [required]
            the platforms the series has samples of
        batch_size
            the number of samples to process at a time"""
    for platform in seen_platforms:
        if not Path(path, platform, f"{id}_beta_values.pkl").exists():
            data_dir = f"{path}/{platform}"
            LOGGER.info(f"Processing {id} -- {platform} samples")
            LOGGER.info(kwargs)
            run_pipeline(
                data_dir,
                betas=True,
                batch_size=batch_size,
                make_sample_sheet=kwargs.get('make_sample_sheet', False),
                meta_data_frame=kwargs.get(
                    'meta_data_frame',
                    False))  #make_sample_sheet handled within miniml.py logic
            ''' # v1.3.x auto-consolidates, so no need for this function to run.
Beispiel #2
0
def process_series(id, path, seen_platforms, batch_size, **kwargs):
    """Processes the samples for each platform for the specified series, saving one pickled dataframe for each platform

    Arguments:
        id [required]
            the Accension for the desired series
        series_path [required]
            the directory containing the series data
        seen_platforms [required]
            the platforms the series has samples of
        batch_size
            the number of samples to process at a time"""
    for platform in seen_platforms:
        if not Path(path, platform, f"{id}_beta_values.pkl").exists():
            data_dir = f"{path}/{platform}"
            LOGGER.info(f"Processing {id} -- {platform} samples")
            LOGGER.info(kwargs)
            run_pipeline(
                data_dir,
                betas=True,
                batch_size=batch_size,
                make_sample_sheet=kwargs.get('make_sample_sheet', False),
                meta_data_frame=kwargs.get(
                    'meta_data_frame',
                    False))  #make_sample_sheet handled within miniml.py logic
            dfs = []
            betas_list = list(Path(data_dir).glob('beta_values_*.pkl'))
            #for i in range(1,len(betas_list) + 1):
            #df = pd.read_pickle(f"beta_values_{i}.pkl")
            for beta in betas_list:
                df = pd.read_pickle(beta)
                dfs.append(df)
            if len(dfs) > 1:
                LOGGER.info(
                    f"Concatenating {len(betas_list)} beta_value files.")
                joined_df = pd.concat(dfs, axis=1)
            else:
                joined_df = dfs[0]

            joined_df.to_pickle(Path(path, platform, f"{id}_beta_values.pkl"))
            for beta in betas_list:
                os.remove(beta)
            LOGGER.info(
                f"Consolidated {id} {platform} samples; saved to {id}_beta_values.pkl"
            )
def test_open_sesame_betas_vs_methylprep():
    """ simplest test: does the openSesame beta output match the run_pipeline beta output? """
    LOCAL = Path('docs/example_data/GSE69852/')
    m_betas= methylprep.run_pipeline(LOCAL, betas=True, sample_name=['FetalLiver1'])
    m_betas = m_betas.sort_index()
    # columns={'beta_value':'9247377085_R04C02'}
    s_betas = (pd.read_csv(Path(LOCAL, 'sesame_open_betas.csv'))
               .set_index('Unnamed: 0')
               .rename(columns=(lambda x: x[1:]))
               .sort_index()
              )
    s_betas.index.name = 'IlmnID'
    s_betas = s_betas[~s_betas.index.str.startswith('rs')]
    s_betas = s_betas[['247377085_R04C02']]
    s_betas = s_betas.rename(columns={'247377085_R04C02':'9247377085_R04C02'}) # opensesame had wrong sentrix ID, missing 9, for some reason
    print((m_betas - s_betas).mean())
    #(m_betas - s_betas).plot.hist(bins=200)
    assert abs( float((m_betas - s_betas).mean()) ) < 0.002 # actual is 0.001023
Beispiel #4
0
 def test_process_mouse():
     """ catches anything serious broken about mouse array processing
     in v1.4.4 / v0.7.4 I expect this to use the linear dye fallback within the sesame method, because of dupe probe names."""
     PATH = 'docs/example_data/mouse'
     ID = '204879580038_R06C02'
     print('* loading mouse manifest')
     import methylprep
     manifest = methylprep.files.Manifest(
         methylprep.models.ArrayType('mouse'))
     print('* loading one idat pair of files')
     green_filepath = Path(
         PATH, f'{ID}_Grn.idat')  #'204879580038_R06C02_Grn.idat')
     red_filepath = Path(PATH,
                         f'{ID}_Red.idat')  #'204879580038_R06C02_Red.idat')
     print(f"* GREEN --> {green_filepath.name}")
     print(f"* RED --> {red_filepath.name}")
     if not (green_filepath.exists() and green_filepath.is_file()):
         raise FileNotFoundError("mouse test data missing")
     if not (red_filepath.exists() and red_filepath.is_file()):
         raise FileNotFoundError("mouse test data missing")
     files_to_remove = [
         'samplesheet.csv', 'control_probes.pkl', 'mouse_probes.pkl',
         'sample_sheet_meta_data.pkl', 'noob_meth_values.pkl',
         'noob_unmeth_values.pkl'
     ]
     for _file in files_to_remove:
         if Path(PATH, _file).is_file():
             Path(PATH, _file).unlink()
     data = methylprep.run_pipeline(PATH, make_sample_sheet=True)
     df = data[0]._SampleDataContainer__data_frame
     print(
         np.isclose(list(df['beta_value'][:3]),
                    [0.905712, 0.841185, 0.129731]))
     assert np.isclose(list(df['beta_value'][:3]),
                       [0.905712, 0.841185, 0.129731]).all() == True
     for _file in files_to_remove:
         if Path(PATH, _file).is_file():
             Path(PATH, _file).unlink()
Beispiel #5
0
def build_composite_dataset(geo_id_list,
                            data_dir,
                            merge=True,
                            download_it=True,
                            extract_controls=False,
                            require_keyword=None,
                            sync_idats=True,
                            betas=False,
                            m_value=False,
                            export=False):
    """A wrapper function for convert_miniml() to download a list of GEO datasets
    and process only those samples that meet criteria. Specifically - grab the "control" or "normal" samples
    from a bunch of experiments for one tissue type (e.g. "blood"), process them, and put all the resulting
    beta_values and/or m_values pkl files in one place, so that you can run `methylize.load_both()` to
    create a combined reference dataset for QC, analysis, or meta-analysis.

    Arguments:
        geo_id_list (required):
            A list of GEO "GSEnnn" ids. From command line, pass these in as separate values
        data_dir:
            folder to save data
        merge (True):
            If merge==True and there is a file with 'samplesheet' in the folder, and that sheet has GSM_IDs,
            merge that data into this samplesheet. Useful for when you have idats and want one combined samplesheet for the dataset.

        download_it (True):
            if miniml file not in data_dir path, it will download it from web.

        extract_controls (True)):
            if you only want to retain samples from the whole set that have certain keywords,
            such as "control" or "blood", this experimental flag will rewrite the samplesheet with only the parts you want,
            then feed that into run_pipeline with named samples.
        require_keyword (None):
            another way to eliminate samples from samplesheets, before passing into the processor.
            if specified, the "keyword" string passed in must appear somewhere in the values of a samplesheet
            for sample to be downloaded, processed, retained.
        sync_idats:
            If flagged, this will search `data_dir` for idats and remove any of those that are not found in the filtered samplesheet.
            Requires you to run the download function first to get all idats, before you run this `meta_data` function.
        betas:
            process beta_values
        m_value:
            process m_values

        - Attempts to also read idat filenames, if they exist, but won't fail if they don't.
        - removes unneeded files as it goes, but leaves the xml MINiML file and folder there as a marker if a geo dataset fails to download. So it won't try again on resume.
    """
    def remove_unused_files(geo_id, geo_folder):
        if list(Path(data_dir, geo_id).rglob('*.idat')) == []:
            for file in Path(data_dir, geo_id).glob("*_samplesheet.csv"):
                file.unlink()
            for file in Path(data_dir, geo_id).glob("*_meta_data.pkl"):
                file.unlink()
            # the XML and folder is used to mark failed downloads on resume, so it skips them next time
            #for file in Path(data_dir, geo_id).glob("*_family.xml"):
            #    file.unlink()
            #try:
            #    Path(data_dir, geo_id).rmdir()
            #except Exception as e:
            #    LOGGER.error(f"Path {data_dir/geo_id} is not empty. Could not remove.")
            return True
        return False

    start_time = time.process_time()
    # note: parser uses VERBOSE setting to show/suppress INFO and DEBUG level messages. WARNING/ERROR msgs always appear.
    # get the ids from file
    try:
        with open(Path(data_dir, geo_id_list), 'r') as fp:
            geo_ids = [series_id.strip() for series_id in fp]
    except FileNotFoundError:
        LOGGER.error(
            """File not found: Specify your list of GEO series IDs to download using a text file in the folder where data should be saved. Put one ID on each line. """
        )
        return
    except ValueError as e:
        LOGGER.error(f"Error with {fp.name}: {e}")
        fp.close()
        return
    fp.close()

    geo_folders = {}
    for geo_id in geo_ids:
        # exclude failed folders: if the folder exists and only contains the miniml family.xml file, skip it.
        if Path(data_dir, geo_id).exists() and Path(
                data_dir, geo_id, f'{geo_id}_family.xml').exists():
            if len(list(Path(data_dir, geo_id).rglob('*'))) == 1:
                LOGGER.info(
                    f"Skipping {geo_id}; appears to be a prior run that didn't match filters, or was missing data."
                )
                continue
        # exclude geo series whose HTML pages don't say TAR (of idat).
        if methylprep.download.process_data.confirm_dataset_contains_idats(
                geo_id) == False:
            LOGGER.error(
                f"[!] Geo data set {geo_id} probably does NOT contain usable raw data (in .idat format). Not downloading."
            )
            continue

        geo_folder = Path(data_dir, geo_id)
        geo_folders[geo_id] = geo_folder

        # download meta_data
        LOGGER.info(f"Running {geo_id}")
        try:
            convert_miniml(geo_id,
                           data_dir=geo_folder,
                           merge=merge,
                           download_it=download_it,
                           extract_controls=extract_controls,
                           require_keyword=require_keyword,
                           sync_idats=False)  #no idat files exist yet.
        except Exception as e:
            LOGGER.error(f'Processing meta_data failed: {e}')
            continue
        ## if the samplesheet is empty, stop.
        abort = False
        for platform in geo_platforms:
            if Path(data_dir, geo_id,
                    f"{geo_id}_{platform}_samplesheet.csv").is_file():
                samplesheet = pd.read_csv(
                    Path(data_dir, geo_id,
                         f"{geo_id}_{platform}_samplesheet.csv"))
                if len(samplesheet.index) == 0:
                    LOGGER.warning(
                        f"Aborting {geo_id}: No samples match filters (control:{extract_controls}, keyword: {require_keyword})"
                    )
                    # TODO: cleanup this first.
                    abort = True
                    remove_unused_files(geo_id, geo_folder)
                    geo_folders.pop(geo_id)
                    continue
        if abort:
            continue

        # check if no idats yet
        if list(Path(geo_folder).rglob('*.idat')) != []:
            pass
        else:
            try:
                # download idats
                methylprep.download.process_data.run_series(
                    geo_id,
                    geo_folder,
                    dict_only=True,
                    batch_size=200,
                    clean=True,  # do later in this function
                )
            except Exception as e:
                LOGGER.error(f'Downloading IDATs failed: {e}')
                # THIS DOES NOT CLEAN THE FOLDERS IF ERROR
                continue

        if sync_idats:
            for platform in geo_platforms:
                paths = list(
                    Path(data_dir,
                         geo_id).rglob(f'{geo_id}_{platform}_samplesheet.csv'))
                if len(paths) > 0:
                    remove_idats_not_in_samplesheet(paths[0],
                                                    Path(data_dir, geo_id))
                #else:
                #    LOGGER.error(f"Could not locate file ({geo_id}_{platform}_samplesheet.csv}) in path of {data_dir}/{geo_id}")

        # there are probably two versions of samplesheets. if they're the same, remove one.
        # if they differ, keep the one created by miniml (where filtering happens)
        # this should only proceed this far if both samplesheets contain samples.
        for platform in geo_platforms:
            samp_A = Path(data_dir, geo_id,
                          f"{geo_id}_{platform}_samplesheet.csv")
            samp_B = Path(data_dir, geo_id, platform,
                          f"{geo_id}_{platform}_samplesheet.csv")
            meta_B = Path(data_dir, geo_id, platform,
                          f"{geo_id}_{platform}_meta_data.pkl")
            if (samp_A.is_file() and samp_B.is_file()):
                samplesheet = pd.read_csv(samp_A)
                basic_samplesheet = pd.read_csv(samp_B)
                #if samplesheet == basic_samplesheet: OR in every case, keep the miniml-filtered version.
                #shutil.move(samp_A, samp_B)
                samp_B.unlink()
                if meta_B.is_file():
                    meta_B.unlink()
        # next, remove all files if there are no idats in the folder.
        if remove_unused_files(geo_id, geo_folder):
            geo_folders.pop(geo_id)

    if export == False and betas == False and m_value == False:
        LOGGER.info(
            "Not processing data, because no output types are specified.")
    else:
        for geo_id, geo_folder in geo_folders.items():
            try:
                run_pipeline(
                    geo_folder,  #maybe pass in sample_sheet_filepath if it gets confused here.
                    betas=betas,
                    m_value=m_value,
                    batch_size=200,
                    export=export,
                    meta_data_frame=False)
            except Exception as e:
                LOGGER.warning(f'Processing IDATs failed for {geo_id}: {e}')

    LOGGER.info('[!] Consolidating data files [!]')
    # consoldate data into one folder and remove rest.
    file_patterns = [
        'beta_values_*.pkl' if betas else None,
        'm_values_*.pkl' if m_value else None
    ]
    for pattern in file_patterns:
        if pattern == None:
            continue
        datas = []
        samples = 0
        for file in Path(data_dir).rglob(pattern):
            if file.parts[-2] in geo_ids:
                data = pd.read_pickle(file)
                # data should have probes in rows, samples in columns.
                if data.shape[1] > data.shape[0]:
                    data = data.transpose()
                datas.append(data)
                samples += data.shape[1]
                print(f"-- {len(datas)} {file.parts[-2]} {data.shape}")
        if datas:
            big_data = pd.concat(datas,
                                 axis=1,
                                 ignore_index=False,
                                 sort=True,
                                 copy=False)
            del datas
            LOGGER.info(f"[!] saved {samples} samples to disk from {pattern}.")
            if 'beta' in pattern:
                big_data.to_pickle(Path(data_dir, 'beta_values.pkl'))
            if 'm_value' in pattern:
                big_data.to_pickle(Path(data_dir, 'm_values.pkl'))
    end_time = time.process_time()
    LOGGER.info(
        f"[*] Process time: {round((start_time - end_time)/60.0,1)} min")