Ejemplo n.º 1
0
def load_sample_set(wc_config,
                    fs_prefix,
                    df,
                    preproc,
                    samples_to_add=[],
                    do_not_add=[],
                    pattern='*'):
    '''
    This function is used to add samples into the SampleSet.

    Args:
        fs_prefix: Prefix of the dataset on filesystem
        df: Name of the dataset
        preproc: Preprocessing you want to use
        samples_to_add: List of sample names to add
        do_not_add: list of sample names NOT to add
        pattern: sample names must match this glob pattern to be included. 
    '''

    if wc_config is None:
        wc_config = load_wc_config()

    samples = []
    fastq_gz_file_loc = wc_config['fastq_gz_file_wc'].format(
        fs_prefix=fs_prefix,
        df=df,
        preproc=preproc,
        strand='R1',
        df_sample=pattern)

    df_samples = [
        f.split('/')[-1].split('.')[0].replace('_R1', '')
        for f in glob.glob(fastq_gz_file_loc)
    ]
    sample_dir_wc = wc_config['sample_dir_wc']
    fastq_gz_file_wc = wc_config['fastq_gz_file_wc']
    count_wc = wc_config['count_wc']

    df_samples = list(set(df_samples) - set(do_not_add))

    if len(samples_to_add) > 0:
        df_samples = list(set(df_samples).intersection(set(samples_to_add)))

    samples = [
        load_sample(fs_prefix,
                    df,
                    preproc,
                    df_sample,
                    sample_dir_wc=sample_dir_wc,
                    fastq_gz_file_wc=fastq_gz_file_wc,
                    count_wc=count_wc) for df_sample in df_samples
    ]

    sample_set = pd.DataFrame(samples)
    return sample_set
Ejemplo n.º 2
0
    def __init__(self, df, include_preprocs=True):
        wc_config = load_wc_config()
        instance_config = read_assnake_instance_config()

        df_info_loc = instance_config[
            'assnake_db'] + '/datasets/{df}/df_info.yaml'.format(df=df)
        df_info = {}

        if not os.path.isfile(df_info_loc):
            raise assnake.api.loaders.InputError('NO DATASET ' + df)

        with open(df_info_loc, 'r') as stream:
            try:
                info = yaml.load(stream, Loader=yaml.FullLoader)
                if 'df' in info:
                    df_info = info
            except yaml.YAMLError as exc:
                print(exc)

        reads_dir = os.path.join(df_info['fs_prefix'], df_info['df'],
                                 'reads/*')
        dataset_type_checker_pattern = os.path.join(
            df_info['fs_prefix'], df_info['df'], 'reads/raw/*_R2.*'
        )  # check in raw preprocess folder if dataset is paired-end
        dataset_type_checker = glob.glob(dataset_type_checker_pattern)
        preprocs = [p.split('/')[-1] for p in glob.glob(reads_dir)]
        preprocessing = {}

        self.df = df_info['df']
        self.fs_prefix = df_info['fs_prefix']
        self.dataset_type = 'paired-end' if len(
            dataset_type_checker) > 0 else 'single-end'
        self.full_path = os.path.join(self.fs_prefix, self.df)

        if include_preprocs:
            preprocessing = {}
            for p in preprocs:
                samples = load_sample_set(wc_config, self.fs_prefix, self.df,
                                          p)
                if len(samples) > 0:
                    samples = samples[[
                        'preproc', 'df', 'fs_prefix', 'df_sample', 'reads'
                    ]]
                    preprocessing.update({p: samples})

        self.sample_sets = preprocessing
        if len(self.sample_sets.keys()) > 0:
            self.sample_containers = pd.concat(self.sample_sets.values())
            self.self_reads_info = self.sample_containers.pivot(
                index='df_sample', columns='preproc', values='reads')
Ejemplo n.º 3
0
import glob, os, time
from assnake.core.config import load_wc_config
from pkg_resources import iter_entry_points 
wc_config = load_wc_config()

start = time.time()

# Discover plugins
discovered_plugins = {
    entry_point.name: entry_point.load()
    for entry_point in iter_entry_points('assnake.plugins')
}


# We need to update wc_config first
for module_name, module_class in discovered_plugins.items():
    
    module_config = {'install_dir': module_class.install_dir}

    if module_class.module_config is not None:
        module_config.update(module_class.module_config)

    config.update({module_name:module_config})


    for wc_conf in module_class.wc_configs:
        if wc_conf is not None:
            wc_config.update(wc_conf)
    for res in module_class.results:
        if res.wc_config is not None:
            wc_config.update(res.wc_config)