def load_sample_set(wc_config, fs_prefix, df, preproc, samples_to_add=[], do_not_add=[], pattern='*'): ''' This function is used to add samples into the SampleSet. Args: fs_prefix: Prefix of the dataset on filesystem df: Name of the dataset preproc: Preprocessing you want to use samples_to_add: List of sample names to add do_not_add: list of sample names NOT to add pattern: sample names must match this glob pattern to be included. ''' if wc_config is None: wc_config = load_wc_config() samples = [] fastq_gz_file_loc = wc_config['fastq_gz_file_wc'].format( fs_prefix=fs_prefix, df=df, preproc=preproc, strand='R1', df_sample=pattern) df_samples = [ f.split('/')[-1].split('.')[0].replace('_R1', '') for f in glob.glob(fastq_gz_file_loc) ] sample_dir_wc = wc_config['sample_dir_wc'] fastq_gz_file_wc = wc_config['fastq_gz_file_wc'] count_wc = wc_config['count_wc'] df_samples = list(set(df_samples) - set(do_not_add)) if len(samples_to_add) > 0: df_samples = list(set(df_samples).intersection(set(samples_to_add))) samples = [ load_sample(fs_prefix, df, preproc, df_sample, sample_dir_wc=sample_dir_wc, fastq_gz_file_wc=fastq_gz_file_wc, count_wc=count_wc) for df_sample in df_samples ] sample_set = pd.DataFrame(samples) return sample_set
def __init__(self, df, include_preprocs=True): wc_config = load_wc_config() instance_config = read_assnake_instance_config() df_info_loc = instance_config[ 'assnake_db'] + '/datasets/{df}/df_info.yaml'.format(df=df) df_info = {} if not os.path.isfile(df_info_loc): raise assnake.api.loaders.InputError('NO DATASET ' + df) with open(df_info_loc, 'r') as stream: try: info = yaml.load(stream, Loader=yaml.FullLoader) if 'df' in info: df_info = info except yaml.YAMLError as exc: print(exc) reads_dir = os.path.join(df_info['fs_prefix'], df_info['df'], 'reads/*') dataset_type_checker_pattern = os.path.join( df_info['fs_prefix'], df_info['df'], 'reads/raw/*_R2.*' ) # check in raw preprocess folder if dataset is paired-end dataset_type_checker = glob.glob(dataset_type_checker_pattern) preprocs = [p.split('/')[-1] for p in glob.glob(reads_dir)] preprocessing = {} self.df = df_info['df'] self.fs_prefix = df_info['fs_prefix'] self.dataset_type = 'paired-end' if len( dataset_type_checker) > 0 else 'single-end' self.full_path = os.path.join(self.fs_prefix, self.df) if include_preprocs: preprocessing = {} for p in preprocs: samples = load_sample_set(wc_config, self.fs_prefix, self.df, p) if len(samples) > 0: samples = samples[[ 'preproc', 'df', 'fs_prefix', 'df_sample', 'reads' ]] preprocessing.update({p: samples}) self.sample_sets = preprocessing if len(self.sample_sets.keys()) > 0: self.sample_containers = pd.concat(self.sample_sets.values()) self.self_reads_info = self.sample_containers.pivot( index='df_sample', columns='preproc', values='reads')
import glob, os, time from assnake.core.config import load_wc_config from pkg_resources import iter_entry_points wc_config = load_wc_config() start = time.time() # Discover plugins discovered_plugins = { entry_point.name: entry_point.load() for entry_point in iter_entry_points('assnake.plugins') } # We need to update wc_config first for module_name, module_class in discovered_plugins.items(): module_config = {'install_dir': module_class.install_dir} if module_class.module_config is not None: module_config.update(module_class.module_config) config.update({module_name:module_config}) for wc_conf in module_class.wc_configs: if wc_conf is not None: wc_config.update(wc_conf) for res in module_class.results: if res.wc_config is not None: wc_config.update(res.wc_config)