p = {'filtering' : {'propN': 0.1, 'phred': 25, 'cutsite_edit_dist' : 2, 'overhang_edit_dist' : 0}, 'cleaning' : {'max_edit_dist' : 1 }} # Insert into filter_parameters table c.filterparam_id = db.insert_binary(p, col='params', table='filtering_parameters') Preprocess.filter_functions = [ Preprocess.make_propN_filter(p['filtering']['propN']), Preprocess.make_phred_filter(p['filtering']['phred']), Preprocess.make_cutsite_filter(max_edit_dist=p['filtering']['cutsite_edit_dist']), Preprocess.make_overhang_filter('TCGAGG', 'GG', p['filtering']['overhang_edit_dist']) ] #=============================================================================== # Run Filtering #=============================================================================== Preprocess.filter_reads_pipeline() #=============================================================================== # Process and Correct MID tag #=============================================================================== Preprocess.process_MIDtag(max_edit_dist = p['cleaning']['max_edit_dist']) Preprocess.cleanup_files('filtered') # Remove filtered intermediate files # Store or pass on config file to clustering section # Pickle config pkl.dump(c, open(joinp(prefix, c.root, 'config.pkl'), 'wb'))
class Workflow(object): ''' Container for all preprocessing, filtering, and exploritry analysis with a particular dataset ''' def __init__(self): self.c = ConfigClass() def create_new(self, name=None, db_name=None, testing=False): ''' Setup directory structure and initialise config file and database for the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Setup Configuration prefix = get_data_prefix() # Default path locations self.c.testing = testing self.c.root_name = name self.c.db_name = db_name if testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name) self.c.cdhit_path = os.path.expanduser("~/bin/cd-hit-v4.6.1") # Create directories of they dont exist for attr in dir(self.c): #numwritten = SeqIO.write(RecCycler.recgen , output_filehdl , 'fastq') #print '{0} records written'.format(numwritten) #total_numwritten += numwritten if 'path' in attr: path = getattr(self.c, attr) if not os.path.exists(path): os.makedirs(path) # Var to choose between different output locations after splitting data self.c.current_tag_split_outpath = None # Set interim file suffixes self.c.filtered_files_postfix = '-pass' self.c.tag_processed_files_postfix = '-clean' # MIDtags self.c.cutsite = 'TGCAGG' self.c.max_edit_dist = 2 # FILTERING # Whether to log reads that fail the filtering self.c.log_fails = False # Create new Database self.db = Popgen_db(joinp(self.c.db_path, db_name), recbyname=True, new=True) # Save config f = open(joinp(self.c.db_path, '.' + name + '-config.pkl'), 'w') pkl.dump(self.c, f) def load(self, name=None, db_name=None, recbyname=True): ''' Load a pre-existing directory structure, config file and database with the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Load config prefix = get_data_prefix() path2config = joinp(prefix, name, '.' + name + '-config.pkl') path2db = joinp(prefix, name, db_name) self.db = Popgen_db(path2db, recbyname=recbyname) self.c = pkl.load(open(path2config)) # Setup Configuration with new prefix prefix = get_data_prefix() if self.c.testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name) def add_datafiles(self, data_files=None , barcode_files=None ): ''' Add datafiles and barcodes in pairs to the database. Each pair defines the samples present in the datafiles listed. If 'files' or 'barcodes' is a str, it is interpreted as a glob to the data_path / barcode_path respecively. ''' if type(data_files) is str: data_files = glob.glob(joinp(self.c.data_inpath, data_files)) if type(barcode_files) is str: barcode_files = glob.glob(joinp(self.c.barcode_inpath, barcode_files)) # Input samples-datafiles info in Database self.db.add_barcodes_datafiles(barcode_files, data_files, datafile_type='raw_mixed') def setup_preprocessing(self, infiles_pattern, params=None, param_id=None): ''' Setup the preprocessing function for the workflow ''' # Get params if id given if params is None and param_id is None: raise Exception("No parameters and no parameter id to lookup.") if param_id: params = self.db.get_binary('params', 'filtering_parameterID', param_id, table='filtering_parameters') assert params, "No data returned from database for param_id: %s" % param_id self.c.filterparam_id = param_id else: # Insert parameters dictionary into filter_parameters table self.c.filterparam_id = self.db.insert_binary(params, col='params', table='filtering_parameters') # Define Preprocessing Class and set inputs self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db # Pass database reference to Preprocessor self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=self.c.data_inpath) self.Preprocessor.filter_functions = [ self.Preprocessor.make_propN_filter(params['filtering']['propN']), self.Preprocessor.make_phred_filter(params['filtering']['phred']), self.Preprocessor.make_cutsite_filter(max_edit_dist=params['filtering']['cutsite_edit_dist']), self.Preprocessor.make_overhang_filter('TCGAGG', 'GG', params['filtering']['overhang_edit_dist']) ] # Save addition to config file path = joinp(self.c.db_path, '.' + self.c.root_name + '-config.pkl') if os.path.exists(path): os.remove(path) pkl.dump(self.c, open(path, 'w')) def run_preprocessing(self): ''' Call the Preprocessing functions for the workflow ''' params = self.db.get_binary('params', 'filtering_parameterID', self.c.filterparam_id, table='filtering_parameters') # Process and Correct MID tag self.Preprocessor.filter_reads_pipeline() self.Preprocessor.process_MIDtag(max_edit_dist = params['cleaning']['max_edit_dist']) # Remove filtered intermediate files self.Preprocessor.cleanup_files('filtered') def setup_clustering(self, mode, infiles_pattern, infiles_path=None, default_params=None, subgroups=None): ''' Setup files for the Clustering function of the workflow. Does the necessary splitting and trimming of files if specified in mode. ''' # Input Checks if not hasattr(self, 'c'): raise Exception('Must first load a config file') if not hasattr(self, 'Preprocessor'): self.Preprocessor = Preprocessor(self.c) self.Preprocessor.db = self.db if infiles_path is None: infiles_path = self.c.tag_processed_outpath # Set files to process for clustering self.Preprocessor.set_input_files(data_files=infiles_pattern, data_inpath=infiles_path) if mode == 'split_by_tags': (outfiles, outpath) = self.Preprocessor.split_by_tags() self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_sample_outpath, n=1) elif mode == 'split_by_subgroups': if subgroups is None: raise Exception("No subgroups specified") (outfiles, outpath) = self.Preprocessor.split_by_subgroups(subgroups) self.c.current_tag_split_outpath = outpath # Create index for files clustered # makeSQLindex(outfiles, outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', outpath=self.c.tag_splitby_subgroup_outpath, n=1) elif mode == 'no_split_grouped': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='grouped', n=1) self.c.current_tag_split_outpath = path elif mode == 'no_split_separate': # Create index for files clustered # makeSQLindex(filepattern=infiles_pattern, data_inpath=self.c.tag_processed_outpath) files2cluster, path = self.Preprocessor.trim_reads(mode='separate', n=1) self.c.current_tag_split_outpath = path else: raise Exception(' No valid mode specified. ') # Setup Clusterer self.Clusterer = ClusterClass(infiles=files2cluster, inpath=path, config=self.c, db=self.db, defaults=default_params) def run_clustering(self, run_parameters, **kwargs): ''' Run CDHIT using the specified parameters. passes on any kwargs''' outputs_list = self.Clusterer.run_batch_cdhit_clustering(run_parameters, **kwargs) outnames_list, out_path, counters_list = zip(*outputs_list) return outnames_list, out_path, counters_list def add_experiment_name(self, name, description): ''' Add Experimental details and config object in database''' # These should be unique for each experiment, else results table is overwritten self.c.experiment_name = name self.c.experiment_description = description self.c.exp_id = self.db.add_experiment(config=self.c, exp_type='clustering') def cleanup_files(self, file_type): ''' Remove all intermediate files specified ''' self.Preprocessor.cleanup_files(file_type)