def load(self, name=None, db_name=None, recbyname=True): ''' Load a pre-existing directory structure, config file and database with the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Load config prefix = get_data_prefix() path2config = joinp(prefix, name, '.' + name + '-config.pkl') path2db = joinp(prefix, name, db_name) self.db = Popgen_db(path2db, recbyname=recbyname) self.c = pkl.load(open(path2config)) # Setup Configuration with new prefix prefix = get_data_prefix() if self.c.testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name)
def create_new(self, name=None, db_name=None, testing=False): ''' Setup directory structure and initialise config file and database for the given dataset name.''' if (name is None) or (type(name) is not str): raise Exception('Must specify a valid name for the dataset.') if db_name is None: db_name = name + '.db' # Setup Configuration prefix = get_data_prefix() # Default path locations self.c.testing = testing self.c.root_name = name self.c.db_name = db_name if testing: self.c.data_inpath = joinp(prefix,name, 'testset') else: self.c.data_inpath = joinp(prefix, name, 'raw-data') self.c.barcode_inpath = joinp(prefix, name , 'barcodes') self.c.filtered_outpath = joinp(prefix, name , 'processed-data') self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data') self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample') self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup') self.c.clusters_outpath = joinp(prefix, name, 'clusters') self.c.db_path = joinp(prefix, name) self.c.cdhit_path = os.path.expanduser("~/bin/cd-hit-v4.6.1") # Create directories of they dont exist for attr in dir(self.c): #numwritten = SeqIO.write(RecCycler.recgen , output_filehdl , 'fastq') #print '{0} records written'.format(numwritten) #total_numwritten += numwritten if 'path' in attr: path = getattr(self.c, attr) if not os.path.exists(path): os.makedirs(path) # Var to choose between different output locations after splitting data self.c.current_tag_split_outpath = None # Set interim file suffixes self.c.filtered_files_postfix = '-pass' self.c.tag_processed_files_postfix = '-clean' # MIDtags self.c.cutsite = 'TGCAGG' self.c.max_edit_dist = 2 # FILTERING # Whether to log reads that fail the filtering self.c.log_fails = False # Create new Database self.db = Popgen_db(joinp(self.c.db_path, db_name), recbyname=True, new=True) # Save config f = open(joinp(self.c.db_path, '.' + name + '-config.pkl'), 'w') pkl.dump(self.c, f)
def get_build_tests_script(): return os.path.join(utils.get_data_prefix(), 'helpers', 'build_tests.py')
from preprocess2 import Preprocessor, ConfigClass from cluster import ClusterClass import cPickle as pkl from database2 import Popgen_db #============================================================================== ''' Cluster SCRIPT FOR ALLL READS IN Gazelles-Zebras RAD data ''' #=============================================================================== testing = True # Load Config and setup # Work out where data is stored on this machine prefix = get_data_prefix() path2config = joinp(prefix, 'gazelles-zebras', 'config.pkl') c = pkl.load(open(path2config, 'rb')) # Set path c.starting_data_inpath = c.tag_processed_outpath # Set next files to process Preprocess = Preprocessor(c) if testing: Preprocess.set_input_files(data_inpath=c.starting_data_inpath, file_pattern='testset_500-pass-clean.fastq.bgzf') c.starting_input_files = glob.glob(joinp(c.starting_data_inpath, 'testset_500-pass-clean.fastq.bgzf')) else: Preprocess.set_input_files(data_inpath=c.starting_data_inpath, file_pattern='lane[68]*-clean.bgzf') c.starting_input_files = glob.glob(joinp(c.starting_data_inpath, 'lane[68]*-clean.bgzf'))
def get_build_sub_project_script(): return os.path.join(utils.get_data_prefix(), 'helpers', 'build_sub_project.py')