Esempio n. 1
0
 def load(self, name=None, db_name=None, recbyname=True):
     ''' Load a pre-existing directory structure, config file and database 
     with the given dataset name.'''
     
     if (name is None) or (type(name) is not str):
         raise Exception('Must specify a valid name for the dataset.')
     if db_name is None:
         db_name = name + '.db'
     
     # Load config
     prefix = get_data_prefix()
     path2config = joinp(prefix, name, '.' + name + '-config.pkl')
     path2db = joinp(prefix, name, db_name)
     
     self.db = Popgen_db(path2db, recbyname=recbyname)
     self.c = pkl.load(open(path2config))
     
     # Setup Configuration with new prefix 
     prefix =  get_data_prefix()
     
     if self.c.testing:
         self.c.data_inpath =  joinp(prefix,name, 'testset')
     else:
         self.c.data_inpath =  joinp(prefix, name, 'raw-data') 
     self.c.barcode_inpath = joinp(prefix, name , 'barcodes')
     self.c.filtered_outpath = joinp(prefix, name , 'processed-data')
     self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data')
     self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample')
     self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup')
     self.c.clusters_outpath = joinp(prefix, name, 'clusters')
     self.c.db_path = joinp(prefix,  name)
Esempio n. 2
0
    def create_new(self, name=None, db_name=None, testing=False):
        ''' Setup directory structure and initialise config file and database 
        for the given dataset name.'''
        
        if (name is None) or (type(name) is not str):
            raise Exception('Must specify a valid name for the dataset.')
        if db_name is None:
            db_name = name + '.db'
        
        # Setup Configuration
        prefix =  get_data_prefix()
        
        # Default path locations
        self.c.testing = testing
        self.c.root_name = name
        self.c.db_name = db_name
        if testing:
            self.c.data_inpath =  joinp(prefix,name, 'testset')
        else:
            self.c.data_inpath =  joinp(prefix, name, 'raw-data') 
        self.c.barcode_inpath = joinp(prefix, name , 'barcodes')
        self.c.filtered_outpath = joinp(prefix, name , 'processed-data')
        self.c.tag_processed_outpath = joinp(prefix, name, 'processed-data')
        self.c.tag_splitby_sample_outpath = joinp(prefix, name, 'processed-data', 'per-sample')
        self.c.tag_splitby_subgroup_outpath = joinp(prefix, name, 'processed-data', 'per-subgroup')
        self.c.clusters_outpath = joinp(prefix, name, 'clusters')
        self.c.db_path = joinp(prefix,  name)
        self.c.cdhit_path = os.path.expanduser("~/bin/cd-hit-v4.6.1")


        # Create directories of they dont exist
        for attr in dir(self.c): #numwritten = SeqIO.write(RecCycler.recgen , output_filehdl , 'fastq')
#print '{0} records written'.format(numwritten)
#total_numwritten += numwritten
            if 'path' in attr:
                path = getattr(self.c, attr)
                if not os.path.exists(path):
                    os.makedirs(path)

        # Var to choose between different output locations after splitting data 
        self.c.current_tag_split_outpath = None
        
        # Set interim file suffixes
        self.c.filtered_files_postfix = '-pass'
        self.c.tag_processed_files_postfix = '-clean'

        # MIDtags
        self.c.cutsite = 'TGCAGG'
        self.c.max_edit_dist = 2
        
        # FILTERING
        # Whether to log reads that fail the filtering         
        self.c.log_fails = False
        
        # Create new Database 
        self.db = Popgen_db(joinp(self.c.db_path, db_name), recbyname=True, new=True)
        
        # Save config
        f = open(joinp(self.c.db_path, '.' + name + '-config.pkl'), 'w')
        pkl.dump(self.c, f)
Esempio n. 3
0
def get_build_tests_script():
    return os.path.join(utils.get_data_prefix(), 'helpers', 'build_tests.py')
Esempio n. 4
0
from preprocess2 import  Preprocessor, ConfigClass
from cluster import ClusterClass
import cPickle as pkl

from database2 import Popgen_db

#==============================================================================
''' Cluster SCRIPT FOR ALLL READS IN Gazelles-Zebras RAD data '''
#===============================================================================

testing = True

# Load Config and setup

# Work out where data is stored on this machine
prefix = get_data_prefix()
path2config = joinp(prefix, 'gazelles-zebras', 'config.pkl')

c = pkl.load(open(path2config, 'rb'))
# Set path 
c.starting_data_inpath = c.tag_processed_outpath

# Set next files to process 
Preprocess = Preprocessor(c)
if testing:
    Preprocess.set_input_files(data_inpath=c.starting_data_inpath, file_pattern='testset_500-pass-clean.fastq.bgzf')
    c.starting_input_files = glob.glob(joinp(c.starting_data_inpath, 'testset_500-pass-clean.fastq.bgzf'))
else:
    Preprocess.set_input_files(data_inpath=c.starting_data_inpath, file_pattern='lane[68]*-clean.bgzf')
    c.starting_input_files = glob.glob(joinp(c.starting_data_inpath, 'lane[68]*-clean.bgzf'))
    
Esempio n. 5
0
def get_build_sub_project_script():
    return os.path.join(utils.get_data_prefix(), 'helpers', 'build_sub_project.py')