def merge_split_casava_results(split_dir,output_dir,split_categories=["Index_length","Lane"]): """ After split directories are run through casava, this function recombines them. First, the input dirs are inferred, and then data is moved. """ sample_sheet_obj_list = SampleSheetObjList() sample_sheet_obj_list.__load_sample_sheets_from_meta_directories__(split_dir,split_categories) #Get all sample ids. sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("SampleID") #Partition by sample (as casava does) sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("SampleProject") #Partition by project (as casava does) #Label th input directory for each sample for sample_sheet_obj in sample_sheet_obj_list.list: split_subdir_pieces = [] for category in split_categories: piece = sample_sheet_obj.__get_meta_datum__(category) split_subdir_pieces.append(str(piece)) split_subdir = "_".join(split_subdir_pieces) project_subdir = "Project_" + sample_sheet_obj.__get_meta_datum__("SampleProject") sample_subdir = "Sample_" + sample_sheet_obj.__get_meta_datum__("SampleID") original_dir = os.path.join(split_dir,split_subdir+"/"+project_subdir+"/"+sample_subdir) sample_sheet_obj.__set_meta_datum__("original_dir",original_dir) #Do the merging merge_casava_fastq_directories(sample_sheet_obj_list,output_dir,merge_type="move") move_undetermined_directories_of_min_length(output_dir,sample_sheet_obj_list,"Lane" in split_categories) return
def merge_flowcell_casava_results(flowcell_dirs,output_dir,*args,**kwargs): """ Merges the samples in multiple flowcell directories. """ sample_dirs_dict = list_sample_dirs(flowcell_dirs) sample_sheet_obj_list = SampleSheetObjList() sample_sheet_obj_list.__load_sample_sheets_from_sample_directories__(sample_dirs_dict) merge_casava_fastq_directories(sample_sheet_obj_list,output_dir,meta_data_prefix=["FCID"]) return
def __init__(self,config,key=-1,prev_step=None,pipeline=None,split_by_lane=True,split_by_index_length=True,process_name="casava",**kwargs): """ In addition to initializing, other steps are completed. These are commented below. """ if not prev_step is None: input_dir = os.path.join(pipeline.output_dir,"Data/Intensities/BaseCalls") output_dir = os.path.join(pipeline.output_dir,os.path.basename(pipeline.output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) if pipeline.sample_sheet is None: original_sample_sheet_file = os.path.join(pipeline.input_dir,"SampleSheet.csv") else: original_sample_sheet_file = pipeline.sample_sheet if not os.path.isfile(original_sample_sheet_file):#Check to make sure original sample sheet exists send_missing_sample_sheet_email(original_sample_sheet_file) raise SampleSheetFormatException("No sample sheet found: "+str(original_sample_sheet_file)) sample_sheet_obj_list = SampleSheetObjList(sample_sheet_file=original_sample_sheet_file) sample_sheet_obj_list.list[0].sample_sheet_table.__write_file__(os.path.join(output_dir,"SampleSheet.csv"))#Copy sample sheet to final output dir. self.merged = True split_categories = [] self.split_by_lane = split_by_lane if split_by_lane is True: #Split by lane (speed up especially for high throughput) sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("Lane") split_categories.append("Lane") self.merged = False self.split_by_index_length = split_by_index_length if split_by_index_length == True: #Split by index lane (prevents casava from breaking when pool samples have different index lengths) for sample_sheet_obj in sample_sheet_obj_list.list: sample_sheet_obj.__attach_max_column_number__("Index") sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("Index",use_length=True) split_categories.append("Index_length") self.merged = False number_tasks = len(sample_sheet_obj_list.list) temporary_output_directories = sample_sheet_obj_list.__create_meta_directories_and_write_files__(os.path.join(output_dir,"split"),split_categories) self.temporary_output_dir = ":".join(temporary_output_directories) sample_sheets = [os.path.join(d,"SampleSheet.csv") for d in temporary_output_directories] self.sample_sheet = ":".join(sample_sheets) sample_sheet_obj_list.__attach_masks__(run_parameters_path=os.path.join(pipeline.input_dir,"runParameters.xml")) masks = [] for sample_sheet_obj in sample_sheet_obj_list.list: mask = sample_sheet_obj.__get_meta_datum__("mask") mask, number = re.subn(',','-',mask) masks.append(mask) self.mask = ":".join(masks) QsubProcess.__init__(self,config,key=key,output_dir=output_dir,input_dir=input_dir,number_tasks=number_tasks,process_name=process_name,**kwargs) self.flowcell_key = pipeline.flowcell_key self.seq_run_key = pipeline.seq_run_key
from processes.hiseq.sample_sheet import SampleSheetObjList from processes.hiseq.scripts import list_sample_dirs import argparse if __name__ == '__main__': #Handle arguments parser = argparse.ArgumentParser(description='Test various functions in this functions in this folder that require multiple modules') parser.add_argument('--load_samples_sample_sheets', dest="samples_dir", type=str, help='Test the loading of sample sheets by sample by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.') parser.add_argument('--column_values', dest="values_dir", type=str, help='Test the column values function by returning a list of samples in all of the sample sheets by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.') parser.add_argument('--merge_to_single', dest="merge_dir", type=str, help='Test the merge all sample sheet objects function by returning a single sample sheet by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.') parser.add_argument('--filter_by_sample', dest="filter_dir", type=str, help='Test the filter sample sheet object by printing multiple sample sheet objects after providing the path for under which all sub-directories are evaluated for SampleSheet.csv.') args = parser.parse_args() sample_sheet_obj_list = SampleSheetObjList() if args.samples_dir: sample_dirs_dict = list_sample_dirs([args.samples_dir]) sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict) sample_sheet_obj_list.__print__() if args.values_dir: sample_dirs_dict = list_sample_dirs([args.values_dir]) sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict) print str(sample_sheet_obj_list.__get_column_values__("SampleID")) if args.merge_dir: sample_dirs_dict = list_sample_dirs([args.merge_dir]) sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict) new_sample_sheet_obj_list = sample_sheet_obj_list.__merge_all_sample_sheet_objects__() new_sample_sheet_obj_list.__print__(print_meta_data=False) if args.filter_dir: sample_dirs_dict = list_sample_dirs([args.filter_dir]) sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict) sample_ids = sample_sheet_obj_list.__get_column_values__("SampleID")