def merge_split_casava_results(split_dir,output_dir,split_categories=["Index_length","Lane"]):
    """
    After split directories are run through casava, this function recombines them.  First, the input dirs are
    inferred, and then data is moved.
    """
    sample_sheet_obj_list = SampleSheetObjList()
    sample_sheet_obj_list.__load_sample_sheets_from_meta_directories__(split_dir,split_categories)
    #Get all sample ids.
    sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("SampleID") #Partition by sample (as casava does)
    sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("SampleProject") #Partition by project (as casava does)
    #Label th input directory for each sample
    for sample_sheet_obj in sample_sheet_obj_list.list:
        split_subdir_pieces = []
        for category in split_categories:
            piece = sample_sheet_obj.__get_meta_datum__(category)
            split_subdir_pieces.append(str(piece))
        split_subdir = "_".join(split_subdir_pieces)
        project_subdir = "Project_" + sample_sheet_obj.__get_meta_datum__("SampleProject")
        sample_subdir = "Sample_" + sample_sheet_obj.__get_meta_datum__("SampleID")
        original_dir = os.path.join(split_dir,split_subdir+"/"+project_subdir+"/"+sample_subdir)
        sample_sheet_obj.__set_meta_datum__("original_dir",original_dir)
    #Do the merging
    merge_casava_fastq_directories(sample_sheet_obj_list,output_dir,merge_type="move")
    move_undetermined_directories_of_min_length(output_dir,sample_sheet_obj_list,"Lane" in split_categories)
    return
def merge_flowcell_casava_results(flowcell_dirs,output_dir,*args,**kwargs):
    """
    Merges the samples in multiple flowcell directories.
    """
    sample_dirs_dict = list_sample_dirs(flowcell_dirs)
    sample_sheet_obj_list = SampleSheetObjList()
    sample_sheet_obj_list.__load_sample_sheets_from_sample_directories__(sample_dirs_dict)
    merge_casava_fastq_directories(sample_sheet_obj_list,output_dir,meta_data_prefix=["FCID"])
    return
Example #3
0
 def __init__(self,config,key=-1,prev_step=None,pipeline=None,split_by_lane=True,split_by_index_length=True,process_name="casava",**kwargs):
     """
     In addition to initializing, other steps are completed.  These are commented below.
     """
     if not prev_step is None:
         input_dir = os.path.join(pipeline.output_dir,"Data/Intensities/BaseCalls")
         output_dir = os.path.join(pipeline.output_dir,os.path.basename(pipeline.output_dir))
         if not os.path.exists(output_dir):
             os.makedirs(output_dir)
         if pipeline.sample_sheet is None:
             original_sample_sheet_file = os.path.join(pipeline.input_dir,"SampleSheet.csv")
         else:
             original_sample_sheet_file = pipeline.sample_sheet
         if not os.path.isfile(original_sample_sheet_file):#Check to make sure original sample sheet exists
            send_missing_sample_sheet_email(original_sample_sheet_file)
            raise SampleSheetFormatException("No sample sheet found: "+str(original_sample_sheet_file))
         sample_sheet_obj_list = SampleSheetObjList(sample_sheet_file=original_sample_sheet_file)
         sample_sheet_obj_list.list[0].sample_sheet_table.__write_file__(os.path.join(output_dir,"SampleSheet.csv"))#Copy sample sheet to final output dir.
         self.merged = True
         split_categories = []
         self.split_by_lane = split_by_lane
         if split_by_lane is True: #Split by lane (speed up especially for high throughput)
             sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("Lane")
             split_categories.append("Lane")
             self.merged = False
         self.split_by_index_length = split_by_index_length
         if split_by_index_length == True: #Split by index lane (prevents casava from breaking when pool samples have different index lengths)
             for sample_sheet_obj in sample_sheet_obj_list.list:
                 sample_sheet_obj.__attach_max_column_number__("Index")
             sample_sheet_obj_list = sample_sheet_obj_list.__partition_sample_sheet_objects__("Index",use_length=True)
             split_categories.append("Index_length")
             self.merged = False
         number_tasks = len(sample_sheet_obj_list.list)
         temporary_output_directories = sample_sheet_obj_list.__create_meta_directories_and_write_files__(os.path.join(output_dir,"split"),split_categories)
         self.temporary_output_dir = ":".join(temporary_output_directories)
         sample_sheets = [os.path.join(d,"SampleSheet.csv") for d in temporary_output_directories]
         self.sample_sheet = ":".join(sample_sheets)
         sample_sheet_obj_list.__attach_masks__(run_parameters_path=os.path.join(pipeline.input_dir,"runParameters.xml"))
         masks = []
         for sample_sheet_obj in sample_sheet_obj_list.list:
             mask = sample_sheet_obj.__get_meta_datum__("mask")
             mask, number = re.subn(',','-',mask)
             masks.append(mask)
         self.mask = ":".join(masks)
         QsubProcess.__init__(self,config,key=key,output_dir=output_dir,input_dir=input_dir,number_tasks=number_tasks,process_name=process_name,**kwargs)
         self.flowcell_key = pipeline.flowcell_key
         self.seq_run_key = pipeline.seq_run_key
Example #4
0
from processes.hiseq.sample_sheet import SampleSheetObjList
from processes.hiseq.scripts import list_sample_dirs
import argparse

if __name__ == '__main__':
    #Handle arguments
    parser = argparse.ArgumentParser(description='Test various functions in this functions in this folder that require multiple modules')
    parser.add_argument('--load_samples_sample_sheets', dest="samples_dir", type=str, help='Test the loading of sample sheets by sample by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.')
    parser.add_argument('--column_values', dest="values_dir", type=str, help='Test the column values function by returning a list of samples in all of the sample sheets by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.')
    parser.add_argument('--merge_to_single', dest="merge_dir", type=str, help='Test the merge all sample sheet objects function by returning a single sample sheet by providing the path for under which all sub-directories are evaluated for SampleSheet.csv.')
    parser.add_argument('--filter_by_sample', dest="filter_dir", type=str, help='Test the filter sample sheet object by printing multiple sample sheet objects after providing the path for under which all sub-directories are evaluated for SampleSheet.csv.')

    args = parser.parse_args()
    sample_sheet_obj_list = SampleSheetObjList()
    if args.samples_dir:
        sample_dirs_dict = list_sample_dirs([args.samples_dir])
        sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict)
        sample_sheet_obj_list.__print__()
    if args.values_dir:
        sample_dirs_dict = list_sample_dirs([args.values_dir])
        sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict)
        print str(sample_sheet_obj_list.__get_column_values__("SampleID"))
    if args.merge_dir:
        sample_dirs_dict = list_sample_dirs([args.merge_dir])
        sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict)
        new_sample_sheet_obj_list = sample_sheet_obj_list.__merge_all_sample_sheet_objects__()
        new_sample_sheet_obj_list.__print__(print_meta_data=False)
    if args.filter_dir:
        sample_dirs_dict = list_sample_dirs([args.filter_dir])
        sample_sheet_obj_list. __load_sample_sheets_from_sample_directories__(sample_dirs_dict)
        sample_ids = sample_sheet_obj_list.__get_column_values__("SampleID")