Example #1
0
    def run(self):
        """Main control function"""
        self.start_time = datetime.datetime.now()
        self._initlog()
        self._initpool()
        logm('Parallel tasks: {} starts at {}'.format(
            self.name, strTime(self.start_time)))
        while self.task_pool or self.process_running:
            if self.task_pool and len(self.process_running) < self.max_process:
                # add tasks into empty process
                self._runNewTask()
            elif not self._checkAll():
                # check if process ends then log
                self._avgsleep()

        self.end_time = datetime.datetime.now()
        loggood('Parallel tasks:', self.name, 'end at', strTime(self.end_time))
        loggood('Total {:d}'.format(self.runned_tasks),
                'tasks with {:d}'.format(len(self.err_list)),
                'error' if len(self.err_list) in [0, 1] else 'errors',
                'using', strDiffTime(self.start_time, self.end_time))

        if len(self.err_list):
            self._printErrTasks()

        logm('Putting results into log')
        logp('Output csv file to', self.output_filename)
        self.output(lastTime=True)
Example #2
0
 def _initpool(self):
     # setup task pool
     logm('Initial pooled tasks')
     self.setupTaskPool(self.task_pool)
     Task.printCount()
     self.total_tasks = len(self.task_pool)
     logp('Total pooled tasks:', self.total_tasks)
    def output(self, lastTime=False):
        # original output function
        super.output(lastTime)

        # link all the ouptput to ~/public_html/log/<name>/<date>/        
        home_log_dir = os.path.join('~/public_html/log', self.name)
        if not os.path.exists(home_log_dir):
            os.makedirs(home_log_dir)
        sim_dir = os.path.join(home_log_dir, mp.strTime(self.start_time))
        logm('Create symbolic link at for log files at', sim_dir)

        abs_logdir = os.path.dirname(os.path.realpath(self.output_filename))
        if os.exists(sim_dir) and not os.path.samefile(sim_dir, abs_logdir):
            if os.path.islink(sim_dir):
                os.rename(sim_dir, sim_dir + '.backup')
                logwarn('link exists. Backup link as {:s}.backup -> {:s}'
                        .format(sim_dir,
                                os.path.realpath(os.path.expanduser(sim_dir))))
            else:
                os.rename(sim_dir, sim_dir + '.backup')                
                logwarn('file or dir exists. Backup as{:s}.backup'
                        .format(sim_dir))
        elif not os.exists(sim_dir):
            os.symlink(abs_logdir, mp.strTime(self.start_time), 
                       target_is_directory=True, dir_fd=home_log_dir)
Example #4
0
    def __init__(self, output_filename="", max_process=None, dump=True,
                 start_now=True, update=True, name=None, html=True, sort=True):

        # clear previous task setup log
        if Task.typeCount:
            Task.refresh()
        logm('Setup parallel task')

        # whole parallel task name
        if not name:
            logwarn('No name given, guessing by class name')
            self.name = self.__class__.__name__
        else:
            self.name = name
        logp('Parallel task name:', self.name)

        # Setup number of max process
        if max_process is None:     # set default max process
            max_process = 1
        if max_process <= 0:
            # used_process_num = MAX_CPU_NUM - max_process
            logp('Number of process depends on the number of CPU')
            if cpu_count() + max_process <= 0:
                logwarn('Number of process reaches 0! Will be set to 1')
                self.max_process = 1
            else:
                self.max_process = cpu_count() + max_process
        else:
            # normal max_process assignment
            if max_process == 1:
                logwarn('Not using parallel function, use 1 process')
            elif max_process > cpu_count():
                logwarn('# of processes exceeds # of CPUs: {:d}'.format(
                    cpu_count()), 'This may decrease speed!')
            self.max_process = max_process
        logp('Use', self.max_process, 'processes')

        # for basic structure
        self.output_list = []
        self.err_list = []
        self.task_pool = []
        self.process_running = []
        self.updated_len = 0
        self.out_filename = output_filename

        # parameters of current running status
        self.runned_tasks = 0
        self.dump = dump
        if not self.dump:
            logp('Using custom output function')
        self.html = html
        self.sort = sort
        self.update = update
        if start_now:
            self.run()
Example #5
0
 def _checkAll(self):
     old_len = len(self.process_running)
     self.process_running = [task for task in self.process_running
                             if not self._checkIfEnd(task)]
     if len(self.process_running) != old_len:
         if self.update and (self.task_pool or self.process_running):
             self.output()
         logm('Runned {:d}/{:d} tasks with {:d}'.format(
             self.runned_tasks, self.total_tasks,
             len(self.err_list)),
             'error' if len(self.err_list) in [0, 1] else 'errors')
         return True
     else:
         return False
Example #6
0
    def output(self, lastTime=False):
        if not lastTime:
            new_num = len(self.output_list) - self.updated_len
            logm('Update log file with {:d} new'.format(new_num),
                 'result' if new_num == 1 else 'results')

        # output results
        if self.dump:
            # dump the output results with name respectively.
            self.dump2csv(lastTime)
        else:
            self.parse2csv(self.output_list)
        self.updated_len = len(self.output_list)    # get new log list length
        if self.html:
            self.write2html(lastTime)
Example #7
0
def fastq_list_Sample(Sample_pth_list, args):
    ''' Main program of Sample mode, 
    Project mode inheritently calls this function'''
    if args.out_dir:
        args.out_dir = op.abspath(args.out_dir)
        if len(Sample_pth_list) > 1:
            # If multiple sample is input => Sample mode
            # use abspath for out_dir as root path of all results dir
            logp('multiple groups of sample get.') 
            logp('root path to results is set manually:', args.out_dir)
    else:
        logp('no out_dir given,', 
             'results will got to the dir of every group of sample')

    # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來,
    # 再交給下一級 run_<...>() 系列的程式執行
    if args.seq_type == 'paired':
        # paired mode, run by pairs
        # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ...
        # => [
        #     ['A_R0.gz', 'A_R1.gz'],
        #     ['B_R0.gz', 'B_R1.gz'],
        #     [...], ...
        #    ]

        for i, paired_sample_list in enumerate(Sample_pth_list, start=1):
            # validation, samples should be paired
            if (len(paired_sample_list) != 2):
                logerr('Input samples :', ' ,'.join(paired_sample_list),
                       'is not paired!', 'Skipping ...')
                continue

            sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1]
            
            # samples should exist
            if not op.exists(sample_R1):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            if args.seq_type == 'paired' and not op.exists(sample_R2):
                logerr('Input', sample_R2, 'does not exist! Skipping')
                continue

            # copy args so if we change the args.out_dir or args.resume, 
            # other samples will not be affected
            temp_args = copy.deepcopy(args)

            # if multiple samples, show the working progress
            if len(Sample_pth_list) > 1:
                logm('({:d}/{:d}) Processing paired:'
                     .format(i, len(Sample_pth_list)),
                     sample_R1, sample_R2)
            else:
                logm('Processing paired:', sample_R1, sample_R2)
            
            # use absolute path
            sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2)

            if not args.out_dir:
                # default output path <path_of_sample_R1>/Tophat
                # if called by Project mode, args.out_dir will be set
                # automatically
                temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat')
            else:
                if len(Sample_pth_list) > 1: 
                    # make sample sub_dir
                    sample_name = ('Sample_'
                                   + op.basename(sample_R1).split('.')[0])
                    logp('subdir for sample name guessed from filename:',
                         sample_name)
                    # args.out_dir is root path shared by all samples
                    temp_args.out_dir = op.join(args.out_dir, sample_name)  
            
            # if not previous work, turn off the --resume option
            cond_log_exist = op.exists(
                op.join(temp_args.out_dir, 'logs/tophat.log'))
            if temp_args.resume and not cond_log_exist:
                logp('previous log file tophat.log not found,',
                     'resume function is temporarily off')
                temp_args.resume = False
            
            # create the out_dir and check if out_dir exists
            if op.isdir(temp_args.out_dir):
                if not temp_args.resume:
                    logwarn('results dir exists', temp_args.out_dir)
            else:
                os.makedirs(temp_args.out_dir, mode=0o755)
            
            run_sample(sample_R1, sample_R2, temp_args)

    else:
        # single mode, run one by one
        # flatting all sequence into a list
        # example: -S A.fq -S B.fq C.fq D.fa -S ...
        # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...]        
        flatten_sample_list = list(
            itertools.chain.from_iterable(Sample_pth_list))

        for sample in flatten_sample_list:
            # sample should exist
            if not op.exists(sample):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            logm('Processing', sample)            
            sample = op.abspath(sample)
            # if run as sample mode, set out_dir as the dir of the samples
            if not args.out_dir:
                args.out_dir = op.join(op.dirname(sample), 'Tophat')
            if op.isdir(args.out_dir):
                logwarn('results dir exists', args.out_dir)
            else:
                os.makedirs(args.out_dir, mode=0o755)
            # same function as paired-end mode, but leaving sample_R2 empty
            run_sample(sample, '', args)
Example #8
0
def fastq_list_Project(Project_pth_list, args):
    ''' Main program of Sample mode. '''    
    # flatting nested list
    # Ex -P A B ... -P C -P D ...
    # => [[A, B], [C], [D], [...], ...]
    # flatten
    # => [A, B, C, D, ...]
    flatten_projects_list = list(
        itertools.chain.from_iterable(Project_pth_list))
    logp('retreiving', len(flatten_projects_list), 'projects')

    for prj_pth in flatten_projects_list:
        if not op.exists(prj_pth) or not op.isdir(prj_pth):
            # not exist or not a directory
            logerr('Cannot find the directory of project:', prj_pth, 
                   'Skipping...')
            continue

        # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯
        # using abosolute path to prevent that some programs can not handle 
        # relative path and easy for debugging 
        prj_dir = op.abspath(prj_pth)

        # a typical sequencing path output after Illumina demultiplexing
        # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA
        # desired output path
        # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA
        FCID_dir, prj_name = op.split(prj_dir)
        FCID_dir = op.split(FCID_dir)[0]
        logm('Working project:', prj_name[8:])

        # determine result dir
        if not args.out_dir:
            prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') 
        else:
            prj_result_root = op.join(op.abspath(args.out_dir), prj_name)

        # create result dir first
        if not op.exists(prj_result_root):
            os.makedirs(prj_result_root, mode=0o755)
        else:
            logwarn('project result exists')

        # obtain all sample dir in the project
        sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/')))
        total_sample = len(sample_list)
        logp('contains', str(total_sample), 'samples')

        for i, sample_dir in enumerate(sample_list, start=1):
            logm('({:d}/{:d}) Sample: {:s}'
                 .format(
                     i, total_sample, 
                     op.split(op.dirname(sample_dir))[1][7:]),
                 'in project', prj_name[8:])

            # read SampleSheet.csv in the sample_dir
            ss_pth = op.join(sample_dir, 'SampleSheet.csv')
            if not op.exists(ss_pth):
                logerr('SampleSheet.csv not found! Skipping')
                continue

            # parsing SampleSheet.csv
            with open(ss_pth) as ss_csv_f:
                reader = csv.DictReader(ss_csv_f)
                for row in reader:
                    temp_args = copy.deepcopy(args)
                    # typical sample name
                    # No35_ATGTCA_L003_R1_001
                    # => <sample_prefix>_R1/2_001
                    sample_prefix = '{:s}_{:s}_L{:03d}'.format(
                        row['SampleID'],
                        row['Index'],
                        int(row['Lane']))

                    # output dir 
                    temp_args.out_dir = op.join(prj_result_root,
                                                'Sample_' + row['SampleID'])
                    logp('result goes to', temp_args.out_dir)
                    if not op.exists(temp_args.out_dir):
                        os.makedirs(temp_args.out_dir, mode=0o755)
                    else:
                        logwarn('sample result exists')

                    # reference
                    if not temp_args.ref_name:
                        temp_args.ref_name = row['SampleRef']

                    sample_R1 = op.join(sample_dir,
                                        sample_prefix + '_R1_001.fastq.gz')
                    if temp_args.seq_type == 'paired':
                        sample_R2 = op.join(sample_dir,
                                            sample_prefix + '_R2_001.fastq.gz')
                    else:
                        sample_R2 = ''
                    run_sample(sample_R1, sample_R2, temp_args)
Example #9
0
 def printCount():
     logm('Print task type and count respectively')
     for k, v in Task.typeCount.items():
         logp('Type', k + ':', str(v))
Example #10
0
 def refresh():
     logm('Cleaning no-type warning and task type count')
     Task.warnedNoType = []
     Task.typeCount = collections.OrderedDict()