Esempio n. 1
0
def strDiffTime(d_start, d_end, human=True):
    delta = d_end - d_start

    if human:
        time_str = ""
        if delta.days < 0:
            logwarn('Negative value when computing time difference of')
            print('start:', strTime(d_start), 'end:', strTime(d_end))
            time_str = "-"
            delta = d_start - d_end

        h = (delta.days * 24) + (delta.seconds // 3600)
        m = delta.seconds % 3600 // 60

        if h:
            time_str += "{:0.0f}h ".format(h)
            time_str += "{:0.0f}m ".format(m)
        elif m:
            time_str += "{:0.0f}m ".format(m)

        time_str += "{:0.2f}s".format(
            delta.seconds % 60 + delta.microseconds / 1000000)
        return time_str
    else:
        return delta.total_seconds()
    def output(self, lastTime=False):
        # original output function
        super.output(lastTime)

        # link all the ouptput to ~/public_html/log/<name>/<date>/        
        home_log_dir = os.path.join('~/public_html/log', self.name)
        if not os.path.exists(home_log_dir):
            os.makedirs(home_log_dir)
        sim_dir = os.path.join(home_log_dir, mp.strTime(self.start_time))
        logm('Create symbolic link at for log files at', sim_dir)

        abs_logdir = os.path.dirname(os.path.realpath(self.output_filename))
        if os.exists(sim_dir) and not os.path.samefile(sim_dir, abs_logdir):
            if os.path.islink(sim_dir):
                os.rename(sim_dir, sim_dir + '.backup')
                logwarn('link exists. Backup link as {:s}.backup -> {:s}'
                        .format(sim_dir,
                                os.path.realpath(os.path.expanduser(sim_dir))))
            else:
                os.rename(sim_dir, sim_dir + '.backup')                
                logwarn('file or dir exists. Backup as{:s}.backup'
                        .format(sim_dir))
        elif not os.exists(sim_dir):
            os.symlink(abs_logdir, mp.strTime(self.start_time), 
                       target_is_directory=True, dir_fd=home_log_dir)
Esempio n. 3
0
    def __init__(self, output_filename="", max_process=None, dump=True,
                 start_now=True, update=True, name=None, html=True, sort=True):

        # clear previous task setup log
        if Task.typeCount:
            Task.refresh()
        logm('Setup parallel task')

        # whole parallel task name
        if not name:
            logwarn('No name given, guessing by class name')
            self.name = self.__class__.__name__
        else:
            self.name = name
        logp('Parallel task name:', self.name)

        # Setup number of max process
        if max_process is None:     # set default max process
            max_process = 1
        if max_process <= 0:
            # used_process_num = MAX_CPU_NUM - max_process
            logp('Number of process depends on the number of CPU')
            if cpu_count() + max_process <= 0:
                logwarn('Number of process reaches 0! Will be set to 1')
                self.max_process = 1
            else:
                self.max_process = cpu_count() + max_process
        else:
            # normal max_process assignment
            if max_process == 1:
                logwarn('Not using parallel function, use 1 process')
            elif max_process > cpu_count():
                logwarn('# of processes exceeds # of CPUs: {:d}'.format(
                    cpu_count()), 'This may decrease speed!')
            self.max_process = max_process
        logp('Use', self.max_process, 'processes')

        # for basic structure
        self.output_list = []
        self.err_list = []
        self.task_pool = []
        self.process_running = []
        self.updated_len = 0
        self.out_filename = output_filename

        # parameters of current running status
        self.runned_tasks = 0
        self.dump = dump
        if not self.dump:
            logp('Using custom output function')
        self.html = html
        self.sort = sort
        self.update = update
        if start_now:
            self.run()
Esempio n. 4
0
    def __init__(self, name, command, working_dir=None, task_type=None):
        self.name = name
        if not task_type:
            if command[0] not in Task.warnedNoType:
                # no task type, guess by command[0]
                Task.warnedNoType.append(command[0])
                logwarn('No task type specified! Guess by command[0]:',
                        command[0])
                logwarn('Further warning for same task type \
                        will be surpressed!')
            self.task_type = command[0]
        else:
            self.task_type = task_type

        if self.task_type in Task.typeCount:
            Task.typeCount[self.task_type] += 1
        else:
            Task.typeCount[self.task_type] = 1

        self.process_time = 0
        self.command = command
        self.working_dir = working_dir
Esempio n. 5
0
    def _initlog(self):
        # output all logs to ./log/<...>.csv
        # if html output, further html file also created
        out_name = self.output_filename

        # determine the log dir place
        log_dir = op.dirname(self.out_fname) if out_name else ""
        if log_dir:
            if log_dir != './log':
                log_dir = op.abspath(op.expanduser(log_dir))
                logp('using non-default log dir:', log_dir)
        else:
            log_dir = './log'

        # create log folder
        if not op.exists(log_dir):
            try:
                os.makedirs(log_dir, mode=0o755)
            except OSError as e:
                logerr('Cannot create log folder!',
                       'using default dir ./log/ instead')
                out_name = './log/' + op.basename(out_name)
                os.makedirs('./log', mode=0o755)
                print(e)

        # determin the log filename
        log_name = op.basename(out_name) if out_name else ""
        if op.splitext(log_name)[0]:
            # append file extension .csv if needed
            if not op.splitext(log_name)[1]:
                logwarn('No extension in given filename',
                        '".csv" will be auto appended.')
                log_name += '.csv'
            elif op.splitext(log_name)[1] != '.csv':
                logwarn('Output file extension given "{:s}" is not ".csv",',
                        'may create error when opening.'.format(
                            op.splitext(log_name)[1]))
        else:
            log_name = "{:s}_{:s}.csv".format(
                self.name.replace(' ', '_'),
                strTime(dt=self.start_time,
                        str_format=myparallel.time_strf))

        # join dir and name setting
        out_name = op.join(log_dir, log_name)

        # warn if log file already exists
        if op.exists(out_name):
            log_mtime = datetime.datetime.fromtimestamp(op.getmtime(out_name))
            logwarn('log file already exists! Created {:s} ago'.format(
                    strDiffTime(log_mtime, datetime.datetime.today())))
        logp('Raw output log csv goes to', out_name)
Esempio n. 6
0
def parse_args():
    '''Parse command line options, return the option list'''

    desc = '''
    This script parse the needed argument, first decompress the
    zipped fastq file, select proper reference genome index, and call
    Top Hat with proper arugments.
    
    More aruments input will be passed directly to Top Hat, be sure you know
    what you are doing.
    
    For more information please contact Liang Bo Wang or 
    Bioinformatics and Biostatistics Core Lab, NTU CGM'''

    # if the parser will be inherited or used by other ArgumentParser,
    # then add_help should be set False.
    # RawTextHelpFormatter both description and help text use raw string
    # RawDescriptionHelpFormatter only description uses raw string
    parser = ap.ArgumentParser(prog='tophat.py',
                               formatter_class=ap.RawDescriptionHelpFormatter,
                               description=textwrap.dedent(desc),
                               add_help=True)
    p_addarg = parser.add_argument          # make the function name shorter

    # --- input ---
    # one must choose either Project(-P) or Sample(-S)
    in_type_grp = parser.add_mutually_exclusive_group(required=True)
    in_addarg = in_type_grp.add_argument    # make the function name shorter
    # Project mode
    in_addarg('-P', '--Project',
              metavar='DIR', action='append',
              dest='Project_list', nargs='+',
              help='''Path for a project directory. It should follow the
              structure of original Illumina direct output by 
              demultiplexing.''')
    # Sample mode
    in_addarg('-S', '--Sample',
              metavar=('R1.fastq[.gz]', '{R2.fastq[.gz]}'), action='append',
              dest='Sample_list', nargs='+',
              help='''PATH to a pair of samples in paired mode or multiple 
              samples in single mode. Ex -S A1 A2 -S B1 B2 (-t paried) 
              or -S A B C -S D E -t single. Both FASTQ and .fastq.gz files are
              accepted.''')
    # nargs='+'         implies that this option accepts mulitple arguments
    # dest='<var_name>' then one can access the arguments using args.var_name
    # action='append'   use the following examples
    # Ex1.  -S A_R0.fq A_R1.fq
    # => [['A_R0.fq', 'A_R1.fq']]
    # Ex2.  -S A_R0.fq A_R1.fq -S B_R0.fq B_R1.fq -S ...
    # => [
    #     ['A_R0.fq', 'A_R1.fq'],
    #     ['B_R0.fq', 'B_R1.fq'],
    #     [...], ...
    #    ]
    # so if input type is Sample, it will be a nested list

    # --- output ---
    p_addarg('-o', '--outdir',
             metavar='OUT_DIR',
             dest='out_dir',
             help='''In Sample mode, if only a (pair of) sample is given, then
             it will be its result dir directly, otherwise it will be the root
             path for all samples given. Ex -S A1 A2 -o DIR -t paired =>
             outputs to DIR/, -S A1 A2 -S B1 B2 -o DIR -t paired => DIR/A/ and
             DIR/B/ for results of A and B respectively. In Project mode, by
             default it assumes multiple results, so OUT_DIR will be the root
             path of all the project results. Ex OUT_DIR/Sample_<1>,
             OUT_DIR/Sample_<2>, ...''')

    # --- parameters for Top Hat ---
    # tophat -p 15 
    # -G /data/iGenome/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf 
    # -o TopHat_with_GTF/Sample_A-W  
    # --library-type=fr-unstranded 
    # --no-novel-juncs 
    # /data/iGenome/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome 
    # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R1_001.fastq 
    # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R2_001.fastq

    p_addarg('-r', '--readlength',
             # required=True,
             metavar='LEN', type=int,
             dest='read_length',
             help='''mate_inner_length given to Tophat.''')

    p_addarg('-t', '--seqtype',
             default='paired',
             choices=['paired', 'single'],
             dest='seq_type',
             help='''Sequencing type. Default is paired-end sequence.''')

    # quantification without a reference annotation
    p_addarg('--no-annotation',
             action='store_false',
             dest='annotation',
             help='''If specified, Top Hat will do alternative splicing 
             without knowledge of existed isoform of all genes.''')

    # --- reference argments ---
    # user and specify species like human, mouse, or chicken through -R
    # or they specify the path to required bowtie index and gene annotation
    # If bowtie_path(--bowtie-index-path) or gene_path(--gene-path) is given,
    # program use the path directly. 
    # Then it looks for the path given by ref_name(-R). 
    # Otherwise it looks for the information inside SampleSheet.csv
    # Thus the priority is <*path> -> <ref_name> -> SampleSheet.csv
    p_addarg('-R', '--refname',
             metavar='NAME',
             dest='ref_name',
             help='''Name of the species or reference database. Ex. both human
             and hg19 goes to hg19; similarly, both mouse and mm10 goes to
             mm10.''')

    p_addarg('-B', '--bowtie-index',
             #required=True,
             metavar='BOWTIE_INDEX_PATH',
             dest='bowtie_index',
             help='''The path to FW index of whole genome sequence for Bowtie2.
             It should be ended with .../Bowtie2Index/genome''')

    p_addarg('-G', '--gtf',
             metavar='GTF_PATH',
             dest='gtf_path',
             help='''The path to the gene annotation GTF file with known
             transcripts, e.g., genes.gtf in most cases. ''')

    # --- miscellaneous arguments ---
    # multiprocessing
    p_addarg('-p', '--multiprocess',
             metavar='N', type=int,
             default=1,
             dest='max_process',
             help='''The maximum of parallel running processes. This
             number should be equal to or less then the number of CPU
             cores. If a negative number or zero is set, number of
             maxprocess depends on the number of CPU cores. For example,
             -1 uses CPU_NUM({:d}) - 1 = {:d} processes on this machine.
             Program use 1 process if not specified.'''.format(
             mp.cpu_count(), mp.cpu_count() - 1))

    # resume Tophat
    p_addarg('--resume',
             action='store_true',
             dest='resume',
             help='''If specified, Tophat will try to resume the progress by
             looking for <out_dir>/logs/tophat.log''')

    # remove unzipped fastq files
    p_addarg('--remove-fastq', 
             action='store_true',
             dest='rm_unzip_fq',
             help='''If specified, all unzipped fastq files will be removed.
             However, those fastq files existed before run will be intact.''')

    p_addarg('--extra-args',
             dest='extra_args',
             action='store_true',
             help='''Input additional arguments to Tophat directly 
             WITHOUT ANY CHECKS. If specified, all unkown args 
             will be collected''')

    # --- validation and first processing commands ---
    #args = parser.parse_args()
    args, unknown_args = parser.parse_known_args()

    if args.extra_args:
        logp('getting extra args passed to Tophat:', ' '.join(unknown_args))
        args.extra_args = unknown_args
    else:
        if unknown_args:
            logerr('Get unkown args.',
                   'If they are passed to Tophat, please specify',
                   '--extra-args')
            parser.error('unrecognized arguments:' + ' '.join(unknown_args)) 
        else:
            args.extra_args = None

    # computing max_process
    if args.max_process > mp.cpu_count():
        logwarn('Set # of processes({:d})'.format(args.max_process), 
                '> # of CPUs({:d})'.format(mp.cpu_count()),
                'the efficiency will be low.')
    elif args.max_process <= 0:
        args.max_process = mp.cpu_count() + args.max_process
        if args.max_process <= 0:
            logwarn('Negative # of processes({:d}) has been set, reset to 1'
                    .format(args.max_process))
            args.max_process = 1

    return args
Esempio n. 7
0
def fastq_list_Sample(Sample_pth_list, args):
    ''' Main program of Sample mode, 
    Project mode inheritently calls this function'''
    if args.out_dir:
        args.out_dir = op.abspath(args.out_dir)
        if len(Sample_pth_list) > 1:
            # If multiple sample is input => Sample mode
            # use abspath for out_dir as root path of all results dir
            logp('multiple groups of sample get.') 
            logp('root path to results is set manually:', args.out_dir)
    else:
        logp('no out_dir given,', 
             'results will got to the dir of every group of sample')

    # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來,
    # 再交給下一級 run_<...>() 系列的程式執行
    if args.seq_type == 'paired':
        # paired mode, run by pairs
        # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ...
        # => [
        #     ['A_R0.gz', 'A_R1.gz'],
        #     ['B_R0.gz', 'B_R1.gz'],
        #     [...], ...
        #    ]

        for i, paired_sample_list in enumerate(Sample_pth_list, start=1):
            # validation, samples should be paired
            if (len(paired_sample_list) != 2):
                logerr('Input samples :', ' ,'.join(paired_sample_list),
                       'is not paired!', 'Skipping ...')
                continue

            sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1]
            
            # samples should exist
            if not op.exists(sample_R1):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            if args.seq_type == 'paired' and not op.exists(sample_R2):
                logerr('Input', sample_R2, 'does not exist! Skipping')
                continue

            # copy args so if we change the args.out_dir or args.resume, 
            # other samples will not be affected
            temp_args = copy.deepcopy(args)

            # if multiple samples, show the working progress
            if len(Sample_pth_list) > 1:
                logm('({:d}/{:d}) Processing paired:'
                     .format(i, len(Sample_pth_list)),
                     sample_R1, sample_R2)
            else:
                logm('Processing paired:', sample_R1, sample_R2)
            
            # use absolute path
            sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2)

            if not args.out_dir:
                # default output path <path_of_sample_R1>/Tophat
                # if called by Project mode, args.out_dir will be set
                # automatically
                temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat')
            else:
                if len(Sample_pth_list) > 1: 
                    # make sample sub_dir
                    sample_name = ('Sample_'
                                   + op.basename(sample_R1).split('.')[0])
                    logp('subdir for sample name guessed from filename:',
                         sample_name)
                    # args.out_dir is root path shared by all samples
                    temp_args.out_dir = op.join(args.out_dir, sample_name)  
            
            # if not previous work, turn off the --resume option
            cond_log_exist = op.exists(
                op.join(temp_args.out_dir, 'logs/tophat.log'))
            if temp_args.resume and not cond_log_exist:
                logp('previous log file tophat.log not found,',
                     'resume function is temporarily off')
                temp_args.resume = False
            
            # create the out_dir and check if out_dir exists
            if op.isdir(temp_args.out_dir):
                if not temp_args.resume:
                    logwarn('results dir exists', temp_args.out_dir)
            else:
                os.makedirs(temp_args.out_dir, mode=0o755)
            
            run_sample(sample_R1, sample_R2, temp_args)

    else:
        # single mode, run one by one
        # flatting all sequence into a list
        # example: -S A.fq -S B.fq C.fq D.fa -S ...
        # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...]        
        flatten_sample_list = list(
            itertools.chain.from_iterable(Sample_pth_list))

        for sample in flatten_sample_list:
            # sample should exist
            if not op.exists(sample):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            logm('Processing', sample)            
            sample = op.abspath(sample)
            # if run as sample mode, set out_dir as the dir of the samples
            if not args.out_dir:
                args.out_dir = op.join(op.dirname(sample), 'Tophat')
            if op.isdir(args.out_dir):
                logwarn('results dir exists', args.out_dir)
            else:
                os.makedirs(args.out_dir, mode=0o755)
            # same function as paired-end mode, but leaving sample_R2 empty
            run_sample(sample, '', args)
Esempio n. 8
0
def fastq_list_Project(Project_pth_list, args):
    ''' Main program of Sample mode. '''    
    # flatting nested list
    # Ex -P A B ... -P C -P D ...
    # => [[A, B], [C], [D], [...], ...]
    # flatten
    # => [A, B, C, D, ...]
    flatten_projects_list = list(
        itertools.chain.from_iterable(Project_pth_list))
    logp('retreiving', len(flatten_projects_list), 'projects')

    for prj_pth in flatten_projects_list:
        if not op.exists(prj_pth) or not op.isdir(prj_pth):
            # not exist or not a directory
            logerr('Cannot find the directory of project:', prj_pth, 
                   'Skipping...')
            continue

        # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯
        # using abosolute path to prevent that some programs can not handle 
        # relative path and easy for debugging 
        prj_dir = op.abspath(prj_pth)

        # a typical sequencing path output after Illumina demultiplexing
        # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA
        # desired output path
        # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA
        FCID_dir, prj_name = op.split(prj_dir)
        FCID_dir = op.split(FCID_dir)[0]
        logm('Working project:', prj_name[8:])

        # determine result dir
        if not args.out_dir:
            prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') 
        else:
            prj_result_root = op.join(op.abspath(args.out_dir), prj_name)

        # create result dir first
        if not op.exists(prj_result_root):
            os.makedirs(prj_result_root, mode=0o755)
        else:
            logwarn('project result exists')

        # obtain all sample dir in the project
        sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/')))
        total_sample = len(sample_list)
        logp('contains', str(total_sample), 'samples')

        for i, sample_dir in enumerate(sample_list, start=1):
            logm('({:d}/{:d}) Sample: {:s}'
                 .format(
                     i, total_sample, 
                     op.split(op.dirname(sample_dir))[1][7:]),
                 'in project', prj_name[8:])

            # read SampleSheet.csv in the sample_dir
            ss_pth = op.join(sample_dir, 'SampleSheet.csv')
            if not op.exists(ss_pth):
                logerr('SampleSheet.csv not found! Skipping')
                continue

            # parsing SampleSheet.csv
            with open(ss_pth) as ss_csv_f:
                reader = csv.DictReader(ss_csv_f)
                for row in reader:
                    temp_args = copy.deepcopy(args)
                    # typical sample name
                    # No35_ATGTCA_L003_R1_001
                    # => <sample_prefix>_R1/2_001
                    sample_prefix = '{:s}_{:s}_L{:03d}'.format(
                        row['SampleID'],
                        row['Index'],
                        int(row['Lane']))

                    # output dir 
                    temp_args.out_dir = op.join(prj_result_root,
                                                'Sample_' + row['SampleID'])
                    logp('result goes to', temp_args.out_dir)
                    if not op.exists(temp_args.out_dir):
                        os.makedirs(temp_args.out_dir, mode=0o755)
                    else:
                        logwarn('sample result exists')

                    # reference
                    if not temp_args.ref_name:
                        temp_args.ref_name = row['SampleRef']

                    sample_R1 = op.join(sample_dir,
                                        sample_prefix + '_R1_001.fastq.gz')
                    if temp_args.seq_type == 'paired':
                        sample_R2 = op.join(sample_dir,
                                            sample_prefix + '_R2_001.fastq.gz')
                    else:
                        sample_R2 = ''
                    run_sample(sample_R1, sample_R2, temp_args)
Esempio n. 9
0
parser.add_argument('--sort',
                    action='store_true',
                    help="""whether sort the output in html log file. default
                    is False.""")
parser.add_argument('--parse',
                    action="store_false",
                    dest='dump',
                    help="""Use self-defined logging function, which means
                    output will be parse using overrided function parse2csv().
                    If the function is not overrrided, it still calls default
                    dump function, which logs raw stdout, stderr outputs.
                    NOTICE! When using default output, multiline output may
                    cause problems during Excel import or post parsing.""")
parser.add_argument('--no_html',
                    action='store_false',
                    dest='html',
                    help='No log in html format.')
parser.add_argument('--no_update',
                    action='store_false',
                    dest='update',
                    help="""Write output to log file only when all the tasks
                    have finished. WARNING! All information will be lost if the
                    program ended unexpectedly.""")

######### Below is for testing function ########

if __name__ == '__main__':
    # setup argv here
    logwarn('One should not run this program directly!')
    print('Try to call test.py')