def _record(self, task):
        """Record the result every time a task finishs"""
        task_time = strDiffTime(task.start_time, datetime.datetime.today())
        logp('task', 'of', task.task_type + ':',
             task.name, 'ended using', task_time)
        output = task.process.communicate()
        try:
            normsg = output[0].decode('utf8')
            errmsg = output[1].decode('utf8')
            if errmsg:
                # error ouccrs in task
                self.err_list.append('{:s}[{:s}]'.format(
                    task.name, task.task_type))
                logerr('task', task.name, 'ended with error!!')
                print(errmsg)

        except UnicodeDecodeError as e:
            logerr('Decoding output of', task.name, 'failed!')
            print(e)
            print('The output is not a string text, skipping!!')
            normsg = errmsg = 'DECODE ERROR'
            self.err_list.append(task.name)

        # putting into output list
        self.output_list.append({
            'name': task.name,
            'type': task.task_type,
            'normal msg': normsg,
            'error msg': errmsg,
            'process time': task_time,
        })
Beispiel #2
0
def cmd_gunzip(gz_pth, fastq_pth):
    '''Unzip a *.gz file and return the fastq file path'''
    try:
        sp.check_call(
            ['gunzip -c {:s} > {:s}'.format(gz_pth, fastq_pth)],
            bufsize=-1,
            shell=True
        )
    except sp.CalledProcessError as e:
        # error when unzipping
        logerr('Unzip file:', op.basename(gz_pth),
               'failed! with exit({:d})'.format(e.returncode))
    def _initlog(self):
        # output all logs to ./log/<...>.csv
        # if html output, further html file also created
        out_name = self.output_filename

        # determine the log dir place
        log_dir = op.dirname(self.out_fname) if out_name else ""
        if log_dir:
            if log_dir != './log':
                log_dir = op.abspath(op.expanduser(log_dir))
                logp('using non-default log dir:', log_dir)
        else:
            log_dir = './log'

        # create log folder
        if not op.exists(log_dir):
            try:
                os.makedirs(log_dir, mode=0o755)
            except OSError as e:
                logerr('Cannot create log folder!',
                       'using default dir ./log/ instead')
                out_name = './log/' + op.basename(out_name)
                os.makedirs('./log', mode=0o755)
                print(e)

        # determin the log filename
        log_name = op.basename(out_name) if out_name else ""
        if op.splitext(log_name)[0]:
            # append file extension .csv if needed
            if not op.splitext(log_name)[1]:
                logwarn('No extension in given filename',
                        '".csv" will be auto appended.')
                log_name += '.csv'
            elif op.splitext(log_name)[1] != '.csv':
                logwarn('Output file extension given "{:s}" is not ".csv",',
                        'may create error when opening.'.format(
                            op.splitext(log_name)[1]))
        else:
            log_name = "{:s}_{:s}.csv".format(
                self.name.replace(' ', '_'),
                strTime(dt=self.start_time,
                        str_format=myparallel.time_strf))

        # join dir and name setting
        out_name = op.join(log_dir, log_name)

        # warn if log file already exists
        if op.exists(out_name):
            log_mtime = datetime.datetime.fromtimestamp(op.getmtime(out_name))
            logwarn('log file already exists! Created {:s} ago'.format(
                    strDiffTime(log_mtime, datetime.datetime.today())))
        logp('Raw output log csv goes to', out_name)
    def dump2csv(self, lastTime=False):
        try:
            csv_f = open(self.output_filename, 'w')
        except IOError as e:
            logerr('Cannot open file', self.output_filename)
            print(e)
            if lastTime:
                csv_f = myparallel.wopenfile(self.output_filename)
            else:
                print('Try to closed the log file. Skip logging this time')
                return

        csv_writer = csv.DictWriter(csv_f, myparallel.dump_headcol)
        csv_writer.writeheader()
        csv_writer.writerows(self.output_list)
        csv_f.close()
Beispiel #5
0
 def unzip(gz_pth):
     gz_dir, gz_name = op.dirname(gz_pth), op.basename(gz_pth)
     fastq_path = op.join(gz_dir, op.splitext(gz_name)[0])
     # check if <gz_pth>.fastq exists
     if op.exists(fastq_path):
         # fastq exists, skipping
         return fastq_path
     else:
         unzip_s = dt.datetime.today()
         logp('unzip', gz_name, 'starts at', strTime(unzip_s))
         cmd_gunzip(gz_pth, fastq_path)
         unzip_e = dt.datetime.today()
         logp('ends at', strTime(unzip_e), 
              'using', strDiffTime(unzip_s, unzip_e))
         if not op.exists(fastq_path):
             logerr('Output fastq not found!', fastq_path)
             return
         else:
             return fastq_path
    def write2html(self, lastTime=False):
        logp('update html log file')
        html_name = op.splitext(self.output_filename)[0] + '.html'
        try:
            html_f = open(html_name, 'w')
        except IOError as e:
            logerr('Cannot open file', html_name)
            print(e)
            if lastTime:
                html_f = myparallel.wopenfile(html_name)
            else:
                print('Try to closed the log html file. Skip this time')
                return

        template = open('sample_output.html')
        output_reg = re.compile(r'\{ ?% ?outputblock ?% ?\}')
        for l in template:
            if output_reg.search(l):
                html_f.write(self.makeDescription(lastTime))
                # place to insert result table
                html_f.write(self.makeHTMLTable(lastTime))
            else:
                html_f.write(l)
Beispiel #7
0
def ref_path(sample_path, ref_name):
    global refpath_dict  # black magic here will further move to new script

    if ref_name is not None:
        ref_name = ref_name.lower()
        # given a short name of the reference index,
        # return the actual directory of the reference
        # Ex. hg19 -> /data/iGenome/H**o-sapiens/UCSC/hg19
        # Currently maintain: human, h**o sapiens, hg19,
        if ref_name == 'unkown':
            # unkown
            return None     # skipping this sample
        elif ref_name in ['human', 'h**o sapiens', 'homo_sapiens']:
            # human -> hg19
            return refpath_dict['hg19']
        elif ref_name in ['mouse']:
            # mouse -> mm10
            return refpath_dict['mm10']
        elif ref_name in ['chicken']:
            # chicken -> gal4
            return refpath_dict['galGal4']
        elif ref_name in refpath_dict:
            return refpath_dict[ref_name]
        else:
            # unrecognized reference name, reporting and skipping
            logerr('Cannot find the reference in current database:', ref_name)
            return None
    else:
        # no ref_name is given, searching the SampleSheet.csv
        logp('guess reference from SampleSheet.csv')
        ss_csv_path = op.join(op.dirname(sample_path), 'SampleSheet.csv')
        if not op.exists(ss_csv_path):
            logerr('SampleSheet.csv not found at', ss_csv_path)
            return None

        sample_name = op.basename(sample_path).split('_')[0]
        with open(ss_csv_path) as ss_csv_f:
            reader = csv.DictReader(ss_csv_f)
            for row in reader:
                if row['SampleID'] == sample_name:
                    return ref_path(sample_path, row['SampleRef'])
            logerr('Sample:', sample_name,
                   'cannot be found in SampleSheet.csv')
            return None
Beispiel #8
0
def parse_args():
    '''Parse command line options, return the option list'''

    desc = '''
    This script parse the needed argument, first decompress the
    zipped fastq file, select proper reference genome index, and call
    Top Hat with proper arugments.
    
    More aruments input will be passed directly to Top Hat, be sure you know
    what you are doing.
    
    For more information please contact Liang Bo Wang or 
    Bioinformatics and Biostatistics Core Lab, NTU CGM'''

    # if the parser will be inherited or used by other ArgumentParser,
    # then add_help should be set False.
    # RawTextHelpFormatter both description and help text use raw string
    # RawDescriptionHelpFormatter only description uses raw string
    parser = ap.ArgumentParser(prog='tophat.py',
                               formatter_class=ap.RawDescriptionHelpFormatter,
                               description=textwrap.dedent(desc),
                               add_help=True)
    p_addarg = parser.add_argument          # make the function name shorter

    # --- input ---
    # one must choose either Project(-P) or Sample(-S)
    in_type_grp = parser.add_mutually_exclusive_group(required=True)
    in_addarg = in_type_grp.add_argument    # make the function name shorter
    # Project mode
    in_addarg('-P', '--Project',
              metavar='DIR', action='append',
              dest='Project_list', nargs='+',
              help='''Path for a project directory. It should follow the
              structure of original Illumina direct output by 
              demultiplexing.''')
    # Sample mode
    in_addarg('-S', '--Sample',
              metavar=('R1.fastq[.gz]', '{R2.fastq[.gz]}'), action='append',
              dest='Sample_list', nargs='+',
              help='''PATH to a pair of samples in paired mode or multiple 
              samples in single mode. Ex -S A1 A2 -S B1 B2 (-t paried) 
              or -S A B C -S D E -t single. Both FASTQ and .fastq.gz files are
              accepted.''')
    # nargs='+'         implies that this option accepts mulitple arguments
    # dest='<var_name>' then one can access the arguments using args.var_name
    # action='append'   use the following examples
    # Ex1.  -S A_R0.fq A_R1.fq
    # => [['A_R0.fq', 'A_R1.fq']]
    # Ex2.  -S A_R0.fq A_R1.fq -S B_R0.fq B_R1.fq -S ...
    # => [
    #     ['A_R0.fq', 'A_R1.fq'],
    #     ['B_R0.fq', 'B_R1.fq'],
    #     [...], ...
    #    ]
    # so if input type is Sample, it will be a nested list

    # --- output ---
    p_addarg('-o', '--outdir',
             metavar='OUT_DIR',
             dest='out_dir',
             help='''In Sample mode, if only a (pair of) sample is given, then
             it will be its result dir directly, otherwise it will be the root
             path for all samples given. Ex -S A1 A2 -o DIR -t paired =>
             outputs to DIR/, -S A1 A2 -S B1 B2 -o DIR -t paired => DIR/A/ and
             DIR/B/ for results of A and B respectively. In Project mode, by
             default it assumes multiple results, so OUT_DIR will be the root
             path of all the project results. Ex OUT_DIR/Sample_<1>,
             OUT_DIR/Sample_<2>, ...''')

    # --- parameters for Top Hat ---
    # tophat -p 15 
    # -G /data/iGenome/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf 
    # -o TopHat_with_GTF/Sample_A-W  
    # --library-type=fr-unstranded 
    # --no-novel-juncs 
    # /data/iGenome/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome 
    # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R1_001.fastq 
    # ../../Unaligned_m0/Project_Lin/Sample_A-W/A-W_GTGAAA_L002_R2_001.fastq

    p_addarg('-r', '--readlength',
             # required=True,
             metavar='LEN', type=int,
             dest='read_length',
             help='''mate_inner_length given to Tophat.''')

    p_addarg('-t', '--seqtype',
             default='paired',
             choices=['paired', 'single'],
             dest='seq_type',
             help='''Sequencing type. Default is paired-end sequence.''')

    # quantification without a reference annotation
    p_addarg('--no-annotation',
             action='store_false',
             dest='annotation',
             help='''If specified, Top Hat will do alternative splicing 
             without knowledge of existed isoform of all genes.''')

    # --- reference argments ---
    # user and specify species like human, mouse, or chicken through -R
    # or they specify the path to required bowtie index and gene annotation
    # If bowtie_path(--bowtie-index-path) or gene_path(--gene-path) is given,
    # program use the path directly. 
    # Then it looks for the path given by ref_name(-R). 
    # Otherwise it looks for the information inside SampleSheet.csv
    # Thus the priority is <*path> -> <ref_name> -> SampleSheet.csv
    p_addarg('-R', '--refname',
             metavar='NAME',
             dest='ref_name',
             help='''Name of the species or reference database. Ex. both human
             and hg19 goes to hg19; similarly, both mouse and mm10 goes to
             mm10.''')

    p_addarg('-B', '--bowtie-index',
             #required=True,
             metavar='BOWTIE_INDEX_PATH',
             dest='bowtie_index',
             help='''The path to FW index of whole genome sequence for Bowtie2.
             It should be ended with .../Bowtie2Index/genome''')

    p_addarg('-G', '--gtf',
             metavar='GTF_PATH',
             dest='gtf_path',
             help='''The path to the gene annotation GTF file with known
             transcripts, e.g., genes.gtf in most cases. ''')

    # --- miscellaneous arguments ---
    # multiprocessing
    p_addarg('-p', '--multiprocess',
             metavar='N', type=int,
             default=1,
             dest='max_process',
             help='''The maximum of parallel running processes. This
             number should be equal to or less then the number of CPU
             cores. If a negative number or zero is set, number of
             maxprocess depends on the number of CPU cores. For example,
             -1 uses CPU_NUM({:d}) - 1 = {:d} processes on this machine.
             Program use 1 process if not specified.'''.format(
             mp.cpu_count(), mp.cpu_count() - 1))

    # resume Tophat
    p_addarg('--resume',
             action='store_true',
             dest='resume',
             help='''If specified, Tophat will try to resume the progress by
             looking for <out_dir>/logs/tophat.log''')

    # remove unzipped fastq files
    p_addarg('--remove-fastq', 
             action='store_true',
             dest='rm_unzip_fq',
             help='''If specified, all unzipped fastq files will be removed.
             However, those fastq files existed before run will be intact.''')

    p_addarg('--extra-args',
             dest='extra_args',
             action='store_true',
             help='''Input additional arguments to Tophat directly 
             WITHOUT ANY CHECKS. If specified, all unkown args 
             will be collected''')

    # --- validation and first processing commands ---
    #args = parser.parse_args()
    args, unknown_args = parser.parse_known_args()

    if args.extra_args:
        logp('getting extra args passed to Tophat:', ' '.join(unknown_args))
        args.extra_args = unknown_args
    else:
        if unknown_args:
            logerr('Get unkown args.',
                   'If they are passed to Tophat, please specify',
                   '--extra-args')
            parser.error('unrecognized arguments:' + ' '.join(unknown_args)) 
        else:
            args.extra_args = None

    # computing max_process
    if args.max_process > mp.cpu_count():
        logwarn('Set # of processes({:d})'.format(args.max_process), 
                '> # of CPUs({:d})'.format(mp.cpu_count()),
                'the efficiency will be low.')
    elif args.max_process <= 0:
        args.max_process = mp.cpu_count() + args.max_process
        if args.max_process <= 0:
            logwarn('Negative # of processes({:d}) has been set, reset to 1'
                    .format(args.max_process))
            args.max_process = 1

    return args
Beispiel #9
0
def cmd_tophat(gz_pth_1, gz_pth_2, bowtie_ref, gene_ref,
               out_dir, read_length, max_process,
               resume, rm_unzip_fq, extra_args):
    def unzip(gz_pth):
        gz_dir, gz_name = op.dirname(gz_pth), op.basename(gz_pth)
        fastq_path = op.join(gz_dir, op.splitext(gz_name)[0])
        # check if <gz_pth>.fastq exists
        if op.exists(fastq_path):
            # fastq exists, skipping
            return fastq_path
        else:
            unzip_s = dt.datetime.today()
            logp('unzip', gz_name, 'starts at', strTime(unzip_s))
            cmd_gunzip(gz_pth, fastq_path)
            unzip_e = dt.datetime.today()
            logp('ends at', strTime(unzip_e), 
                 'using', strDiffTime(unzip_s, unzip_e))
            if not op.exists(fastq_path):
                logerr('Output fastq not found!', fastq_path)
                return
            else:
                return fastq_path

    def rm_fq(fastq_path):
        logp('removing unzipped fastq:', fastq_path)
        os.remove(fastq_path)

    # unzipping
    unzip_fastq_1 = False    
    if op.splitext(gz_pth_1)[1] == '.fastq':
        fq_pth_1 = gz_pth_1
    else:
        fq_pth_1 = unzip(gz_pth_1)
        unzip_fastq_1 = True

    unzip_fastq_2 = False      
    if gz_pth_2:
        if op.splitext(gz_pth_2)[1] == '.fastq':
            fq_pth_2 = gz_pth_2
        else:
            fq_pth_2 = unzip(gz_pth_2)
            unzip_fastq_2 = True
    else:
        fq_pth_2 = None

    # tophat command splitted by space
    cmd = ['tophat',
           '-p', str(max_process), 
           '-o', out_dir]
    
    if gene_ref:                            # genes.gtf
        cmd.extend(['-G', gene_ref])

    if resume:                              # resume Tophat
        cmd.extend(['--resume', out_dir])

    if read_length:                         # mate_inner_length
        cmd.extend(['-r', str(read_length)])

    if extra_args:
        logp('getting extra args for TopHat:', ' '.join(extra_args))
        cmd.extend(extra_args)

    cmd.extend([bowtie_ref, fq_pth_1])
    # paired end
    if fq_pth_2:
        cmd.append(fq_pth_2)

    logp('running command:', ' '.join(cmd))
    dt_start = dt.datetime.today()
    logp('starts at', strTime(dt_start))
    process = sp.Popen(
        cmd,
        stdout=sp.PIPE, stderr=sp.PIPE,     # pipeline
        bufsize=-1,                         # means use system buffer size
        universal_newlines=True,            # parse '\n' automatically
        cwd=out_dir
    )
    process.wait()      # wait for process completes
    dt_end = dt.datetime.today()
    logp('ends at', strTime(dt_end), 'using', strDiffTime(dt_start, dt_end))
    stdout, stderr = process.communicate()  # get the message

    if process.poll() > 0:
        # tophat ends with not normal exit (returncode > 1)
        if stderr.startswith('Nothing to resume.'):
            logp('successfully complete, skipping')
        logerr('Original message:\n' + stderr)
    
    # remove unzipped fastq
    if rm_unzip_fq:
        if unzip_fastq_1:
            rm_fq(fq_pth_1)
        if unzip_fastq_2:
            rm_fq(fq_pth_2)
Beispiel #10
0
def fastq_list_Sample(Sample_pth_list, args):
    ''' Main program of Sample mode, 
    Project mode inheritently calls this function'''
    if args.out_dir:
        args.out_dir = op.abspath(args.out_dir)
        if len(Sample_pth_list) > 1:
            # If multiple sample is input => Sample mode
            # use abspath for out_dir as root path of all results dir
            logp('multiple groups of sample get.') 
            logp('root path to results is set manually:', args.out_dir)
    else:
        logp('no out_dir given,', 
             'results will got to the dir of every group of sample')

    # 這個程式主要把執行的環境設定好,把一些不正確的參數先判斷出來,
    # 再交給下一級 run_<...>() 系列的程式執行
    if args.seq_type == 'paired':
        # paired mode, run by pairs
        # exapmle: -S A_R0.gz A_R1.gz -S B_R0.gz B_R1.gz -S ...
        # => [
        #     ['A_R0.gz', 'A_R1.gz'],
        #     ['B_R0.gz', 'B_R1.gz'],
        #     [...], ...
        #    ]

        for i, paired_sample_list in enumerate(Sample_pth_list, start=1):
            # validation, samples should be paired
            if (len(paired_sample_list) != 2):
                logerr('Input samples :', ' ,'.join(paired_sample_list),
                       'is not paired!', 'Skipping ...')
                continue

            sample_R1, sample_R2 = paired_sample_list[0], paired_sample_list[1]
            
            # samples should exist
            if not op.exists(sample_R1):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            if args.seq_type == 'paired' and not op.exists(sample_R2):
                logerr('Input', sample_R2, 'does not exist! Skipping')
                continue

            # copy args so if we change the args.out_dir or args.resume, 
            # other samples will not be affected
            temp_args = copy.deepcopy(args)

            # if multiple samples, show the working progress
            if len(Sample_pth_list) > 1:
                logm('({:d}/{:d}) Processing paired:'
                     .format(i, len(Sample_pth_list)),
                     sample_R1, sample_R2)
            else:
                logm('Processing paired:', sample_R1, sample_R2)
            
            # use absolute path
            sample_R1, sample_R2 = op.abspath(sample_R1), op.abspath(sample_R2)

            if not args.out_dir:
                # default output path <path_of_sample_R1>/Tophat
                # if called by Project mode, args.out_dir will be set
                # automatically
                temp_args.out_dir = op.join(op.dirname(sample_R1), 'Tophat')
            else:
                if len(Sample_pth_list) > 1: 
                    # make sample sub_dir
                    sample_name = ('Sample_'
                                   + op.basename(sample_R1).split('.')[0])
                    logp('subdir for sample name guessed from filename:',
                         sample_name)
                    # args.out_dir is root path shared by all samples
                    temp_args.out_dir = op.join(args.out_dir, sample_name)  
            
            # if not previous work, turn off the --resume option
            cond_log_exist = op.exists(
                op.join(temp_args.out_dir, 'logs/tophat.log'))
            if temp_args.resume and not cond_log_exist:
                logp('previous log file tophat.log not found,',
                     'resume function is temporarily off')
                temp_args.resume = False
            
            # create the out_dir and check if out_dir exists
            if op.isdir(temp_args.out_dir):
                if not temp_args.resume:
                    logwarn('results dir exists', temp_args.out_dir)
            else:
                os.makedirs(temp_args.out_dir, mode=0o755)
            
            run_sample(sample_R1, sample_R2, temp_args)

    else:
        # single mode, run one by one
        # flatting all sequence into a list
        # example: -S A.fq -S B.fq C.fq D.fa -S ...
        # fastq list now becomes ['A.fq', 'B.fq', 'C.fq', ...]        
        flatten_sample_list = list(
            itertools.chain.from_iterable(Sample_pth_list))

        for sample in flatten_sample_list:
            # sample should exist
            if not op.exists(sample):
                logerr('Input', sample_R1, 'does not exist! Skipping')
                continue
            logm('Processing', sample)            
            sample = op.abspath(sample)
            # if run as sample mode, set out_dir as the dir of the samples
            if not args.out_dir:
                args.out_dir = op.join(op.dirname(sample), 'Tophat')
            if op.isdir(args.out_dir):
                logwarn('results dir exists', args.out_dir)
            else:
                os.makedirs(args.out_dir, mode=0o755)
            # same function as paired-end mode, but leaving sample_R2 empty
            run_sample(sample, '', args)
Beispiel #11
0
def fastq_list_Project(Project_pth_list, args):
    ''' Main program of Sample mode. '''    
    # flatting nested list
    # Ex -P A B ... -P C -P D ...
    # => [[A, B], [C], [D], [...], ...]
    # flatten
    # => [A, B, C, D, ...]
    flatten_projects_list = list(
        itertools.chain.from_iterable(Project_pth_list))
    logp('retreiving', len(flatten_projects_list), 'projects')

    for prj_pth in flatten_projects_list:
        if not op.exists(prj_pth) or not op.isdir(prj_pth):
            # not exist or not a directory
            logerr('Cannot find the directory of project:', prj_pth, 
                   'Skipping...')
            continue

        # 使用絕對路徑,以免有些程式不能處理相對路徑,也方便除錯
        # using abosolute path to prevent that some programs can not handle 
        # relative path and easy for debugging 
        prj_dir = op.abspath(prj_pth)

        # a typical sequencing path output after Illumina demultiplexing
        # .../<date_index_FCID>/Unaligned/Project_Test/Sample_HAHA
        # desired output path
        # .../<date_index_FCID>/Aligned/Project_Test/TopHat/Sample_HAHA
        FCID_dir, prj_name = op.split(prj_dir)
        FCID_dir = op.split(FCID_dir)[0]
        logm('Working project:', prj_name[8:])

        # determine result dir
        if not args.out_dir:
            prj_result_root = op.join(FCID_dir, 'Aligned', prj_name, 'TopHat') 
        else:
            prj_result_root = op.join(op.abspath(args.out_dir), prj_name)

        # create result dir first
        if not op.exists(prj_result_root):
            os.makedirs(prj_result_root, mode=0o755)
        else:
            logwarn('project result exists')

        # obtain all sample dir in the project
        sample_list = sorted(glob.glob(op.join(prj_dir, 'Sample_*/')))
        total_sample = len(sample_list)
        logp('contains', str(total_sample), 'samples')

        for i, sample_dir in enumerate(sample_list, start=1):
            logm('({:d}/{:d}) Sample: {:s}'
                 .format(
                     i, total_sample, 
                     op.split(op.dirname(sample_dir))[1][7:]),
                 'in project', prj_name[8:])

            # read SampleSheet.csv in the sample_dir
            ss_pth = op.join(sample_dir, 'SampleSheet.csv')
            if not op.exists(ss_pth):
                logerr('SampleSheet.csv not found! Skipping')
                continue

            # parsing SampleSheet.csv
            with open(ss_pth) as ss_csv_f:
                reader = csv.DictReader(ss_csv_f)
                for row in reader:
                    temp_args = copy.deepcopy(args)
                    # typical sample name
                    # No35_ATGTCA_L003_R1_001
                    # => <sample_prefix>_R1/2_001
                    sample_prefix = '{:s}_{:s}_L{:03d}'.format(
                        row['SampleID'],
                        row['Index'],
                        int(row['Lane']))

                    # output dir 
                    temp_args.out_dir = op.join(prj_result_root,
                                                'Sample_' + row['SampleID'])
                    logp('result goes to', temp_args.out_dir)
                    if not op.exists(temp_args.out_dir):
                        os.makedirs(temp_args.out_dir, mode=0o755)
                    else:
                        logwarn('sample result exists')

                    # reference
                    if not temp_args.ref_name:
                        temp_args.ref_name = row['SampleRef']

                    sample_R1 = op.join(sample_dir,
                                        sample_prefix + '_R1_001.fastq.gz')
                    if temp_args.seq_type == 'paired':
                        sample_R2 = op.join(sample_dir,
                                            sample_prefix + '_R2_001.fastq.gz')
                    else:
                        sample_R2 = ''
                    run_sample(sample_R1, sample_R2, temp_args)
Beispiel #12
0
def run_sample(sample_R1, sample_R2, args):
    if args.seq_type == 'paired':
        # for paired, make sure the order is  R1, R2
        sample_R1, sample_R2 = sorted(
            [sample_R1, sample_R2],
            key=lambda x: op.basename(x)
        )

    ref_root_path = ref_path(sample_R1, args.ref_name)
    # --- Bowtie2 reference ---
    # then appended with Sequence/Bowtie2Index/genome
    cond_bowtie = (
        args.bowtie_index 
        and op.exists(op.dirname(args.bowtie_index)) 
        and args.bowtie_index[-7:] == '/genome'
    )
    if cond_bowtie:
        logp('directly specify bowtie index path:', args.bowtie_index)
        genome_bowtie_ref = args.bowtie_index
    else:
        if ref_root_path is None:
            logerr('Cannot determine the Bowite index reference! Skipping')
            return
        else:
            genome_bowtie_ref = op.join(ref_root_path,
                                        'Sequence/Bowtie2Index/genome')
            logp('reference: ' + args.ref_name if args.ref_name else '',
                 'mapping to', genome_bowtie_ref)

    # --- gene annotation reference ---
    if args.annotation:
        cond_gtf = (
            args.gtf_path
            and op.exists(args.gtf_path)
            and op.splitext(args.gtf_path)[1] == '.gtf'     # gtf file
        )
        if cond_gtf:
            logp('directly specify genes annotation file path:', args.gtf_path)
            gene_gtf_ref = args.gtf_path
        else:
            if ref_root_path is None:
                logerr('Cannot determine the Bowite index reference! Skipping')
                return
            else:
                gene_gtf_ref = op.join(ref_root_path,
                                       'Annotation/Genes/genes.gtf')
                logp('reference: ' + args.ref_name if args.ref_name else '',                     
                     'mapping to', gene_gtf_ref)
    else:
        logp('run without a reference annotation')
        gene_gtf_ref = None

    cmd_tophat(
        gz_pth_1=sample_R1, gz_pth_2=sample_R2,
        bowtie_ref=genome_bowtie_ref,
        gene_ref=gene_gtf_ref,
        out_dir=args.out_dir,
        read_length=args.read_length,
        max_process=args.max_process,
        resume=args.resume,
        rm_unzip_fq=args.rm_unzip_fq,
        extra_args=args.extra_args
    )