Beispiel #1
0
    def __init__(self, orientation, bed, job_name, out_sh, submit, directory):
        cmd_list = []
        for filename in glob('{}/*sorted.bam'.format(directory)):
            cmd_list.append(
                'count_tags.py --annotation_file {} -f {} -b {} -o {}.count'.format(
                    bed, orientation, filename, filename))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name, nodes=1, ppn=16, queue='home',
                        walltime='1:00:00',
                        array=True, max_running=20)
        sub.write_sh(submit=submit)
Beispiel #2
0
    def __init__(self, job_name, out_sh=None, queue_type='PBS',
                 directory='./', submit=True):
        cmd_list = []
        for file in glob('{}/*sam'.format(directory)):
            cmd_list.append(
                'samtools view -bS -q 10 {} > {}.bam'.format(file, file))

        sub = Submitter(queue_type=queue_type,
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name, nodes=1, ppn=1, queue='home',
                        walltime='1:00:00',
                        array=True, max_running=20)
        sub.job(submit=submit)
Beispiel #3
0
    def __init__(self, gtf, job_name, out_sh, submit, directory):
        commands = []
        for bam in iglob('{}/*.sorted.bam'.format(directory.rstrip('/'))):
            commands.append('cufflinks --GTF {0} --GTF-guide  '
                            '--multi-read-correct --num-threads 8 {1}'.format(
                gtf, bam
            ))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=commands,
                        job_name=job_name, nodes=1, ppn=8, walltime='0:30:00',
                        array=True,
                        max_running=20
        )
        sub.write_sh(submit=submit)
Beispiel #4
0
    def __init__(self, job_name, out_sh=None,
                 directory='./', submit=True):
        command_list = []

        for file in glob('{}/*sorted.bam'.format(directory)):
            command_list.append('samtools index {0}'.format(file))


        # def submit_and_write(name, command_list):
        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=command_list,
                        job_name=job_name, nodes=1, ppn=1, queue='home',
                        array=True,
                        max_running=20, walltime='0:30:00')

        sub.job(submit=submit)
    def __init__(self, job_name, out_sh=None,
                 directory='./', submit=True):
        cmd_list = []
        for file in glob('{}/*bam'.format(directory.rstrip('/'))):
            cmd_list.append(
                "samtools view -h -F 4 {0} | awk '$6 !~ /N/ || $1 ~ /@/' "
                "| "
                "samtools view -bS - > {0}.unspliced.bam".format(file))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name, nodes=1, ppn=16, queue='home',
                        array=True,
                        walltime='0:30:00',
                        max_running=10)
        sub.job(submit=submit)
Beispiel #6
0
    def __init__(self, job_name, out_sh, directory, submit):
        command_list = []
        for filename in glob('{}/*bam'.format(directory)):
            command_list.append(
                'samtools sort -@ 8 -m 50000000000 {0} {0}.sorted'
                .format(filename))


        # def submit_and_write(name, command_list):
        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=command_list,
                        job_name=job_name, nodes=1, ppn=8, queue='home',
                        array=True,
                        max_running=10, walltime='0:30:00')

        sub.write_sh(submit=submit)
Beispiel #7
0
    def __init__(self, job_name, out_sh, submit, directory):
        cmd_list = []
        for file in glob('{}/*count'.format(directory.rstrip('/'))):
            cmd_list.append('single_RPKM.py -i {} -o {}.rpkm'.format(
                file, file))

        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=1,
                        queue='home',
                        walltime='1:00:00',
                        array=True,
                        max_running=20)
        sub.write_sh(submit=submit)
Beispiel #8
0
    def __init__(self, orientation, bed, job_name, out_sh, submit, directory):
        cmd_list = []
        for filename in glob('{}/*sorted.bam'.format(directory)):
            cmd_list.append(
                'count_tags.py --annotation_file {} -f {} -b {} -o {}.count'.
                format(bed, orientation, filename, filename))

        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=16,
                        queue='home',
                        walltime='1:00:00',
                        array=True,
                        max_running=20)
        sub.write_sh(submit=submit)
Beispiel #9
0
    def __init__(self, job_name, out_sh=None, directory='./', submit=True):
        cmd_list = []
        for file in glob('{}/*bam'.format(directory.rstrip('/'))):
            cmd_list.append(
                "samtools view -h -F 4 {0} | awk '$6 !~ /N/ || $1 ~ /@/' "
                "| "
                "samtools view -bS - > {0}.unspliced.bam".format(file))

        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=16,
                        queue='home',
                        array=True,
                        walltime='0:30:00',
                        max_running=10)
        sub.job(submit=submit)
Beispiel #10
0
    def __init__(self, job_name, out_sh=None, directory='./', submit=True):
        cmd_list = []
        for filename in glob('{}/*fastq'.format(directory)):
            cmd_list.append('bowtie \
        -c \
        -S \
        -q \
        -p 16 \
        -e 100 \
        -l 20 \
        --un {0}.norep \
        all_ref \
        {0} \
        | grep -v \"@\" \
        |  perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \
        > {0}.repeat_counts'.format(filename))

        for filename in glob('{}/*gz'.format(directory)):
            cmd_list.append('gunzip -c {0} \
        |bowtie \
        -c \
        -S \
        -q \
        -p 16 \
        -e 100 \
        -l 20 \
        --un {0}.norep \
        all_ref \
        - \
        | grep -v \"@\" \
        |  perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \
        > {0}.repeat_counts'.format(filename))

        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=16,
                        walltime='2:30:00',
                        array=True,
                        max_running=20)
        sub.write_sh(submit=submit)
Beispiel #11
0
    def __init__(self, job_name, out_sh=None, directory='./', submit=True):
        command_list = []

        for file in glob('{}/*sorted.bam'.format(directory)):
            command_list.append('samtools index {0}'.format(file))

        # def submit_and_write(name, command_list):
        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=command_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=1,
                        queue='home',
                        array=True,
                        max_running=20,
                        walltime='0:30:00')

        sub.job(submit=submit)
Beispiel #12
0
    def __init__(self, job_name, out_sh=None,
                 directory='./', submit=True):
        cmd_list = []
        for filename in glob('{}/*fastq'.format(directory)):
            cmd_list.append('bowtie \
        -c \
        -S \
        -q \
        -p 16 \
        -e 100 \
        -l 20 \
        --un {0}.norep \
        all_ref \
        {0} \
        | grep -v \"@\" \
        |  perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \
        > {0}.repeat_counts'.format(filename))

        for filename in glob('{}/*gz'.format(directory)):
            cmd_list.append('gunzip -c {0} \
        |bowtie \
        -c \
        -S \
        -q \
        -p 16 \
        -e 100 \
        -l 20 \
        --un {0}.norep \
        all_ref \
        - \
        | grep -v \"@\" \
        |  perl /home/ppliu/tscc_scripts/count_aligned_from_sam.pl \
        > {0}.repeat_counts'.format(filename))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=cmd_list, job_name=job_name, nodes=1,
                        ppn=16, walltime='2:30:00',
                        array=True,
                        max_running=20
        )
        sub.write_sh(submit=submit)
Beispiel #13
0
    def __init__(self, job_name, out_sh, submit=False, directory='./'):
        try:
            os.mkdir('{}/filtered/'.format(directory.rstrip('/')))
        except OSError:
            pass

        commands = []
        for filename in iglob('{}/*.fastq.gz'.format(directory.rstrip('/'))):
            #TODO: the -l argument "20" should be a % of read length
            commands.append('echo {0}; zcat {0} | fastx_artifacts_filter | '
                            'fastq_quality_trimmer -l 20 -t 30 | '
                            'fastq_quality_filter -q 30 -p 90 -z '
                            '> filtered/{0}'.format(filename))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=commands,
                        job_name=job_name, nodes=1, ppn=2, queue='home',
                        array=True,
                        max_running=20, walltime='1:00:00')

        sub.write_sh(submit=submit)
Beispiel #14
0
    def __init__(self,
                 job_name,
                 out_sh=None,
                 queue_type='PBS',
                 directory='./',
                 submit=True):
        cmd_list = []
        for file in glob('{}/*sam'.format(directory)):
            cmd_list.append('samtools view -bS -q 10 {} > {}.bam'.format(
                file, file))

        sub = Submitter(queue_type=queue_type,
                        sh_filename=out_sh,
                        commands=cmd_list,
                        job_name=job_name,
                        nodes=1,
                        ppn=1,
                        queue='home',
                        walltime='1:00:00',
                        array=True,
                        max_running=20)
        sub.job(submit=submit)
    def __init__(self, genome, out_dir='./', directory='./', submit=True,
                 ppn=8, job_name='STAR', out_sh='STAR.sh', walltime='0:30:00',
                 outReadsUnmapped='Fastx', outFilterMismatchNmax=5,
                 outFilterMismatchNoverLmax=0.3, outFilterMultimapNmax=5,
                 outFilterScoreMin=10, outFilterType='BySJout',
                 outSAMattributes='All',
                 outSAMstrandField='intronMotif',
                 clip5pNbases=0, clip3pNbases=0, additional_STAR_args='', extension='.gz'):
        """Read the fastq files in a directory, assuming that the first 2
        underscore-separated parts of a filename are the unique sample ID,
        then running STAR. Most of these arguments are the defaults in STAR,
        except:

        outReadsUnmapped : str
            'Fastx' instead of 'None' so the unmapped reads can be remapped
            to the spikein genomes, for example
        outFilterMismatchNmax : int
            5 instead of 10
        outFilterMultimapNmax : int
            5 instead of 10
        outFilterType : str
            'BySJout' instead of 'None', so that all junction reads pass our
            stringent filter of at least 4bp overhang for annotated and at
            least 8bp overhang for unannotated
        outSAMattributes : str
            'All' instead of 'None' for more information just in case
        outSAMstrandField : str
            'intronMotif' instead of 'None' for compatibility with Cufflinks
        """

        commands = []


        # Make the directory
        try:
            os.mkdir(out_dir)
        except OSError:
            # It's already there, don't do anything
            pass

        # Set of unique sample ids for checking if we've read them all
        sample_ids = set([])

        for read1 in iglob('{}/*R1*{}'.format(directory.rstrip('/'), extension)):
            # if read1.endswith('gz'):
            #     compressed = True
            # else:
            #     compressed = False
            # readFilesCommand = 'zcat' if compressed else 'cat'

            # Remove trailing "A" and "B" so they get merged
            sample_id = '_'.join(os.path.basename(read1).split('.')[0].split(
                '_')[:2]).rstrip(
                'ABCDEFGH')
            if sample_id in sample_ids:
                continue
            paired = os.path.isfile(read1.replace('R1', 'R2'))
            print sample_id, 'paired', paired

            read1 = ','.join(glob('{}*R1*{}'.format(sample_id, extension)))
            read2 = read1.replace('R1', 'R2') if paired else ""
            print 'R1', read1
            print 'R2', read2
            sample_ids.add(sample_id)

            # print sample_id
            commands.append('''STAR \
        --runMode alignReads \
        --runThreadN {0} \
        --genomeDir {1} \
        --genomeLoad LoadAndRemove \
        --readFilesCommand zcat \
        --readFilesIn {2} {3} \
        --outFileNamePrefix {4}/{5}. \
        --outReadsUnmapped {6} \
        --outFilterMismatchNmax {7} \
        --outFilterMismatchNoverLmax {8} \
        --outFilterMultimapNmax {9} \
        --outFilterScoreMin {10} \
        --outFilterType {11} \
        --outSAMattributes {12} \
        --outSAMstrandField {13} \
        --clip5pNbases {14} \
        --clip3pNbases {15} \
        {16}'''.format(ppn,
                       genome,
                   read1,
                   read2,
                   out_dir.rstrip('/'),
                   sample_id,
                   outReadsUnmapped,
                   outFilterMismatchNmax,
                   outFilterMismatchNoverLmax,
                   outFilterMultimapNmax,
                   outFilterScoreMin,
                   outFilterType,
                   outSAMattributes,
                   outSAMstrandField,
                   clip5pNbases,
                   clip3pNbases,
                   additional_STAR_args))

        sub = Submitter(queue_type='PBS', sh_filename=out_sh,
                        commands=commands,
                        job_name=job_name, nodes=1, ppn=ppn,
                        array=True, max_running=20,
                        queue='home', walltime=walltime)

        sub.write_sh(submit=submit)
    def __init__(self,
                 genome,
                 out_dir='./',
                 directory='./',
                 submit=True,
                 ppn=8,
                 job_name='STAR',
                 out_sh='STAR.sh',
                 walltime='0:30:00',
                 outReadsUnmapped='Fastx',
                 outFilterMismatchNmax=5,
                 outFilterMismatchNoverLmax=0.3,
                 outFilterMultimapNmax=5,
                 outFilterScoreMin=10,
                 outFilterType='BySJout',
                 outSAMattributes='All',
                 outSAMstrandField='intronMotif',
                 clip5pNbases=0,
                 clip3pNbases=0,
                 additional_STAR_args='',
                 extension='.gz'):
        """Read the fastq files in a directory, assuming that the first 2
        underscore-separated parts of a filename are the unique sample ID,
        then running STAR. Most of these arguments are the defaults in STAR,
        except:

        outReadsUnmapped : str
            'Fastx' instead of 'None' so the unmapped reads can be remapped
            to the spikein genomes, for example
        outFilterMismatchNmax : int
            5 instead of 10
        outFilterMultimapNmax : int
            5 instead of 10
        outFilterType : str
            'BySJout' instead of 'None', so that all junction reads pass our
            stringent filter of at least 4bp overhang for annotated and at
            least 8bp overhang for unannotated
        outSAMattributes : str
            'All' instead of 'None' for more information just in case
        outSAMstrandField : str
            'intronMotif' instead of 'None' for compatibility with Cufflinks
        """

        commands = []

        # Make the directory
        try:
            os.mkdir(out_dir)
        except OSError:
            # It's already there, don't do anything
            pass

        # Set of unique sample ids for checking if we've read them all
        sample_ids = set([])

        for read1 in iglob('{}/*R1*{}'.format(directory.rstrip('/'),
                                              extension)):
            # if read1.endswith('gz'):
            #     compressed = True
            # else:
            #     compressed = False
            # readFilesCommand = 'zcat' if compressed else 'cat'

            # Remove trailing "A" and "B" so they get merged
            sample_id = '_'.join(
                os.path.basename(read1).split('.')[0].split('_')[:2]).rstrip(
                    'ABCDEFGH')
            if sample_id in sample_ids:
                continue
            paired = os.path.isfile(read1.replace('R1', 'R2'))
            print sample_id, 'paired', paired

            read1 = ','.join(glob('{}*R1*{}'.format(sample_id, extension)))
            read2 = read1.replace('R1', 'R2') if paired else ""
            print 'R1', read1
            print 'R2', read2
            sample_ids.add(sample_id)

            # print sample_id
            commands.append('''STAR \
        --runMode alignReads \
        --runThreadN {0} \
        --genomeDir {1} \
        --genomeLoad LoadAndRemove \
        --readFilesCommand zcat \
        --readFilesIn {2} {3} \
        --outFileNamePrefix {4}/{5}. \
        --outReadsUnmapped {6} \
        --outFilterMismatchNmax {7} \
        --outFilterMismatchNoverLmax {8} \
        --outFilterMultimapNmax {9} \
        --outFilterScoreMin {10} \
        --outFilterType {11} \
        --outSAMattributes {12} \
        --outSAMstrandField {13} \
        --clip5pNbases {14} \
        --clip3pNbases {15} \
        {16}'''.format(ppn, genome, read1, read2, out_dir.rstrip('/'),
                       sample_id, outReadsUnmapped, outFilterMismatchNmax,
                       outFilterMismatchNoverLmax, outFilterMultimapNmax,
                       outFilterScoreMin, outFilterType, outSAMattributes,
                       outSAMstrandField, clip5pNbases, clip3pNbases,
                       additional_STAR_args))

        sub = Submitter(queue_type='PBS',
                        sh_filename=out_sh,
                        commands=commands,
                        job_name=job_name,
                        nodes=1,
                        ppn=ppn,
                        array=True,
                        max_running=20,
                        queue='home',
                        walltime=walltime)

        sub.write_sh(submit=submit)