def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. 

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type num_workers: int 
    """

    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")
    
    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers) 
    else:
        ## check for the file type  
        gff_hand = helper.open_file(genome_anno)
    
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue

            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 

        gff_hand.close() 

        ## according to the file type 
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) 

    ## create downloadpath if doesnot exists 
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            print "error: cannot create the directory %s." % out_dir
            sys.exit(0)
    else:## if present any other old index files clean up the folder 
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print e 
Exemple #2
0
def create_star_genome_index(fasta_file,
                             out_dir,
                             genome_anno=None,
                             num_workers=1,
                             onematelength=100):
    """
    Creating STAR genome index with or without using genome annotation

    @args fasta_file: reference genome sequence file .fasta format 
    @type fasta_file: str 
    @args out_dir: genome index binary file storage place  
    @type out_dir: str 
    @args genome_anno: genome annotation file (optional) 
    @type genome_anno: str 
    @args num_workers: number of threads to run (default value = 1)
    @type num_workers: int 
    @args onematelength: One Mate Length (default value=100) 
    @type onematelength: int 
    """

    try:
        subprocess.call(["STAR"],
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    file_prefx, ext = os.path.splitext(fasta_file)
    if ext in [".bz2", ".gz", ".lzma"
               ]:  ## checking for the compressed form of the file extension
        exit(
            "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s."
            % fasta_file)

    if not genome_anno:
        cli_cmd = 'STAR \
        --runMode genomeGenerate \
        --genomeDir %s \
        --genomeFastaFiles %s \
        --runThreadN %d' % (out_dir, fasta_file, num_workers)
    else:
        file_prefx, ext = os.path.splitext(genome_anno)
        if ext in [".bz2", ".gz", ".lzma"]:
            exit(
                "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s."
                % genome_anno)

        ## check for the file type
        gff_hand = helper.open_file(genome_anno)
        for rec in gff_hand:
            rec = rec.strip('\n\r')

            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in ['#', '>']:
                continue
            # skip the genome sequence
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec

            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break
        gff_hand.close()

        ## according to the file type
        if ftype:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFtagExonParentTranscript Parent \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)
        else:
            cli_cmd = 'STAR \
            --runMode genomeGenerate \
            --genomeDir %s \
            --genomeFastaFiles %s \
            --runThreadN %d \
            --sjdbGTFfile %s \
            --sjdbGTFfeatureExon exon \
            --sjdbOverhang %d' % (out_dir, fasta_file, num_workers,
                                  genome_anno, onematelength)

    ## create downloadpath if doesnot exists
    if not os.path.exists(out_dir):
        try:
            os.makedirs(out_dir)
        except OSError:
            exit("error: cannot create the directory %s." % out_dir)
    else:  ## if present any other old index files clean up the folder
        for the_file in os.listdir(out_dir):
            file_path = os.path.join(out_dir, the_file)
            try:
                if os.path.isfile(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception, e:
                print(e)
Exemple #3
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 4 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang 5 \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))
Exemple #4
0
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1):
    """
    wrapper for running STAR program 

    @args org_db: a python dictionary with all details about a single organism 
    @type org_db: defaultdict
    @args read_type: library type - paired-end or single-end (default: PE)
    @type read_type: str 
    @args max_mates_gap_length: maximum insert size from the sample (default: 10000)
    @type max_mates_gap_length: int 
    @args num_cpus: number of threads to use for the run (default: 1)
    @type num_cpus: int 
    """
    try:
        subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    except:
        exit("Please make sure that the `STAR` binary is in your $PATH")

    from gfftools import helper, GFFParser

    genome_dir = org_db['genome_index_dir']## genome indices and annotation file
    gtf_db = org_db['gtf']

    if gtf_db != None: 
        ## check for the annotation file type gff or gtf 
        gff_hand = helper.open_file(gtf_db)
        for rec in gff_hand:
            rec = rec.strip('\n\r')
            # skip empty line fasta identifier and commented line
            if not rec or rec[0] in  ['#', '>']:
                continue
            # skip the genome sequence 
            if not re.search('\t', rec):
                continue
            parts = rec.split('\t')
            assert len(parts) >= 8, rec
            ftype, tags = GFFParser.attribute_tags(parts[-1])
            break 
        gff_hand.close() 

    ## library type 
    if read_type == 'PE':
        read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1])
    else:
        read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0])
    
    ## getting the command to uncompress the read file
    zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} 
    file_prefx, ext = os.path.splitext(org_db['fastq'][0])

    out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name'])

    ## genomic feature information 
    max_lenth_intron = org_db['max_intron_len']
    
    ##sjdbOverhang 
    mate_len = org_db['mate_length']

    ## according to the file type 
    if gtf_db == None:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, mate_len)
    elif ftype:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFtagExonParentTranscript Parent \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)
    else:
        make_star_run = "STAR \
        --genomeDir %s \
        --readFilesIn %s \
        --readFilesCommand %s \
        --outFileNamePrefix %s \
        --runThreadN %d \
        --outFilterMultimapScoreRange 2 \
        --outFilterMultimapNmax 30 \
        --outFilterMismatchNmax 3 \
        --alignIntronMax %d \
        --sjdbGTFfile %s \
        --sjdbGTFfeatureExon exon \
        --sjdbScore 1 \
        --sjdbOverhang %d \
        --outSAMstrandField intronMotif \
        --outFilterIntronMotifs RemoveNoncanonical \
        --outSAMtype BAM Unsorted \
        --genomeLoad LoadAndRemove" % (genome_dir, read_file, 
            zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len)

    sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run)
    try:
        process = subprocess.Popen(make_star_run, shell=True) 
        returncode = process.wait()

        if returncode !=0:
            raise Exception, "Exit status return code = %i" % returncode

        sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix)
    except Exception, e:
        sys.exit("Error running STAR.\n%s" %  str( e ))