def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation TODO check whether the fasta and gtf files are uncompressed star works with uncompressed files in this step. @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type num_workers: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: print "error: cannot create the directory %s." % out_dir sys.exit(0) else:## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print e
def create_star_genome_index(fasta_file, out_dir, genome_anno=None, num_workers=1, onematelength=100): """ Creating STAR genome index with or without using genome annotation @args fasta_file: reference genome sequence file .fasta format @type fasta_file: str @args out_dir: genome index binary file storage place @type out_dir: str @args genome_anno: genome annotation file (optional) @type genome_anno: str @args num_workers: number of threads to run (default value = 1) @type num_workers: int @args onematelength: One Mate Length (default value=100) @type onematelength: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") file_prefx, ext = os.path.splitext(fasta_file) if ext in [".bz2", ".gz", ".lzma" ]: ## checking for the compressed form of the file extension exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed FASTA file %s." % fasta_file) if not genome_anno: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d' % (out_dir, fasta_file, num_workers) else: file_prefx, ext = os.path.splitext(genome_anno) if ext in [".bz2", ".gz", ".lzma"]: exit( "error: STAR - Generating genome indexes - recommended to use the uncompressed GTF/GFF file %s." % genome_anno) ## check for the file type gff_hand = helper.open_file(genome_anno) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## according to the file type if ftype: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) else: cli_cmd = 'STAR \ --runMode genomeGenerate \ --genomeDir %s \ --genomeFastaFiles %s \ --runThreadN %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbOverhang %d' % (out_dir, fasta_file, num_workers, genome_anno, onematelength) ## create downloadpath if doesnot exists if not os.path.exists(out_dir): try: os.makedirs(out_dir) except OSError: exit("error: cannot create the directory %s." % out_dir) else: ## if present any other old index files clean up the folder for the_file in os.listdir(out_dir): file_path = os.path.join(out_dir, the_file) try: if os.path.isfile(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) except Exception, e: print(e)
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 4 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang 5 \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))
def run_star_alignment(org_db, read_type='PE', max_mates_gap_length=100000, num_cpus=1): """ wrapper for running STAR program @args org_db: a python dictionary with all details about a single organism @type org_db: defaultdict @args read_type: library type - paired-end or single-end (default: PE) @type read_type: str @args max_mates_gap_length: maximum insert size from the sample (default: 10000) @type max_mates_gap_length: int @args num_cpus: number of threads to use for the run (default: 1) @type num_cpus: int """ try: subprocess.call(["STAR"], stdout=subprocess.PIPE, stderr=subprocess.PIPE) except: exit("Please make sure that the `STAR` binary is in your $PATH") from gfftools import helper, GFFParser genome_dir = org_db['genome_index_dir']## genome indices and annotation file gtf_db = org_db['gtf'] if gtf_db != None: ## check for the annotation file type gff or gtf gff_hand = helper.open_file(gtf_db) for rec in gff_hand: rec = rec.strip('\n\r') # skip empty line fasta identifier and commented line if not rec or rec[0] in ['#', '>']: continue # skip the genome sequence if not re.search('\t', rec): continue parts = rec.split('\t') assert len(parts) >= 8, rec ftype, tags = GFFParser.attribute_tags(parts[-1]) break gff_hand.close() ## library type if read_type == 'PE': read_file = "%s/%s %s/%s" % (org_db['fastq_path'], org_db['fastq'][0], org_db['fastq_path'], org_db['fastq'][1]) else: read_file = "%s/%s" % (org_db['fastq_path'], org_db['fastq'][0]) ## getting the command to uncompress the read file zip_type = {".gz" : "gzip -c", ".bz2" : "bzip2 -d -c"} file_prefx, ext = os.path.splitext(org_db['fastq'][0]) out_prefix = '%s/%s_' % (org_db['read_map_dir'], org_db['short_name']) ## genomic feature information max_lenth_intron = org_db['max_intron_len'] ##sjdbOverhang mate_len = org_db['mate_length'] ## according to the file type if gtf_db == None: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, mate_len) elif ftype: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFtagExonParentTranscript Parent \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) else: make_star_run = "STAR \ --genomeDir %s \ --readFilesIn %s \ --readFilesCommand %s \ --outFileNamePrefix %s \ --runThreadN %d \ --outFilterMultimapScoreRange 2 \ --outFilterMultimapNmax 30 \ --outFilterMismatchNmax 3 \ --alignIntronMax %d \ --sjdbGTFfile %s \ --sjdbGTFfeatureExon exon \ --sjdbScore 1 \ --sjdbOverhang %d \ --outSAMstrandField intronMotif \ --outFilterIntronMotifs RemoveNoncanonical \ --outSAMtype BAM Unsorted \ --genomeLoad LoadAndRemove" % (genome_dir, read_file, zip_type[ext], out_prefix, num_cpus, max_lenth_intron, gtf_db, mate_len) sys.stdout.write('\trunning STAR program as: %s \n' % make_star_run) try: process = subprocess.Popen(make_star_run, shell=True) returncode = process.wait() if returncode !=0: raise Exception, "Exit status return code = %i" % returncode sys.stdout.write("STAR run completed. result file stored at %sAligned.out.bam\n" % out_prefix) except Exception, e: sys.exit("Error running STAR.\n%s" % str( e ))