Esempio n. 1
0
    def get_comparison_report(self, args, output_dir, labels, transcripts_metrics, db_genes_metrics, reads_coverage, logger,
                              WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, TRANSCRIPT_LENS):

        logger.print_timestamp()
        logger.info('Getting COMPARISON report...')

        if len(transcripts_metrics) != 0:
            self.output_dir = UtilsPipeline.create_empty_folder(os.path.join(output_dir, 'comparison_output'))
        else:
            self.output_dir = output_dir

        self.txt_comparison_report = \
            TXTMetricsReport.TXTMetricsReport(args.blast, self.output_dir, labels, transcripts_metrics, db_genes_metrics, reads_coverage, logger,
                                              WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, TRANSCRIPT_LENS)

        if reads_coverage is not None:
            self.path_well_expressed_list_by_reads = os.path.join(self.output_dir, 'reads.{}%-covered.list'.format(int(WELL_FULLY_COVERAGE_THRESHOLDS.well_isoform_threshold * 100)))
            reads_coverage.print_well_expressed_isoforms(self.path_well_expressed_list_by_reads, logger)

            self.path_fully_expressed_list_by_reads = os.path.join(self.output_dir, 'reads.{}%-covered.list'.format(int(WELL_FULLY_COVERAGE_THRESHOLDS.fully_isoform_threshold * 100)))
            reads_coverage.print_fully_expressed_isoforms(self.path_fully_expressed_list_by_reads, logger)


        if not args.no_plots:
            self.distribution_report = \
                DistributionReport.DistributionReport(transcripts_metrics, db_genes_metrics, self.output_dir, logger,
                                                      PRECISION)

        logger.info('  saved to {}'.format(self.output_dir))
    def get_GeneMarkS_T_report(cls, type_organism, args_threads, args_ss,
                               transcripts_path, tmp_dir, label, logger,
                               log_dir):
        # run GeneMarkS-T:
        logger.print_timestamp()
        logger.info(
            '  Running GeneMarkS-T (Gene Prediction in Transcripts)...')

        out_dir_path = UtilsPipeline.create_empty_folder(
            os.path.join(tmp_dir, label + '_GeneMarkS-T'))

        initial_dir = os.getcwd()

        os.chdir(out_dir_path)

        transcripts_name = os.path.split(transcripts_path)[-1]
        tmp_GeneMarkS_T_report_path = os.path.join(out_dir_path,
                                                   transcripts_name + '.lst')
        GeneMarkS_T_report_path = None
        tmp_log_path = os.path.join(out_dir_path, 'gms.log')
        log_out_path = os.path.join(log_dir, label + '.GeneMarkS_T.out.log')
        log_err_path = os.path.join(log_dir, label + '.GeneMarkS_T.err.log')

        GeneMarkS_T_run = 'gmst.pl'
        command = '{} {} --output {} 2>> {}'.format(
            GeneMarkS_T_run, transcripts_path, tmp_GeneMarkS_T_report_path,
            log_err_path)
        if type_organism == 'prokaryotes':
            command += ' --prok'

        if args_ss:
            command += ' --strand direct'

        logger.debug(command)

        exit_code = subprocess.call(command, shell=True)

        os.chdir(initial_dir)

        if exit_code != 0 or not os.path.exists(tmp_GeneMarkS_T_report_path):
            logger.error(message='GeneMarkS-T failed for {}!'.format(label))
        else:
            GeneMarkS_T_report_path = tmp_GeneMarkS_T_report_path

            logger.info('    saved to {}'.format(GeneMarkS_T_report_path))

        if os.path.exists(tmp_log_path):
            shutil.move(tmp_log_path, log_out_path)

        logger.info('    logs can be found in {} and {}.'.format(
            log_out_path, log_err_path))

        return GeneMarkS_T_report_path
Esempio n. 3
0
def run_tophat(bowtie2_index_path, reference_path, single_reads, reads_1_path,
               reads_2_path, output_dir, threads, logger, log_dir):
    program_name = 'tophat'

    # We are changing the FASTA file ending from .fna to .fa,
    # because tophat wants a file names .fa, its picky that way
    if os.path.splitext(reference_path)[1] != '.fa':
        new_ref_path = os.path.join(
            output_dir,
            os.path.basename(reference_path)
            [:os.path.basename(reference_path).rfind('.f')] + '.fa')
        command = 'ln -s {} {}'.format(reference_path, new_ref_path)
        subprocess.call(command, shell=True)
        reference_path = new_ref_path

    tophat_logger_err_path = os.path.join(log_dir, program_name + '.err.log')

    tophat_outdir = UtilsPipeline.create_folder(
        os.path.join(output_dir, program_name + '_out'))

    if bowtie2_index_path is None:
        bowtie2_index_path = get_genome_bowtie2_index(reference_path, logger,
                                                      log_dir)

    logger.print_timestamp()
    logger.info('Running {}...'.format(program_name))

    reads = ''
    if reads_1_path and reads_2_path:
        reads += reads_1_path + ' ' + reads_2_path
    if single_reads:
        if reads_1_path and reads_2_path:
            reads += ','
        reads += single_reads

    command = \
        '{program_name} -o {output_dir} {index} {reads} -p {threads} 2>> {log_out_2}'.\
            format(program_name=program_name, output_dir=tophat_outdir, index=bowtie2_index_path, reads=reads,
                   threads=threads, log_out_2=tophat_logger_err_path)
    exit_code = subprocess.call(command, shell=True)
    if exit_code != 0:
        tophat_outdir = None

        logger.error(
            '{program_name} failed!'.format(program_name=program_name))
    else:
        logger.info('  saved to {}.'.format(tophat_outdir))

    logger.info('  log can be found in {}.'.format(tophat_logger_err_path))

    return tophat_outdir
Esempio n. 4
0
    def __init__(self, label, output_dir, transcripts_metrics,
                 WELL_FULLY_COVERAGE_THRESHOLDS):
        # get folders for separated reports:
        self.output_dir = UtilsPipeline.create_empty_folder(
            os.path.join(output_dir, '{}_output'.format(label)))

        self.distribution_report = None

        self.txt_metrics_report = None

        # OTHER REPORTS:
        if transcripts_metrics.simple_metrics is not None:
            # UNALIGNED:
            self.path_fa_unaligned = os.path.join(
                self.output_dir, '{}.unaligned.fasta'.format(label))

            # MULTIPLE ALIGNED:
            self.path_fa_paralogous = os.path.join(
                self.output_dir, '{}.paralogs.fasta'.format(label))

            # MISASSEMBLED:
            self.path_fa_misassembled_together = os.path.join(
                self.output_dir, '{}.misassembled.fasta'.format(label))
            self.path_fa_misassembled_by_blat = os.path.join(
                self.output_dir, '{}.misassembled.blat.fasta'.format(label))
            self.path_fa_misassembled_by_blast = os.path.join(
                self.output_dir, '{}.misassembled.blast.fasta'.format(label))

            # UNIQUE ALIGNED:
            self.path_fa_unique_aligned = os.path.join(
                self.output_dir, '{}.correct.fasta'.format(label))

        if transcripts_metrics.assembly_completeness_metrics is not None:
            self.path_fully_assembled_list = os.path.join(
                self.output_dir, '{}.{}%-assembled.list'.format(
                    label,
                    int(WELL_FULLY_COVERAGE_THRESHOLDS.fully_isoform_threshold
                        * 100)))
            self.path_well_assembled_list = os.path.join(
                self.output_dir, '{}.{}%-assembled.list'.format(
                    label,
                    int(WELL_FULLY_COVERAGE_THRESHOLDS.well_isoform_threshold *
                        100)))

        if transcripts_metrics.assembly_correctness_metrics is not None:
            self.path_fa_unannotated = os.path.join(
                self.output_dir, '{}.unannotated.fasta'.format(label))
Esempio n. 5
0
def main_utils():
    program_name = sys.argv[0][:sys.argv[0].rfind('.')]

    # parse running string of main program and get all arguments:
    args = UtilsPipeline.get_arguments()

    WELL_FULLY_COVERAGE_THRESHOLDS = rqconfig.well_fully_coverage_thresholds(
        args.lower_threshold, args.upper_threshold)

    ALIGNMENT_THRESHOLDS = rqconfig.alignment_thresholds()

    # run rnaQUAST on test_data:
    if args.test:
        UtilsPipeline.run_rnaQUAST_on_test_data(args, rquast_dirpath,
                                                program_name)
        # UtilsPipeline.run_rnaQUAST_on_debug_data(args, rquast_dirpath, program_name)
        sys.exit()

    UtilsPipeline.get_abspath_input_data(args)

    # create output directory:
    args.output_dir = UtilsPipeline.create_output_folder(
        args.output_dir, program_name)
    # create temporary directory:
    tmp_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'tmp'))
    # create directory for log files:
    log_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'logs'))

    # SET LOGGER:
    if args.debug:
        rqconfig.debug = True
        logger.set_up_console_handler(debug=True)
    else:
        logger.set_up_console_handler()
    logger.set_up_file_handler(log_dir)
    logger.print_command_line([os.path.realpath(__file__)] + sys.argv[1:],
                              wrap_after=None)
    logger.start(args.blat, tmp_dir)

    UtilsPipeline.get_input_data_exist_error(args, logger)

    # THREADING:
    args.threads = UtilsPipeline.get_num_threads(args.threads, logger)

    if args.meta:
        logger.info(
            '\nYOU RUN QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES')

    # GET segregate FILES:
    if args.reference and args.gtf and len(args.reference) != len(args.gtf):
        logger.error('Numbers of references and gene databases are different',
                     exit_with_code=1)

    args.reference = \
        UtilsPipeline.get_single_file(args.reference, tmp_dir, 'reference', rqconfig.list_ext_fa, args.meta, logger)

    args.gtf = \
        UtilsPipeline.get_single_file(args.gtf, tmp_dir, 'gene_database', rqconfig.list_ext_gtf, args.meta, logger)

    # READ REFERENCE FROM MULTIFASTA:
    reference_dict = None
    ids_chrs = None
    if args.reference is not None:
        logger.print_timestamp()
        logger.info('Getting reference...')
        reference_dict = UtilsGeneral.list_to_dict(
            fastaparser.read_fasta(args.reference))
        logger.info('Done.')

        genome_len = UtilsGeneral.get_genome_len(reference_dict)

        ids_chrs = reference_dict.keys()

        # correction for fasta contained Y, W and etc:
        # for id_chr in ids_chrs:
        #     reference_dict[id_chr] = UtilsGeneral.correct_nucl_seq(reference_dict[id_chr])

    # for strand specific data we store + and - keys in dictionaries and only + for non strand specific data:
    strands = UtilsGeneral.get_strands(args, logger)

    if args.prokaryote:
        type_organism = 'prokaryotes'
    else:
        type_organism = 'eukaryotes'

    # USE ANNOTATION:
    sqlite3_db_genes = None
    sorted_exons_attr = None
    db_genes_metrics = None
    type_genes, type_isoforms, type_exons = \
        UtilsAnnotations.default_type_genes, \
        UtilsAnnotations.default_type_isoforms, \
        UtilsAnnotations.default_type_exons

    if args.gtf is not None or args.gene_db is not None:
        if args.gene_db is not None:
            gene_db_name = os.path.split(args.gene_db)[1]
            label_db = gene_db_name[:gene_db_name.rfind('.db')]
        else:
            gtf_name = os.path.split(args.gtf)[1]
            label_db = gtf_name[:gtf_name.rfind('.g')]

            if ids_chrs is not None:
                args.gtf = UtilsAnnotations.clear_gtf_by_reference_chr(
                    args.gtf, ids_chrs, tmp_dir, label_db, logger)

        sqlite3_db_genes = \
            UtilsAnnotations.create_sqlite3_db(args.gene_db, args.gtf, label_db,
                                               args.disable_infer_genes, args.disable_infer_transcripts,
                                               args.output_dir, tmp_dir, logger)

        type_genes, type_isoforms, type_exons = \
            UtilsAnnotations.get_type_features(sqlite3_db_genes, UtilsAnnotations.default_type_genes,
                                               UtilsAnnotations.default_type_isoforms,
                                               UtilsAnnotations.default_type_exons, args.prokaryote, logger)

        # if UtilsAnnotations.default_type_exons == type_exons:
        #     type_organism = 'eukaryotes'
        # else:
        #     type_organism = 'prokaryotes'

        db_genes_metrics = GeneDatabaseMetrics.GeneDatabaseMetrics(
            sqlite3_db_genes, type_genes, type_isoforms, logger)

        ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT = db_genes_metrics.max_intron_len + 100
        logger.info(
            '\nSets maximum intron size equal {}. Default is 1500000 bp.\n'.
            format(ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT))

        # set exons starts / ends and ids for binning strategy:
        if ids_chrs is not None:
            sorted_exons_attr = \
                SortedExonsAttributes.SortedExonsAttributes(sqlite3_db_genes, type_exons, strands, ids_chrs, reference_dict, logger)

    reads_coverage = None
    if args.reads_alignment is not None or \
            ((args.single_reads is not None or (args.left_reads is not None and args.right_reads is not None))
             and args.reference is not None and sqlite3_db_genes is not None):
        reads_coverage = \
            ReadsCoverage.ReadsCoverage(args.reads_alignment, args.tophat, args.reference, args.single_reads,
                                        args.left_reads, args.right_reads, reference_dict, sqlite3_db_genes, type_isoforms,
                                        sorted_exons_attr, args.strand_specific, db_genes_metrics.tot_isoforms_len,
                                        genome_len, tmp_dir, args.threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir)

    if args.transcripts is not None:
        # GET TRANSCRIPTS:
        transcripts_dicts = []
        for i_transcripts in range(len(args.transcripts)):
            logger.print_timestamp('  ')
            logger.info('  Getting transcripts from {}...'.format(
                args.transcripts[i_transcripts]))
            transcripts_dicts.append(
                UtilsGeneral.list_to_dict(
                    fastaparser.read_fasta(args.transcripts[i_transcripts])))
            logger.info('  Done.')

        # get labels for folders names and names of transcripts in reports:
        all_labels_from_dirs = False
        if args.labels is None:
            args.labels = UtilsPipeline.process_labels(args.transcripts,
                                                       args.labels,
                                                       all_labels_from_dirs)
    else:
        logger.warning('No transcripts. Use --transcripts option.')

    # GET PSL ALIGNMENT FILE:
    if args.alignment is None and args.reference is not None and args.transcripts is not None:
        if args.blat:
            args.alignment = UtilsTools.run_blat(None, args.reference,
                                                 transcripts_dicts,
                                                 args.labels, args.threads,
                                                 tmp_dir, logger, log_dir)
        else:
            args.alignment = UtilsTools.run_gmap(args.reference, genome_len,
                                                 args.transcripts, args.labels,
                                                 args.threads, args.gmap_index,
                                                 tmp_dir, logger, log_dir)

        #if args.fusion_misassemble_analyze:
        #    if not (args.left_reads is not None and args.right_reads is not None):
        #        logger.error('Usage: --left_reads LEFT_READS --right RIGHT_READS for analyse fusions and misassemblies',
        #                     exit_with_code=2, to_stderr=True)
        #        sys.exit(2)

    # FOR MISASSEMBLIES SEARCH:
    # GET DATABASE FOR FA ISOFORMS:
    args.blast = False
    if args.reference is not None and sqlite3_db_genes is not None and args.alignment is not None:
        blastn_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blastn')
        if not os.path.isfile(blastn_run):
            blastn_run = "blastn"

        if UtilsGeneral.which(blastn_run) is None:
            logger.warning(
                'blastn not found! Please add blastn to PATH for better MISASSEMBLIES metrics.'
            )
        else:
            args.blast = True

            isoforms_fa_path = os.path.join(tmp_dir,
                                            '{}.isoforms.fa'.format(label_db))
            isoforms_list = UtilsGeneral.dict_to_list(
                UtilsAnnotations.get_fa_isoforms(sqlite3_db_genes,
                                                 type_isoforms, type_exons,
                                                 reference_dict, logger))
            fastaparser.write_fasta(isoforms_fa_path, isoforms_list)

            isoforms_blast_db = UtilsTools.get_blast_db(
                isoforms_fa_path, label_db, tmp_dir, logger, log_dir)

    # LOGGING INPUT DATA:
    logger.print_input_files(args)

    # INITIALIZATION TRANSCRIPTS METRICS AND REPORTS:
    transcripts_metrics = []
    separated_reports = []
    if args.transcripts is not None:
        alignments_reports = []
        blast_alignments = []
        for i_transcripts in range(len(args.transcripts)):
            # INITIALIZE TRANSCRIPTS METRICS:
            #if args.sam_file is not None:
            #    sam_file_tmp = args.sam_file[i_transcripts]
            #else:
            transcripts_metrics.append(
                TranscriptsMetrics.TranscriptsMetrics(
                    args, args.labels[i_transcripts]))

            # INITIALIZE SEPARATED REPORTS:
            separated_reports.append(
                SeparatedReport.SeparatedReport(
                    args.labels[i_transcripts], args.output_dir,
                    transcripts_metrics[i_transcripts],
                    WELL_FULLY_COVERAGE_THRESHOLDS))
            '''from joblib import Parallel, delayed

            n = len(args.transcripts)
            run_n = n / args.threads
            for i_run in range(run_n):
                tmp = Parallel(n_jobs=args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                              annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                              basic_isoforms_metrics, separated_reports)
                                                         for i_transcripts in range(i_run * args.threads, args.threads * (i_run + 1), 1))
                for i in range(args.threads):
                    i_transcripts = i + i_run * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]

            if n - run_n * args.threads != 0:
                tmp = Parallel(n_jobs=n - run_n * args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                                     annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                                     basic_isoforms_metrics, separated_reports)
                                                                for i_transcripts in range(run_n * args.threads, n, 1))
                for i in range(n - run_n * args.threads):
                    i_transcripts = i + run_n * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]'''

            logger.info()
            logger.info('Processing transcripts from {}:'.format(
                args.transcripts[i_transcripts]))

            if args.blast:
                blast_alignments.append\
                    (UtilsTools.align_transcripts_to_isoforms_by_blastn
                     (args.transcripts[i_transcripts], isoforms_blast_db, tmp_dir, args.labels[i_transcripts], logger, log_dir))
            else:
                blast_alignments.append(None)

            # PROCESS TRANSCRIPTS ALIGNMENTS:
            if transcripts_metrics[i_transcripts].simple_metrics is not None:
                # GET FILES WITH ALIGNMENTS REPORTS:
                alignments_reports.append\
                    (UtilsAlignment.AlignmentsReport.get_alignments_report
                     (args.labels[i_transcripts], args.alignment[i_transcripts], blast_alignments[i_transcripts],
                      transcripts_dicts[i_transcripts], tmp_dir, args.min_alignment, logger, ALIGNMENT_THRESHOLDS))

                # UPDATE METRICS BY ASSEMBLED TRANSCRIPTS:
                transcripts_metrics[i_transcripts].processing_assembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.assembled_psl_file, sorted_exons_attr,
                     args.strand_specific, logger, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS)

                # UPDATE METRICS BY MISASSEMBLED TRANSCRIPTS:
                # by blat:
                transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.misassembled_psl_union_file, logger, True)
                # by blast:
                if args.blast:
                    transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                        (alignments_reports[i_transcripts].blast6_report.misassembled_blast6_union_file, logger, False)

            # GET METRICS:
            transcripts_metrics[i_transcripts].get_transcripts_metrics\
                (args, type_organism, reference_dict, args.transcripts[i_transcripts], transcripts_dicts[i_transcripts],
                 args.labels[i_transcripts], args.threads, sqlite3_db_genes, db_genes_metrics, reads_coverage, logger,
                 tmp_dir, log_dir, WELL_FULLY_COVERAGE_THRESHOLDS, rqconfig.TRANSCRIPT_LENS)

            # GET SEPARATED REPORT:
            separated_reports[i_transcripts].get_separated_report\
                (args, args.labels[i_transcripts], transcripts_dicts[i_transcripts], transcripts_metrics[i_transcripts],
                 db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS)

    # GET COMPARISON REPORT:
    comparison_report = None
    if len(separated_reports) != 1:
        comparison_report = ComparisonReport.ComparisonReport()
        comparison_report.get_comparison_report(
            args, args.output_dir, args.labels, transcripts_metrics,
            db_genes_metrics, reads_coverage, logger,
            WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
            rqconfig.TRANSCRIPT_LENS)

    # GET SHORT REPORT:
    short_report = \
        ShortReport.ShortReport(args, db_genes_metrics, transcripts_metrics, args.output_dir, separated_reports,
                                comparison_report, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
                                rqconfig.TRANSCRIPT_LENS)

    # REMOVE TEMPORARY DIRECTORY FROM OUTPUT DIRECTORY:
    if os.path.exists(tmp_dir) and not args.debug:
        logger.debug('Remove temporary directory {}'.format(tmp_dir))
        shutil.rmtree(tmp_dir)
        logger.debug('Done.')

    # LOGGING RESULTS PATHES:
    logger.print_path_results(args, separated_reports, comparison_report,
                              short_report)

    if args.debug:
        UtilsGeneral.profile_memory(args, reference_dict, db_genes_metrics,
                                    transcripts_metrics, separated_reports,
                                    comparison_report, logger)

    # FINISH LOGGING:
    logger.finish_up()
Esempio n. 6
0
def run_STAR(threads, reference_path, gtf_path, single_reads, left_reads,
             right_reads, output_dir, sjdbGTFtagExonParentTranscript,
             sjdbGTFtagExonParentGene, genome_len, logger, log_dir):
    # Basic STAR workflow consists of 2 steps:
    program_name = 'STAR'

    star_logger_out_path = os.path.join(log_dir, program_name + '.out.log')
    star_logger_err_path = os.path.join(log_dir, program_name + '.err.log')

    logger.print_timestamp()
    logger.info('Running {}...'.format(program_name))

    # create STAR output directory:
    star_outdir = UtilsPipeline.create_folder(
        os.path.join(output_dir, 'star_out'))
    # out_sorted_bam_path = os.path.join(star_outdir, 'Aligned.sortedByCoord.out.bam')

    # 1 Generating genome indexes files (supplied the reference genome sequences (FASTA files)
    # and annotations (GTF file))
    genome_dir = os.path.join(star_outdir, 'genome_dir')

    if not os.path.exists(genome_dir):
        mode = '--runMode'

        # create tmp output directory:
        tmp_dir = UtilsPipeline.create_empty_folder(
            os.path.join(star_outdir, 'tmp_dir'))

        # create tmp_genome_dir directory:
        tmp_genome_dir = UtilsPipeline.create_empty_folder(
            os.path.join(tmp_dir, 'genome_dir'))

        genomeSAindexNbases = min(14, math.log(genome_len, 2) / 2 - 1)

        command = '{program_name} {mode} genomeGenerate --runThreadN {threads} --genomeDir {tmp_genome_dir} ' \
                  '--genomeFastaFiles {reference} --genomeSAindexNbases {genomeSAindexNbases}'.\
            format(program_name=program_name, mode=mode, threads=threads, tmp_genome_dir=tmp_genome_dir,
                   reference=reference_path, genomeSAindexNbases=genomeSAindexNbases)

        if gtf_path is not None:
            command += ' --sjdbGTFfile {gtf} --sjdbGTFtagExonParentTranscript {parent_transcript} --sjdbGTFtagExonParentGene {parent_gene}'.\
                format(gtf=gtf_path, parent_transcript=sjdbGTFtagExonParentTranscript, parent_gene=sjdbGTFtagExonParentGene)
        command += ' 1>> {log_out_1} 2>> {log_out_2}'.format(
            log_out_1=star_logger_out_path, log_out_2=star_logger_err_path)

        logger.print_timestamp()
        logger.info('  ' + command)

        exit_code = subprocess.call(command, shell=True)

        logger.info('    logs can be found in {} and {}.'.format(
            star_logger_out_path, star_logger_err_path))

        if exit_code != 0:
            logger.error('{program_name_mode} failed!'.format(
                program_name_mode=program_name + ' ' + mode,
                program_name=program_name))
        else:
            command = 'mv {} {}'.format(tmp_genome_dir, star_outdir)
            subprocess.call(command, shell=True)

    # 2 Mapping reads to the genome (supplied the genome files generated in the 1st step, as well as the RNA-seq reads
    # (sequences) in the form of FASTA or FASTQ files.)
    readFilesIn = ''
    if single_reads:
        readFilesIn += single_reads + ' '
    if right_reads and left_reads:
        readFilesIn += left_reads + ' ' + right_reads
    command = '{program_name} --runThreadN {threads} --genomeDir {genome_dir} --readFilesIn {readFilesIn} ' \
              '--outFileNamePrefix {out_file_name_prefix} --outSAMtype SAM ' \
              '--limitBAMsortRAM 1000706316'.\
        format(program_name=program_name, threads=threads, genome_dir=genome_dir, readFilesIn=readFilesIn,
               out_file_name_prefix=star_outdir + '/')

    # for compressed read files:
    if (single_reads
            and '.gz' in single_reads) or (left_reads and '.gz' in left_reads
                                           and right_reads
                                           and '.gz' in right_reads):
        command += ' --readFilesCommand zcat'
    # if '.bz2' in single_reads and '.bz2' in left_reads and '.bz2' in right_reads:
    #     command += ' --readFilesCommand bzcat'

    command += ' 1>> {log_out_1} 2>> {log_out_2}'.format(
        log_out_1=star_logger_out_path, log_out_2=star_logger_err_path)

    logger.print_timestamp()
    logger.info('  ' + command)
    exit_code = subprocess.call(command, shell=True)
    if exit_code != 0:
        star_outdir = None

        logger.error(
            '{program_name} failed!'.format(program_name=program_name))
    else:
        logger.info('  saved to {}.'.format(star_outdir))

    logger.info('  logs can be found in {} and {}.'.format(
        star_logger_out_path, star_logger_err_path))

    return star_outdir