Esempio n. 1
0
def run_blat(args_database, args_reference, transcripts_dicts, args_labels,
             args_threads, tmp_dir, logger, log_dir):

    blat_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blat')
    if not os.path.isfile(blat_run):
        blat_run = "blat"

    if UtilsGeneral.which(blat_run) is None:
        logger.error(
            'BLAT not found! Please add BLAT to PATH for ALIGNMENT metrics.')
    else:
        # for single file with scaffolds/patches/chromosomes:
        # split big single file with reference to files with scaffolds/patches/chromosomes
        # this save us from blat segfault for reference more than 4g:
        if args_database is None and os.path.getsize(
                args_reference) >= 4294967296:
            args_database = get_database_split_chr(tmp_dir, args_reference,
                                                   logger)

        if os.path.getsize(args_reference) < 4294967296:
            args_reference = UtilsGeneral.get_upper_case_fasta(
                args_reference, tmp_dir, logger)
            reference_pathes = [args_reference]
        else:
            # get upper case (don't mask repeats):
            args_database = get_upper_case_database_split_chr(
                args_database, tmp_dir, logger)
            reference_pathes = args_database

        # RUN BLAT:
        args_alignment = []
        for i_transcripts in range(len(transcripts_dicts)):
            start_time = datetime.datetime.now()

            args_alignment.append(
                parallel_blat_run.parallel_blat_run(
                    transcripts_dicts[i_transcripts], reference_pathes,
                    args_threads, tmp_dir, args_labels[i_transcripts], logger,
                    log_dir))

            end_time = datetime.datetime.now()
            spent_time = end_time - start_time
            logger.info('\nBLAT TIME: {}\n\n'.format(spent_time))

            # args.alignment.append(UtilsPipeline.align_fa_transcripts_to_psl_by_blat
            #                       (args.transcripts[i_transcripts], reference_pathes, args.output_dir,
            #                        args.labels[i_transcripts]))

        return args_alignment
Esempio n. 2
0
    def is_suspected_fusion(self, best_union_alignments,
                            fusion_err_threshhold):
        strands = ['+', '-']
        q_starts = []
        q_ends = []
        t_starts = {'+': {}, '-': {}}
        t_ends = {'+': {}, '-': {}}
        t_starts_sort_index = {'+': {}, '-': {}}
        t_starts_sort_array = {'+': {}, '-': {}}
        t_ends_sort_index = {'+': {}, '-': {}}
        t_ends_sort_array = {'+': {}, '-': {}}

        for i in range(len(best_union_alignments)):
            q_starts.append(best_union_alignments[i].query_fragment.start)
            q_ends.append(best_union_alignments[i].query_fragment.end)
            strand = best_union_alignments[i].strand
            id_chr = best_union_alignments[i].target_fragment.name
            t_start = best_union_alignments[i].target_fragment.start
            t_end = best_union_alignments[i].target_fragment.end
            if id_chr not in t_starts:
                t_starts[strand][id_chr] = []
                t_ends[strand][id_chr] = []
            t_starts[strand][id_chr].append(t_start)
            t_ends[strand][id_chr].append(t_end)

        q_sort_index, q_sort_array = UtilsGeneral.get_order_indexes_elements(
            q_starts)
        for i in range(len(q_sort_index) - 1):
            i_alignment0 = q_sort_index[i]
            i_alignment1 = q_sort_index[i + 1]
            start = min(q_ends[i_alignment0], q_ends[i_alignment1])
            end = max(q_starts[i_alignment0], q_starts[i_alignment1])
            if abs(end - start) + 1 > fusion_err_threshhold:
                return 0

        for strand in strands:
            for id_chr in t_starts[strand]:
                t_starts_sort_index[strand][id_chr], t_starts_sort_array[
                    strand][id_chr] = UtilsGeneral.get_order_indexes_elements(
                        t_starts[strand][id_chr])
                t_ends_sort_index[strand][id_chr], t_ends_sort_array[strand][
                    id_chr] = UtilsGeneral.get_order_indexes_elements(
                        t_ends[strand][id_chr])
                if t_starts_sort_index[strand][id_chr] != t_ends_sort_index[
                        strand][id_chr]:
                    return 0

        return 1
Esempio n. 3
0
def get_best_alignment_set(transcript_alignments, ALIGNMENT_THRESHOLDS):
    #logger.debug('      Getting best union alignments...')
    best_score = -float('Inf')
    best_b = None

    q_ends = []
    for i in range(len(transcript_alignments)):
        q_ends.append(transcript_alignments[i].query_fragment.end)
    q_ends_sort_index, q_ends_sort_array = UtilsGeneral.get_order_indexes_elements(
        q_ends)

    best_list = [[]]
    scores = {str([]): 0}

    for i in q_ends_sort_index:
        a_i = transcript_alignments[i]
        best_i, curr_best_score = \
            get_best_i(best_list, a_i, scores, ALIGNMENT_THRESHOLDS)

        best_list.append(best_i)

        scores[str(best_i)] = curr_best_score

        if curr_best_score >= best_score:
            best_b = best_i
            best_score = curr_best_score

    return best_b
Esempio n. 4
0
 def print_version(self, program_name, version="unknown", build="unknown", location=None, to_stderr=False):
     if location:
         version, build = UtilsGeneral.get_version(location)
     if to_stderr:
         sys.stderr.write(program_name + ": " + str(version) + (", " + str(build) if build != "unknown" else ""))
     else:
         self.info(program_name + ": " + str(version) + (", " + str(build) if build != "unknown" else ""))
Esempio n. 5
0
    def print_tools_versions(self, blat, tmp_dir, to_stderr=False):
        import matplotlib, joblib, gffutils

        self._logger.info('External tools:')
        self.print_version('  matplotlib',
                           version=matplotlib.__version__,
                           to_stderr=to_stderr)
        self.print_version('  joblib',
                           version=joblib.__version__,
                           to_stderr=to_stderr)
        self.print_version('  gffutils',
                           version=gffutils.__version__,
                           to_stderr=to_stderr)

        # BLAST + (blastn)
        version, build = UtilsGeneral.get_version_by_key('blastn',
                                                         '-version',
                                                         tmp_dir,
                                                         v_ident='blastn: ',
                                                         b_ident='build ')
        self.print_version('  blastn', version=version, to_stderr=to_stderr)

        version, build = UtilsGeneral.get_version_by_key(
            'makeblastdb',
            '-version',
            tmp_dir,
            v_ident='makeblastdb: ',
            b_ident='build ')
        self.print_version('  makeblastdb',
                           version=version,
                           to_stderr=to_stderr)

        # blat
        if blat:
            version, build = UtilsGeneral.get_version_by_key(
                'blat', '', tmp_dir, v_ident='blat - Standalone BLAT v. ')
            self.print_version('  blat', version=version, to_stderr=to_stderr)
        else:
            # GMAP
            version, build = UtilsGeneral.get_version_by_key(
                'gmap',
                '--version',
                tmp_dir,
                v_ident='Part of GMAP package, version ',
                b_ident='Build target: ')
            self.print_version('  gmap', version=version, to_stderr=to_stderr)
Esempio n. 6
0
def get_union_fake_blat_alignments(union_lines, union_alignments,
                                   single_transcript_lines,
                                   single_transcript_alignments,
                                   fout_fake_blat, ALIGNMENT_THRESHOLDS):
    #logger.debug('      Getting fake blat alignments...')
    new_union_alignments = []
    new_union_lines = []
    q_starts = []
    for alignment in union_alignments:
        q_starts.append(alignment.query_fragment.start)
    sort_index, sort_array = UtilsGeneral.get_order_indexes_elements(q_starts)

    i_alignment0 = sort_index[0]
    alignment0 = union_alignments[i_alignment0]
    alignment_line0 = union_lines[i_alignment0]
    if len(sort_index) == 1:
        new_union_alignments.append(alignment0)
        new_union_lines.append(alignment_line0)
    for i in range(len(sort_index) - 1):
        i_alignment1 = sort_index[i + 1]
        alignment1 = union_alignments[i_alignment1]
        alignment_line1 = union_lines[i_alignment1]
        if best_alignment_set.is_union_fake_blat(alignment0, alignment1,
                                                 ALIGNMENT_THRESHOLDS):
            fout_fake_blat.write(alignment_line0 + '\n' + alignment_line1 +
                                 '\n\n\n')
            alignment0 = get_union_fake_blat_alignment(alignment0, alignment1)
            alignment_line0 = alignment0.get_psl_line_from_alignment()
        else:
            new_union_alignments.append(alignment0)
            new_union_lines.append(alignment_line0)
            i_alignment0 = sort_index[i + 1]
            alignment0 = union_alignments[i_alignment0]
            alignment_line0 = union_lines[i_alignment0]
        if i + 1 == len(sort_index) - 1:
            new_union_alignments.append(alignment0)
            new_union_lines.append(alignment_line0)

    # update single transcript lines / alignments:
    if len(union_lines) != len(new_union_lines):
        for i_alignment in range(len(union_lines)):
            if union_lines[i_alignment] not in new_union_lines:
                single_transcript_lines.remove(union_lines[i_alignment])
                single_transcript_alignments.remove(
                    union_alignments[i_alignment])
        for i_alignment in range(len(new_union_lines)):
            if new_union_lines[i_alignment] not in single_transcript_lines:
                single_transcript_lines.append(new_union_lines[i_alignment])
                single_transcript_alignments.append(
                    new_union_alignments[i_alignment])

    #logger.debug('      Done.')
    return new_union_lines, new_union_alignments, single_transcript_lines, single_transcript_alignments
Esempio n. 7
0
def get_fa_isoforms(sqlite3_db_genes, type_isoforms, type_exons,
                    reference_dict, logger):
    logger.print_timestamp()
    logger.info("Extracting isoforms sequences...")

    inconsistent_ref_db = False

    isoforms_dict = {}

    isoforms = list(sqlite3_db_genes.features_of_type(type_isoforms))

    for transcript in isoforms:
        if transcript.seqid not in reference_dict:
            continue

        isoforms_dict[transcript.id] = ''

        exons = list(
            sqlite3_db_genes.children(transcript.id,
                                      featuretype=type_exons,
                                      order_by='start'))
        # for prokaryotes:
        if len(exons) == 0:
            exons = [transcript]

        for exon in exons:
            # start, end: 1-based coordinates; start must be <= end
            isoforms_dict[transcript.id] += reference_dict[
                exon.seqid][exon.start - 1:exon.end]

        if transcript.strand == '-':
            isoforms_dict[transcript.id] = UtilsGeneral.rev_comp(
                isoforms_dict[transcript.id])

        if len(isoforms_dict[transcript.id]) == 0:
            inconsistent_ref_db = True

            logger.debug(
                'Inconsistent length chromosome / scaffold and transcript start / end: {} skipped'
                .format(transcript.id))

            isoforms_dict.pop(transcript.id)

    logger.info('Done.')

    if inconsistent_ref_db:
        logger.warning('Inconsistent reference sequences and genes database')

    return isoforms_dict
Esempio n. 8
0
    def get_confirmed_fusion_misassemblies(self, best_union_alignments,
                                           fusion_err_threshhold):
        # getting confirmed fusions and misassemblies
        q_name = best_union_alignments[0].query_fragment.name
        is_confirmed_fus = 0
        is_confirmed_mis = 0

        is_suspected_fusion, is_suspected_misassemble = self.get_suspected_fusion_misassemble(
            best_union_alignments, fusion_err_threshhold)
        if is_suspected_misassemble == 1:
            is_confirmed_mis = 1
        elif q_name not in self.misassemble_by_reads_dict and is_suspected_fusion == 1:
            is_confirmed_fus = 1
        elif is_suspected_fusion == 1:
            # say that misassembled if ends of neighboring aligned parts of transcript are into misassembled by reads intervals:
            q_starts_alignments = []
            for i_alignment in range(len(best_union_alignments)):
                q_starts_alignments.append(
                    best_union_alignments[i_alignment].query_fragment.start)
            i_q_sort_starts_alignments, q_sort_starts_alignments = UtilsGeneral.get_order_indexes_elements(
                q_starts_alignments)
            for i in range(len(i_q_sort_starts_alignments) - 1):
                alignment0 = best_union_alignments[
                    i_q_sort_starts_alignments[i]]
                alignment1 = best_union_alignments[i_q_sort_starts_alignments[
                    i + 1]]
                q_end_alignment = alignment0.query_fragment.end
                q_start_alignment = alignment1.query_fragment.start
                q_start = min(q_end_alignment, q_start_alignment)
                q_end = max(q_end_alignment, q_start_alignment)
                for interval in self.misassemble_by_reads_dict[q_name]:
                    mis_start = interval[0]
                    mis_end = interval[1]
                    if (q_start > mis_start
                            and q_start < mis_end) or (q_end > mis_start
                                                       and q_end < mis_end):
                        is_confirmed_mis = 1
                        self.fusion_num -= 1
                        self.misassemble_num += 1
                    if is_confirmed_mis == 1:
                        break
                if is_confirmed_mis == 1:
                    break
            if is_confirmed_mis == 0:
                is_confirmed_fus = 1
        #logger.debug('      Done.')
        return is_confirmed_mis, is_confirmed_fus
Esempio n. 9
0
def main_utils():
    program_name = sys.argv[0][:sys.argv[0].rfind('.')]

    # parse running string of main program and get all arguments:
    args = UtilsPipeline.get_arguments()

    WELL_FULLY_COVERAGE_THRESHOLDS = rqconfig.well_fully_coverage_thresholds(
        args.lower_threshold, args.upper_threshold)

    ALIGNMENT_THRESHOLDS = rqconfig.alignment_thresholds()

    # run rnaQUAST on test_data:
    if args.test:
        UtilsPipeline.run_rnaQUAST_on_test_data(args, rquast_dirpath,
                                                program_name)
        # UtilsPipeline.run_rnaQUAST_on_debug_data(args, rquast_dirpath, program_name)
        sys.exit()

    UtilsPipeline.get_abspath_input_data(args)

    # create output directory:
    args.output_dir = UtilsPipeline.create_output_folder(
        args.output_dir, program_name)
    # create temporary directory:
    tmp_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'tmp'))
    # create directory for log files:
    log_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'logs'))

    # SET LOGGER:
    if args.debug:
        rqconfig.debug = True
        logger.set_up_console_handler(debug=True)
    else:
        logger.set_up_console_handler()
    logger.set_up_file_handler(log_dir)
    logger.print_command_line([os.path.realpath(__file__)] + sys.argv[1:],
                              wrap_after=None)
    logger.start(args.blat, tmp_dir)

    UtilsPipeline.get_input_data_exist_error(args, logger)

    # THREADING:
    args.threads = UtilsPipeline.get_num_threads(args.threads, logger)

    if args.meta:
        logger.info(
            '\nYOU RUN QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES')

    # GET segregate FILES:
    if args.reference and args.gtf and len(args.reference) != len(args.gtf):
        logger.error('Numbers of references and gene databases are different',
                     exit_with_code=1)

    args.reference = \
        UtilsPipeline.get_single_file(args.reference, tmp_dir, 'reference', rqconfig.list_ext_fa, args.meta, logger)

    args.gtf = \
        UtilsPipeline.get_single_file(args.gtf, tmp_dir, 'gene_database', rqconfig.list_ext_gtf, args.meta, logger)

    # READ REFERENCE FROM MULTIFASTA:
    reference_dict = None
    ids_chrs = None
    if args.reference is not None:
        logger.print_timestamp()
        logger.info('Getting reference...')
        reference_dict = UtilsGeneral.list_to_dict(
            fastaparser.read_fasta(args.reference))
        logger.info('Done.')

        genome_len = UtilsGeneral.get_genome_len(reference_dict)

        ids_chrs = reference_dict.keys()

        # correction for fasta contained Y, W and etc:
        # for id_chr in ids_chrs:
        #     reference_dict[id_chr] = UtilsGeneral.correct_nucl_seq(reference_dict[id_chr])

    # for strand specific data we store + and - keys in dictionaries and only + for non strand specific data:
    strands = UtilsGeneral.get_strands(args, logger)

    if args.prokaryote:
        type_organism = 'prokaryotes'
    else:
        type_organism = 'eukaryotes'

    # USE ANNOTATION:
    sqlite3_db_genes = None
    sorted_exons_attr = None
    db_genes_metrics = None
    type_genes, type_isoforms, type_exons = \
        UtilsAnnotations.default_type_genes, \
        UtilsAnnotations.default_type_isoforms, \
        UtilsAnnotations.default_type_exons

    if args.gtf is not None or args.gene_db is not None:
        if args.gene_db is not None:
            gene_db_name = os.path.split(args.gene_db)[1]
            label_db = gene_db_name[:gene_db_name.rfind('.db')]
        else:
            gtf_name = os.path.split(args.gtf)[1]
            label_db = gtf_name[:gtf_name.rfind('.g')]

            if ids_chrs is not None:
                args.gtf = UtilsAnnotations.clear_gtf_by_reference_chr(
                    args.gtf, ids_chrs, tmp_dir, label_db, logger)

        sqlite3_db_genes = \
            UtilsAnnotations.create_sqlite3_db(args.gene_db, args.gtf, label_db,
                                               args.disable_infer_genes, args.disable_infer_transcripts,
                                               args.output_dir, tmp_dir, logger)

        type_genes, type_isoforms, type_exons = \
            UtilsAnnotations.get_type_features(sqlite3_db_genes, UtilsAnnotations.default_type_genes,
                                               UtilsAnnotations.default_type_isoforms,
                                               UtilsAnnotations.default_type_exons, args.prokaryote, logger)

        # if UtilsAnnotations.default_type_exons == type_exons:
        #     type_organism = 'eukaryotes'
        # else:
        #     type_organism = 'prokaryotes'

        db_genes_metrics = GeneDatabaseMetrics.GeneDatabaseMetrics(
            sqlite3_db_genes, type_genes, type_isoforms, logger)

        ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT = db_genes_metrics.max_intron_len + 100
        logger.info(
            '\nSets maximum intron size equal {}. Default is 1500000 bp.\n'.
            format(ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT))

        # set exons starts / ends and ids for binning strategy:
        if ids_chrs is not None:
            sorted_exons_attr = \
                SortedExonsAttributes.SortedExonsAttributes(sqlite3_db_genes, type_exons, strands, ids_chrs, reference_dict, logger)

    reads_coverage = None
    if args.reads_alignment is not None or \
            ((args.single_reads is not None or (args.left_reads is not None and args.right_reads is not None))
             and args.reference is not None and sqlite3_db_genes is not None):
        reads_coverage = \
            ReadsCoverage.ReadsCoverage(args.reads_alignment, args.tophat, args.reference, args.single_reads,
                                        args.left_reads, args.right_reads, reference_dict, sqlite3_db_genes, type_isoforms,
                                        sorted_exons_attr, args.strand_specific, db_genes_metrics.tot_isoforms_len,
                                        genome_len, tmp_dir, args.threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir)

    if args.transcripts is not None:
        # GET TRANSCRIPTS:
        transcripts_dicts = []
        for i_transcripts in range(len(args.transcripts)):
            logger.print_timestamp('  ')
            logger.info('  Getting transcripts from {}...'.format(
                args.transcripts[i_transcripts]))
            transcripts_dicts.append(
                UtilsGeneral.list_to_dict(
                    fastaparser.read_fasta(args.transcripts[i_transcripts])))
            logger.info('  Done.')

        # get labels for folders names and names of transcripts in reports:
        all_labels_from_dirs = False
        if args.labels is None:
            args.labels = UtilsPipeline.process_labels(args.transcripts,
                                                       args.labels,
                                                       all_labels_from_dirs)
    else:
        logger.warning('No transcripts. Use --transcripts option.')

    # GET PSL ALIGNMENT FILE:
    if args.alignment is None and args.reference is not None and args.transcripts is not None:
        if args.blat:
            args.alignment = UtilsTools.run_blat(None, args.reference,
                                                 transcripts_dicts,
                                                 args.labels, args.threads,
                                                 tmp_dir, logger, log_dir)
        else:
            args.alignment = UtilsTools.run_gmap(args.reference, genome_len,
                                                 args.transcripts, args.labels,
                                                 args.threads, args.gmap_index,
                                                 tmp_dir, logger, log_dir)

        #if args.fusion_misassemble_analyze:
        #    if not (args.left_reads is not None and args.right_reads is not None):
        #        logger.error('Usage: --left_reads LEFT_READS --right RIGHT_READS for analyse fusions and misassemblies',
        #                     exit_with_code=2, to_stderr=True)
        #        sys.exit(2)

    # FOR MISASSEMBLIES SEARCH:
    # GET DATABASE FOR FA ISOFORMS:
    args.blast = False
    if args.reference is not None and sqlite3_db_genes is not None and args.alignment is not None:
        blastn_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blastn')
        if not os.path.isfile(blastn_run):
            blastn_run = "blastn"

        if UtilsGeneral.which(blastn_run) is None:
            logger.warning(
                'blastn not found! Please add blastn to PATH for better MISASSEMBLIES metrics.'
            )
        else:
            args.blast = True

            isoforms_fa_path = os.path.join(tmp_dir,
                                            '{}.isoforms.fa'.format(label_db))
            isoforms_list = UtilsGeneral.dict_to_list(
                UtilsAnnotations.get_fa_isoforms(sqlite3_db_genes,
                                                 type_isoforms, type_exons,
                                                 reference_dict, logger))
            fastaparser.write_fasta(isoforms_fa_path, isoforms_list)

            isoforms_blast_db = UtilsTools.get_blast_db(
                isoforms_fa_path, label_db, tmp_dir, logger, log_dir)

    # LOGGING INPUT DATA:
    logger.print_input_files(args)

    # INITIALIZATION TRANSCRIPTS METRICS AND REPORTS:
    transcripts_metrics = []
    separated_reports = []
    if args.transcripts is not None:
        alignments_reports = []
        blast_alignments = []
        for i_transcripts in range(len(args.transcripts)):
            # INITIALIZE TRANSCRIPTS METRICS:
            #if args.sam_file is not None:
            #    sam_file_tmp = args.sam_file[i_transcripts]
            #else:
            transcripts_metrics.append(
                TranscriptsMetrics.TranscriptsMetrics(
                    args, args.labels[i_transcripts]))

            # INITIALIZE SEPARATED REPORTS:
            separated_reports.append(
                SeparatedReport.SeparatedReport(
                    args.labels[i_transcripts], args.output_dir,
                    transcripts_metrics[i_transcripts],
                    WELL_FULLY_COVERAGE_THRESHOLDS))
            '''from joblib import Parallel, delayed

            n = len(args.transcripts)
            run_n = n / args.threads
            for i_run in range(run_n):
                tmp = Parallel(n_jobs=args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                              annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                              basic_isoforms_metrics, separated_reports)
                                                         for i_transcripts in range(i_run * args.threads, args.threads * (i_run + 1), 1))
                for i in range(args.threads):
                    i_transcripts = i + i_run * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]

            if n - run_n * args.threads != 0:
                tmp = Parallel(n_jobs=n - run_n * args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                                     annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                                     basic_isoforms_metrics, separated_reports)
                                                                for i_transcripts in range(run_n * args.threads, n, 1))
                for i in range(n - run_n * args.threads):
                    i_transcripts = i + run_n * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]'''

            logger.info()
            logger.info('Processing transcripts from {}:'.format(
                args.transcripts[i_transcripts]))

            if args.blast:
                blast_alignments.append\
                    (UtilsTools.align_transcripts_to_isoforms_by_blastn
                     (args.transcripts[i_transcripts], isoforms_blast_db, tmp_dir, args.labels[i_transcripts], logger, log_dir))
            else:
                blast_alignments.append(None)

            # PROCESS TRANSCRIPTS ALIGNMENTS:
            if transcripts_metrics[i_transcripts].simple_metrics is not None:
                # GET FILES WITH ALIGNMENTS REPORTS:
                alignments_reports.append\
                    (UtilsAlignment.AlignmentsReport.get_alignments_report
                     (args.labels[i_transcripts], args.alignment[i_transcripts], blast_alignments[i_transcripts],
                      transcripts_dicts[i_transcripts], tmp_dir, args.min_alignment, logger, ALIGNMENT_THRESHOLDS))

                # UPDATE METRICS BY ASSEMBLED TRANSCRIPTS:
                transcripts_metrics[i_transcripts].processing_assembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.assembled_psl_file, sorted_exons_attr,
                     args.strand_specific, logger, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS)

                # UPDATE METRICS BY MISASSEMBLED TRANSCRIPTS:
                # by blat:
                transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.misassembled_psl_union_file, logger, True)
                # by blast:
                if args.blast:
                    transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                        (alignments_reports[i_transcripts].blast6_report.misassembled_blast6_union_file, logger, False)

            # GET METRICS:
            transcripts_metrics[i_transcripts].get_transcripts_metrics\
                (args, type_organism, reference_dict, args.transcripts[i_transcripts], transcripts_dicts[i_transcripts],
                 args.labels[i_transcripts], args.threads, sqlite3_db_genes, db_genes_metrics, reads_coverage, logger,
                 tmp_dir, log_dir, WELL_FULLY_COVERAGE_THRESHOLDS, rqconfig.TRANSCRIPT_LENS)

            # GET SEPARATED REPORT:
            separated_reports[i_transcripts].get_separated_report\
                (args, args.labels[i_transcripts], transcripts_dicts[i_transcripts], transcripts_metrics[i_transcripts],
                 db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS)

    # GET COMPARISON REPORT:
    comparison_report = None
    if len(separated_reports) != 1:
        comparison_report = ComparisonReport.ComparisonReport()
        comparison_report.get_comparison_report(
            args, args.output_dir, args.labels, transcripts_metrics,
            db_genes_metrics, reads_coverage, logger,
            WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
            rqconfig.TRANSCRIPT_LENS)

    # GET SHORT REPORT:
    short_report = \
        ShortReport.ShortReport(args, db_genes_metrics, transcripts_metrics, args.output_dir, separated_reports,
                                comparison_report, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
                                rqconfig.TRANSCRIPT_LENS)

    # REMOVE TEMPORARY DIRECTORY FROM OUTPUT DIRECTORY:
    if os.path.exists(tmp_dir) and not args.debug:
        logger.debug('Remove temporary directory {}'.format(tmp_dir))
        shutil.rmtree(tmp_dir)
        logger.debug('Done.')

    # LOGGING RESULTS PATHES:
    logger.print_path_results(args, separated_reports, comparison_report,
                              short_report)

    if args.debug:
        UtilsGeneral.profile_memory(args, reference_dict, db_genes_metrics,
                                    transcripts_metrics, separated_reports,
                                    comparison_report, logger)

    # FINISH LOGGING:
    logger.finish_up()
Esempio n. 10
0
def run_gmap(args_reference, genome_len, args_transcripts, args_labels,
             args_threads, args_gmap_index, tmp_dir, logger, log_dir):
    args_alignment = []

    if genome_len < 2**32:
        gmap_run = 'gmap'
    else:
        gmap_run = 'gmapl'

    gmap_build = 'gmap_build'

    gmap_build_logger_out_path = os.path.join(log_dir, gmap_build + '.out.log')
    gmap_build_logger_err_path = os.path.join(log_dir, gmap_build + '.err.log')

    args_reference = UtilsGeneral.get_upper_case_fasta(args_reference, tmp_dir,
                                                       logger)

    ref_label = os.path.split(
        args_reference)[-1][:os.path.split(args_reference)[-1].rfind('.f')]

    # if UtilsGeneral.which(gmap_run) is None or UtilsGeneral.which(gmap_build):
    #     logger.warning('gmap or gmap_build not found! Please add GMAP to PATH or run with BLAT for ALIGNMENT metrics.')
    # else:
    # RUN GMAP:
    # create index (gmap_build):
    if args_gmap_index is None:
        logger.print_timestamp()
        logger.info('Creating genome index by {}...'.format(gmap_build))

        start_time = datetime.datetime.now()

        command = '{gmap_build} -D {tmp_dir} -d {ref_index_name} {reference} 1>> {log_out_1} 2>> {log_out_2}'.\
            format(gmap_build=gmap_build, tmp_dir=tmp_dir, ref_index_name=ref_label, reference=args_reference,
                   log_out_1=gmap_build_logger_out_path, log_out_2=gmap_build_logger_err_path)
        exit_code = subprocess.call(command, shell=True)

        logger.info('  logs can be found in {} and {}.'.format(
            gmap_build_logger_out_path, gmap_build_logger_err_path))

        if exit_code != 0:
            logger.error(message='{} failed!'.format(gmap_build),
                         exit_with_code=exit_code,
                         to_stderr=True)

        end_time = datetime.datetime.now()
        spent_time = end_time - start_time

        logger.info('  saved to {}'.format(os.path.join(tmp_dir, ref_label)))

        logger.info('\nGMAP_BUILD TIME: {}\n\n'.format(spent_time))
    else:
        command = 'ln -s {} {}'.format(args_gmap_index,
                                       os.path.join(tmp_dir, ref_label))
        subprocess.call(command, shell=True)

    # align (gmap):
    for i_transcripts in range(len(args_transcripts)):
        gmap_run_logger_err_path = os.path.join(
            log_dir, gmap_run + '.' + args_labels[i_transcripts] + '.err.log')

        logger.print_timestamp()
        logger.info('Aligning {} to {}...'.format(args_labels[i_transcripts],
                                                  ref_label))

        alignment_psl_path = os.path.join(tmp_dir,
                                          args_labels[i_transcripts] + '.psl')

        start_time = datetime.datetime.now()

        command = '{gmap} -D {tmp_dir} -d {ref_index_name} {transcripts} --format=1 -t {threads} -O > {alignment_out}' \
                  ' 2>> {log_out_2}'.\
            format(gmap=gmap_run, tmp_dir=tmp_dir, ref_index_name=ref_label, transcripts=args_transcripts[i_transcripts],
                   threads=args_threads, alignment_out=alignment_psl_path, log_out_2=gmap_run_logger_err_path)
        exit_code = subprocess.call(command, shell=True)

        logger.info(
            '  log can be found in {}.'.format(gmap_run_logger_err_path))

        if exit_code != 0:
            logger.error(message='{} failed for {}!'.format(
                gmap_run, args_labels[i_transcripts]),
                         exit_with_code=exit_code,
                         to_stderr=True)

        end_time = datetime.datetime.now()
        spent_time = end_time - start_time

        args_alignment.append(alignment_psl_path)

        logger.info('  saved to {}'.format(alignment_psl_path))

        logger.info('\nGMAP TIME: {}\n\n'.format(spent_time))

    return args_alignment
Esempio n. 11
0
def get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr,
                              alignment_t_starts, alignment_t_ends, strand,
                              id_chr):
    ids_internal_exons = set()
    internal_exons = set()

    # ids_internal_exons_tmp = set()

    for i_block in range(len(alignment_t_starts)):
        bin_start_i_in_ends, bin_end_i_in_ends = \
            get_bin_indexes(alignment_t_starts[i_block], sorted_exons_attr.sort_target_ends[str(strand)][id_chr],
                            sorted_exons_attr.index_sort_ends[str(strand)][id_chr], sorted_exons_attr.index_step[id_chr])
        if bin_start_i_in_ends is not None and bin_end_i_in_ends is not None:
            bin_ends = sorted_exons_attr.sort_target_ends[str(
                strand)][id_chr][bin_start_i_in_ends:bin_end_i_in_ends]
            begin = UtilsGeneral.get_bin_search_position_of_element(
                bin_ends, alignment_t_starts[i_block]) + bin_start_i_in_ends
            ids_ends_set = set(
                sorted_exons_attr.ids_by_end[strand][id_chr][begin:])
        else:
            ids_ends_set = set()


        bin_start_i_in_starts, bin_end_i_in_starts = \
            get_bin_indexes(alignment_t_ends[i_block], sorted_exons_attr.sort_target_starts[str(strand)][id_chr],
                            sorted_exons_attr.index_sort_starts[str(strand)][id_chr], sorted_exons_attr.index_step[id_chr])
        if bin_start_i_in_starts is not None and bin_end_i_in_starts is not None:
            bin_starts = sorted_exons_attr.sort_target_starts[str(
                strand)][id_chr][bin_start_i_in_starts:bin_end_i_in_starts]
            end = UtilsGeneral.get_bin_search_position_of_element(
                bin_starts, alignment_t_ends[i_block]) + bin_start_i_in_starts
            while end != len(
                    sorted_exons_attr.sort_target_starts[strand]
                [id_chr]) and end == sorted_exons_attr.sort_target_starts[
                    strand][id_chr][end]:
                end += 1
            ids_starts_set = set(
                sorted_exons_attr.ids_by_start[strand][id_chr][:end])
        else:
            ids_starts_set = set(
                sorted_exons_attr.ids_by_start[strand][id_chr])

        ids_internal_exons = ids_internal_exons.union(
            ids_ends_set.intersection(ids_starts_set))

        for id_exon in ids_internal_exons:
            internal_exons.add(sqlite3_db_genes[id_exon])

        # TODO: delete
        # begin_tmp = UtilsGeneral.get_bin_search_position_of_element(sorted_exons_attr.sort_target_ends[str(strand)][id_chr], alignment_t_starts[i_block])
        # ids_ends_set_tmp = set(sorted_exons_attr.ids_by_end[strand][id_chr][begin_tmp:])
        #
        #
        # end_tmp = UtilsGeneral.get_bin_search_position_of_element(sorted_exons_attr.sort_target_starts[str(strand)][id_chr], alignment_t_ends[i_block])
        # while end_tmp != len(sorted_exons_attr.sort_target_starts[strand][id_chr]) and end_tmp == sorted_exons_attr.sort_target_starts[strand][id_chr][end_tmp]:
        #     end_tmp += 1
        # ids_starts_set_tmp = set(sorted_exons_attr.ids_by_start[strand][id_chr][:end_tmp])
        #
        # ids_internal_exons_tmp = ids_internal_exons_tmp.union(ids_ends_set_tmp.intersection(ids_starts_set_tmp))
        #
        # if ids_internal_exons != ids_internal_exons_tmp:
        #     print '!!!!'
        #     import sys
        #     sys.exit()

    return list(internal_exons)
Esempio n. 12
0
def get_arguments():
    # use --help for running without arguments:
    if len(sys.argv) == 1:
        command = 'python2 {} -h'.format(sys.argv[0])
        subprocess.call(command, shell=True)
        sys.exit(0)

    version, build = UtilsGeneral.get_version(rqconfig.rnaQUAST_LOCATION)

    parser = \
        argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                description="QUALITY ASSESSMENT FOR TRANSCRIPTOME ASSEMBLIES %(prog)s v.{}"
                                              "\n\nUsage:\npython2 %(prog)s --transcripts TRANSCRIPTS --reference REFERENCE --gtf GENE_COORDINATES".format(version),
                                #"    pipeline-2: python %(prog)s -p2 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION
                                #"    pipeline-1: python %(prog)s -p1 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION --alignment ALIGNMENT\n"
                                #"    pipeline-2: python %(prog)s -p2 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION\n"
                                #"    pipeline-3: python %(prog)s -p3 --reference REFERENCE --annotation ANNOTATION --assembler ASSEMBLER --left_reads LEFT_READS --right_reads RIGHT_READS\n"
                                #"    pipeline-4: python %(prog)s -p4 --reference REFERENCE --annotation ANNOTATION --simulator SIMULATOR --par PAR --assembler ASSEMBLER\n",
                                #epilog='If you don\'t use prepared arguments, please add to PATH samtools, bowtie or bowtie-build for fusion and misassamble analyze.', conflict_handler='resolve', prog=sys.argv[0])
                                epilog='Don\'t forget to add GMAP (or BLAT) to PATH.', conflict_handler='resolve', prog=sys.argv[0])

    # PIPELINES:
    #groupPipelines = parser.add_argument_group('Pipeline options')

    #groupChoosePipelines = groupPipelines.add_mutually_exclusive_group(required=False)
    #groupChoosePipelines.add_argument("-p1", "--use_alignment_annotation", action="store_true", help='Use this pipeline-1 if you have FASTA-file with assembled transcripts, database with reference, GTF or GFF-file with annotation, PSL-file with alignment')
    #groupChoosePipelines.add_argument("-p2", "--use_reference_transcripts_annotation", action="store_true", help='Use this pipeline-2 if you have FASTA-file with assembled transcripts, database with reference and GTF or GFF-file with annotation')
    #groupChoosePipelines.add_argument("-p3", "--use_reference_reads_annotation", action="store_true", help='Use this pipeline-3 if you have database with reference, GTF or GFF-file with annotation and FASTQ-file with reads')
    #groupChoosePipelines.add_argument("-p4", "--use_reference_annotation", action="store_true", help='Use this pipeline-4 if you have database with reference and GTF or GFF-file with annotation')

    # INPUT DATA:
    group_input_data = parser.add_argument_group('Input data')
    group_input_data.add_argument(
        '-r',
        '--reference',
        help=
        'Single file (or several files for meta RNA) with reference genome in FASTA format '
        'or *.txt file with one-per-line list of FASTA files with reference sequences',
        type=str,
        nargs='+')

    group_input_data.add_argument(
        '--gtf',
        help=
        'File with gene coordinates (or several files or *.txt file with one-per-line '
        'list of GTF / GFF files for meta RNA). '
        'We recommend to use files downloaded from GENCODE or Ensembl [GTF/GFF]',
        type=str,
        nargs='+')
    group_input_data.add_argument(
        '--gene_db',
        help='Path to the gene database generated by gffutils to be used',
        type=str)
    #group_input_data.add_argument('-g', '--genes', help='File with gene coordinates in the reference for prokaryotes [GFF]', type=str)
    #group_input_data.add_argument('-o', '--operons', help='File with operon coordinates in the reference for prokaryotes [GFF]', type=str)

    group_input_data.add_argument('-c',
                                  '--transcripts',
                                  help='File(s) with transcripts [FASTA]',
                                  type=str,
                                  nargs='+')

    group_input_data.add_argument(
        '-psl',
        '--alignment',
        help='File(s) with transcript alignments to the reference genome [PSL]',
        type=str,
        nargs='+')

    group_input_data.add_argument(
        '-sam',
        '--reads_alignment',
        help='File with read alignments to the reference genome [SAM]')

    group_input_data.add_argument(
        '-1',
        '--left_reads',
        help='File with forward paired-end reads [FASTQ or gzip-compressed]',
        type=str)
    group_input_data.add_argument(
        '-2',
        '--right_reads',
        help='File with reverse paired-end reads [FASTQ or gzip-compressed]',
        type=str)
    # group_input_data.add_argument('-12', '--paired_reads', help='File with interplaced forward and reverse paired-end reads [FASTQ or gzip-compressed]')
    group_input_data.add_argument(
        '-s',
        '--single_reads',
        help='File with unpaired reads [FASTQ or gzip-compressed]',
        type=str)

    group_input_data.add_argument(
        '--gmap_index',
        help='Folder containing GMAP index for the reference genome')

    #group_input_data.add_argument('--par', help='File with simulation parameters, for details go to http://sammeth.net/confluence/'
    #                                    'display/SIM/.PAR+Simulation+Parameters [PAR]', type=str)

    # BASIC OPTIONS:
    group_basic = parser.add_argument_group('Basic options')
    group_basic.add_argument(
        '-o',
        '--output_dir',
        help=
        'Directory to store all results [default: rnaQUAST_results/results_<datetime>]',
        type=str)
    group_basic.add_argument(
        '--test',
        help=
        'Run rnaQUAST on the test data from the test_data folder, output directory is rnaOUAST_test_output',
        action='store_true')
    group_basic.add_argument(
        '-d',
        '--debug',
        help=
        'Report detailed information, typically used only for detecting problems.',
        action='store_true')

    group_advanced = parser.add_argument_group('Advanced options')
    group_advanced.add_argument(
        '-t',
        '--threads',
        help='Maximum number of threads, default: min(number of CPUs / 2, 16)',
        type=int)

    group_advanced.add_argument(
        '-l',
        '--labels',
        help='Name(s) of assemblies that will be used in the reports',
        type=str,
        nargs='+')

    group_advanced.add_argument(
        '-ss',
        '--strand_specific',
        help=
        'Set if transcripts were assembled using strand-specific RNA-Seq data',
        action='store_true')

    group_advanced.add_argument(
        '--min_alignment',
        help='Minimal alignment length, default: %(default)s',
        type=int,
        default=50,
        required=False)

    group_advanced.add_argument(
        '--no_plots',
        help='Do not draw plots (to speed up computation)',
        action='store_true')

    group_advanced.add_argument(
        '--blat',
        help=
        'Run with BLAT alignment tool (http://hgwdev.cse.ucsc.edu/~kent/exe/) instead of GMAP',
        action='store_true')

    # group_advanced.add_argument('--busco', help='Run with BUSCO tool (http://busco.ezlab.org/)', action='store_true')
    # group_advanced.add_argument('-C', '--cegma', help='Run with CEGMA (Core Eukaryotic Genes Mapping Approach)', action='store_true')

    group_advanced.add_argument(
        '--tophat',
        help=
        'Run with TopHat tool (https://ccb.jhu.edu/software/tophat/index.shtml) instead of STAR',
        action='store_true')

    group_advanced.add_argument(
        '--gene_mark',
        help='Run with GeneMarkS-T tool (http://topaz.gatech.edu/GeneMark/)',
        action='store_true')
    # groupSpecies = group_basic.add_mutually_exclusive_group(required=False)
    # groupSpecies.add_argument('--eukaryote', help='Genome is eukaryotic', action='store_true')

    group_advanced.add_argument(
        '--meta',
        help='Run QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES',
        action='store_true')

    group_advanced.add_argument(
        '--lower_threshold',
        help=
        'Lower threshold for x-assembled/covered/matched metrics, default: %(default)s',
        type=float,
        default=0.5,
        required=False)
    group_advanced.add_argument(
        '--upper_threshold',
        help=
        'Upper threshold for x-assembled/covered/matched metrics, default: %(default)s',
        type=float,
        default=0.95,
        required=False)

    # group_advanced.add_argument('-ir', '--isoforms_range', help='Range of isoforms lengths involved in metrics', type=int, nargs='+')
    #group_advanced.add_argument('-fma', '--fusion_misassemble_analyze', help='Analyze fusions and misassemblies', action='store_true')

    group_gffutils = parser.add_argument_group('Gffutils related options')
    group_gffutils.add_argument(
        '--disable_infer_genes',
        help='Use this option if your GTF file already contains genes records',
        action='store_true')
    group_gffutils.add_argument(
        '--disable_infer_transcripts',
        help='Use this option if your GTF already contains transcripts records',
        action='store_true')
    # group_gffutils.add_argument('--store_db', help='Save new complete gene database generated by gffutils (speeds up next runs with these database)', action='store_true')

    group_busco = parser.add_argument_group('BUSCO related options')
    group_busco.add_argument(
        '--busco_lineage',
        help=
        'Run with BUSCO tool (http://busco.ezlab.org/). Path to the BUSCO lineage data to be used (Eukaryota, Metazoa, Arthropoda, Vertebrata or Fungi)',
        type=str)

    # group_gene_mark = parser.add_argument_group('GeneMarkS-T related options')
    group_advanced.add_argument(
        '--prokaryote',
        help='Use this option if the genome is prokaryotic',
        action='store_true')

    # TOOLS:
    #groupTools = parser.add_argument_group('Tools')
    #groupTools.add_argument('--assembler', help='Choose assembler to get FASTA-file with transcripts', type=str, choices=['Trinity', 'SPAdes'], nargs='+')
    #groupTools.add_argument('--simulator', help='Choose simulator to get FASTQ-file with reads', type=str, choices=['Flux'])

    args = parser.parse_args()

    return args
Esempio n. 13
0
def remove_low_complexity_tail(union_lines, union_alignments, single_transcript_lines, single_transcript_alignments,
                               transcript_seq, out_low_complexity_file, threshold_block_len, end_tail):
    block_seq = ''
    for i in range(len(union_alignments)):
        if end_tail == True:
            i_alignment = len(union_alignments) - 1 - i
        else:
            i_alignment = i
        psl_alignment = union_alignments[i_alignment]
        for j in range(psl_alignment.blocks_num):
            if end_tail == True:
                i_block = psl_alignment.blocks_num - 1 - j
            else:
                i_block = j

            start = psl_alignment.query_fragment.starts[i_block]
            end = psl_alignment.query_fragment.ends[i_block]

            if psl_alignment.strand == '+':
                block_seq += transcript_seq[start:end + 1]
            else:
                block_seq += UtilsGeneral.rev_comp(transcript_seq)[start:end + 1]

            if len(block_seq) < threshold_block_len:
                continue

            if not is_low_complexity(block_seq):
                if (end_tail == True and i_block == psl_alignment.blocks_num - 1) or (end_tail == False and i_block == 0):
                    latest_alignment = psl_alignment
                    latest_line = union_lines[i_alignment]
                else:
                    if end_tail == True:
                        latest_alignment = psl_alignment.get_split_alignment(0, i_block)
                    else:
                        latest_alignment = psl_alignment.get_split_alignment(i_block, psl_alignment.blocks_num - 1)
                    latest_line = latest_alignment.get_psl_line_from_alignment()

                if end_tail == True:
                    clear_union_alignments = union_alignments[:i_alignment] + [latest_alignment]
                    clear_union_lines = union_lines[:i_alignment] + [latest_line]
                else:
                    clear_union_alignments = [latest_alignment] + union_alignments[i_alignment + 1:]
                    clear_union_lines = [latest_line] + union_lines[i_alignment + 1:]

                # update single transcript lines / alignments:
                for i_alignment in range(len(union_lines)):
                    if union_lines[i_alignment] not in clear_union_lines:
                        single_transcript_lines.remove(union_lines[i_alignment])
                        single_transcript_alignments.remove(union_alignments[i_alignment])
                for i_alignment in range(len(clear_union_lines)):
                    if clear_union_lines[i_alignment] not in single_transcript_lines:
                        single_transcript_lines.append(clear_union_lines[i_alignment])
                        single_transcript_alignments.append(clear_union_alignments[i_alignment])

                return clear_union_lines, clear_union_alignments, single_transcript_lines, single_transcript_alignments,

            else:
                fastaparser.write_fasta(out_low_complexity_file, [('{}_block'.format(psl_alignment.query_fragment.name), block_seq)], mode='a')

            if len(block_seq) > threshold_block_len:
                block_seq = ''

    # update single transcript lines / alignments:
    for i_alignment in range(len(union_lines)):
        single_transcript_lines.remove(union_lines[i_alignment])
        single_transcript_alignments.remove(union_alignments[i_alignment])

    return [], [], single_transcript_lines, single_transcript_alignments
Esempio n. 14
0
def get_union_fake_blat_alignment(alignment0, alignment1):
    union_alignment = Alignment.PSLFileAlignment()
    strand = alignment0.strand

    # for cross more than one block or equal one block:
    if strand == '+':
        tmp_start = alignment1.query_fragment.starts[0]
        tmp_array = alignment0.query_fragment.starts
    else:
        tmp_start = alignment0.query_fragment.starts[0]
        tmp_array = alignment1.query_fragment.starts
    i_block1 = UtilsGeneral.get_bin_search_position_of_element(tmp_array, tmp_start)

    if strand == '+':
        if i_block1 != alignment0.blocks_num:
            alignment0 = alignment0.get_split_alignment(0, i_block1 - 1)
    else:
        if i_block1 != alignment1.blocks_num:
            alignment1 = alignment1.get_split_alignment(0, i_block1 - 1)

    tmp_qbase_cross = alignment0.query_fragment.end - alignment1.query_fragment.start + 1
    if strand == '+':
        tmp_tbase_cross = alignment0.target_fragment.end - alignment1.target_fragment.start + 1
    else:
        tmp_tbase_cross = alignment1.target_fragment.end - alignment0.target_fragment.start + 1
    cross_bases = max(tmp_qbase_cross, tmp_tbase_cross, 0)

    f_together = False
    if tmp_qbase_cross >= 0 and tmp_tbase_cross >= 0 and tmp_qbase_cross == tmp_tbase_cross:
        f_together = True

    union_alignment.matches = alignment0.matches + alignment1.matches - max(cross_bases, 0)
    union_alignment.mismatches = alignment0.mismatches + alignment1.mismatches
    union_alignment.repmatches = alignment0.repmatches + alignment1.repmatches
    union_alignment.n_num = alignment0.n_num + alignment1.n_num
    union_alignment.strand = strand
    if f_together:
        union_alignment.blocks_num = alignment0.blocks_num + alignment1.blocks_num - 1
    else:
        union_alignment.blocks_num = alignment0.blocks_num + alignment1.blocks_num

    if strand == '+':
        if f_together:
            union_alignment.blocks_sizes = alignment0.blocks_sizes[:-1] + \
                                        [alignment0.blocks_sizes[-1] + alignment1.blocks_sizes[0] - max(cross_bases, 0)] + \
                                        alignment1.blocks_sizes[1:]
        else:
            union_alignment.blocks_sizes = alignment0.blocks_sizes[:-1] + [alignment0.blocks_sizes[-1] - max(cross_bases, 0)] + alignment1.blocks_sizes[:]
    else:
        if f_together:
            union_alignment.blocks_sizes = alignment1.blocks_sizes[:-1] + \
                                        [alignment1.blocks_sizes[-1] + alignment0.blocks_sizes[0] - max(cross_bases, 0)] + \
                                        alignment0.blocks_sizes[1:]
        else:
            union_alignment.blocks_sizes = alignment1.blocks_sizes[:-1] + [alignment1.blocks_sizes[-1] - max(cross_bases, 0)] + alignment0.blocks_sizes[:]

    set_union_query_fragment_attributes(alignment0.query_fragment, alignment1.query_fragment, union_alignment.query_fragment, strand, f_together, cross_bases)

    set_union_target_fragment_attributes(alignment0.target_fragment, alignment1.target_fragment, union_alignment.target_fragment, strand, f_together, cross_bases)

    return union_alignment