def run_blat(args_database, args_reference, transcripts_dicts, args_labels, args_threads, tmp_dir, logger, log_dir): blat_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blat') if not os.path.isfile(blat_run): blat_run = "blat" if UtilsGeneral.which(blat_run) is None: logger.error( 'BLAT not found! Please add BLAT to PATH for ALIGNMENT metrics.') else: # for single file with scaffolds/patches/chromosomes: # split big single file with reference to files with scaffolds/patches/chromosomes # this save us from blat segfault for reference more than 4g: if args_database is None and os.path.getsize( args_reference) >= 4294967296: args_database = get_database_split_chr(tmp_dir, args_reference, logger) if os.path.getsize(args_reference) < 4294967296: args_reference = UtilsGeneral.get_upper_case_fasta( args_reference, tmp_dir, logger) reference_pathes = [args_reference] else: # get upper case (don't mask repeats): args_database = get_upper_case_database_split_chr( args_database, tmp_dir, logger) reference_pathes = args_database # RUN BLAT: args_alignment = [] for i_transcripts in range(len(transcripts_dicts)): start_time = datetime.datetime.now() args_alignment.append( parallel_blat_run.parallel_blat_run( transcripts_dicts[i_transcripts], reference_pathes, args_threads, tmp_dir, args_labels[i_transcripts], logger, log_dir)) end_time = datetime.datetime.now() spent_time = end_time - start_time logger.info('\nBLAT TIME: {}\n\n'.format(spent_time)) # args.alignment.append(UtilsPipeline.align_fa_transcripts_to_psl_by_blat # (args.transcripts[i_transcripts], reference_pathes, args.output_dir, # args.labels[i_transcripts])) return args_alignment
def is_suspected_fusion(self, best_union_alignments, fusion_err_threshhold): strands = ['+', '-'] q_starts = [] q_ends = [] t_starts = {'+': {}, '-': {}} t_ends = {'+': {}, '-': {}} t_starts_sort_index = {'+': {}, '-': {}} t_starts_sort_array = {'+': {}, '-': {}} t_ends_sort_index = {'+': {}, '-': {}} t_ends_sort_array = {'+': {}, '-': {}} for i in range(len(best_union_alignments)): q_starts.append(best_union_alignments[i].query_fragment.start) q_ends.append(best_union_alignments[i].query_fragment.end) strand = best_union_alignments[i].strand id_chr = best_union_alignments[i].target_fragment.name t_start = best_union_alignments[i].target_fragment.start t_end = best_union_alignments[i].target_fragment.end if id_chr not in t_starts: t_starts[strand][id_chr] = [] t_ends[strand][id_chr] = [] t_starts[strand][id_chr].append(t_start) t_ends[strand][id_chr].append(t_end) q_sort_index, q_sort_array = UtilsGeneral.get_order_indexes_elements( q_starts) for i in range(len(q_sort_index) - 1): i_alignment0 = q_sort_index[i] i_alignment1 = q_sort_index[i + 1] start = min(q_ends[i_alignment0], q_ends[i_alignment1]) end = max(q_starts[i_alignment0], q_starts[i_alignment1]) if abs(end - start) + 1 > fusion_err_threshhold: return 0 for strand in strands: for id_chr in t_starts[strand]: t_starts_sort_index[strand][id_chr], t_starts_sort_array[ strand][id_chr] = UtilsGeneral.get_order_indexes_elements( t_starts[strand][id_chr]) t_ends_sort_index[strand][id_chr], t_ends_sort_array[strand][ id_chr] = UtilsGeneral.get_order_indexes_elements( t_ends[strand][id_chr]) if t_starts_sort_index[strand][id_chr] != t_ends_sort_index[ strand][id_chr]: return 0 return 1
def get_best_alignment_set(transcript_alignments, ALIGNMENT_THRESHOLDS): #logger.debug(' Getting best union alignments...') best_score = -float('Inf') best_b = None q_ends = [] for i in range(len(transcript_alignments)): q_ends.append(transcript_alignments[i].query_fragment.end) q_ends_sort_index, q_ends_sort_array = UtilsGeneral.get_order_indexes_elements( q_ends) best_list = [[]] scores = {str([]): 0} for i in q_ends_sort_index: a_i = transcript_alignments[i] best_i, curr_best_score = \ get_best_i(best_list, a_i, scores, ALIGNMENT_THRESHOLDS) best_list.append(best_i) scores[str(best_i)] = curr_best_score if curr_best_score >= best_score: best_b = best_i best_score = curr_best_score return best_b
def print_version(self, program_name, version="unknown", build="unknown", location=None, to_stderr=False): if location: version, build = UtilsGeneral.get_version(location) if to_stderr: sys.stderr.write(program_name + ": " + str(version) + (", " + str(build) if build != "unknown" else "")) else: self.info(program_name + ": " + str(version) + (", " + str(build) if build != "unknown" else ""))
def print_tools_versions(self, blat, tmp_dir, to_stderr=False): import matplotlib, joblib, gffutils self._logger.info('External tools:') self.print_version(' matplotlib', version=matplotlib.__version__, to_stderr=to_stderr) self.print_version(' joblib', version=joblib.__version__, to_stderr=to_stderr) self.print_version(' gffutils', version=gffutils.__version__, to_stderr=to_stderr) # BLAST + (blastn) version, build = UtilsGeneral.get_version_by_key('blastn', '-version', tmp_dir, v_ident='blastn: ', b_ident='build ') self.print_version(' blastn', version=version, to_stderr=to_stderr) version, build = UtilsGeneral.get_version_by_key( 'makeblastdb', '-version', tmp_dir, v_ident='makeblastdb: ', b_ident='build ') self.print_version(' makeblastdb', version=version, to_stderr=to_stderr) # blat if blat: version, build = UtilsGeneral.get_version_by_key( 'blat', '', tmp_dir, v_ident='blat - Standalone BLAT v. ') self.print_version(' blat', version=version, to_stderr=to_stderr) else: # GMAP version, build = UtilsGeneral.get_version_by_key( 'gmap', '--version', tmp_dir, v_ident='Part of GMAP package, version ', b_ident='Build target: ') self.print_version(' gmap', version=version, to_stderr=to_stderr)
def get_union_fake_blat_alignments(union_lines, union_alignments, single_transcript_lines, single_transcript_alignments, fout_fake_blat, ALIGNMENT_THRESHOLDS): #logger.debug(' Getting fake blat alignments...') new_union_alignments = [] new_union_lines = [] q_starts = [] for alignment in union_alignments: q_starts.append(alignment.query_fragment.start) sort_index, sort_array = UtilsGeneral.get_order_indexes_elements(q_starts) i_alignment0 = sort_index[0] alignment0 = union_alignments[i_alignment0] alignment_line0 = union_lines[i_alignment0] if len(sort_index) == 1: new_union_alignments.append(alignment0) new_union_lines.append(alignment_line0) for i in range(len(sort_index) - 1): i_alignment1 = sort_index[i + 1] alignment1 = union_alignments[i_alignment1] alignment_line1 = union_lines[i_alignment1] if best_alignment_set.is_union_fake_blat(alignment0, alignment1, ALIGNMENT_THRESHOLDS): fout_fake_blat.write(alignment_line0 + '\n' + alignment_line1 + '\n\n\n') alignment0 = get_union_fake_blat_alignment(alignment0, alignment1) alignment_line0 = alignment0.get_psl_line_from_alignment() else: new_union_alignments.append(alignment0) new_union_lines.append(alignment_line0) i_alignment0 = sort_index[i + 1] alignment0 = union_alignments[i_alignment0] alignment_line0 = union_lines[i_alignment0] if i + 1 == len(sort_index) - 1: new_union_alignments.append(alignment0) new_union_lines.append(alignment_line0) # update single transcript lines / alignments: if len(union_lines) != len(new_union_lines): for i_alignment in range(len(union_lines)): if union_lines[i_alignment] not in new_union_lines: single_transcript_lines.remove(union_lines[i_alignment]) single_transcript_alignments.remove( union_alignments[i_alignment]) for i_alignment in range(len(new_union_lines)): if new_union_lines[i_alignment] not in single_transcript_lines: single_transcript_lines.append(new_union_lines[i_alignment]) single_transcript_alignments.append( new_union_alignments[i_alignment]) #logger.debug(' Done.') return new_union_lines, new_union_alignments, single_transcript_lines, single_transcript_alignments
def get_fa_isoforms(sqlite3_db_genes, type_isoforms, type_exons, reference_dict, logger): logger.print_timestamp() logger.info("Extracting isoforms sequences...") inconsistent_ref_db = False isoforms_dict = {} isoforms = list(sqlite3_db_genes.features_of_type(type_isoforms)) for transcript in isoforms: if transcript.seqid not in reference_dict: continue isoforms_dict[transcript.id] = '' exons = list( sqlite3_db_genes.children(transcript.id, featuretype=type_exons, order_by='start')) # for prokaryotes: if len(exons) == 0: exons = [transcript] for exon in exons: # start, end: 1-based coordinates; start must be <= end isoforms_dict[transcript.id] += reference_dict[ exon.seqid][exon.start - 1:exon.end] if transcript.strand == '-': isoforms_dict[transcript.id] = UtilsGeneral.rev_comp( isoforms_dict[transcript.id]) if len(isoforms_dict[transcript.id]) == 0: inconsistent_ref_db = True logger.debug( 'Inconsistent length chromosome / scaffold and transcript start / end: {} skipped' .format(transcript.id)) isoforms_dict.pop(transcript.id) logger.info('Done.') if inconsistent_ref_db: logger.warning('Inconsistent reference sequences and genes database') return isoforms_dict
def get_confirmed_fusion_misassemblies(self, best_union_alignments, fusion_err_threshhold): # getting confirmed fusions and misassemblies q_name = best_union_alignments[0].query_fragment.name is_confirmed_fus = 0 is_confirmed_mis = 0 is_suspected_fusion, is_suspected_misassemble = self.get_suspected_fusion_misassemble( best_union_alignments, fusion_err_threshhold) if is_suspected_misassemble == 1: is_confirmed_mis = 1 elif q_name not in self.misassemble_by_reads_dict and is_suspected_fusion == 1: is_confirmed_fus = 1 elif is_suspected_fusion == 1: # say that misassembled if ends of neighboring aligned parts of transcript are into misassembled by reads intervals: q_starts_alignments = [] for i_alignment in range(len(best_union_alignments)): q_starts_alignments.append( best_union_alignments[i_alignment].query_fragment.start) i_q_sort_starts_alignments, q_sort_starts_alignments = UtilsGeneral.get_order_indexes_elements( q_starts_alignments) for i in range(len(i_q_sort_starts_alignments) - 1): alignment0 = best_union_alignments[ i_q_sort_starts_alignments[i]] alignment1 = best_union_alignments[i_q_sort_starts_alignments[ i + 1]] q_end_alignment = alignment0.query_fragment.end q_start_alignment = alignment1.query_fragment.start q_start = min(q_end_alignment, q_start_alignment) q_end = max(q_end_alignment, q_start_alignment) for interval in self.misassemble_by_reads_dict[q_name]: mis_start = interval[0] mis_end = interval[1] if (q_start > mis_start and q_start < mis_end) or (q_end > mis_start and q_end < mis_end): is_confirmed_mis = 1 self.fusion_num -= 1 self.misassemble_num += 1 if is_confirmed_mis == 1: break if is_confirmed_mis == 1: break if is_confirmed_mis == 0: is_confirmed_fus = 1 #logger.debug(' Done.') return is_confirmed_mis, is_confirmed_fus
def main_utils(): program_name = sys.argv[0][:sys.argv[0].rfind('.')] # parse running string of main program and get all arguments: args = UtilsPipeline.get_arguments() WELL_FULLY_COVERAGE_THRESHOLDS = rqconfig.well_fully_coverage_thresholds( args.lower_threshold, args.upper_threshold) ALIGNMENT_THRESHOLDS = rqconfig.alignment_thresholds() # run rnaQUAST on test_data: if args.test: UtilsPipeline.run_rnaQUAST_on_test_data(args, rquast_dirpath, program_name) # UtilsPipeline.run_rnaQUAST_on_debug_data(args, rquast_dirpath, program_name) sys.exit() UtilsPipeline.get_abspath_input_data(args) # create output directory: args.output_dir = UtilsPipeline.create_output_folder( args.output_dir, program_name) # create temporary directory: tmp_dir = UtilsPipeline.create_empty_folder( os.path.join(args.output_dir, 'tmp')) # create directory for log files: log_dir = UtilsPipeline.create_empty_folder( os.path.join(args.output_dir, 'logs')) # SET LOGGER: if args.debug: rqconfig.debug = True logger.set_up_console_handler(debug=True) else: logger.set_up_console_handler() logger.set_up_file_handler(log_dir) logger.print_command_line([os.path.realpath(__file__)] + sys.argv[1:], wrap_after=None) logger.start(args.blat, tmp_dir) UtilsPipeline.get_input_data_exist_error(args, logger) # THREADING: args.threads = UtilsPipeline.get_num_threads(args.threads, logger) if args.meta: logger.info( '\nYOU RUN QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES') # GET segregate FILES: if args.reference and args.gtf and len(args.reference) != len(args.gtf): logger.error('Numbers of references and gene databases are different', exit_with_code=1) args.reference = \ UtilsPipeline.get_single_file(args.reference, tmp_dir, 'reference', rqconfig.list_ext_fa, args.meta, logger) args.gtf = \ UtilsPipeline.get_single_file(args.gtf, tmp_dir, 'gene_database', rqconfig.list_ext_gtf, args.meta, logger) # READ REFERENCE FROM MULTIFASTA: reference_dict = None ids_chrs = None if args.reference is not None: logger.print_timestamp() logger.info('Getting reference...') reference_dict = UtilsGeneral.list_to_dict( fastaparser.read_fasta(args.reference)) logger.info('Done.') genome_len = UtilsGeneral.get_genome_len(reference_dict) ids_chrs = reference_dict.keys() # correction for fasta contained Y, W and etc: # for id_chr in ids_chrs: # reference_dict[id_chr] = UtilsGeneral.correct_nucl_seq(reference_dict[id_chr]) # for strand specific data we store + and - keys in dictionaries and only + for non strand specific data: strands = UtilsGeneral.get_strands(args, logger) if args.prokaryote: type_organism = 'prokaryotes' else: type_organism = 'eukaryotes' # USE ANNOTATION: sqlite3_db_genes = None sorted_exons_attr = None db_genes_metrics = None type_genes, type_isoforms, type_exons = \ UtilsAnnotations.default_type_genes, \ UtilsAnnotations.default_type_isoforms, \ UtilsAnnotations.default_type_exons if args.gtf is not None or args.gene_db is not None: if args.gene_db is not None: gene_db_name = os.path.split(args.gene_db)[1] label_db = gene_db_name[:gene_db_name.rfind('.db')] else: gtf_name = os.path.split(args.gtf)[1] label_db = gtf_name[:gtf_name.rfind('.g')] if ids_chrs is not None: args.gtf = UtilsAnnotations.clear_gtf_by_reference_chr( args.gtf, ids_chrs, tmp_dir, label_db, logger) sqlite3_db_genes = \ UtilsAnnotations.create_sqlite3_db(args.gene_db, args.gtf, label_db, args.disable_infer_genes, args.disable_infer_transcripts, args.output_dir, tmp_dir, logger) type_genes, type_isoforms, type_exons = \ UtilsAnnotations.get_type_features(sqlite3_db_genes, UtilsAnnotations.default_type_genes, UtilsAnnotations.default_type_isoforms, UtilsAnnotations.default_type_exons, args.prokaryote, logger) # if UtilsAnnotations.default_type_exons == type_exons: # type_organism = 'eukaryotes' # else: # type_organism = 'prokaryotes' db_genes_metrics = GeneDatabaseMetrics.GeneDatabaseMetrics( sqlite3_db_genes, type_genes, type_isoforms, logger) ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT = db_genes_metrics.max_intron_len + 100 logger.info( '\nSets maximum intron size equal {}. Default is 1500000 bp.\n'. format(ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT)) # set exons starts / ends and ids for binning strategy: if ids_chrs is not None: sorted_exons_attr = \ SortedExonsAttributes.SortedExonsAttributes(sqlite3_db_genes, type_exons, strands, ids_chrs, reference_dict, logger) reads_coverage = None if args.reads_alignment is not None or \ ((args.single_reads is not None or (args.left_reads is not None and args.right_reads is not None)) and args.reference is not None and sqlite3_db_genes is not None): reads_coverage = \ ReadsCoverage.ReadsCoverage(args.reads_alignment, args.tophat, args.reference, args.single_reads, args.left_reads, args.right_reads, reference_dict, sqlite3_db_genes, type_isoforms, sorted_exons_attr, args.strand_specific, db_genes_metrics.tot_isoforms_len, genome_len, tmp_dir, args.threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir) if args.transcripts is not None: # GET TRANSCRIPTS: transcripts_dicts = [] for i_transcripts in range(len(args.transcripts)): logger.print_timestamp(' ') logger.info(' Getting transcripts from {}...'.format( args.transcripts[i_transcripts])) transcripts_dicts.append( UtilsGeneral.list_to_dict( fastaparser.read_fasta(args.transcripts[i_transcripts]))) logger.info(' Done.') # get labels for folders names and names of transcripts in reports: all_labels_from_dirs = False if args.labels is None: args.labels = UtilsPipeline.process_labels(args.transcripts, args.labels, all_labels_from_dirs) else: logger.warning('No transcripts. Use --transcripts option.') # GET PSL ALIGNMENT FILE: if args.alignment is None and args.reference is not None and args.transcripts is not None: if args.blat: args.alignment = UtilsTools.run_blat(None, args.reference, transcripts_dicts, args.labels, args.threads, tmp_dir, logger, log_dir) else: args.alignment = UtilsTools.run_gmap(args.reference, genome_len, args.transcripts, args.labels, args.threads, args.gmap_index, tmp_dir, logger, log_dir) #if args.fusion_misassemble_analyze: # if not (args.left_reads is not None and args.right_reads is not None): # logger.error('Usage: --left_reads LEFT_READS --right RIGHT_READS for analyse fusions and misassemblies', # exit_with_code=2, to_stderr=True) # sys.exit(2) # FOR MISASSEMBLIES SEARCH: # GET DATABASE FOR FA ISOFORMS: args.blast = False if args.reference is not None and sqlite3_db_genes is not None and args.alignment is not None: blastn_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blastn') if not os.path.isfile(blastn_run): blastn_run = "blastn" if UtilsGeneral.which(blastn_run) is None: logger.warning( 'blastn not found! Please add blastn to PATH for better MISASSEMBLIES metrics.' ) else: args.blast = True isoforms_fa_path = os.path.join(tmp_dir, '{}.isoforms.fa'.format(label_db)) isoforms_list = UtilsGeneral.dict_to_list( UtilsAnnotations.get_fa_isoforms(sqlite3_db_genes, type_isoforms, type_exons, reference_dict, logger)) fastaparser.write_fasta(isoforms_fa_path, isoforms_list) isoforms_blast_db = UtilsTools.get_blast_db( isoforms_fa_path, label_db, tmp_dir, logger, log_dir) # LOGGING INPUT DATA: logger.print_input_files(args) # INITIALIZATION TRANSCRIPTS METRICS AND REPORTS: transcripts_metrics = [] separated_reports = [] if args.transcripts is not None: alignments_reports = [] blast_alignments = [] for i_transcripts in range(len(args.transcripts)): # INITIALIZE TRANSCRIPTS METRICS: #if args.sam_file is not None: # sam_file_tmp = args.sam_file[i_transcripts] #else: transcripts_metrics.append( TranscriptsMetrics.TranscriptsMetrics( args, args.labels[i_transcripts])) # INITIALIZE SEPARATED REPORTS: separated_reports.append( SeparatedReport.SeparatedReport( args.labels[i_transcripts], args.output_dir, transcripts_metrics[i_transcripts], WELL_FULLY_COVERAGE_THRESHOLDS)) '''from joblib import Parallel, delayed n = len(args.transcripts) run_n = n / args.threads for i_run in range(run_n): tmp = Parallel(n_jobs=args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict, annotated_exons, annotated_isoforms, strands, transcripts_metrics, basic_isoforms_metrics, separated_reports) for i_transcripts in range(i_run * args.threads, args.threads * (i_run + 1), 1)) for i in range(args.threads): i_transcripts = i + i_run * args.threads transcripts_metrics[i_transcripts] = tmp[i][0] separated_reports[i_transcripts] = tmp[i][1] if n - run_n * args.threads != 0: tmp = Parallel(n_jobs=n - run_n * args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict, annotated_exons, annotated_isoforms, strands, transcripts_metrics, basic_isoforms_metrics, separated_reports) for i_transcripts in range(run_n * args.threads, n, 1)) for i in range(n - run_n * args.threads): i_transcripts = i + run_n * args.threads transcripts_metrics[i_transcripts] = tmp[i][0] separated_reports[i_transcripts] = tmp[i][1]''' logger.info() logger.info('Processing transcripts from {}:'.format( args.transcripts[i_transcripts])) if args.blast: blast_alignments.append\ (UtilsTools.align_transcripts_to_isoforms_by_blastn (args.transcripts[i_transcripts], isoforms_blast_db, tmp_dir, args.labels[i_transcripts], logger, log_dir)) else: blast_alignments.append(None) # PROCESS TRANSCRIPTS ALIGNMENTS: if transcripts_metrics[i_transcripts].simple_metrics is not None: # GET FILES WITH ALIGNMENTS REPORTS: alignments_reports.append\ (UtilsAlignment.AlignmentsReport.get_alignments_report (args.labels[i_transcripts], args.alignment[i_transcripts], blast_alignments[i_transcripts], transcripts_dicts[i_transcripts], tmp_dir, args.min_alignment, logger, ALIGNMENT_THRESHOLDS)) # UPDATE METRICS BY ASSEMBLED TRANSCRIPTS: transcripts_metrics[i_transcripts].processing_assembled_psl_file\ (alignments_reports[i_transcripts].blat_report.assembled_psl_file, sorted_exons_attr, args.strand_specific, logger, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS) # UPDATE METRICS BY MISASSEMBLED TRANSCRIPTS: # by blat: transcripts_metrics[i_transcripts].processing_misassembled_psl_file\ (alignments_reports[i_transcripts].blat_report.misassembled_psl_union_file, logger, True) # by blast: if args.blast: transcripts_metrics[i_transcripts].processing_misassembled_psl_file\ (alignments_reports[i_transcripts].blast6_report.misassembled_blast6_union_file, logger, False) # GET METRICS: transcripts_metrics[i_transcripts].get_transcripts_metrics\ (args, type_organism, reference_dict, args.transcripts[i_transcripts], transcripts_dicts[i_transcripts], args.labels[i_transcripts], args.threads, sqlite3_db_genes, db_genes_metrics, reads_coverage, logger, tmp_dir, log_dir, WELL_FULLY_COVERAGE_THRESHOLDS, rqconfig.TRANSCRIPT_LENS) # GET SEPARATED REPORT: separated_reports[i_transcripts].get_separated_report\ (args, args.labels[i_transcripts], transcripts_dicts[i_transcripts], transcripts_metrics[i_transcripts], db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # GET COMPARISON REPORT: comparison_report = None if len(separated_reports) != 1: comparison_report = ComparisonReport.ComparisonReport() comparison_report.get_comparison_report( args, args.output_dir, args.labels, transcripts_metrics, db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # GET SHORT REPORT: short_report = \ ShortReport.ShortReport(args, db_genes_metrics, transcripts_metrics, args.output_dir, separated_reports, comparison_report, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS) # REMOVE TEMPORARY DIRECTORY FROM OUTPUT DIRECTORY: if os.path.exists(tmp_dir) and not args.debug: logger.debug('Remove temporary directory {}'.format(tmp_dir)) shutil.rmtree(tmp_dir) logger.debug('Done.') # LOGGING RESULTS PATHES: logger.print_path_results(args, separated_reports, comparison_report, short_report) if args.debug: UtilsGeneral.profile_memory(args, reference_dict, db_genes_metrics, transcripts_metrics, separated_reports, comparison_report, logger) # FINISH LOGGING: logger.finish_up()
def run_gmap(args_reference, genome_len, args_transcripts, args_labels, args_threads, args_gmap_index, tmp_dir, logger, log_dir): args_alignment = [] if genome_len < 2**32: gmap_run = 'gmap' else: gmap_run = 'gmapl' gmap_build = 'gmap_build' gmap_build_logger_out_path = os.path.join(log_dir, gmap_build + '.out.log') gmap_build_logger_err_path = os.path.join(log_dir, gmap_build + '.err.log') args_reference = UtilsGeneral.get_upper_case_fasta(args_reference, tmp_dir, logger) ref_label = os.path.split( args_reference)[-1][:os.path.split(args_reference)[-1].rfind('.f')] # if UtilsGeneral.which(gmap_run) is None or UtilsGeneral.which(gmap_build): # logger.warning('gmap or gmap_build not found! Please add GMAP to PATH or run with BLAT for ALIGNMENT metrics.') # else: # RUN GMAP: # create index (gmap_build): if args_gmap_index is None: logger.print_timestamp() logger.info('Creating genome index by {}...'.format(gmap_build)) start_time = datetime.datetime.now() command = '{gmap_build} -D {tmp_dir} -d {ref_index_name} {reference} 1>> {log_out_1} 2>> {log_out_2}'.\ format(gmap_build=gmap_build, tmp_dir=tmp_dir, ref_index_name=ref_label, reference=args_reference, log_out_1=gmap_build_logger_out_path, log_out_2=gmap_build_logger_err_path) exit_code = subprocess.call(command, shell=True) logger.info(' logs can be found in {} and {}.'.format( gmap_build_logger_out_path, gmap_build_logger_err_path)) if exit_code != 0: logger.error(message='{} failed!'.format(gmap_build), exit_with_code=exit_code, to_stderr=True) end_time = datetime.datetime.now() spent_time = end_time - start_time logger.info(' saved to {}'.format(os.path.join(tmp_dir, ref_label))) logger.info('\nGMAP_BUILD TIME: {}\n\n'.format(spent_time)) else: command = 'ln -s {} {}'.format(args_gmap_index, os.path.join(tmp_dir, ref_label)) subprocess.call(command, shell=True) # align (gmap): for i_transcripts in range(len(args_transcripts)): gmap_run_logger_err_path = os.path.join( log_dir, gmap_run + '.' + args_labels[i_transcripts] + '.err.log') logger.print_timestamp() logger.info('Aligning {} to {}...'.format(args_labels[i_transcripts], ref_label)) alignment_psl_path = os.path.join(tmp_dir, args_labels[i_transcripts] + '.psl') start_time = datetime.datetime.now() command = '{gmap} -D {tmp_dir} -d {ref_index_name} {transcripts} --format=1 -t {threads} -O > {alignment_out}' \ ' 2>> {log_out_2}'.\ format(gmap=gmap_run, tmp_dir=tmp_dir, ref_index_name=ref_label, transcripts=args_transcripts[i_transcripts], threads=args_threads, alignment_out=alignment_psl_path, log_out_2=gmap_run_logger_err_path) exit_code = subprocess.call(command, shell=True) logger.info( ' log can be found in {}.'.format(gmap_run_logger_err_path)) if exit_code != 0: logger.error(message='{} failed for {}!'.format( gmap_run, args_labels[i_transcripts]), exit_with_code=exit_code, to_stderr=True) end_time = datetime.datetime.now() spent_time = end_time - start_time args_alignment.append(alignment_psl_path) logger.info(' saved to {}'.format(alignment_psl_path)) logger.info('\nGMAP TIME: {}\n\n'.format(spent_time)) return args_alignment
def get_internal_exons_faster(sqlite3_db_genes, sorted_exons_attr, alignment_t_starts, alignment_t_ends, strand, id_chr): ids_internal_exons = set() internal_exons = set() # ids_internal_exons_tmp = set() for i_block in range(len(alignment_t_starts)): bin_start_i_in_ends, bin_end_i_in_ends = \ get_bin_indexes(alignment_t_starts[i_block], sorted_exons_attr.sort_target_ends[str(strand)][id_chr], sorted_exons_attr.index_sort_ends[str(strand)][id_chr], sorted_exons_attr.index_step[id_chr]) if bin_start_i_in_ends is not None and bin_end_i_in_ends is not None: bin_ends = sorted_exons_attr.sort_target_ends[str( strand)][id_chr][bin_start_i_in_ends:bin_end_i_in_ends] begin = UtilsGeneral.get_bin_search_position_of_element( bin_ends, alignment_t_starts[i_block]) + bin_start_i_in_ends ids_ends_set = set( sorted_exons_attr.ids_by_end[strand][id_chr][begin:]) else: ids_ends_set = set() bin_start_i_in_starts, bin_end_i_in_starts = \ get_bin_indexes(alignment_t_ends[i_block], sorted_exons_attr.sort_target_starts[str(strand)][id_chr], sorted_exons_attr.index_sort_starts[str(strand)][id_chr], sorted_exons_attr.index_step[id_chr]) if bin_start_i_in_starts is not None and bin_end_i_in_starts is not None: bin_starts = sorted_exons_attr.sort_target_starts[str( strand)][id_chr][bin_start_i_in_starts:bin_end_i_in_starts] end = UtilsGeneral.get_bin_search_position_of_element( bin_starts, alignment_t_ends[i_block]) + bin_start_i_in_starts while end != len( sorted_exons_attr.sort_target_starts[strand] [id_chr]) and end == sorted_exons_attr.sort_target_starts[ strand][id_chr][end]: end += 1 ids_starts_set = set( sorted_exons_attr.ids_by_start[strand][id_chr][:end]) else: ids_starts_set = set( sorted_exons_attr.ids_by_start[strand][id_chr]) ids_internal_exons = ids_internal_exons.union( ids_ends_set.intersection(ids_starts_set)) for id_exon in ids_internal_exons: internal_exons.add(sqlite3_db_genes[id_exon]) # TODO: delete # begin_tmp = UtilsGeneral.get_bin_search_position_of_element(sorted_exons_attr.sort_target_ends[str(strand)][id_chr], alignment_t_starts[i_block]) # ids_ends_set_tmp = set(sorted_exons_attr.ids_by_end[strand][id_chr][begin_tmp:]) # # # end_tmp = UtilsGeneral.get_bin_search_position_of_element(sorted_exons_attr.sort_target_starts[str(strand)][id_chr], alignment_t_ends[i_block]) # while end_tmp != len(sorted_exons_attr.sort_target_starts[strand][id_chr]) and end_tmp == sorted_exons_attr.sort_target_starts[strand][id_chr][end_tmp]: # end_tmp += 1 # ids_starts_set_tmp = set(sorted_exons_attr.ids_by_start[strand][id_chr][:end_tmp]) # # ids_internal_exons_tmp = ids_internal_exons_tmp.union(ids_ends_set_tmp.intersection(ids_starts_set_tmp)) # # if ids_internal_exons != ids_internal_exons_tmp: # print '!!!!' # import sys # sys.exit() return list(internal_exons)
def get_arguments(): # use --help for running without arguments: if len(sys.argv) == 1: command = 'python2 {} -h'.format(sys.argv[0]) subprocess.call(command, shell=True) sys.exit(0) version, build = UtilsGeneral.get_version(rqconfig.rnaQUAST_LOCATION) parser = \ argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="QUALITY ASSESSMENT FOR TRANSCRIPTOME ASSEMBLIES %(prog)s v.{}" "\n\nUsage:\npython2 %(prog)s --transcripts TRANSCRIPTS --reference REFERENCE --gtf GENE_COORDINATES".format(version), #" pipeline-2: python %(prog)s -p2 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION #" pipeline-1: python %(prog)s -p1 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION --alignment ALIGNMENT\n" #" pipeline-2: python %(prog)s -p2 --transcripts TRANSCRIPTS --reference REFERENCE --annotation ANNOTATION\n" #" pipeline-3: python %(prog)s -p3 --reference REFERENCE --annotation ANNOTATION --assembler ASSEMBLER --left_reads LEFT_READS --right_reads RIGHT_READS\n" #" pipeline-4: python %(prog)s -p4 --reference REFERENCE --annotation ANNOTATION --simulator SIMULATOR --par PAR --assembler ASSEMBLER\n", #epilog='If you don\'t use prepared arguments, please add to PATH samtools, bowtie or bowtie-build for fusion and misassamble analyze.', conflict_handler='resolve', prog=sys.argv[0]) epilog='Don\'t forget to add GMAP (or BLAT) to PATH.', conflict_handler='resolve', prog=sys.argv[0]) # PIPELINES: #groupPipelines = parser.add_argument_group('Pipeline options') #groupChoosePipelines = groupPipelines.add_mutually_exclusive_group(required=False) #groupChoosePipelines.add_argument("-p1", "--use_alignment_annotation", action="store_true", help='Use this pipeline-1 if you have FASTA-file with assembled transcripts, database with reference, GTF or GFF-file with annotation, PSL-file with alignment') #groupChoosePipelines.add_argument("-p2", "--use_reference_transcripts_annotation", action="store_true", help='Use this pipeline-2 if you have FASTA-file with assembled transcripts, database with reference and GTF or GFF-file with annotation') #groupChoosePipelines.add_argument("-p3", "--use_reference_reads_annotation", action="store_true", help='Use this pipeline-3 if you have database with reference, GTF or GFF-file with annotation and FASTQ-file with reads') #groupChoosePipelines.add_argument("-p4", "--use_reference_annotation", action="store_true", help='Use this pipeline-4 if you have database with reference and GTF or GFF-file with annotation') # INPUT DATA: group_input_data = parser.add_argument_group('Input data') group_input_data.add_argument( '-r', '--reference', help= 'Single file (or several files for meta RNA) with reference genome in FASTA format ' 'or *.txt file with one-per-line list of FASTA files with reference sequences', type=str, nargs='+') group_input_data.add_argument( '--gtf', help= 'File with gene coordinates (or several files or *.txt file with one-per-line ' 'list of GTF / GFF files for meta RNA). ' 'We recommend to use files downloaded from GENCODE or Ensembl [GTF/GFF]', type=str, nargs='+') group_input_data.add_argument( '--gene_db', help='Path to the gene database generated by gffutils to be used', type=str) #group_input_data.add_argument('-g', '--genes', help='File with gene coordinates in the reference for prokaryotes [GFF]', type=str) #group_input_data.add_argument('-o', '--operons', help='File with operon coordinates in the reference for prokaryotes [GFF]', type=str) group_input_data.add_argument('-c', '--transcripts', help='File(s) with transcripts [FASTA]', type=str, nargs='+') group_input_data.add_argument( '-psl', '--alignment', help='File(s) with transcript alignments to the reference genome [PSL]', type=str, nargs='+') group_input_data.add_argument( '-sam', '--reads_alignment', help='File with read alignments to the reference genome [SAM]') group_input_data.add_argument( '-1', '--left_reads', help='File with forward paired-end reads [FASTQ or gzip-compressed]', type=str) group_input_data.add_argument( '-2', '--right_reads', help='File with reverse paired-end reads [FASTQ or gzip-compressed]', type=str) # group_input_data.add_argument('-12', '--paired_reads', help='File with interplaced forward and reverse paired-end reads [FASTQ or gzip-compressed]') group_input_data.add_argument( '-s', '--single_reads', help='File with unpaired reads [FASTQ or gzip-compressed]', type=str) group_input_data.add_argument( '--gmap_index', help='Folder containing GMAP index for the reference genome') #group_input_data.add_argument('--par', help='File with simulation parameters, for details go to http://sammeth.net/confluence/' # 'display/SIM/.PAR+Simulation+Parameters [PAR]', type=str) # BASIC OPTIONS: group_basic = parser.add_argument_group('Basic options') group_basic.add_argument( '-o', '--output_dir', help= 'Directory to store all results [default: rnaQUAST_results/results_<datetime>]', type=str) group_basic.add_argument( '--test', help= 'Run rnaQUAST on the test data from the test_data folder, output directory is rnaOUAST_test_output', action='store_true') group_basic.add_argument( '-d', '--debug', help= 'Report detailed information, typically used only for detecting problems.', action='store_true') group_advanced = parser.add_argument_group('Advanced options') group_advanced.add_argument( '-t', '--threads', help='Maximum number of threads, default: min(number of CPUs / 2, 16)', type=int) group_advanced.add_argument( '-l', '--labels', help='Name(s) of assemblies that will be used in the reports', type=str, nargs='+') group_advanced.add_argument( '-ss', '--strand_specific', help= 'Set if transcripts were assembled using strand-specific RNA-Seq data', action='store_true') group_advanced.add_argument( '--min_alignment', help='Minimal alignment length, default: %(default)s', type=int, default=50, required=False) group_advanced.add_argument( '--no_plots', help='Do not draw plots (to speed up computation)', action='store_true') group_advanced.add_argument( '--blat', help= 'Run with BLAT alignment tool (http://hgwdev.cse.ucsc.edu/~kent/exe/) instead of GMAP', action='store_true') # group_advanced.add_argument('--busco', help='Run with BUSCO tool (http://busco.ezlab.org/)', action='store_true') # group_advanced.add_argument('-C', '--cegma', help='Run with CEGMA (Core Eukaryotic Genes Mapping Approach)', action='store_true') group_advanced.add_argument( '--tophat', help= 'Run with TopHat tool (https://ccb.jhu.edu/software/tophat/index.shtml) instead of STAR', action='store_true') group_advanced.add_argument( '--gene_mark', help='Run with GeneMarkS-T tool (http://topaz.gatech.edu/GeneMark/)', action='store_true') # groupSpecies = group_basic.add_mutually_exclusive_group(required=False) # groupSpecies.add_argument('--eukaryote', help='Genome is eukaryotic', action='store_true') group_advanced.add_argument( '--meta', help='Run QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES', action='store_true') group_advanced.add_argument( '--lower_threshold', help= 'Lower threshold for x-assembled/covered/matched metrics, default: %(default)s', type=float, default=0.5, required=False) group_advanced.add_argument( '--upper_threshold', help= 'Upper threshold for x-assembled/covered/matched metrics, default: %(default)s', type=float, default=0.95, required=False) # group_advanced.add_argument('-ir', '--isoforms_range', help='Range of isoforms lengths involved in metrics', type=int, nargs='+') #group_advanced.add_argument('-fma', '--fusion_misassemble_analyze', help='Analyze fusions and misassemblies', action='store_true') group_gffutils = parser.add_argument_group('Gffutils related options') group_gffutils.add_argument( '--disable_infer_genes', help='Use this option if your GTF file already contains genes records', action='store_true') group_gffutils.add_argument( '--disable_infer_transcripts', help='Use this option if your GTF already contains transcripts records', action='store_true') # group_gffutils.add_argument('--store_db', help='Save new complete gene database generated by gffutils (speeds up next runs with these database)', action='store_true') group_busco = parser.add_argument_group('BUSCO related options') group_busco.add_argument( '--busco_lineage', help= 'Run with BUSCO tool (http://busco.ezlab.org/). Path to the BUSCO lineage data to be used (Eukaryota, Metazoa, Arthropoda, Vertebrata or Fungi)', type=str) # group_gene_mark = parser.add_argument_group('GeneMarkS-T related options') group_advanced.add_argument( '--prokaryote', help='Use this option if the genome is prokaryotic', action='store_true') # TOOLS: #groupTools = parser.add_argument_group('Tools') #groupTools.add_argument('--assembler', help='Choose assembler to get FASTA-file with transcripts', type=str, choices=['Trinity', 'SPAdes'], nargs='+') #groupTools.add_argument('--simulator', help='Choose simulator to get FASTQ-file with reads', type=str, choices=['Flux']) args = parser.parse_args() return args
def remove_low_complexity_tail(union_lines, union_alignments, single_transcript_lines, single_transcript_alignments, transcript_seq, out_low_complexity_file, threshold_block_len, end_tail): block_seq = '' for i in range(len(union_alignments)): if end_tail == True: i_alignment = len(union_alignments) - 1 - i else: i_alignment = i psl_alignment = union_alignments[i_alignment] for j in range(psl_alignment.blocks_num): if end_tail == True: i_block = psl_alignment.blocks_num - 1 - j else: i_block = j start = psl_alignment.query_fragment.starts[i_block] end = psl_alignment.query_fragment.ends[i_block] if psl_alignment.strand == '+': block_seq += transcript_seq[start:end + 1] else: block_seq += UtilsGeneral.rev_comp(transcript_seq)[start:end + 1] if len(block_seq) < threshold_block_len: continue if not is_low_complexity(block_seq): if (end_tail == True and i_block == psl_alignment.blocks_num - 1) or (end_tail == False and i_block == 0): latest_alignment = psl_alignment latest_line = union_lines[i_alignment] else: if end_tail == True: latest_alignment = psl_alignment.get_split_alignment(0, i_block) else: latest_alignment = psl_alignment.get_split_alignment(i_block, psl_alignment.blocks_num - 1) latest_line = latest_alignment.get_psl_line_from_alignment() if end_tail == True: clear_union_alignments = union_alignments[:i_alignment] + [latest_alignment] clear_union_lines = union_lines[:i_alignment] + [latest_line] else: clear_union_alignments = [latest_alignment] + union_alignments[i_alignment + 1:] clear_union_lines = [latest_line] + union_lines[i_alignment + 1:] # update single transcript lines / alignments: for i_alignment in range(len(union_lines)): if union_lines[i_alignment] not in clear_union_lines: single_transcript_lines.remove(union_lines[i_alignment]) single_transcript_alignments.remove(union_alignments[i_alignment]) for i_alignment in range(len(clear_union_lines)): if clear_union_lines[i_alignment] not in single_transcript_lines: single_transcript_lines.append(clear_union_lines[i_alignment]) single_transcript_alignments.append(clear_union_alignments[i_alignment]) return clear_union_lines, clear_union_alignments, single_transcript_lines, single_transcript_alignments, else: fastaparser.write_fasta(out_low_complexity_file, [('{}_block'.format(psl_alignment.query_fragment.name), block_seq)], mode='a') if len(block_seq) > threshold_block_len: block_seq = '' # update single transcript lines / alignments: for i_alignment in range(len(union_lines)): single_transcript_lines.remove(union_lines[i_alignment]) single_transcript_alignments.remove(union_alignments[i_alignment]) return [], [], single_transcript_lines, single_transcript_alignments
def get_union_fake_blat_alignment(alignment0, alignment1): union_alignment = Alignment.PSLFileAlignment() strand = alignment0.strand # for cross more than one block or equal one block: if strand == '+': tmp_start = alignment1.query_fragment.starts[0] tmp_array = alignment0.query_fragment.starts else: tmp_start = alignment0.query_fragment.starts[0] tmp_array = alignment1.query_fragment.starts i_block1 = UtilsGeneral.get_bin_search_position_of_element(tmp_array, tmp_start) if strand == '+': if i_block1 != alignment0.blocks_num: alignment0 = alignment0.get_split_alignment(0, i_block1 - 1) else: if i_block1 != alignment1.blocks_num: alignment1 = alignment1.get_split_alignment(0, i_block1 - 1) tmp_qbase_cross = alignment0.query_fragment.end - alignment1.query_fragment.start + 1 if strand == '+': tmp_tbase_cross = alignment0.target_fragment.end - alignment1.target_fragment.start + 1 else: tmp_tbase_cross = alignment1.target_fragment.end - alignment0.target_fragment.start + 1 cross_bases = max(tmp_qbase_cross, tmp_tbase_cross, 0) f_together = False if tmp_qbase_cross >= 0 and tmp_tbase_cross >= 0 and tmp_qbase_cross == tmp_tbase_cross: f_together = True union_alignment.matches = alignment0.matches + alignment1.matches - max(cross_bases, 0) union_alignment.mismatches = alignment0.mismatches + alignment1.mismatches union_alignment.repmatches = alignment0.repmatches + alignment1.repmatches union_alignment.n_num = alignment0.n_num + alignment1.n_num union_alignment.strand = strand if f_together: union_alignment.blocks_num = alignment0.blocks_num + alignment1.blocks_num - 1 else: union_alignment.blocks_num = alignment0.blocks_num + alignment1.blocks_num if strand == '+': if f_together: union_alignment.blocks_sizes = alignment0.blocks_sizes[:-1] + \ [alignment0.blocks_sizes[-1] + alignment1.blocks_sizes[0] - max(cross_bases, 0)] + \ alignment1.blocks_sizes[1:] else: union_alignment.blocks_sizes = alignment0.blocks_sizes[:-1] + [alignment0.blocks_sizes[-1] - max(cross_bases, 0)] + alignment1.blocks_sizes[:] else: if f_together: union_alignment.blocks_sizes = alignment1.blocks_sizes[:-1] + \ [alignment1.blocks_sizes[-1] + alignment0.blocks_sizes[0] - max(cross_bases, 0)] + \ alignment0.blocks_sizes[1:] else: union_alignment.blocks_sizes = alignment1.blocks_sizes[:-1] + [alignment1.blocks_sizes[-1] - max(cross_bases, 0)] + alignment0.blocks_sizes[:] set_union_query_fragment_attributes(alignment0.query_fragment, alignment1.query_fragment, union_alignment.query_fragment, strand, f_together, cross_bases) set_union_target_fragment_attributes(alignment0.target_fragment, alignment1.target_fragment, union_alignment.target_fragment, strand, f_together, cross_bases) return union_alignment