def index(self, genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None): FileRoutines.safe_mkdir(genome_dir) options = "--runMode genomeGenerate" options += " --genomeDir %s" % os.path.abspath(genome_dir) options += " --runThreadN %i" % self.threads options += " --genomeFastaFiles %s" % ( os.path.abspath(genome_fasta) if isinstance(genome_fasta, str) else " ".join(map(os.path.abspath, genome_fasta))) options += " --sjdbGTFfile %s" % annotation_gtf if annotation_gtf else "" options += " --sjdbFileChrStartEnd %s" % junction_tab_file if junction_tab_file else "" options += " --sjdbOverhang %i" % sjdboverhang if sjdboverhang else "" # number of bases taken from both sides of splice junction. 100 by default if genome_size: options += " --genomeSAindexNbases %i" % min( [14, (floor(log(genome_size, 2) / 2)) - 1]) else: options += " --genomeSAindexNbases %i" % genomeSAindexNbases if genomeSAindexNbases else "" # size of k-mers used for preindexing of suffix array options += " --genomeChrBinNbits %i" % genomeChrBinNbits if genomeChrBinNbits else "" # padding size (log2) of reference sequences. 18 by default # recommended value min(18, log2(GenomeLength/NumberOfScaffolds)) self.execute(options)
def parallel_blast(self, blast_command, seqfile, database, outfile=None, blast_options=None, split_dir="splited_fasta", splited_output_dir="splited_output_dir", evalue=None, output_format=None, threads=None, num_of_seqs_per_scan=None, combine_output_to_single_file=True, async_run=False, external_process_pool=None): splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) self.safe_mkdir(splited_dir) self.safe_mkdir(splited_out_dir) number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_files = [] for filename in input_list_of_files: filename_prefix = FileRoutines.split_filename(filename)[1] input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.hits" % (splited_out_dir, filename_prefix) list_of_files.append((input_file, output_file)) options_list = [] out_files = [] for in_file, out_filename in list_of_files: options = " -out %s" % out_filename options += " -db %s" % database options += " -query %s" % in_file options += " %s" % blast_options if blast_options else "" options += " -evalue %s" % evalue if evalue else "" options += " -outfmt %i" % output_format if output_format else "" options_list.append(options) out_files.append(out_filename) self.parallel_execute(options_list, cmd=blast_command, threads=threads, async_run=async_run, external_process_pool=external_process_pool) if combine_output_to_single_file: CGAS.cat(out_files, output=outfile)
def parallel_align(self, list_of_files, output_directory, output_suffix="alignment", gap_open_penalty=None, offset=None, maxiterate=None, quiet=False, mode="globalpair", number_of_processes=1, anysymbol=False): # TODO: add rest of options options = " --thread %i" % self.threads options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else "" options += " --ep %f" % offset if offset is not None else "" options += " --maxiterate %i" % maxiterate if maxiterate is not None else "" options += " --quiet" if quiet else "" options += " --%s" % mode options += " --anysymbol" if anysymbol else "" options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = options op += " %s" % filename op += " > %s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) self.parallel_execute(options_list, threads=number_of_processes)
def parallel_align(self, list_of_files, output_directory, output_suffix=None, tree_file=None, output_format=None, show_xml=None, show_tree=None, show_ancestral_sequences=None, show_evolutionary_events=None, showall=None, compute_posterior_support=None, njtree=None, skip_insertions=False, codon_alignment=None, translated_alignment=None): common_options = self.parse_common_options( tree_file=tree_file, output_format=output_format, show_xml=show_xml, show_tree=show_tree, show_ancestral_sequences=show_ancestral_sequences, show_evolutionary_events=show_evolutionary_events, showall=showall, compute_posterior_support=compute_posterior_support, njtree=njtree, skip_insertions=skip_insertions, codon_alignment=codon_alignment, translated_alignment=translated_alignment) FileRoutines.safe_mkdir(output_directory) options_list = [] for filename in list_of_files: basename = FileRoutines.split_filename(filename)[1] op = common_options op += " -d=%s" % filename op += " -o=%s/%s.fasta" % (output_directory, ("%s_%s" % (basename, output_suffix)) if output_suffix else basename) options_list.append(op) self.parallel_execute(options_list)
print("Drawing histograms...") for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence: MatplotlibRoutines.percent_histogram_from_file( stat_file, stat_file, data_type=None, column_list=(2, ), comments="#", n_bins=20, title="Transcript support by hints", extensions=("png", "svg"), legend_location="upper center", stats_as_legend=True) print("Creating final directories...") if args.pfam_db and args.swissprot_db: db_or_hints_dir = "supported_by_db_or_hints/" db_and_hints_dir = "supported_by_db_and_hints/" for directory in db_and_hints_dir, db_or_hints_dir: FileRoutines.safe_mkdir(directory) os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" % (args.output, db_or_hints_dir)) os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" % (args.output, db_and_hints_dir))
def mask(self, list_of_fasta_files, output_dir="./", soft_masking=True, engine="ncbi", search_speed="normal", no_low_complexity=None, only_low_complexity=None, no_interspersed=None, only_interspersed=None, no_rna=None, only_alu=None, custom_library=None, species=None, html_output=False, ace_output=False, gff_output=False): if species and custom_library: tmp_repeat_file = "%s/%s.repeats.tmp.fa" % (output_dir, species) tmp_repeats_all_file = "%s/all.repeats.tmp.fasta" % output_dir self.extract_repeats_from_database(tmp_repeat_file, species=species) cmd = "cat %s %s > %s" % (tmp_repeat_file, custom_library, tmp_repeats_all_file) self.execute(cmd=cmd) options = " -pa %i" % self.threads options += " -e %s" % engine if search_speed == "slow": options += " -s" elif search_speed == "quick": options += " -q" elif search_speed == "rush": options += " -qq" options += " -nolow" if no_low_complexity else "" options += " -low" if only_low_complexity else "" options += " -noint" if no_interspersed else "" options += " -int" if only_interspersed else "" options += " -norna" if no_rna else "" options += " -alu" if only_alu else "" if species and custom_library: options += " -lib %s" % tmp_repeats_all_file elif custom_library: options += " -lib %s" % custom_library if custom_library else "" elif species: options += " -species %s" % species if species else "" options += " -dir %s" % output_dir options += " -html" if html_output else "" options += " -ace" if ace_output else "" options += " -gff" if gff_output else "" options += " -xsmall" if soft_masking else "" options += " " + (list_of_fasta_files if isinstance(list_of_fasta_files, str) else " ".join(FileRoutines.make_list_of_path_to_files(list_of_fasta_files))) self.execute(options=options) """
def parallel_predict(self, species, genome_file, output, strand="both", gene_model=None, output_gff3=True, other_options="", split_dir="splited_input", splited_output_dir="splited_output_dir", config_dir=None, combine_output_to_single_file=True, use_softmasking=None, hints_file=None, extrinsicCfgFile=None, predict_UTR=None, external_process_pool=None, async_run=False, min_intron_len=None, parsing_mode="parse"): common_options = self.parse_options(species, genome_file="", strand=strand, gene_model=gene_model, output_gff3=output_gff3, other_options=other_options, config_dir=config_dir, use_softmasking=use_softmasking, hints_file=hints_file, extrinsicCfgFile=extrinsicCfgFile, predict_UTR=predict_UTR, min_intron_len=min_intron_len) splited_dir = FileRoutines.check_path(split_dir) splited_out_dir = FileRoutines.check_path(splited_output_dir) FileRoutines.safe_mkdir(splited_dir) FileRoutines.safe_mkdir(splited_out_dir) self.split_fasta_by_seq_len(genome_file, splited_dir, parsing_mode=parsing_mode) input_list_of_files = sorted(os.listdir(splited_dir)) list_of_output_files = [] options_list = [] for filename in input_list_of_files: input_file = "%s%s" % (splited_dir, filename) output_file = "%s%s.gff" % (splited_out_dir, filename) list_of_output_files.append(output_file) options = common_options options += " %s" % input_file options += " > %s" % output_file options_list.append(options) self.parallel_execute(options_list, external_process_pool=external_process_pool, async_run=async_run) if combine_output_to_single_file: CGAS.cat(list_of_output_files, output=output)
def align_samples(self, samples_dir, output_dir, genome_dir, genome_fasta=None, samples=None, annotation_gtf=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=8000000000, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=True, max_intron_length=None): #STAR.threads = threads #STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=annotation_gtf, junction_tab_file=junction_tab_file_list, sjdboverhang=sjdboverhang, genomeSAindexNbases=genomeSAindexNbases, genomeChrBinNbits=genomeChrBinNbits, genome_size=genome_size) sample_list = samples if samples else self.get_sample_list(samples_dir) FileRoutines.safe_mkdir(output_dir) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_dir, sample) alignment_sample_dir = "%s/%s/" % (output_dir, sample) FileRoutines.safe_mkdir(alignment_sample_dir) filetypes, forward_files, reverse_files, se_files = FileRoutines.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf if not genome_fasta else None, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) print "\tIndexing bam file..." resulting_bam_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir SamtoolsV1.index(resulting_bam_file)