Example #1
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Example #2
0
    def parallel_convert(self,
                         list_of_files,
                         output_directory,
                         output_format=".tiff"):

        save_mkdir(output_directory)
        options_list = []

        for filename in list_of_files:
            option = " %s" % filename
            option += " %s%s%s" % (check_path(output_directory),
                                   split_filename(filename)[1], output_format)
            options_list.append(option)

        self.parallel_execute(options_list)
Example #3
0
#parser.add_argument("-t", "--threads", action="store", dest="threads", default=1, type=int,
#                    help="Number of threads to use in Trimmomatic. Default - 1.")
parser.add_argument("-q", "--average_quality_threshold", action="store", dest="average_quality_threshold", default=15,
                    type=int,
                    help="Quality threshold for sliding window. Works only if -q/--average_quality_threshold is set"
                         "Default - 15.")
parser.add_argument("-u", "--score_type", action="store", dest="score_type", default="phred64",
                    help="Phred quality score type. Allowed: phred33, phred64. Default: phred64")
parser.add_argument("-n", "--name_type", action="store", dest="name_type", default="short",
                    help="Type of read name. Required to gather per tile filtering statistics. Default: short")
"""
args = parser.parse_args()

samples = args.samples.split(",") if args.samples else sorted(
    os.listdir(args.samples_dir))
save_mkdir(args.output_dir)

overall_stat_file = "%s/overall_samples.stat" % args.output_dir
overall_stat_fd = open(overall_stat_file, "w")
overall_stat_fd.write(
    "#Sample_id\tTotal_pairs\tRetained_pairs\tRetained_pairs_percent\tMin_pairs_retained_in_tiles\n"
)

for sample in samples:
    print("Handling %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)

    sample_out_dir = "%s%s/" % (args.output_dir, sample)
    save_mkdir(sample_out_dir)
    files_from_sample_dir = sorted(os.listdir(sample_dir))
Example #4
0
    action="store",
    dest="suffix",
    default=".gaps_removed",
    help="Suffix to use in output files. Default: '.gaps_removed'")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignment")
parser.add_argument("-v",
                    "--verbose",
                    action="store_true",
                    dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

save_mkdir(args.output)

for alignment_file in args.input:
    splited_filename = split_filename(alignment_file)
    if args.verbose:
        print("Handling %s ..." % alignment_file)
    output_filename = "%s%s%s%s" % (args.output, splited_filename[1],
                                    args.suffix, splited_filename[2])
    alignment = AlignIO.read(alignment_file, args.format)
    filtered_alignment = MultipleAlignmentRoutines.remove_columns_with_gaps(
        alignment, args.max_gap_number, gap_symbol=args.gap_symbol)
    AlignIO.write(filtered_alignment, output_filename, args.format)
Example #5
0
args = parser.parse_args()

Trimmomatic.jar_path = args.path_to_trimmomatic_dir
Trimmomatic.threads = args.threads
#print(Trimmomatic.path)
#print(Trimmomatic.jar_path)
samples = args.samples.split(",") if args.samples else os.listdir(
    args.samples_dir)

for sample in samples:
    print("Handling %s" % sample)

    sample_dir = "%s%s/" % (args.samples_dir, sample)

    sample_out_dir = "%s%s/" % (args.output_dir, sample)
    save_mkdir(sample_out_dir)
    trimmomatic_log = "%s/trimmomatic.log" % sample_out_dir
    trimmomatic_time_log = "%s/trimmomatic.time.log" % sample_out_dir
    output_prefix = "%s%s.TMF" % (sample_out_dir, sample)

    files_from_sample_dir = os.listdir(sample_dir)

    left_reads_file = None
    right_reads_file = None

    for filename in files_from_sample_dir:
        if ("_1.fq" in filename) or ("_1.fastq" in filename):
            left_reads_file = filename
        elif ("_2.fq" in filename) or ("_2.fastq" in filename):
            right_reads_file = filename
    if (left_reads_file is None) and (right_reads_file is None):
Example #6
0
    "-s",
    "--store_logs",
    action="store_true",
    dest="store_logs",
    default=False,
    help="Store download logs in directory set by -g/--logs_dir option")
parser.add_argument("-g",
                    "--logs_dir",
                    action="store",
                    dest="logs_dir",
                    default="logs",
                    type=check_path,
                    help="Directory with logs")
args = parser.parse_args()

save_mkdir(args.output_dir)
save_mkdir(args.logs_dir)

if (not args.alignment) and (not args.tree) and (not args.hmm):
    args.all = True

in_fd = sys.stdin if args.input == "stdin" else open(args.input, "r")

family_ids = IdList()
family_ids.read(in_fd)

if args.input != "stdin":
    in_fd.close()

absent_alignment_list = IdList()
absent_tree_list = IdList()
Example #7
0
    def parallel_hmmscan(self,
                         hmmfile,
                         seqfile,
                         outfile,
                         num_of_seqs_per_scan=None,
                         split_dir="splited_fasta",
                         splited_output_dir="splited_output_dir",
                         tblout_outfile=None,
                         domtblout_outfile=None,
                         pfamtblout_outfile=None,
                         splited_tblout_dir=None,
                         splited_domtblout_dir=None,
                         splited_pfamtblout_dir=None,
                         dont_output_alignments=False,
                         model_evalue_threshold=None,
                         model_score_threshold=None,
                         domain_evalue_threshold=None,
                         domain_score_threshold=None,
                         model_evalue_significant_threshold=None,
                         model_score_significant_threshold=None,
                         domain_evalue_significant_threshold=None,
                         domain_score_significant_threshold=None,
                         use_profile_GA_gathering_cutoffs_for_thresholds=False,
                         use_profile_NC_noise_cutoffs_for_thresholds=False,
                         use_profile_TC_trusted_cutoffs_for_thresholds=False,
                         turn_off_all_heruristics=False,
                         turn_off_bias_filter=False,
                         MSV_threshold=None,
                         Vit_threshold=None,
                         Fwd_threshold=None,
                         turn_off_biased_composition_score_corrections=None,
                         input_format=None,
                         threads=None,
                         combine_output_to_single_file=True,
                         biopython_165_compartibility=False,
                         remove_tmp_dirs=True,
                         async_run=False,
                         external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        if splited_tblout_dir:
            save_mkdir(splited_tblout_dir)
        if splited_domtblout_dir:
            save_mkdir(splited_domtblout_dir)
        if splited_pfamtblout_dir:
            save_mkdir(splited_pfamtblout_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)
            tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix
                                         ) if splited_tblout_dir else None
            domtblout_file = "%s%s.hits" % (
                splited_domtblout_dir,
                filename_prefix) if splited_domtblout_dir else None
            pfamtblout_file = "%s%s.hits" % (
                splited_pfamtblout_dir,
                filename_prefix) if splited_pfamtblout_dir else None

            list_of_files.append((input_file, output_file, tblout_file,
                                  domtblout_file, pfamtblout_file))

        common_options = self.__parse_hmmsxxx_common_options(
            tblout=None,
            domtblout=None,
            pfamtblout=None,
            dont_output_alignments=dont_output_alignments,
            model_evalue_threshold=model_evalue_threshold,
            model_score_threshold=model_score_threshold,
            domain_evalue_threshold=domain_evalue_threshold,
            domain_score_threshold=domain_score_threshold,
            model_evalue_significant_threshold=
            model_evalue_significant_threshold,
            model_score_significant_threshold=model_score_significant_threshold,
            domain_evalue_significant_threshold=
            domain_evalue_significant_threshold,
            domain_score_significant_threshold=
            domain_score_significant_threshold,
            use_profile_GA_gathering_cutoffs_for_thresholds=
            use_profile_GA_gathering_cutoffs_for_thresholds,
            use_profile_NC_noise_cutoffs_for_thresholds=
            use_profile_NC_noise_cutoffs_for_thresholds,
            use_profile_TC_trusted_cutoffs_for_thresholds=
            use_profile_TC_trusted_cutoffs_for_thresholds,
            turn_off_all_heruristics=turn_off_all_heruristics,
            turn_off_bias_filter=turn_off_bias_filter,
            MSV_threshold=MSV_threshold,
            Vit_threshold=Vit_threshold,
            Fwd_threshold=Fwd_threshold,
            turn_off_biased_composition_score_corrections=
            turn_off_biased_composition_score_corrections)

        common_options += " --qformat %s" if input_format else ""
        options_list = []
        out_files = []
        tblout_files = []
        domtblout_files = []
        pfamtblout_files = []

        for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files:
            options = common_options

            options += " --tblout %s" % tblout_file if tblout_file else ""
            options += " --domtblout %s" % domtblout_file if domtblout_file else ""
            options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else ""
            options += " -o %s" % out_filename

            options += " %s" % hmmfile
            options += " %s" % in_file

            options_list.append(options)
            out_files.append(out_filename)
            tblout_files.append(tblout_file)
            domtblout_files.append(domtblout_file)
            pfamtblout_files.append(pfamtblout_file)

        self.parallel_execute(options_list,
                              cmd="hmmscan",
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            if biopython_165_compartibility:
                CGAS.cgas(
                    out_files,
                    sed_string=
                    "s/^Description:.*/Description: <unknown description>/",
                    output=outfile)
            else:
                CGAS.cat(out_files, output=outfile)
        if tblout_outfile:
            CGAS.cat(tblout_files, output=tblout_outfile)
        if domtblout_outfile:
            CGAS.cat(domtblout_files, output=domtblout_outfile)
        if pfamtblout_outfile:
            CGAS.cat(pfamtblout_files, output=pfamtblout_outfile)

        if remove_tmp_dirs:
            if splited_tblout_dir:
                shutil.rmtree(splited_tblout_dir)
            if splited_domtblout_dir:
                shutil.rmtree(splited_domtblout_dir)
            if splited_pfamtblout_dir:
                shutil.rmtree(splited_pfamtblout_dir)
            for tmp_dir in splited_dir, splited_out_dir:
                shutil.rmtree(tmp_dir)
parser.add_argument("-d",
                    "--top_hits_dir",
                    action="store",
                    dest="top_hits_dir",
                    default="top_hits_dir/",
                    type=check_path,
                    help="Directory to write intermediate(splited) output")
parser.add_argument("-r",
                    "--retain_splited_output",
                    action="store_true",
                    dest="retain",
                    help="Retain splited output")

args = parser.parse_args()

save_mkdir(args.top_hits_dir)


def handle_input(filename):
    sys.stdout.write("Handling %s\n" % filename)
    not_significant_ids = IdList()
    not_found_ids = IdList()

    prefix = split_filename(filename)[1]
    index_file = "%s.tmp.idx" % prefix
    hmm_dict = SearchIO.index_db(index_file, filename, args.format)
    if args.output == "stdout":
        out_fd = sys.stdout
    else:
        out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")
Example #9
0
    def parallel_alignment(self,
                           query_file,
                           target_file,
                           model,
                           num_of_recs_per_file=None,
                           show_alignment=None,
                           show_sugar=True,
                           show_cigar=None,
                           show_vulgar=None,
                           show_query_gff=None,
                           show_target_gff=None,
                           store_intermediate_files=False,
                           splited_fasta_dir="splited_fasta_dir",
                           splited_result_dir="splited_output",
                           number_of_results_to_report=None,
                           other_options=None,
                           num_of_files=None,
                           converted_output_dir="converted_output"):
        splited_filename = split_filename(query_file)
        self.split_fasta(query_file,
                         splited_fasta_dir,
                         num_of_recs_per_file=num_of_recs_per_file,
                         num_of_files=num_of_files,
                         output_prefix=splited_filename[1])

        common_options = self.parse_common_options(
            model,
            show_alignment=show_alignment,
            show_sugar=show_sugar,
            show_cigar=show_cigar,
            show_vulgar=show_vulgar,
            show_query_gff=show_query_gff,
            show_target_gff=show_target_gff,
            number_of_results_to_report=number_of_results_to_report,
            other_options=other_options)

        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        save_mkdir(splited_result_dir)
        #save_mkdir(converted_output_dir)

        for filename in splited_files:
            filename_list = split_filename(filename)
            options = common_options
            options += " -q %s/%s" % (splited_fasta_dir, filename)
            options += " -t %s" % target_file
            options += " > %s/%s.output" % (splited_result_dir,
                                            filename_list[1])
            options_list.append(options)

        self.parallel_execute(options_list)
        """
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (splited_result_dir, filename,
                                                                  matching_weight, mismatching_penalty,
                                                                  indel_penalty, match_probability,
                                                                  indel_probability,
                                                                  min_alignment_score, max_period)

            self.convert_trf_report(trf_output_file, "%s/%s" % (converted_output_dir, filename))

        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename, suffix)
            CGAS.cat(file_str, merged_file)
        """
        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
Example #10
0
    def parallel_search_tandem_repeat(self,
                                      query_file,
                                      output_prefix,
                                      matching_weight=2,
                                      mismatching_penalty=7,
                                      indel_penalty=7,
                                      match_probability=80,
                                      indel_probability=10,
                                      min_alignment_score=50,
                                      max_period=500,
                                      report_flanking_sequences=False,
                                      splited_fasta_dir="splited_fasta_dir",
                                      splited_result_dir="splited_output",
                                      converted_output_dir="converted_output",
                                      max_len_per_file=100000,
                                      store_intermediate_files=False):
        work_dir = os.getcwd()
        splited_filename = split_filename(query_file)
        self.split_fasta_by_seq_len(query_file,
                                    splited_fasta_dir,
                                    max_len_per_file=max_len_per_file,
                                    output_prefix=splited_filename[1])

        common_options = self.parse_common_options(
            matching_weight=matching_weight,
            mismatching_penalty=mismatching_penalty,
            indel_penalty=indel_penalty,
            match_probability=match_probability,
            indel_probability=indel_probability,
            min_alignment_score=min_alignment_score,
            max_period=max_period,
            report_flanking_sequences=report_flanking_sequences,
            make_dat_file=True)
        common_options += " -h"  # suppress html output
        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        save_mkdir(splited_result_dir)
        save_mkdir(converted_output_dir)
        os.chdir(splited_result_dir)

        input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \
                    else "../%s" % splited_fasta_dir

        for filename in splited_files:
            file_options = "%s/%s" % (input_dir, filename)
            file_options += common_options
            options_list.append(file_options)

        self.parallel_execute(options_list)

        os.chdir(work_dir)
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (
                splited_result_dir, filename, matching_weight,
                mismatching_penalty, indel_penalty, match_probability,
                indel_probability, min_alignment_score, max_period)

            self.convert_trf_report(trf_output_file,
                                    "%s/%s" % (converted_output_dir, filename))

        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab",
                       ".wide.tab"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename,
                                          suffix)
            CGAS.cat(file_str, merged_file)

        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
            shutil.rmtree(splited_result_dir)
            shutil.rmtree(converted_output_dir)
Example #11
0
                    action="store",
                    dest="hmmer_dir",
                    default="",
                    help="Directory with hmmer v3.1 binaries")
args = parser.parse_args()

input_filename_list = split_filename(args.input)
input_filename = input_filename_list[1] + input_filename_list[2]

workdir_dir = "%s.transdecoder_dir/" % input_filename
pep_from_longest_orfs = "%s/longest_orfs.pep" % workdir_dir

hmmscan_dir = "hmmscan_vs_pfam/"
blastp_dir = "blastp_vs_uniref/"

save_mkdir(hmmscan_dir)
save_mkdir(blastp_dir)

hmmscan_splited_fasta_dir = "%ssplited_fasta_dir/" % hmmscan_dir
splited_domtblout_dir = "%ssplited_domtblout_dir/" % hmmscan_dir
hmmscan_vs_pfam_output = "%s%s.pfam.hits" % (hmmscan_dir, input_filename)
domtblout_outfile = "%s%s.pfam.domtblout" % (
    hmmscan_dir, input_filename) if args.pfam_database else None

blastp_outfile = "%s%s.blastp.hits" % (
    blastp_dir, input_filename) if args.blast_database else None
blastp_split_dir = "%ssplited_fasta_dir/" % blastp_dir
blastp_splited_output_dir = "%ssplited_output_dir" % blastp_dir
HMMER3.path = args.hmmer_dir
HMMER3.threads = args.threads
BLASTp.threads = args.threads