Example #1
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Example #2
0
 def write_splited(self, out_dir="./", extension="t", value_separator=","):
     from Routines.File import check_path
     for fl_key in self:
         with open("%s%s.%s" % (check_path(out_dir), fl_key, extension),
                   "w") as out_fd:
             for sl_key in self[fl_key]:
                 out_fd.write(
                     "%s\t%s\n" %
                     (sl_key, value_separator.join(self[fl_key][sl_key])))
Example #3
0
    def parallel_convert(self,
                         list_of_files,
                         output_directory,
                         output_format=".tiff"):

        save_mkdir(output_directory)
        options_list = []

        for filename in list_of_files:
            option = " %s" % filename
            option += " %s%s%s" % (check_path(output_directory),
                                   split_filename(filename)[1], output_format)
            options_list.append(option)

        self.parallel_execute(options_list)
Example #4
0
import os

import argparse

import numpy as np

from Routines.File import check_path, save_mkdir

parser = argparse.ArgumentParser()

parser.add_argument("-d",
                    "--sample_directory",
                    action="store",
                    dest="samples_dir",
                    required=True,
                    type=lambda s: check_path(os.path.abspath(s)),
                    help="Directory with samples")
parser.add_argument(
    "-s",
    "--samples",
    action="store",
    dest="samples",
    help="Comma-separated list of subdirectories(one per sample) to handle. "
    "If not set all subdirectories will be considered as containing samples."
    "In sample directory should one(in case SE reads) or two(in case PE reads) files."
    "Filenames should should contain '_1.fq' or '_1.fastq' for forward(left) reads, "
    " '_2.fq' or '_2.fastq' for reverse(right) reads and '.fq' or '.fastq' for SE reads"
)
parser.add_argument(
    "-o",
    "--output_dir",
Example #5
0
                    help="Input file with left reads")
parser.add_argument("-r", "--input_right", action="store", dest="input_right",
                    help="Input file with right reads")
parser.add_argument("-o", "--out_dir", action="store", dest="out_dir", default="./",
                    help="Directory to write output")
parser.add_argument("-m", "--min_len", action="store", dest="min_len", type=int, default=1,
                    help="Minimum length of read to output")

args = parser.parse_args()
n_regexp = re.compile("N+$")

if args.input_se:

    se_directory, se_prefix, se_extension = split_filename(args.input_se)
    se_in_fd = open(args.input_se, "r")
    se_out_file = "%s%s.filtered%s" % (check_path(args.out_dir), se_prefix, se_extension)
    se_out_fd = open(se_out_file, "w")

    while True:
        name, sequence, separator, quality = read_entry(se_in_fd)
        if name is None:
            break
        match = n_regexp.search(sequence)
        if match is None:
            se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence, separator, quality))
        elif match.start() >= args.min_len:
            se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence[:match.start()+1], separator, quality[:match.start()+1]))
        else:
            continue

    se_in_fd.close()
Example #6
0
    def parallel_hmmscan(self,
                         hmmfile,
                         seqfile,
                         outfile,
                         num_of_seqs_per_scan=None,
                         split_dir="splited_fasta",
                         splited_output_dir="splited_output_dir",
                         tblout_outfile=None,
                         domtblout_outfile=None,
                         pfamtblout_outfile=None,
                         splited_tblout_dir=None,
                         splited_domtblout_dir=None,
                         splited_pfamtblout_dir=None,
                         dont_output_alignments=False,
                         model_evalue_threshold=None,
                         model_score_threshold=None,
                         domain_evalue_threshold=None,
                         domain_score_threshold=None,
                         model_evalue_significant_threshold=None,
                         model_score_significant_threshold=None,
                         domain_evalue_significant_threshold=None,
                         domain_score_significant_threshold=None,
                         use_profile_GA_gathering_cutoffs_for_thresholds=False,
                         use_profile_NC_noise_cutoffs_for_thresholds=False,
                         use_profile_TC_trusted_cutoffs_for_thresholds=False,
                         turn_off_all_heruristics=False,
                         turn_off_bias_filter=False,
                         MSV_threshold=None,
                         Vit_threshold=None,
                         Fwd_threshold=None,
                         turn_off_biased_composition_score_corrections=None,
                         input_format=None,
                         threads=None,
                         combine_output_to_single_file=True,
                         biopython_165_compartibility=False,
                         remove_tmp_dirs=True,
                         async_run=False,
                         external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        if splited_tblout_dir:
            save_mkdir(splited_tblout_dir)
        if splited_domtblout_dir:
            save_mkdir(splited_domtblout_dir)
        if splited_pfamtblout_dir:
            save_mkdir(splited_pfamtblout_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)
            tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix
                                         ) if splited_tblout_dir else None
            domtblout_file = "%s%s.hits" % (
                splited_domtblout_dir,
                filename_prefix) if splited_domtblout_dir else None
            pfamtblout_file = "%s%s.hits" % (
                splited_pfamtblout_dir,
                filename_prefix) if splited_pfamtblout_dir else None

            list_of_files.append((input_file, output_file, tblout_file,
                                  domtblout_file, pfamtblout_file))

        common_options = self.__parse_hmmsxxx_common_options(
            tblout=None,
            domtblout=None,
            pfamtblout=None,
            dont_output_alignments=dont_output_alignments,
            model_evalue_threshold=model_evalue_threshold,
            model_score_threshold=model_score_threshold,
            domain_evalue_threshold=domain_evalue_threshold,
            domain_score_threshold=domain_score_threshold,
            model_evalue_significant_threshold=
            model_evalue_significant_threshold,
            model_score_significant_threshold=model_score_significant_threshold,
            domain_evalue_significant_threshold=
            domain_evalue_significant_threshold,
            domain_score_significant_threshold=
            domain_score_significant_threshold,
            use_profile_GA_gathering_cutoffs_for_thresholds=
            use_profile_GA_gathering_cutoffs_for_thresholds,
            use_profile_NC_noise_cutoffs_for_thresholds=
            use_profile_NC_noise_cutoffs_for_thresholds,
            use_profile_TC_trusted_cutoffs_for_thresholds=
            use_profile_TC_trusted_cutoffs_for_thresholds,
            turn_off_all_heruristics=turn_off_all_heruristics,
            turn_off_bias_filter=turn_off_bias_filter,
            MSV_threshold=MSV_threshold,
            Vit_threshold=Vit_threshold,
            Fwd_threshold=Fwd_threshold,
            turn_off_biased_composition_score_corrections=
            turn_off_biased_composition_score_corrections)

        common_options += " --qformat %s" if input_format else ""
        options_list = []
        out_files = []
        tblout_files = []
        domtblout_files = []
        pfamtblout_files = []

        for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files:
            options = common_options

            options += " --tblout %s" % tblout_file if tblout_file else ""
            options += " --domtblout %s" % domtblout_file if domtblout_file else ""
            options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else ""
            options += " -o %s" % out_filename

            options += " %s" % hmmfile
            options += " %s" % in_file

            options_list.append(options)
            out_files.append(out_filename)
            tblout_files.append(tblout_file)
            domtblout_files.append(domtblout_file)
            pfamtblout_files.append(pfamtblout_file)

        self.parallel_execute(options_list,
                              cmd="hmmscan",
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            if biopython_165_compartibility:
                CGAS.cgas(
                    out_files,
                    sed_string=
                    "s/^Description:.*/Description: <unknown description>/",
                    output=outfile)
            else:
                CGAS.cat(out_files, output=outfile)
        if tblout_outfile:
            CGAS.cat(tblout_files, output=tblout_outfile)
        if domtblout_outfile:
            CGAS.cat(domtblout_files, output=domtblout_outfile)
        if pfamtblout_outfile:
            CGAS.cat(pfamtblout_files, output=pfamtblout_outfile)

        if remove_tmp_dirs:
            if splited_tblout_dir:
                shutil.rmtree(splited_tblout_dir)
            if splited_domtblout_dir:
                shutil.rmtree(splited_domtblout_dir)
            if splited_pfamtblout_dir:
                shutil.rmtree(splited_pfamtblout_dir)
            for tmp_dir in splited_dir, splited_out_dir:
                shutil.rmtree(tmp_dir)
Example #7
0
    "occurrences of both. So this option is not suitable for generating sets of forward "
    "and reverse-complement kmers. For this case use -r/--add_reverse_complement option. "
    "Not compatible with -r/--add_reverse_complement option.")
parser.add_argument(
    "-r",
    "--add_reverse_complement",
    action="store_true",
    dest="add_rev_com",
    help="Add reverse-complement sequences before counting kmers. "
    "Works only for fasta sequences. "
    "Not compatible with -b/--count_both_strands option")

parser = argparse.ArgumentParser()

args = parser.parse_args()
args.path_to_mavr = check_path(args.path_to_mavr)

MaSuRCA.threads = args.threads
Jellyfish.threads = args.threads
Jellyfish.path = args.jellyfish_path if args.jellyfish_path else ""

iteration_reference_file = args.initial_sequences
working_dir = os.getcwd()
abs_path_left_source_reads = os.path.abspath(args.left_source_reads)
abs_path_right_source_reads = os.path.abspath(args.right_source_reads)
"""
for filename in args.source_reads:
    ab
    if os.path.isabs(filename):
        abs_path_source_reads.append(filename)
    else:
Example #8
0
                    help="File with families")
parser.add_argument("-o",
                    "--output_dir",
                    action="store",
                    dest="output_dir",
                    help="Directory to write output")
parser.add_argument("-t",
                    "--threads",
                    action="store",
                    dest="threads",
                    type=int,
                    default=1,
                    help="Number of threads to use")
args = parser.parse_args()

args.output_dir = check_path(args.output_dir)


def check_edge_strict(nodes_list, id_list):
    for node in nodes_list:
        if node not in id_list:
            return False
    return True


def check_edge_soft(nodes_list, id_list):
    for node in nodes_list:
        if node in id_list:
            return True
    return False