def handle_input(filename):
    sys.stdout.write("Handling %s\n" % filename)
    not_significant_ids = IdList()
    not_found_ids = IdList()

    prefix = split_filename(filename)[1]
    index_file = "%s.tmp.idx" % prefix
    hmm_dict = SearchIO.index_db(index_file, filename, args.format)
    if args.output == "stdout":
        out_fd = sys.stdout
    else:
        out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

    for query in hmm_dict:
        if hmm_dict[query].hits:
            if hmm_dict[query][0].is_included:
                out_fd.write(
                    "%s\t%s\t%s\t%s\n" %
                    (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue,
                     hmm_dict[query][0].bitscore))
            else:
                not_significant_ids.append(query)
        else:
            not_found_ids.append(query)

    if args.output != "stdout":
        out_fd.close()

    os.remove(index_file)
    return not_significant_ids, not_found_ids
Ejemplo n.º 2
0
    def parallel_align(self,
                       list_of_files,
                       output_directory,
                       output_suffix="alignment",
                       gap_open_penalty=None,
                       offset=None,
                       maxiterate=None,
                       quiet=False,
                       mode="globalpair",
                       number_of_processes=1):
        # TODO: add rest of options

        options = " --thread %i" % self.threads
        options += " --op %f" % gap_open_penalty if gap_open_penalty is not None else ""
        options += " --ep %f" % offset if offset is not None else ""
        options += " --maxiterate %i" % maxiterate if maxiterate is not None else ""
        options += " --quiet" if quiet else ""
        options += " --%s" % mode

        options_list = []
        for filename in list_of_files:
            basename = split_filename(filename)[1]
            op = options
            op += " %s" % filename
            op += " > %s/%s_%s.fasta" % (output_directory, basename,
                                         output_suffix)
            options_list.append(op)

        self.parallel_execute(options_list, threads=number_of_processes)
Ejemplo n.º 3
0
    def parallel_blast(self,
                       blast_command,
                       seqfile,
                       database,
                       outfile=None,
                       blast_options=None,
                       split_dir="splited_fasta",
                       splited_output_dir="splited_output_dir",
                       evalue=None,
                       output_format=None,
                       threads=None,
                       num_of_seqs_per_scan=None,
                       combine_output_to_single_file=True,
                       async_run=False,
                       external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)

            list_of_files.append((input_file, output_file))

        options_list = []
        out_files = []

        for in_file, out_filename in list_of_files:

            options = " -out %s" % out_filename

            options += " -db %s" % database
            options += " -query %s" % in_file
            options += " %s" % blast_options if blast_options else ""
            options += " -evalue %s" % evalue if evalue else ""
            options += " -outfmt %i" % output_format if output_format else ""
            options_list.append(options)
            out_files.append(out_filename)

        self.parallel_execute(options_list,
                              cmd=blast_command,
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            CGAS.cat(out_files, output=outfile)
Ejemplo n.º 4
0
    def parallel_convert(self,
                         list_of_files,
                         output_directory,
                         output_format=".tiff"):

        save_mkdir(output_directory)
        options_list = []

        for filename in list_of_files:
            option = " %s" % filename
            option += " %s%s%s" % (check_path(output_directory),
                                   split_filename(filename)[1], output_format)
            options_list.append(option)

        self.parallel_execute(options_list)
Ejemplo n.º 5
0
    action="store",
    dest="suffix",
    default=".gaps_removed",
    help="Suffix to use in output files. Default: '.gaps_removed'")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    default="fasta",
                    help="Format of alignment")
parser.add_argument("-v",
                    "--verbose",
                    action="store_true",
                    dest="verbose",
                    help="Print not found ids. Default - no")

args = parser.parse_args()

save_mkdir(args.output)

for alignment_file in args.input:
    splited_filename = split_filename(alignment_file)
    if args.verbose:
        print("Handling %s ..." % alignment_file)
    output_filename = "%s%s%s%s" % (args.output, splited_filename[1],
                                    args.suffix, splited_filename[2])
    alignment = AlignIO.read(alignment_file, args.format)
    filtered_alignment = MultipleAlignmentRoutines.remove_columns_with_gaps(
        alignment, args.max_gap_number, gap_symbol=args.gap_symbol)
    AlignIO.write(filtered_alignment, output_filename, args.format)
Ejemplo n.º 6
0
                    action="store_true",
                    dest="convert_to_single_letter",
                    help="Convert aminoacids to single letters")

args = parser.parse_args()

args.input = make_list_of_path_to_files(args.input)

gene_alias_dict = SynDict()
if args.gene_alias_file:
    gene_alias_dict.read(args.gene_alias_file, split_values=False)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

summary_dict = TwoLvlDict()
for filename in args.input:
    directory, prefix, extension = split_filename(filename)

    if args.write_dir_path and args.write_ext:
        name = filename
    elif args.write_dir_path:
        name = (directory + prefix) if directory else prefix
    elif args.write_ext:
        name = prefix + extension
    else:
        name = prefix
        if args.suffix_to_remove in name:
            name = name.replace(args.suffix_to_remove, "")
    summary_dict[name] = OrderedDict()
    with open(filename, "r") as file_fd:
        file_fd.readline()
        for line in file_fd:
                    help="Suffix of fam files. Default: .fam")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Suffix of fam files")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
species_list = []
suffix_list = []
if args.use_basename:
    for filename in sorted(os.listdir(args.input)):
        dir, basename, ext = split_filename(filename)
        species_list.append(basename)
        suffix_list.append("%s" % ext)
else:
    species_list = sorted(args.species_set)
    suffix_list = [args.suffix for i in range(0, len(species_list))]

out_fd.write("#species\tnumber_of_families\tnumber_of_proteins\n")
for species, suffix in zip(species_list, suffix_list):
    fam_dict = SynDict()
    fam_dict.read("%s%s%s" % (args.input, species, suffix),
                  separator="\t",
                  split_values=True,
                  values_separator=",",
                  key_index=0,
                  value_index=1)
Ejemplo n.º 8
0
                    help="Input file with SE reads")
parser.add_argument("-l", "--input_left", action="store", dest="input_left",
                    help="Input file with left reads")
parser.add_argument("-r", "--input_right", action="store", dest="input_right",
                    help="Input file with right reads")
parser.add_argument("-o", "--out_dir", action="store", dest="out_dir", default="./",
                    help="Directory to write output")
parser.add_argument("-m", "--min_len", action="store", dest="min_len", type=int, default=1,
                    help="Minimum length of read to output")

args = parser.parse_args()
n_regexp = re.compile("N+$")

if args.input_se:

    se_directory, se_prefix, se_extension = split_filename(args.input_se)
    se_in_fd = open(args.input_se, "r")
    se_out_file = "%s%s.filtered%s" % (check_path(args.out_dir), se_prefix, se_extension)
    se_out_fd = open(se_out_file, "w")

    while True:
        name, sequence, separator, quality = read_entry(se_in_fd)
        if name is None:
            break
        match = n_regexp.search(sequence)
        if match is None:
            se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence, separator, quality))
        elif match.start() >= args.min_len:
            se_out_fd.write("%s\n%s\n%s\n%s\n" % (name, sequence[:match.start()+1], separator, quality[:match.start()+1]))
        else:
            continue
Ejemplo n.º 9
0
    def parallel_hmmscan(self,
                         hmmfile,
                         seqfile,
                         outfile,
                         num_of_seqs_per_scan=None,
                         split_dir="splited_fasta",
                         splited_output_dir="splited_output_dir",
                         tblout_outfile=None,
                         domtblout_outfile=None,
                         pfamtblout_outfile=None,
                         splited_tblout_dir=None,
                         splited_domtblout_dir=None,
                         splited_pfamtblout_dir=None,
                         dont_output_alignments=False,
                         model_evalue_threshold=None,
                         model_score_threshold=None,
                         domain_evalue_threshold=None,
                         domain_score_threshold=None,
                         model_evalue_significant_threshold=None,
                         model_score_significant_threshold=None,
                         domain_evalue_significant_threshold=None,
                         domain_score_significant_threshold=None,
                         use_profile_GA_gathering_cutoffs_for_thresholds=False,
                         use_profile_NC_noise_cutoffs_for_thresholds=False,
                         use_profile_TC_trusted_cutoffs_for_thresholds=False,
                         turn_off_all_heruristics=False,
                         turn_off_bias_filter=False,
                         MSV_threshold=None,
                         Vit_threshold=None,
                         Fwd_threshold=None,
                         turn_off_biased_composition_score_corrections=None,
                         input_format=None,
                         threads=None,
                         combine_output_to_single_file=True,
                         biopython_165_compartibility=False,
                         remove_tmp_dirs=True,
                         async_run=False,
                         external_process_pool=None):

        splited_dir = check_path(split_dir)
        splited_out_dir = check_path(splited_output_dir)
        save_mkdir(splited_dir)
        save_mkdir(splited_out_dir)

        if splited_tblout_dir:
            save_mkdir(splited_tblout_dir)
        if splited_domtblout_dir:
            save_mkdir(splited_domtblout_dir)
        if splited_pfamtblout_dir:
            save_mkdir(splited_pfamtblout_dir)

        number_of_files = num_of_seqs_per_scan if num_of_seqs_per_scan else 5 * threads if threads else 5 * self.threads
        self.split_fasta(seqfile, splited_dir, num_of_files=number_of_files)
        input_list_of_files = sorted(os.listdir(splited_dir))
        list_of_files = []

        for filename in input_list_of_files:
            filename_prefix = split_filename(filename)[1]

            input_file = "%s%s" % (splited_dir, filename)
            output_file = "%s%s.hits" % (splited_out_dir, filename_prefix)
            tblout_file = "%s%s.hits" % (splited_tblout_dir, filename_prefix
                                         ) if splited_tblout_dir else None
            domtblout_file = "%s%s.hits" % (
                splited_domtblout_dir,
                filename_prefix) if splited_domtblout_dir else None
            pfamtblout_file = "%s%s.hits" % (
                splited_pfamtblout_dir,
                filename_prefix) if splited_pfamtblout_dir else None

            list_of_files.append((input_file, output_file, tblout_file,
                                  domtblout_file, pfamtblout_file))

        common_options = self.__parse_hmmsxxx_common_options(
            tblout=None,
            domtblout=None,
            pfamtblout=None,
            dont_output_alignments=dont_output_alignments,
            model_evalue_threshold=model_evalue_threshold,
            model_score_threshold=model_score_threshold,
            domain_evalue_threshold=domain_evalue_threshold,
            domain_score_threshold=domain_score_threshold,
            model_evalue_significant_threshold=
            model_evalue_significant_threshold,
            model_score_significant_threshold=model_score_significant_threshold,
            domain_evalue_significant_threshold=
            domain_evalue_significant_threshold,
            domain_score_significant_threshold=
            domain_score_significant_threshold,
            use_profile_GA_gathering_cutoffs_for_thresholds=
            use_profile_GA_gathering_cutoffs_for_thresholds,
            use_profile_NC_noise_cutoffs_for_thresholds=
            use_profile_NC_noise_cutoffs_for_thresholds,
            use_profile_TC_trusted_cutoffs_for_thresholds=
            use_profile_TC_trusted_cutoffs_for_thresholds,
            turn_off_all_heruristics=turn_off_all_heruristics,
            turn_off_bias_filter=turn_off_bias_filter,
            MSV_threshold=MSV_threshold,
            Vit_threshold=Vit_threshold,
            Fwd_threshold=Fwd_threshold,
            turn_off_biased_composition_score_corrections=
            turn_off_biased_composition_score_corrections)

        common_options += " --qformat %s" if input_format else ""
        options_list = []
        out_files = []
        tblout_files = []
        domtblout_files = []
        pfamtblout_files = []

        for in_file, out_filename, tblout_file, domtblout_file, pfamtblout_file in list_of_files:
            options = common_options

            options += " --tblout %s" % tblout_file if tblout_file else ""
            options += " --domtblout %s" % domtblout_file if domtblout_file else ""
            options += " --pfamtblout %s" % pfamtblout_file if pfamtblout_file else ""
            options += " -o %s" % out_filename

            options += " %s" % hmmfile
            options += " %s" % in_file

            options_list.append(options)
            out_files.append(out_filename)
            tblout_files.append(tblout_file)
            domtblout_files.append(domtblout_file)
            pfamtblout_files.append(pfamtblout_file)

        self.parallel_execute(options_list,
                              cmd="hmmscan",
                              threads=threads,
                              async_run=async_run,
                              external_process_pool=external_process_pool)

        if combine_output_to_single_file:
            if biopython_165_compartibility:
                CGAS.cgas(
                    out_files,
                    sed_string=
                    "s/^Description:.*/Description: <unknown description>/",
                    output=outfile)
            else:
                CGAS.cat(out_files, output=outfile)
        if tblout_outfile:
            CGAS.cat(tblout_files, output=tblout_outfile)
        if domtblout_outfile:
            CGAS.cat(domtblout_files, output=domtblout_outfile)
        if pfamtblout_outfile:
            CGAS.cat(pfamtblout_files, output=pfamtblout_outfile)

        if remove_tmp_dirs:
            if splited_tblout_dir:
                shutil.rmtree(splited_tblout_dir)
            if splited_domtblout_dir:
                shutil.rmtree(splited_domtblout_dir)
            if splited_pfamtblout_dir:
                shutil.rmtree(splited_pfamtblout_dir)
            for tmp_dir in splited_dir, splited_out_dir:
                shutil.rmtree(tmp_dir)
Ejemplo n.º 10
0
    def split_hmm(self,
                  hmmfile,
                  output_dir,
                  num_of_recs_per_file,
                  num_of_files=None,
                  output_prefix=None,
                  threads=4):

        try:
            os.mkdir(output_dir)
        except OSError:
            pass

        id_fd = CGAS.cgas(hmmfile,
                          grep_pattern="NAME",
                          whole_word_match=True,
                          awk_code="{print $2}",
                          capture_output=True)

        split_index = 1
        ids_written = 0
        ids_list = IdList()
        #ids_list = read_ids(id_fd, close_after_if_file_object=False)
        ids_list.read(id_fd, close_after_if_file_object=True)
        number_of_ids = len(ids_list)
        out_prefix = split_filename(
            hmmfile)[1] if output_prefix is None else output_prefix

        num_of_ids = int(
            number_of_ids /
            num_of_files) + 1 if num_of_files else num_of_recs_per_file

        common_options = " -f"
        common_options += " %s" % hmmfile
        options_list = []
        while (ids_written + num_of_ids) <= number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:ids_written +
                                          num_of_ids])
            tmp_id_list.write("%s/%s_%i.ids" %
                              (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" %
                                  (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
            ids_written += num_of_ids

        if ids_written != number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:])
            tmp_id_list.write("%s/%s_%i.ids" %
                              (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" %
                                  (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
        #print options_list
        self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
Ejemplo n.º 11
0
"""

for iteration_index in range(1, args.number_of_iterations):

    os.chdir(working_dir)

    iteration = "iteration_%i" % iteration_index
    iteration_dir = "%s/%s" % (working_dir, iteration)
    iteration_ref = "%s/%s_reference.fasta" % (iteration_dir, iteration)
    iteration_ref_index = "%s/%s_reference.idx" % (iteration_dir, iteration)
    base_prefix = "%s/%s_reference_with_rev_com" % (iteration_dir, iteration)
    iteration_ref_with_rev_com = "%s/%s_reference_with_rev_com.fasta" % (
        iteration_dir, iteration)
    kmer_file = "%s_%i_mer.kmer" % (base_prefix, args.kmer_length)
    masurca_config_file = "masurca_%s.config" % iteration
    left_reads_prefix = split_filename(abs_path_left_source_reads)[1]
    right_reads_prefix = split_filename(abs_path_right_source_reads)[1]

    left_reads_se = "%s.se.fastq" % left_reads_prefix
    right_reads_se = "%s.se.fastq" % right_reads_prefix
    left_reads_filtered = "%s.filtered.fastq" % left_reads_prefix
    right_reads_filtered = "%s.filtered.fastq" % right_reads_prefix

    try:
        os.mkdir(iteration_dir)
    except OSError:
        pass

    shutil.copyfile(iteration_reference_file, iteration_ref)
    os.chdir(iteration_dir)
    iteration_reference_dict = SeqIO.index_db(iteration_ref_index,
Ejemplo n.º 12
0
    def parallel_alignment(self,
                           query_file,
                           target_file,
                           model,
                           num_of_recs_per_file=None,
                           show_alignment=None,
                           show_sugar=True,
                           show_cigar=None,
                           show_vulgar=None,
                           show_query_gff=None,
                           show_target_gff=None,
                           store_intermediate_files=False,
                           splited_fasta_dir="splited_fasta_dir",
                           splited_result_dir="splited_output",
                           number_of_results_to_report=None,
                           other_options=None,
                           num_of_files=None,
                           converted_output_dir="converted_output"):
        splited_filename = split_filename(query_file)
        self.split_fasta(query_file,
                         splited_fasta_dir,
                         num_of_recs_per_file=num_of_recs_per_file,
                         num_of_files=num_of_files,
                         output_prefix=splited_filename[1])

        common_options = self.parse_common_options(
            model,
            show_alignment=show_alignment,
            show_sugar=show_sugar,
            show_cigar=show_cigar,
            show_vulgar=show_vulgar,
            show_query_gff=show_query_gff,
            show_target_gff=show_target_gff,
            number_of_results_to_report=number_of_results_to_report,
            other_options=other_options)

        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        save_mkdir(splited_result_dir)
        #save_mkdir(converted_output_dir)

        for filename in splited_files:
            filename_list = split_filename(filename)
            options = common_options
            options += " -q %s/%s" % (splited_fasta_dir, filename)
            options += " -t %s" % target_file
            options += " > %s/%s.output" % (splited_result_dir,
                                            filename_list[1])
            options_list.append(options)

        self.parallel_execute(options_list)
        """
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (splited_result_dir, filename,
                                                                  matching_weight, mismatching_penalty,
                                                                  indel_penalty, match_probability,
                                                                  indel_probability,
                                                                  min_alignment_score, max_period)

            self.convert_trf_report(trf_output_file, "%s/%s" % (converted_output_dir, filename))

        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab", ".wide.tab"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename, suffix)
            CGAS.cat(file_str, merged_file)
        """
        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
Ejemplo n.º 13
0
    def parallel_search_tandem_repeat(self,
                                      query_file,
                                      output_prefix,
                                      matching_weight=2,
                                      mismatching_penalty=7,
                                      indel_penalty=7,
                                      match_probability=80,
                                      indel_probability=10,
                                      min_alignment_score=50,
                                      max_period=500,
                                      report_flanking_sequences=False,
                                      splited_fasta_dir="splited_fasta_dir",
                                      splited_result_dir="splited_output",
                                      converted_output_dir="converted_output",
                                      max_len_per_file=100000,
                                      store_intermediate_files=False):
        work_dir = os.getcwd()
        splited_filename = split_filename(query_file)
        self.split_fasta_by_seq_len(query_file,
                                    splited_fasta_dir,
                                    max_len_per_file=max_len_per_file,
                                    output_prefix=splited_filename[1])

        common_options = self.parse_common_options(
            matching_weight=matching_weight,
            mismatching_penalty=mismatching_penalty,
            indel_penalty=indel_penalty,
            match_probability=match_probability,
            indel_probability=indel_probability,
            min_alignment_score=min_alignment_score,
            max_period=max_period,
            report_flanking_sequences=report_flanking_sequences,
            make_dat_file=True)
        common_options += " -h"  # suppress html output
        options_list = []
        splited_files = os.listdir(splited_fasta_dir)

        save_mkdir(splited_result_dir)
        save_mkdir(converted_output_dir)
        os.chdir(splited_result_dir)

        input_dir = splited_fasta_dir if (splited_fasta_dir[0] == "/") or (splited_fasta_dir[0] == "~") \
                    else "../%s" % splited_fasta_dir

        for filename in splited_files:
            file_options = "%s/%s" % (input_dir, filename)
            file_options += common_options
            options_list.append(file_options)

        self.parallel_execute(options_list)

        os.chdir(work_dir)
        for filename in splited_files:

            trf_output_file = "%s/%s.%i.%i.%i.%i.%i.%i.%i.dat" % (
                splited_result_dir, filename, matching_weight,
                mismatching_penalty, indel_penalty, match_probability,
                indel_probability, min_alignment_score, max_period)

            self.convert_trf_report(trf_output_file,
                                    "%s/%s" % (converted_output_dir, filename))

        for suffix in (".rep", ".gff", ".simple.gff", ".short.tab",
                       ".wide.tab"):
            file_str = ""
            merged_file = "%s%s" % (output_prefix, suffix)
            for filename in splited_files:
                file_str += " %s/%s%s" % (converted_output_dir, filename,
                                          suffix)
            CGAS.cat(file_str, merged_file)

        if not store_intermediate_files:
            shutil.rmtree(splited_fasta_dir)
            shutil.rmtree(splited_result_dir)
            shutil.rmtree(converted_output_dir)
Ejemplo n.º 14
0
    "--number_of_top_orfs_for_training",
    action="store",
    type=int,
    dest="number_of_top_orfs_for_training",
    help=
    "If no --number_of_top_orfs_for_training, top longest ORFs to train Markov "
    "Model (hexamer stats) (default: 500)")
parser.add_argument("-c",
                    "--hmmer_dir",
                    action="store",
                    dest="hmmer_dir",
                    default="",
                    help="Directory with hmmer v3.1 binaries")
args = parser.parse_args()

input_filename_list = split_filename(args.input)
input_filename = input_filename_list[1] + input_filename_list[2]

workdir_dir = "%s.transdecoder_dir/" % input_filename
pep_from_longest_orfs = "%s/longest_orfs.pep" % workdir_dir

hmmscan_dir = "hmmscan_vs_pfam/"
blastp_dir = "blastp_vs_uniref/"

save_mkdir(hmmscan_dir)
save_mkdir(blastp_dir)

hmmscan_splited_fasta_dir = "%ssplited_fasta_dir/" % hmmscan_dir
splited_domtblout_dir = "%ssplited_domtblout_dir/" % hmmscan_dir
hmmscan_vs_pfam_output = "%s%s.pfam.hits" % (hmmscan_dir, input_filename)
domtblout_outfile = "%s%s.pfam.domtblout" % (
Ejemplo n.º 15
0
                    type=int,
                    help="Format of input trees")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output_file",
                    default="stdout",
                    help="Output file with leaves of trees. Default: stdout")

args = parser.parse_args()

out_fd = sys.stdout if args.output_file == "stdout" else open(
    args.output_file, "w")

tree_files_list = os.listdir(args.tree_dir)

names_dict = SynDict()

for tree_file in tree_files_list:
    tree_name = split_filename(tree_file)[1]
    with open("%s%s" % (args.tree_dir, tree_file), "r") as tree_fd:
        tree = Tree(tree_fd.readline().strip(), format=args.tree_format)
    leaves_list = []
    for node in tree.traverse():
        if node.is_leaf():
            leaves_list.append(node.name)
    names_dict[tree_name] = leaves_list

names_dict.write(args.outp_fd, splited_values=True)
if args.output_file != "stdout":
    out_fd.close()
Ejemplo n.º 16
0
if args.threads == 1:
    TRF.search_tandem_repeats(
        args.input_file,
        matching_weight=args.matching_weight,
        mismatching_penalty=args.mismatching_penalty,
        indel_penalty=args.indel_penalty,
        match_probability=args.matching_probability,
        indel_probability=args.indel_probability,
        min_alignment_score=args.min_score,
        max_period=args.max_period_size,
        report_flanking_sequences=args.report_flanking_sequences,
        make_dat_file=True,
        disable_html_output=args.enable_html_output)

    trf_report = "%s.%i.%i.%i.%i.%i.%i.%i.dat" % (
        split_filename(args.input_file)[1] +
        split_filename(args.input_file)[2], args.matching_weight,
        args.mismatching_penalty, args.indel_penalty,
        args.matching_probability, args.indel_probability, args.min_score,
        args.max_period_size)
    TRF.convert_trf_report(trf_report, args.output_prefix)

else:

    TRF.parallel_search_tandem_repeat(
        args.input_file,
        args.output_prefix,
        matching_weight=args.matching_weight,
        mismatching_penalty=args.mismatching_penalty,
        indel_penalty=args.indel_penalty,
        match_probability=args.matching_probability,