Ejemplo n.º 1
0
def handle_input(filename):
    sys.stdout.write("Handling %s\n" % filename)
    not_significant_ids = IdList()
    not_found_ids = IdList()

    prefix = FileRoutines.split_filename(filename)[1]
    index_file = "%s.tmp.idx" % prefix
    hmm_dict = SearchIO.index_db(index_file, filename, args.format)
    if args.output == "stdout":
        out_fd = sys.stdout
    else:
        out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

    for query in hmm_dict:
        if hmm_dict[query].hits:
            if hmm_dict[query][0].is_included:
                out_fd.write(
                    "%s\t%s\t%s\t%s\n" %
                    (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue,
                     hmm_dict[query][0].bitscore))
            else:
                not_significant_ids.append(query)
        else:
            not_found_ids.append(query)

    if args.output != "stdout":
        out_fd.close()

    os.remove(index_file)
    return not_significant_ids, not_found_ids
Ejemplo n.º 2
0
    def seq_ids(self):
        id_list = IdList()

        for record in self.records:
            id_list.append(record.id)

        return id_list
Ejemplo n.º 3
0
    def add_length_to_accordance_file(accordance_file, length_file,
                                      output_prefix):

        accordance_dict = SynDict(filename=accordance_file,
                                  allow_repeats_of_key=True)
        length_dict = SynDict(filename=length_file, expression=int)
        print(length_dict)
        longest_list = IdList()

        all_output_file = "%s.all.correspondence" % output_prefix
        longest_output_file = "%s.longest.correspondence" % output_prefix
        longest_id_file = "%s.longest.ids" % output_prefix

        with open(all_output_file, "w") as all_out_fd:
            with open(longest_output_file, "w") as longest_out_fd:
                for gene in accordance_dict:
                    current_transcript = None
                    current_length = 0
                    for transcript in accordance_dict[gene]:
                        if length_dict[transcript] > current_length:
                            current_transcript = transcript
                            current_length = length_dict[transcript]
                        all_out_fd.write(
                            "%s\t%s\t%i\n" %
                            (gene, transcript, length_dict[transcript]))

                    longest_out_fd.write(
                        "%s\t%s\t%i\n" %
                        (gene, current_transcript, current_length))
                    longest_list.append(current_transcript)
        longest_list.write(longest_id_file)
Ejemplo n.º 4
0
    def prepare_annotation_file_from_transcript_and_cds(
            self,
            transcript_file,
            cds_file,
            correspondence_file,
            output_prefix,
            format="fasta",
            correspondence_key_column=0,
            correspondence_value_column=1,
            verbose=False):
        transcript_dict = self.parse_seq_file(transcript_file,
                                              "parse",
                                              format=format)

        cds_dict = self.parse_seq_file(cds_file, "parse", format=format)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      comments_prefix="#",
                                      key_index=correspondence_key_column,
                                      value_index=correspondence_value_column)

        no_corresponding_cds_transcript_list = IdList()
        cds_not_found_transcript_list = IdList()

        annotation_file = "%s.annotation" % output_prefix
        no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix
        cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix

        with open(annotation_file, "w") as annotation_fd:
            for transcript_id in transcript_dict:
                if transcript_id not in correspondence_dict:
                    no_corresponding_cds_transcript_list.append(transcript_id)
                    if verbose:
                        print(
                            "No cds in correspondence file for transcript %s" %
                            transcript_id)
                    continue
                cds_id = correspondence_dict[transcript_id]
                length = len(cds_dict[cds_id].seq)
                start = transcript_dict[transcript_id].seq.upper().find(
                    cds_dict[cds_id].seq.upper())
                if start == -1:
                    cds_not_found_transcript_list.append(transcript_id)
                    if verbose:
                        print("CDS was not found for transcript %s" %
                              transcript_id)
                    continue
                annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id,
                                                         start + 1, length)

                annotation_fd.write(annotation_string)

        no_corresponding_cds_transcript_list.write(
            no_corresponding_cds_transcript_file)
        cds_not_found_transcript_list.write(cds_not_found_transcript_file)
Ejemplo n.º 5
0
    def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None):

        extracted_families = SynDict()
        common_protein_names_to_families_dict = SynDict()
        common_names_to_eggnog_proteins_syn_dict = SynDict()

        not_found_proteins_common_names = IdList()

        transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value()

        for common_protein_name in protein_syn_dict:
            not_found = True
            for protein_id in protein_syn_dict[common_protein_name]:
                extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id
                if extended_protein_id in transposed_eggnog_fam_dict:
                    not_found = False
                    if common_protein_name not in common_protein_names_to_families_dict:
                        common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]]
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id]
                    else:
                        common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0])
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id)
                    if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families:
                        extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]]

            if not_found:
                not_found_proteins_common_names.append(common_protein_name)

        if output_prefix:
            extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True)
            common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True)
            common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True)
            not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix)

            #print common_names_to_eggnog_proteins_syn_dict
            #print common_protein_names_to_families_dict
        return extracted_families, common_protein_names_to_families_dict, \
               common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
Ejemplo n.º 6
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict
Ejemplo n.º 7
0
    def extract_top_hits(
        self,
        hmmer_hits,
        output_prefix,
        parsing_mode="index_db"
    ):  #top_hits_file, top_hits_ids_file=None,not_significant_ids_file=None, not_found_ids_file=None):
        print(hmmer_hits)
        top_hits_ids = IdList()
        not_significant_ids = IdList()
        not_found_ids = IdList()

        top_hits_file = "%s.top_hits" % output_prefix
        top_hits_ids_file = "%s.top_hits.ids" % output_prefix
        not_significant_ids_file = "%s.not_significant.ids" % output_prefix
        not_found_ids_file = "%s.not_found.ids" % output_prefix

        index_file = "%s.hmmer_hits.tmp.idx" % output_prefix

        #hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text")

        hmm_dict = self.parse_search_file(hmmer_hits,
                                          parsing_mode,
                                          format="hmmer3-text",
                                          index_file=None)
        print(len(hmm_dict))
        with open(top_hits_file, "w") as out_fd:
            out_fd.write("#query\thit\tevalue\tbitscore\n")

            for query in hmm_dict:
                if hmm_dict[query].hits:
                    if hmm_dict[query][0].is_included:
                        out_fd.write("%s\t%s\t%s\t%s\n" %
                                     (query, hmm_dict[query][0].id,
                                      hmm_dict[query][0].evalue,
                                      hmm_dict[query][0].bitscore))
                        top_hits_ids.append(query)
                    else:
                        not_significant_ids.append(query)
                else:
                    not_found_ids.append(query)
        if parsing_mode == "index_db":
            os.remove(index_file)

        for id_list, id_file in zip(
            [not_significant_ids, not_found_ids, top_hits_ids],
            [not_significant_ids_file, not_found_ids_file, top_hits_ids_file]):
            id_list.write(id_file)
Ejemplo n.º 8
0
    def prepare_template_for_popart(alignment_file,
                                    output_file,
                                    haplotype_fam_file=None,
                                    traits_file=None,
                                    whitelist_file=None):
        from RouToolPa.Parsers.Sequence import CollectionSequence
        sequence_collection = CollectionSequence(in_file=alignment_file,
                                                 parsing_mode="parse")
        sequence_collection.get_stats_and_features(count_gaps=False,
                                                   sort=False)
        whitelist = IdSet(filename=whitelist_file)
        alignment_len = sequence_collection.seq_lengths["length"].unique()
        if len(alignment_len) > 1:
            raise ValueError(
                "ERROR!!! Sequences in alignment have different lengths!")
        alignment_len = alignment_len[0]

        haplotype_selected_sequence_dict = SynDict()
        haplotypes_without_sequences_ids = IdList()

        traits_df = pd.read_csv(
            traits_file, sep="\t",
            index_col=0) if traits_file else pd.DataFrame()

        if haplotype_fam_file:
            haplotype_dict = SynDict(filename=haplotype_fam_file,
                                     split_values=True)
            for haplotype_id in haplotype_dict:
                for sequence_id in haplotype_dict[haplotype_id]:
                    if sequence_id in sequence_collection.records:
                        haplotype_selected_sequence_dict[
                            haplotype_id] = sequence_id
                        break
                else:
                    haplotypes_without_sequences_ids.append(haplotype_id)
        else:
            haplotype_dict = dict([(entry, [entry])
                                   for entry in sequence_collection.scaffolds])
            haplotype_selected_sequence_dict = dict([
                (entry, entry) for entry in sequence_collection.scaffolds
            ])

        final_haplotype_set = (set(haplotype_selected_sequence_dict.keys())
                               & whitelist) if whitelist else set(
                                   haplotype_selected_sequence_dict.keys())

        with open(output_file, "w") as out_fd:
            #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict),
            #                                                                                          "\n".join(haplotype_selected_sequence_dict.keys())))
            out_fd.write("#NEXUS\n\n")
            out_fd.write(
                "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n"
                % (len(final_haplotype_set), alignment_len))
            out_fd.write("\tMATRIX\n")

            for haplotype_id in final_haplotype_set:
                out_fd.write(
                    "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[
                        haplotype_selected_sequence_dict[haplotype_id]]))
            out_fd.write("\t;\nEND;\n\n")

            if not traits_df.empty:
                traits_number = len(traits_df.columns)
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n"
                    .format(traits_number))
                out_fd.write("\tTraitLabels {0};\n".format(" ".join(
                    traits_df.columns)))
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %s\n" %
                        (haplotype_id,
                         ",".join(map(str, traits_df.loc[haplotype_id]))
                         if haplotype_id in traits_df.index else
                         ("0," * traits_number)[:-1]))
            else:
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n"
                )
                out_fd.write("\tTraitLabels Area;\n")
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %i\n" %
                        (haplotype_id, len(haplotype_dict[haplotype_id])))
            out_fd.write("\t;\nEND;\n\n")
Ejemplo n.º 9
0
    def handle_sanger_data(self,
                           input_dir,
                           output_prefix,
                           outdir=None,
                           read_subfolders=False,
                           min_mean_qual=0,
                           min_median_qual=0,
                           min_len=50):
        if outdir:
            self.workdir = outdir

        self.init_dirs()

        sanger_filelist = self.make_list_of_path_to_files(
            input_dir,
            expression=self.is_sanger_file,
            recursive=read_subfolders,
            return_absolute_paths=True)
        stat_dict = TwoLvlDict()
        record_dict = OrderedDict()
        trimmed_record_dict = OrderedDict()
        excluded_list = IdList()
        excluded_counter = 0
        low_quality_counter = 0
        too_short_counter = 0

        merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix)
        merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix)
        merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir,
                                                        output_prefix)
        merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir,
                                                        output_prefix)

        for filename in sanger_filelist:
            filename_list = self.split_filename(filename)

            record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir,
                                                              filename_list[1])
            record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir,
                                                              filename_list[1])
            record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % (
                self.workdir, filename_list[1])

            record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % (
                self.workdir, filename_list[1])
            record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % (
                self.workdir, filename_list[1])
            record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % (
                self.workdir, filename_list[1])

            record = SeqIO.read(self.metaopen(filename, "rb"), format="abi")
            record_dict[record.id] = record
            SeqIO.write(record, record_raw_fastq, format="fastq")
            SeqIO.write(record, record_raw_fasta, format="fasta")

            trimmed_record = SeqIO.AbiIO._abi_trim(record)

            stat_dict[record.id] = OrderedDict({
                "raw_len":
                len(record),
                "raw_mean_qual":
                np.mean(record.letter_annotations["phred_quality"]),
                "raw_median_qual":
                np.median(record.letter_annotations["phred_quality"]),
                "trimmed_len":
                len(trimmed_record),
                "trimmed_mean_qual":
                np.mean(trimmed_record.letter_annotations["phred_quality"]),
                "trimmed_median_qual":
                np.median(trimmed_record.letter_annotations["phred_quality"]),
                "retained":
                "-",
            })
            MatplotlibRoutines.draw_bar_plot(
                record.letter_annotations["phred_quality"],
                record_raw_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            if stat_dict[record.id]["trimmed_len"] >= min_len:
                if min_median_qual:
                    if (stat_dict[record.id]["trimmed_median_qual"] >=
                            min_median_qual) and (
                                stat_dict[record.id]["trimmed_mean_qual"] >=
                                min_mean_qual):
                        stat_dict[record.id]["retained"] = "+"
                    else:
                        low_quality_counter += 1
                else:
                    stat_dict[record.id]["retained"] = "+"
            else:
                too_short_counter += 1

            if stat_dict[record.id]["retained"] == "-":
                excluded_list.append(record.id)
                continue

            SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq")
            SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta")

            MatplotlibRoutines.draw_bar_plot(
                trimmed_record.letter_annotations["phred_quality"],
                record_trimmed_qual_plot_prefix,
                extentions=["png"],
                xlabel="Position",
                ylabel="Phred quality",
                title="Per base quality",
                min_value=None,
                max_value=None,
                new_figure=True,
                figsize=(3 * (int(len(record) / 100) + 1), 3),
                close_figure=True)

            trimmed_record_dict[record.id] = trimmed_record

        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(record_dict),
                    merged_raw_fasta,
                    format="fasta")

        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fastq,
                    format="fastq")
        SeqIO.write(self.record_from_dict_generator(trimmed_record_dict),
                    merged_trimmed_fasta,
                    format="fasta")

        excluded_list.write("%s.excluded.ids" % output_prefix)
        stat_dict.write(out_filename="%s.stats" % output_prefix)

        print("Excluded: %i" % excluded_counter)
        print("\tToo short( < %i ): %i" % (min_len, too_short_counter))
        print("\tLow quality( median < %i or mean < %i ): %i" %
              (min_median_qual, min_mean_qual, low_quality_counter))
Ejemplo n.º 10
0
                         count_dict[args.name_b][scaffold][i], ratio))
                elif ratio < (1.0 / float(args.minimal_ratio)):
                    vcf_b_more_variants_file_fd.write(
                        "%s\t%i\t%i\t%i\t%i\t%i\t%.3f\n" %
                        (scaffold, start, stop, i,
                         count_dict[args.name_a][scaffold][i],
                         count_dict[args.name_b][scaffold][i], ratio))

            elif count_dict[args.name_a][scaffold] == 0:
                vcf_a_no_variants_file_fd.write("%s\t%i\t%i\t%i" %
                                                (scaffold, start, stop, i))
            elif count_dict[args.name_b][scaffold] == 0:
                vcf_b_no_variants_file_fd.write("%s\t%i\t%i\t%i" %
                                                (scaffold, start, stop, i))

    else:
        if scaffold not in count_dict[args.name_a]:
            vcf_a_absent_scaffolds_id_list.append(scaffold)
        if scaffold not in count_dict[args.name_b]:
            vcf_b_absent_scaffolds_id_list.append(scaffold)

vcf_a_more_variants_file_fd.close()
vcf_b_more_variants_file_fd.close()
vcf_a_no_variants_file_fd.close()
vcf_b_no_variants_file_fd.close()

vcf_density_ratio_fd.close()

vcf_a_absent_scaffolds_id_list.write(vcf_a_absent_scaffolds_id_file)
vcf_b_absent_scaffolds_id_list.write(vcf_b_absent_scaffolds_id_file)
Ejemplo n.º 11
0
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
annotations_dict = SeqIO.to_dict(GFF.parse(open(args.input)))
single_gene_id_list = IdList()

for record in annotations_dict:
    for feature in annotations_dict[record].features:
        #print feature.id
        if feature.type != "gene":
            continue
        for subfeature in feature.sub_features:
            if subfeature.type != "mRNA":
                continue
            exon_number = 0
            for mRNA_subfeature in subfeature.sub_features:
                if mRNA_subfeature.type == "exon":
                    exon_number += 1
            if exon_number == 1:
                single_gene_id_list.append(feature.id)

single_gene_id_list.write(out_fd, close_after_if_file_object=True)
"""
sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(record_by_id_generator(sequence_dict, sequence_groups_id[group]),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)
"""
Ejemplo n.º 12
0
    def parallel_run(
        self,
        input_dir,
        output_dir,
        output_prefix,
        input_type="codon",
        min_seq_number_for_conserved_position=None,
        min_seq_number_for_flank_position=None,
        max_pos_number_for_noncons_contig_pos=None,
        min_block_len=None,
        allow_gaps="half",
        save_postscript=True,
        output_type="htm",
        threads=None,
    ):

        if threads:
            self.threads = threads

        data_dir = "%s/data/" % output_dir
        postscript_dir = "%s/ps/" % output_dir
        results_dir = "%s/results/" % output_dir
        htm_dir = "%s/htm/" % output_dir

        for directory in output_dir, data_dir, postscript_dir, results_dir, htm_dir:
            self.safe_mkdir(directory)

        #input_files_list = map(os.path.abspath, self.make_list_of_path_to_files(input_directory))

        input_files_list = self.make_list_of_path_to_files(
            input_dir, return_absolute_paths=True)

        for entry in input_files_list:
            directory, prefix, extension = self.split_filename(entry)
            os.system("ln -s %s %s/%s%s" %
                      (entry, data_dir, prefix, extension))

        data_files_list = self.make_list_of_path_to_files(
            data_dir, return_absolute_paths=True)

        common_options = self.parse_options(
            input_type=input_type,
            min_seq_number_for_conserved_position=
            min_seq_number_for_conserved_position,
            min_seq_number_for_flank_position=min_seq_number_for_flank_position,
            max_pos_number_for_noncons_contig_pos=
            max_pos_number_for_noncons_contig_pos,
            min_block_len=min_block_len,
            allow_gaps=allow_gaps,
            save_postscript=save_postscript,
            output_type=output_type,
            concatenate_blocks_from_aignments=None)
        options_list = []

        for data_file in data_files_list:
            options = " %s" % data_file
            options += " %s" % common_options
            options_list.append(options)

        self.parallel_execute(options_list=options_list)

        block_coordinates = OrderedDict()

        skipped_ids_file = "%s/%s.skipped.ids" % (output_dir, output_prefix)
        skipped_ids = IdList()

        for filename in data_files_list:
            data_dir, prefix, extension = self.split_filename(filename)
            blocks_file = "%s-gb" % filename
            htm_file = "%s-gb.htm" % filename
            postscript_file = "%s-gbPS" % filename

            if (not os.path.exists(blocks_file)) or (
                    not os.path.exists(htm_file)):
                skipped_ids.append(prefix)
                print("Warning!!! %s skipped..." % prefix)
                continue

            block_coordinates[prefix] = self.extract_block_coordinates(
                htm_file)
            os.system("mv %s %s/%s.ps" %
                      (postscript_file, postscript_dir, prefix))
            os.system("mv %s %s/%s.htm" % (htm_file, htm_dir, prefix))
            self.convert_output_to_fasta(
                blocks_file, "%s/%s%s" % (results_dir, prefix, extension))
            os.remove(blocks_file)

        block_coordinates_file = "%s/%s.block.coordinates" % (output_dir,
                                                              output_prefix)
        skipped_ids.write(skipped_ids_file)
        with open(block_coordinates_file, "w") as block_fd:
            for entry in block_coordinates:
                coordinates_string = ";".join(
                    map(lambda s: "%i,%i" % (s[0], s[1]),
                        block_coordinates[entry]))
                block_fd.write("%s\t%s\n" % (entry, coordinates_string))
Ejemplo n.º 13
0
    if args.all or args.alignment:
        os.system("wget %s" % alignment_options)
    if args.all or args.tree:
        os.system("wget %s" % tree_options)
    if args.all or args.hmm:
        os.system("wget %s" % hmm_options)


pool = Pool(args.threads)
pool.map(download_data, family_ids)
pool.close()
for fam_id in family_ids:
    if args.all or args.alignment:
        if os.path.getsize("%s%s.fasta" % (args.output_dir, fam_id)) == 0:
            absent_alignment_list.append(fam_id)
    if args.all or args.tree:
        if os.path.getsize("%s%s.nwk" % (args.output_dir, fam_id)) == 0:
            absent_tree_list.append(fam_id)
    if args.all or args.hmm:
        if os.path.getsize("%s%s.hmm" % (args.output_dir, fam_id)) == 0:
            absent_hmm_list.append(fam_id)

if absent_alignment_list:
    absent_alignment_list.write("absent_alignments.ids")
    print("%i alignments were not downloaded" % len(absent_alignment_list))
if absent_tree_list:
    absent_tree_list.write("absent_trees.ids")
    print("%i trees were not downloaded" % len(absent_tree_list))
if absent_hmm_list:
    absent_hmm_list.write("absent_hmms.ids")