Exemple #1
0
    def add_length_to_accordance_file(accordance_file, length_file, output_prefix):

        accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True)
        length_dict = SynDict(filename=length_file, expression=int)
        print length_dict
        longest_list = IdList()

        all_output_file = "%s.all.correspondence" % output_prefix
        longest_output_file = "%s.longest.correspondence" % output_prefix
        longest_id_file = "%s.longest.ids" % output_prefix

        with open(all_output_file, "w") as all_out_fd:
            with open(longest_output_file, "w") as longest_out_fd:
                for gene in accordance_dict:
                    current_transcript = None
                    current_length = 0
                    for transcript in accordance_dict[gene]:
                        if length_dict[transcript] > current_length:
                            current_transcript = transcript
                            current_length = length_dict[transcript]
                        all_out_fd.write("%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript]))

                    longest_out_fd.write("%s\t%s\t%i\n" % (gene, current_transcript, current_length))
                    longest_list.append(current_transcript)
        longest_list.write(longest_id_file)
Exemple #2
0
    def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict, output_prefix=None):
        length_file = "%s.protein_length.tsv" % output_prefix
        if output_prefix:
            longest_protein_id_file = "%s.longest_pep.ids" % output_prefix

            len_fd = open(length_file, 'w')
            len_fd.write("#gene_id\tprotein_id\tprotein_length\n")

        data_dict = OrderedDict()
        for protein_id in protein_dict:
            length = len(protein_dict[protein_id].seq)
            description_list = protein_dict[protein_id].description.split()
            #print protein_dict[protein_id]
            #print ''
            #print description_list

            for entry in description_list:
                if "gene:" in entry:
                    gene_id = entry.split(":")[1]
            if output_prefix:
                len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length))
            if gene_id not in data_dict:
                data_dict[gene_id] = protein_id
            else:
                if length > len(protein_dict[data_dict[gene_id]].seq):
                    data_dict[gene_id] = protein_id

        longest_pep_ids = IdList(data_dict.values())
        if output_prefix:
            longest_pep_ids.write(longest_protein_id_file)
            len_fd.close()
        return longest_pep_ids
Exemple #3
0
    def extract_counts_by_max_level(input_file, output_prefix,
                                   separator="\t", verbose=True):
        output_file = "%s.divided_by_maxlvl" % output_prefix
        zero_max_lvl_list = IdList()

        zero_max_lvl_list_file = "%s.zero_max_lvl.ids" % output_prefix

        with open(input_file, "r") as in_fd:
            header = in_fd.readline()
            header_list = header.strip().split(separator)
            with open(output_file, "w") as out_fd:
                out_fd.write(header)
                for line in in_fd:
                    tmp_line = line.strip().split(separator)
                    data = np.array(map(float, tmp_line[1:]))
                    max_level = max(data)
                    if max_level == 0:
                        zero_max_lvl_list.append(tmp_line[0])

                        if verbose:
                            print("Zero max level for %s...Skipping..." % tmp_line[0])
                        continue

                    data /= max_level
                    output_string = tmp_line[0] + "\t"
                    output_string += "\t".join(map(str, data))
                    output_string += "\n"
                    out_fd.write(output_string)

        zero_max_lvl_list.write(zero_max_lvl_list_file)
Exemple #4
0
    def create_per_cluster_element_id_files(self, cluster_dict,
                                            output_directory):
        self.safe_mkdir(output_directory)

        for cluster_id in cluster_dict:
            cluster_element_id_list = IdList(cluster_dict[cluster_id])
            cluster_element_id_list.write("%s/%s.ids" %
                                          (output_directory, cluster_id))
Exemple #5
0
 def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t",
                           comments_prefix="#", column_number=None):
     id_list = IdList()
     id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix,
                  column_number=column_number, header=header)
     if output_file:
         id_list.write(output_file, header=header)
     return id_list
Exemple #6
0
    def prepare_annotation_file_from_transcript_and_cds(
            self,
            transcript_file,
            cds_file,
            correspondence_file,
            output_prefix,
            format="fasta",
            correspondence_key_column=0,
            correspondence_value_column=1,
            verbose=False):
        transcript_dict = self.parse_seq_file(transcript_file,
                                              "parse",
                                              format=format)

        cds_dict = self.parse_seq_file(cds_file, "parse", format=format)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      comments_prefix="#",
                                      key_index=correspondence_key_column,
                                      value_index=correspondence_value_column)

        no_corresponding_cds_transcript_list = IdList()
        cds_not_found_transcript_list = IdList()

        annotation_file = "%s.annotation" % output_prefix
        no_corresponding_cds_transcript_file = "%s.no_cds.id" % output_prefix
        cds_not_found_transcript_file = "%s.not_found_cds.id" % output_prefix

        with open(annotation_file, "w") as annotation_fd:
            for transcript_id in transcript_dict:
                if transcript_id not in correspondence_dict:
                    no_corresponding_cds_transcript_list.append(transcript_id)
                    if verbose:
                        print(
                            "No cds in correspondence file for transcript %s" %
                            transcript_id)
                    continue
                cds_id = correspondence_dict[transcript_id]
                length = len(cds_dict[cds_id].seq)
                start = transcript_dict[transcript_id].seq.upper().find(
                    cds_dict[cds_id].seq.upper())
                if start == -1:
                    cds_not_found_transcript_list.append(transcript_id)
                    if verbose:
                        print("CDS was not found for transcript %s" %
                              transcript_id)
                    continue
                annotation_string = "%s\t+\t%i\t%i\n" % (transcript_id,
                                                         start + 1, length)

                annotation_fd.write(annotation_string)

        no_corresponding_cds_transcript_list.write(
            no_corresponding_cds_transcript_file)
        cds_not_found_transcript_list.write(cds_not_found_transcript_file)
Exemple #7
0
    def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4):

        try:
            os.mkdir(output_dir)
        except OSError:
            pass

        id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}",
                          capture_output=True)

        split_index = 1
        ids_written = 0
        ids_list = IdList()
        #ids_list = read_ids(id_fd, close_after_if_file_object=False)
        ids_list.read(id_fd, close_after_if_file_object=True)
        number_of_ids = len(ids_list)
        out_prefix = self.split_filename(hmmfile)[1] if output_prefix is None else output_prefix

        num_of_ids = int(number_of_ids/num_of_files) + 1 if num_of_files else num_of_recs_per_file

        common_options = " -f"
        common_options += " %s" % hmmfile
        options_list = []
        while (ids_written + num_of_ids) <= number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:ids_written+num_of_ids])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
            ids_written += num_of_ids

        if ids_written != number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
        #print options_list
        self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
Exemple #8
0
    def divide_counts_by_base_level(input_file, output_prefix, base_level,
                                    separator="\t", verbose=True, secondary_base_lvl=None):
        output_file = "%s.divided_by_baselvl" % output_prefix
        zero_base_lvl_list = IdList()
        zero_both_base_lvls_list = IdList()
        zero_base_lvl_list_file = "%s.zero_base_lvl.ids" % output_prefix
        zero_both_base_lvls_list_file = "%s.zero_base_lvls.ids" % output_prefix
        with open(input_file, "r") as in_fd:
            header = in_fd.readline()
            header_list = header.strip().split(separator)
            data_base_level_index = header_list.index(base_level) - 1
            if secondary_base_lvl:
                data_secondary_base_level_index = header_list.index(secondary_base_lvl) - 1
            with open(output_file, "w") as out_fd:
                out_fd.write(header)
                for line in in_fd:
                    tmp_line = line.strip().split(separator)
                    data = np.array(map(float, tmp_line[1:]))
                    if data[data_base_level_index] == 0:
                        zero_base_lvl_list.append(tmp_line[0])
                        if not secondary_base_lvl:
                            if verbose:
                                print("Zero base level(%s) for %s...Skipping..." % (base_level, tmp_line[0]))
                            continue
                    if secondary_base_lvl:
                        if data[data_secondary_base_level_index] == 0:
                            zero_both_base_lvls_list.append(tmp_line[0])
                            if verbose:
                                print("Both base levels are zero (%s, %s) for %s...Skipping..." % (base_level,
                                                                                                   secondary_base_lvl,
                                                                                                   tmp_line[0]))
                            continue

                        data /= data[data_base_level_index] if data[data_base_level_index] != 0 else data[data_secondary_base_level_index]
                    else:
                        data /= data[data_base_level_index]
                    output_string = tmp_line[0] + "\t"
                    output_string += "\t".join(map(str, data))
                    output_string += "\n"
                    out_fd.write(output_string)

        zero_base_lvl_list.write(zero_base_lvl_list_file)
        zero_both_base_lvls_list.write(zero_both_base_lvls_list_file)
Exemple #9
0
    def divide_counts_by_several_base_level(input_file, output_prefix, base_levels,
                                            separator="\t", verbose=True,
                                            max_ratio_to_base_lvl=0.5):
        output_file = "%s.divided_by_max_baselvl" % output_prefix
        max_ratio_to_base_lvl_file = "%s.divided_by_max_baselvl.max_%f_ratio" % (output_prefix, max_ratio_to_base_lvl)
        zero_max_base_lvl_list = IdList()
        zero_max_base_lvl_list_file = "%s.zero_base_lvls.ids" % output_prefix
        max_ratio_to_base_lvl_fd = open(max_ratio_to_base_lvl_file, "w")
        with open(input_file, "r") as in_fd:
            header = in_fd.readline()
            header_list = header.strip().split(separator)

            data_base_lvl_index_list = []
            base_level_list = [base_levels] if isinstance(base_levels, str) else base_levels
            for level in base_level_list:
                data_base_lvl_index_list.append(header_list.index(level) - 1)

            with open(output_file, "w") as out_fd:
                out_fd.write(header)
                max_ratio_to_base_lvl_fd.write(header)
                for line in in_fd:
                    tmp_line = line.strip().split(separator)
                    data = np.array(map(float, tmp_line[1:]))
                    max_base_lvl = max(np.take(data, data_base_lvl_index_list))
                    if max_base_lvl == 0:
                        zero_max_base_lvl_list.append(tmp_line[0])
                        if verbose:
                            print("Zero max base level(s) for %s...Skipping..." % tmp_line[0])
                        continue

                    data /= max_base_lvl
                    output_string = tmp_line[0] + "\t"
                    output_string += "\t".join(map(str, data))
                    output_string += "\n"
                    if max(np.delete(data, data_base_lvl_index_list)) <= max_ratio_to_base_lvl:
                        max_ratio_to_base_lvl_fd.write(output_string)
                    out_fd.write(output_string)

        zero_max_base_lvl_list.write(zero_max_base_lvl_list_file)
        max_ratio_to_base_lvl_fd.close()
Exemple #10
0
    def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None):

        extracted_families = SynDict()
        common_protein_names_to_families_dict = SynDict()
        common_names_to_eggnog_proteins_syn_dict = SynDict()

        not_found_proteins_common_names = IdList()

        transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value()

        for common_protein_name in protein_syn_dict:
            not_found = True
            for protein_id in protein_syn_dict[common_protein_name]:
                extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id
                if extended_protein_id in transposed_eggnog_fam_dict:
                    not_found = False
                    if common_protein_name not in common_protein_names_to_families_dict:
                        common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]]
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id]
                    else:
                        common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0])
                        common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id)
                    if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families:
                        extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]]

            if not_found:
                not_found_proteins_common_names.append(common_protein_name)

        if output_prefix:
            extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True)
            common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True)
            common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True)
            not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix)

            #print common_names_to_eggnog_proteins_syn_dict
            #print common_protein_names_to_families_dict
        return extracted_families, common_protein_names_to_families_dict, \
               common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
Exemple #11
0
    def cluster_sequence_names_by_id_fragment(self,
                                              seq_id_list,
                                              id_element_index,
                                              id_separator="_",
                                              output_prefix=None):
        cluster_dict = SynDict()
        skipped_id_list = IdList()

        for seq_id in seq_id_list:
            seq_id_splited = seq_id.split(id_separator)
            if id_element_index < len(seq_id_splited):
                if seq_id_list[id_element_index] in cluster_dict:
                    cluster_dict[seq_id_list[id_element_index]].append(seq_id)
                else:
                    cluster_dict[seq_id_list[id_element_index]] = [seq_id]
            else:
                skipped_id_list.append(seq_id)

        if output_prefix:
            cluster_dict.write("%s.seqid.clusters" % output_prefix,
                               splited_values=True)
            skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix)

        return cluster_dict
Exemple #12
0
    def extract_top_hits(hmmer_hits,
                         top_hits_file,
                         top_hits_ids_file=None,
                         not_significant_ids_file=None,
                         not_found_ids_file=None):
        top_hits_ids = IdList()
        not_significant_ids = IdList()
        not_found_ids = IdList()

        index_file = "hmmer_hits.tmp.idx"
        hmm_dict = SearchIO.index_db(index_file, hmmer_hits, "hmmer3-text")

        out_fd = open(top_hits_file, "w")
        out_fd.write("#query\thit\tevalue\tbitscore\n")

        for query in hmm_dict:
            if hmm_dict[query].hits:
                if hmm_dict[query][0].is_included:
                    out_fd.write("%s\t%s\t%s\t%s\n" %
                                 (query, hmm_dict[query][0].id,
                                  hmm_dict[query][0].evalue,
                                  hmm_dict[query][0].bitscore))
                    top_hits_ids.append(query)
                else:
                    not_significant_ids.append(query)
            else:
                not_found_ids.append(query)

        os.remove(index_file)

        if not_significant_ids_file:
            not_significant_ids.write(not_significant_ids_file)

        if not_found_ids_file:
            not_found_ids.write(not_found_ids_file)

        if top_hits_ids_file:
            top_hits_ids.write(top_hits_ids_file)
out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
annotations_dict = SeqIO.to_dict(GFF.parse(open(args.input)))
single_gene_id_list = IdList()

for record in annotations_dict:
    for feature in annotations_dict[record].features:
        #print feature.id
        if feature.type != "gene":
            continue
        for subfeature in feature.sub_features:
            if subfeature.type != "mRNA":
                continue
            exon_number = 0
            for mRNA_subfeature in subfeature.sub_features:
                if mRNA_subfeature.type == "exon":
                    exon_number += 1
            if exon_number == 1:
                single_gene_id_list.append(feature.id)

single_gene_id_list.write(out_fd, close_after_if_file_object=True)
"""
sequence_groups_id = SynDict()
sequence_groups_id.read(args.id_file, split_values=True)
#print("Parsing %s..." % args.input_file)
sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format)
for group in sequence_groups_id:
    SeqIO.write(record_by_id_generator(sequence_dict, sequence_groups_id[group]),
                "%s%s.%s" % (args.output, group, args.extension), format=args.format)
"""
Exemple #14
0
            tmp[i] = tmp[i].split(",")
            for syn_id in tmp[i]:
                complicated_families_syn_ids.add(syn_id)
                sp_set.add(syn_id)
    complicated_families_syn_dict[sl_key] = sp_set
complicated_families_syn_dict.write("complicated_families_connections.t", splited_values=True)

for entry in complicated_families_dict.all_values():
    tmp = entry.split(";")
    for i in range(0, len(tmp)):
        if "_" in tmp[i]:
            tmp[i] = tmp[i][2]
        tmp[i] = tmp[i].split(",")
        for syn_id in tmp[i]:
            complicated_families_syn_ids.add(syn_id)
complicated_families_syn_ids.write("complicated_families_check.ids")

nonassembled.write("splited_to_several_families.t", absent_symbol=".")

assemled_to_different_families = species_syn_dict.filter_by_line(filter_different_assembly)
species_syn_dict.write("correctly_assembled_families_in_all_species.t", absent_symbol=".")
assemled_to_different_families.write("assembled_to_different_families_in_all_species.t", absent_symbol=".")

correctly_assembled_families_synonym = IdList(set(species_syn_dict.all_values()))
assemled_to_different_families_synonym = IdList(set(assemled_to_different_families.all_values()))

correctly_assembled_families_synonym.write("correctly_assembled_families_syn_in_all_species.ids")
assemled_to_different_families_synonym.write("assembled_to_different_families_syn_in_all_species.ids")
if args.output != "output":
    out_fd.close()
Exemple #15
0
    def check_gvcf_integrity(self, gvcf_file, output_prefix, reference=None, length_dict=None, parsing_mode="parse"):
        len_dict = length_dict if length_dict else self.get_lengths(record_dict=self.parse_seq_file(reference,
                                                                                                    mode=parsing_mode),
                                                                    out_file=None,
                                                                    close_after_if_file_object=False)

        scaffold_dict = OrderedDict()

        with self.metaopen(gvcf_file, "r") as gvcf_fd:
            prev_scaffold = ""

            for line in gvcf_fd:
                #print line
                if line[0] == "#":
                    continue

                line_list = line.split("\t")
                scaffold = line_list[0]
                start = int(line_list[1])
                format = line_list[7].split(";")

                if (len(format) == 1) and (format[0][0:3] == "END"):
                    end = int(format[0].split("=")[1])
                else:
                    end = start + len(line_list[3]) - 1
                #print line_list
                #print scaffold, start, end, format

                if scaffold not in scaffold_dict:
                    scaffold_dict[scaffold] = []

                if scaffold != prev_scaffold:
                    scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)])
                else:
                    #print scaffold_dict[scaffold][-1][1]
                    if scaffold_dict[scaffold][-1][1] + 1 >= start:
                        scaffold_dict[scaffold][-1][1] = deepcopy(max(end, scaffold_dict[scaffold][-1][1]))
                    else:
                        print scaffold_dict[scaffold]
                        print line
                        scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)])
                prev_scaffold = scaffold

        complete_scaffolds = IdList()
        fragmented_scaffolds = IdList()
        scaffolds_with_absent_fragments = IdList()

        with open("%s.scaffold_regions" % output_prefix, "w") as scaf_reg_fd:

            for scaffold in scaffold_dict:
                if len(scaffold_dict[scaffold]) > 1:
                    fragmented_scaffolds.append(scaffold)

                scaffold_length = sum(map(lambda s: s[1] - s[0] + 1, scaffold_dict[scaffold]))
                if scaffold_length != len_dict[scaffold]:
                    scaffolds_with_absent_fragments.append(scaffold)
                else:
                    complete_scaffolds.append(scaffold)
                scaf_reg_fd.write("%s\t%s\n" % (scaffold, ",".join(map(lambda s: "-".join(map(str,s)), scaffold_dict[scaffold]))))

        complete_scaffolds.write("%s.complete_scaffolds" % output_prefix)
        fragmented_scaffolds.write("%s.fragmented_scaffolds" % output_prefix)
        scaffolds_with_absent_fragments.write("%s.scaffolds_with_absent_fragments" % output_prefix)
import sys
import argparse

from CustomCollections.GeneralCollections import IdList

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--fam_file",
                    action="store",
                    dest="fam_file",
                    required=True,
                    help="File with families")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="File to write ids")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

id_list = IdList()
id_list.read(args.fam_file,
             close_after_if_file_object=True,
             column_number=1,
             id_in_column_separator=",")
id_list.write(args.output, close_after_if_file_object=True)
Exemple #17
0
    def parallel_run(
        self,
        input_dir,
        output_dir,
        output_prefix,
        input_type="codon",
        min_seq_number_for_conserved_position=None,
        min_seq_number_for_flank_position=None,
        max_pos_number_for_noncons_contig_pos=None,
        min_block_len=None,
        allow_gaps="half",
        save_postscript=True,
        output_type="htm",
        threads=None,
    ):

        if threads:
            self.threads = threads

        data_dir = "%s/data/" % output_dir
        postscript_dir = "%s/ps/" % output_dir
        results_dir = "%s/results/" % output_dir
        htm_dir = "%s/htm/" % output_dir

        for directory in output_dir, data_dir, postscript_dir, results_dir, htm_dir:
            self.safe_mkdir(directory)

        #input_files_list = map(os.path.abspath, self.make_list_of_path_to_files(input_directory))

        input_files_list = self.make_list_of_path_to_files(
            input_dir, return_absolute_paths=True)

        for entry in input_files_list:
            directory, prefix, extension = self.split_filename(entry)
            os.system("ln -s %s %s/%s%s" %
                      (entry, data_dir, prefix, extension))

        data_files_list = self.make_list_of_path_to_files(
            data_dir, return_absolute_paths=True)

        common_options = self.parse_options(
            input_type=input_type,
            min_seq_number_for_conserved_position=
            min_seq_number_for_conserved_position,
            min_seq_number_for_flank_position=min_seq_number_for_flank_position,
            max_pos_number_for_noncons_contig_pos=
            max_pos_number_for_noncons_contig_pos,
            min_block_len=min_block_len,
            allow_gaps=allow_gaps,
            save_postscript=save_postscript,
            output_type=output_type,
            concatenate_blocks_from_aignments=None)
        options_list = []

        for data_file in data_files_list:
            options = " %s" % data_file
            options += " %s" % common_options
            options_list.append(options)

        self.parallel_execute(options_list=options_list)

        block_coordinates = OrderedDict()

        skipped_ids_file = "%s/%s.skipped.ids" % (output_dir, output_prefix)
        skipped_ids = IdList()

        for filename in data_files_list:
            data_dir, prefix, extension = self.split_filename(filename)
            blocks_file = "%s-gb" % filename
            htm_file = "%s-gb.htm" % filename
            postscript_file = "%s-gbPS" % filename

            if (not os.path.exists(blocks_file)) or (
                    not os.path.exists(htm_file)):
                skipped_ids.append(prefix)
                print("Warning!!! %s skipped..." % prefix)
                continue

            block_coordinates[prefix] = self.extract_block_coordinates(
                htm_file)
            os.system("mv %s %s/%s.ps" %
                      (postscript_file, postscript_dir, prefix))
            os.system("mv %s %s/%s.htm" % (htm_file, htm_dir, prefix))
            self.convert_output_to_fasta(
                blocks_file, "%s/%s%s" % (results_dir, prefix, extension))
            os.remove(blocks_file)

        block_coordinates_file = "%s/%s.block.coordinates" % (output_dir,
                                                              output_prefix)
        skipped_ids.write(skipped_ids_file)
        with open(block_coordinates_file, "w") as block_fd:
            for entry in block_coordinates:
                coordinates_string = ";".join(
                    map(lambda s: "%i,%i" % (s[0], s[1]),
                        block_coordinates[entry]))
                block_fd.write("%s\t%s\n" % (entry, coordinates_string))
Exemple #18
0
    if args.all or args.tree:
        os.system("wget %s" % tree_options)
    if args.all or args.hmm:
        os.system("wget %s" % hmm_options)


pool = Pool(args.threads)
pool.map(download_data, family_ids)
pool.close()
for fam_id in family_ids:
    if args.all or args.alignment:
        if os.path.getsize("%s%s.fasta" % (args.output_dir, fam_id)) == 0:
            absent_alignment_list.append(fam_id)
    if args.all or args.tree:
        if os.path.getsize("%s%s.nwk" % (args.output_dir, fam_id)) == 0:
            absent_tree_list.append(fam_id)
    if args.all or args.hmm:
        if os.path.getsize("%s%s.hmm" % (args.output_dir, fam_id)) == 0:
            absent_hmm_list.append(fam_id)

print absent_alignment_list
if absent_alignment_list:
    absent_alignment_list.write("absent_alignments.ids")
    print("%i alignments were not downloaded" % len(absent_alignment_list))
if absent_tree_list:
    absent_tree_list.write("absent_trees.ids")
    print("%i trees were not downloaded" % len(absent_tree_list))
if absent_hmm_list:
    absent_hmm_list.write("absent_hmms.ids")
    print("%i hmms were not downloaded" % len(absent_hmm_list))
Exemple #19
0
    def get_cds_for_proteins(self, protein_id_list, output_prefix, download_chunk_size=100, temp_dir_prefix="temp"):

        from Tools.Abstract import Tool

        transcript_temp_dir = "%s_transcripts" % temp_dir_prefix
        protein_temp_dir = "%s_proteins" % temp_dir_prefix
        number_of_ids = len(protein_id_list)
        print "Total %i ids" % number_of_ids

        for directory in transcript_temp_dir, protein_temp_dir:
            self.save_mkdir(directory)
        pep_file = "%s.pep.genbank" % output_prefix
        transcript_file = "%s.trascript.genbank" % output_prefix

        ranges = np.append(np.arange(0, number_of_ids, download_chunk_size), [number_of_ids])

        print "Downloading proteins..."
        for i in range(0, len(ranges)-1):
            print "Downloading chunk %i" % i
            pep_tmp_file = "%s/%s_%i" % (protein_temp_dir, pep_file, i)
            self.efetch("protein", protein_id_list[ranges[i]:ranges[i+1]], pep_tmp_file, rettype="gb", retmode="text")

        os.system("cat %s/* > %s" % (protein_temp_dir, pep_file))

        peptide_dict = SeqIO.index_db("tmp.idx", pep_file, format="genbank")
        downloaded_protein_ids = IdList(peptide_dict.keys())

        print "%i proteins were downloaded" % len(downloaded_protein_ids)
        not_downloaded_proteins_ids = Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="only_a")
        print "%i proteins were not downloaded" % len(not_downloaded_proteins_ids)
        not_downloaded_proteins_ids.write("%s.not_downloaded.ids" % output_prefix)
        downloaded_protein_ids.write("%s.downloaded.ids" % output_prefix)
        print Tool.intersect_ids([protein_id_list], [downloaded_protein_ids], mode="count")

        pep_without_transcripts = IdList()
        pep_with_several_CDS_features = IdList()
        pep_to_transcript_accordance = SynDict()
        transcript_ids = IdList()

        print "Extracting transcript ids corresponding to proteins..."
        for pep_id in peptide_dict:
            for feature in peptide_dict[pep_id].features:
                if feature.type == "CDS":
                    try:
                        transcript_id = feature.qualifiers["coded_by"][0].split(":")[0]
                        if pep_id not in pep_to_transcript_accordance:
                            pep_to_transcript_accordance[pep_id] = [transcript_id]
                        else:
                            pep_to_transcript_accordance[pep_id].append(transcript_id)
                            print("Genbank record for %s contains several CDS features" % pep_id)
                            pep_with_several_CDS_features.append(pep_id)
                        if transcript_id in transcript_ids:
                            print "Repeated transcript id: %s" % transcript_id
                            continue
                        transcript_ids.append(transcript_id)
                    except:
                        print "Transcript id for %s was not found" % pep_id
                        pep_without_transcripts.append(pep_id)

        pep_with_several_CDS_features.write("%s.pep_with_several_CDS.ids" % output_prefix)
        pep_without_transcripts.write("%s.pep_without_transcripts.ids" % output_prefix)
        transcript_ids.write("%s.transcripts.ids" % output_prefix)

        number_of_transcripts = len(transcript_ids)
        print "%i transcripts were found" % number_of_transcripts

        pep_to_transcript_accordance.write("%s.pep_to_transcript.accordance" % output_prefix, splited_values=True)

        transcript_ranges = np.append(np.arange(0, number_of_transcripts, download_chunk_size), [number_of_transcripts])

        print "Downloading transcripts..."
        for i in range(0, len(transcript_ranges)-1):
            print "Downloading chunk %i" % i
            transcript_tmp_file = "%s/%s_%i" % (transcript_temp_dir, transcript_file, i)
            self.efetch("nuccore", transcript_ids[transcript_ranges[i]:transcript_ranges[i+1]],
                        transcript_tmp_file, rettype="gb", retmode="text")

        os.system("cat %s/* > %s" % (transcript_temp_dir, transcript_file))


        transcript_dict = SeqIO.index_db("tmp_1.idx", transcript_file, format="genbank")

        cds_records_list = []
        for transcript_id in transcript_dict:
            for feature in transcript_dict[transcript_id].features:
                CDS_counter = 1
                if feature.type == "CDS":
                    #print feature

                    feature_seq = feature.extract(transcript_dict[transcript_id].seq)
                    feature_id = transcript_id  # case with several CDS per transcripts is was not taken into account
                    if "protein_id" in feature.qualifiers:
                        description = "protein=%s" % feature.qualifiers["protein_id"][0]
                    else:
                        print "Corresponding protein id was not found for %s" % transcript_id
                    cds_records_list.append(SeqRecord(seq=feature_seq, id=feature_id, description=description))
        SeqIO.write(cds_records_list, "%s.cds" % output_prefix, format="fasta")

        stat_string = "Input protein ids\t %i\n" % number_of_ids
        stat_string += "Downloaded proteins\t%i\n" % number_of_transcripts
        stat_string += "Downloaded transcripts\t%i\n" % len(transcript_dict)

        print stat_string

        with open("%s.stats" % output_prefix, "w") as stat_fd:
            stat_fd.write(stat_string)

        for filename in "tmp.idx", "tmp_1.idx":
            os.remove(filename)