Example #1
0
    def extract_annotation_by_refence_id(list_of_target_gff, id_file,
                                         extracted_gff, filtered_out_gff):
        ids = IdList()
        ids.read(id_file)
        extracted_gff_fd = open(extracted_gff, "w")
        filtered_out_gff_fd = open(filtered_out_gff, "w")
        for filename in list_of_target_gff:
            with open(filename, "r") as in_fd:
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if target_name not in ids:
                            writing_fd = filtered_out_gff_fd

                        else:
                            writing_fd = extracted_gff_fd
                        # print target_name
                        writing_fd.write(tmp)
                        while True:
                            tmp = next(in_fd, "")
                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        extracted_gff_fd.close()
        filtered_out_gff_fd.close()
Example #2
0
    def extract_top_hits_from_target_gff(list_of_target_gff,
                                         top_hits_gff,
                                         secondary_hits_gff,
                                         id_white_list_file=None,
                                         max_hits_per_query=None):
        if id_white_list_file:
            white_ids = IdList()
            white_ids.read(id_white_list_file)
        top_hits_gff_fd = open(top_hits_gff, "w")
        secondary_hits_gff_fd = open(secondary_hits_gff, "w")
        targets_list = []
        hit_counter = 0
        gene_counter = 0
        for filename in list_of_target_gff:
            index = 0
            with open(filename, "r") as in_fd:
                #print u
                #tmp = None
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if id_white_list_file:
                            if target_name not in white_ids:
                                continue
                        if target_name not in targets_list:
                            writing_fd = top_hits_gff_fd
                            targets_list.append(target_name)
                            gene_counter += 1
                            hit_counter = 0
                        else:
                            writing_fd = secondary_hits_gff_fd
                        # print target_name
                        hit_counter += 1
                        tmp = tmp.replace(
                            "gene_id 0",
                            "gene_id g%i_h%i" % (gene_counter, hit_counter))
                        if hit_counter <= max_hits_per_query:
                            writing_fd.write(tmp)

                        while True:
                            tmp = next(in_fd, "")
                            # print("cccc")

                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            if max_hits_per_query:
                                if hit_counter > max_hits_per_query:
                                    #print "aaaaa"
                                    continue
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        top_hits_gff_fd.close()
        secondary_hits_gff_fd.close()
Example #3
0
    def extract_sequences_from_selected_clusters(
            self,
            clusters_id_file,
            cluster_file,
            seq_file,
            output_dir="./",
            seq_format="fasta",
            out_prefix=None,
            create_dir_for_each_cluster=False,
            skip_cluster_if_no_sequence_for_element=True):
        from Routines import SequenceRoutines, FileRoutines
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")
        protein_dict = SeqIO.index_db(
            "tmp.idx",
            FileRoutines.make_list_of_path_to_files(seq_file),
            format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_dict)
                if absent_elements:
                    print "Skipping cluster %s due to absent element(%s)" % (
                        fam_id, ",".join(absent_elements))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, cluster_dict[fam_id], verbose=True),
                            out_file,
                            format=seq_format)

        os.remove("tmp.idx")
        print "%i of %i clusters were skipped due to absent elements" % (
            number_of_skipped_clusters, len(cluster_dict))

        return number_of_skipped_clusters
Example #4
0
 def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t",
                           comments_prefix="#", column_number=None):
     id_list = IdList()
     id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix,
                  column_number=column_number, header=header)
     if output_file:
         id_list.write(output_file, header=header)
     return id_list
Example #5
0
    def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4):

        try:
            os.mkdir(output_dir)
        except OSError:
            pass

        id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}",
                          capture_output=True)

        split_index = 1
        ids_written = 0
        ids_list = IdList()
        #ids_list = read_ids(id_fd, close_after_if_file_object=False)
        ids_list.read(id_fd, close_after_if_file_object=True)
        number_of_ids = len(ids_list)
        out_prefix = self.split_filename(hmmfile)[1] if output_prefix is None else output_prefix

        num_of_ids = int(number_of_ids/num_of_files) + 1 if num_of_files else num_of_recs_per_file

        common_options = " -f"
        common_options += " %s" % hmmfile
        options_list = []
        while (ids_written + num_of_ids) <= number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:ids_written+num_of_ids])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
            ids_written += num_of_ids

        if ids_written != number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:])
            tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
        #print options_list
        self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
Example #6
0
    def extract_clusters_by_element_ids_from_file(self,
                                                  cluster_file,
                                                  element_file,
                                                  output_file,
                                                  mode="w"):
        """"
        mode: "w" - if elements from element_id_list are present in cluster extracts only that elements
              "a" - if elements from element_id_list are present in cluster extracts all elements
        """
        cluster_dict = SynDict()
        cluster_dict.read(cluster_file, split_values=True, comments_prefix="#")

        element_id_list = IdList()
        element_id_list.read(element_file, comments_prefix="#")
        extracted_clusters = self.extract_clusters_by_element_ids(
            cluster_dict, element_id_list, mode=mode)
        extracted_clusters.write(output_file, splited_values=True)
Example #7
0
    def extract_evidence_by_ids(evidence_file, id_file, output_evidence_file, mode="transcript"):
        # possible modes: transcript, gene
        ids = IdList()
        ids.read(id_file, comments_prefix="#")

        column_id = 0 if mode == "gene" else 1

        with open(evidence_file, "r") as ev_fd:
            with open(output_evidence_file, "w") as out_fd:
                for line in ev_fd:
                    if line[0] == "#":
                        out_fd.write(line)
                        continue

                    entry_id = line.split("\t")[column_id]
                    if entry_id in ids:
                        out_fd.write(line)
Example #8
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from Routines import SequenceRoutines, FileRoutines
        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
parser.add_argument("-d",
                    "--id_file",
                    action="store",
                    dest="id_file",
                    required=True,
                    help="File with ids of genes to extract")
parser.add_argument("-w",
                    "--write_comments",
                    action="store_true",
                    dest="write_comments",
                    help="Write comments to output")

args = parser.parse_args()

feature_id_list = IdList()
feature_id_list.read(args.id_file)

with open(args.input, "r") as in_fd:
    with open(args.output, "w") as out_fd:
        for line in in_fd:
            if (line[0] == "#") and args.write_comments:
                out_fd.write(line)
                continue
            description_list = line.split("\t")[9].split(";")
            feature_id = description_list[0].split("=")[1]
            if feature_id not in feature_id_list:
                continue
            out_fd.write(line)
            while True:
                description_list = in_fd.next().split("\t")[9].split(";")
Example #10
0
                    dest="logs_dir",
                    default="logs",
                    type=check_path,
                    help="Directory with logs")
args = parser.parse_args()

save_mkdir(args.output_dir)
save_mkdir(args.logs_dir)

if (not args.alignment) and (not args.tree) and (not args.hmm):
    args.all = True

in_fd = sys.stdin if args.input == "stdin" else open(args.input, "r")

family_ids = IdList()
family_ids.read(in_fd)

if args.input != "stdin":
    in_fd.close()

absent_alignment_list = IdList()
absent_tree_list = IdList()
absent_hmm_list = IdList()


def download_data(fam_id):
    print("Downloading %s family" % fam_id)
    ali_log_file = "/dev/null" if not args.store_logs else "%s%s_alignment.log" % (
        args.logs_dir, fam_id)
    tree_log_file = "/dev/null" if not args.store_logs else "%s%s_tree.log" % (
        args.logs_dir, fam_id)
Example #11
0
#!/usr/bin/env python
__author__ = 'Sergei F. Kliver'
import argparse

from CustomCollections.GeneralCollections import IdList
from Routines import SequenceRoutines, FileRoutines


parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")),
                    help="Comma-separated list of  genbank files/directories with transcript annotations")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="Output file")
parser.add_argument("-d", "--id_file", action="store", dest="id_file",
                    help="File with id of transcripts to deal with")

args = parser.parse_args()

if args.id_file:
    id_list = IdList()
    id_list.read(args.id_file)
else:
    id_list = None
SequenceRoutines.extract_introns_from_transcripts_from_genbank_files(args.input, args.output,
                                                                     transcript_id_white_list=id_list)
Example #12
0
    def get_taxonomy_from_id_file(self, taxa_file, output_file, email, input_type="latin"):

        taxa_list = IdList()
        taxa_list.read(taxa_file)

        self.get_taxonomy(taxa_list, output_file, email, input_type=input_type)
import sys
import argparse

from CustomCollections.GeneralCollections import IdList

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--fam_file",
                    action="store",
                    dest="fam_file",
                    required=True,
                    help="File with families")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="File to write ids")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

id_list = IdList()
id_list.read(args.fam_file,
             close_after_if_file_object=True,
             column_number=1,
             id_in_column_separator=",")
id_list.write(args.output, close_after_if_file_object=True)
                    default="stdout",
                    help="Prefix of output file")
args = parser.parse_args()

out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
            reference_families[family_id])

reference_random_genes.write("%s_reference_random_genes.t" %
                             args.output_prefix)

for family_id in reference_random_genes:
Example #15
0
                    "--header",
                    action="store_true",
                    dest="header",
                    help="Header is present in id file. Default: False")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    required=True,
                    help="Format of the file with hits")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
id_list = IdList()
id_list = id_list.read(args.id_file, header=args.header)

HMMER3.extract_hits_by_query_ids(id_list,
                                 args.input,
                                 args.output,
                                 fileformat=args.format,
                                 close_after_if_file_object=True)

out_fd.close()
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output_file",
                    help="Output file with extracted_annotations")
parser.add_argument("-d",
                    "--ids_file",
                    action="store",
                    dest="ids_file",
                    help="File with ids of annotations to extract")
parser.add_argument("-t",
                    "--annotation_types",
                    action="store",
                    dest="annotation_types",
                    default=["gene"],
                    type=lambda s: s.split(","),
                    help="Comma-separated list of annotation types to extract")

args = parser.parse_args()

annotation_ids = IdList()
annotation_ids.read(args.ids_file, comments_prefix="#")
#print args.annotation_types
out_fd = open(args.output_file, "w")

GFF.write(
    record_with_extracted_annotations_generator(args.input_gff,
                                                args.annotation_types), out_fd)

out_fd.close()
Example #17
0
    def get_cds_for_proteins_from_id_file(self, protein_id_file, output_prefix):
        pep_ids = IdList()
        pep_ids.read(protein_id_file)

        self.get_cds_for_proteins(pep_ids, output_prefix)