Ejemplo n.º 1
0
    def extract_annotation_by_refence_id(list_of_target_gff, id_file,
                                         extracted_gff, filtered_out_gff):
        ids = IdList()
        ids.read(id_file)
        extracted_gff_fd = open(extracted_gff, "w")
        filtered_out_gff_fd = open(filtered_out_gff, "w")
        for filename in list_of_target_gff:
            with open(filename, "r") as in_fd:
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if target_name not in ids:
                            writing_fd = filtered_out_gff_fd

                        else:
                            writing_fd = extracted_gff_fd
                        # print target_name
                        writing_fd.write(tmp)
                        while True:
                            tmp = next(in_fd, "")
                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        extracted_gff_fd.close()
        filtered_out_gff_fd.close()
Ejemplo n.º 2
0
    def extract_top_hits_from_target_gff(list_of_target_gff,
                                         top_hits_gff,
                                         secondary_hits_gff,
                                         id_white_list_file=None,
                                         max_hits_per_query=None):
        if id_white_list_file:
            white_ids = IdList()
            white_ids.read(id_white_list_file)
        top_hits_gff_fd = open(top_hits_gff, "w")
        secondary_hits_gff_fd = open(secondary_hits_gff, "w")
        targets_list = []
        hit_counter = 0
        gene_counter = 0
        for filename in list_of_target_gff:
            index = 0
            with open(filename, "r") as in_fd:
                #print u
                #tmp = None
                for line in in_fd:
                    tmp = line
                    if tmp == "# --- START OF GFF DUMP ---\n":
                        # read until string with target_name will appear
                        while tmp[0] == "#":
                            tmp = next(in_fd, "")

                        target_name = tmp.split("\t")[8].split(
                            ";")[1].split()[1]
                        if id_white_list_file:
                            if target_name not in white_ids:
                                continue
                        if target_name not in targets_list:
                            writing_fd = top_hits_gff_fd
                            targets_list.append(target_name)
                            gene_counter += 1
                            hit_counter = 0
                        else:
                            writing_fd = secondary_hits_gff_fd
                        # print target_name
                        hit_counter += 1
                        tmp = tmp.replace(
                            "gene_id 0",
                            "gene_id g%i_h%i" % (gene_counter, hit_counter))
                        if hit_counter <= max_hits_per_query:
                            writing_fd.write(tmp)

                        while True:
                            tmp = next(in_fd, "")
                            # print("cccc")

                            if tmp == "# --- END OF GFF DUMP ---\n":
                                break
                            if max_hits_per_query:
                                if hit_counter > max_hits_per_query:
                                    #print "aaaaa"
                                    continue
                            writing_fd.write(tmp)
                    if tmp == "":
                        break
        top_hits_gff_fd.close()
        secondary_hits_gff_fd.close()
Ejemplo n.º 3
0
 def extract_ids_from_file(input_file,
                           output_file=None,
                           header=False,
                           column_separator="\t",
                           comments_prefix="#",
                           column_number=None):
     id_list = IdList()
     id_list.read(input_file,
                  column_separator=column_separator,
                  comments_prefix=comments_prefix,
                  column_number=column_number,
                  header=header)
     if output_file:
         id_list.write(output_file, header=header)
     return id_list
Ejemplo n.º 4
0
    def extract_evidence_by_ids(evidence_file,
                                id_file,
                                output_evidence_file,
                                mode="transcript"):
        # possible modes: transcript, gene
        ids = IdList()
        ids.read(id_file, comments_prefix="#")

        column_id = 0 if mode == "gene" else 1

        with open(evidence_file, "r") as ev_fd:
            with open(output_evidence_file, "w") as out_fd:
                for line in ev_fd:
                    if line[0] == "#":
                        out_fd.write(line)
                        continue

                    entry_id = line.split("\t")[column_id]
                    if entry_id in ids:
                        out_fd.write(line)
Ejemplo n.º 5
0
    def extract_proteins_from_selected_families(
            families_id_file,
            fam_file,
            pep_file,
            output_dir="./",
            pep_format="fasta",
            out_prefix=None,
            create_dir_for_each_family=False):
        from RouToolPa.Routines import SequenceRoutines

        fam_id_list = IdList()
        fam_dict = SynDict()
        #print(pep_file)
        FileRoutines.safe_mkdir(output_dir)
        out_dir = FileRoutines.check_path(output_dir)
        create_directory_for_each_family = True if out_prefix else create_dir_for_each_family
        if families_id_file:
            fam_id_list.read(families_id_file)
        fam_dict.read(fam_file, split_values=True, values_separator=",")
        protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format)

        for fam_id in fam_id_list if families_id_file else fam_dict:
            if fam_id in fam_dict:
                if create_directory_for_each_family:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    FileRoutines.safe_mkdir(fam_dir)
                    out_file = "%s%s.pep" % (fam_dir, out_prefix
                                             if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.pep" % (out_dir, out_prefix
                                              if out_prefix else fam_id)

                SeqIO.write(SequenceRoutines.record_by_id_generator(
                    protein_dict, fam_dict[fam_id], verbose=True),
                            out_file,
                            format=pep_format)
            else:
                print("%s was not found" % fam_id)

        os.remove("tmp.idx")
Ejemplo n.º 6
0
                    default="stdout",
                    help="Prefix of output file")
args = parser.parse_args()

out_fd = sys.stdout if args.output_prefix == "stdout" else open(
    "%s_reference_random_genes.ids" % args.output_prefix, "w")

reference_families = SynDict()
reference_families.read(args.reference_fam,
                        separator="\t",
                        split_values=True,
                        values_separator=",")

node_family_ids = IdList()
node_family_ids.read(args.input,
                     header=True,
                     column_number=0,
                     column_separator="\t")

reference_random_genes = SynDict()

for family_id in node_family_ids:
    if family_id not in reference_families:
        reference_random_genes[family_id] = "."
    else:
        reference_random_genes[family_id] = choice(
            reference_families[family_id])

reference_random_genes.write("%s_reference_random_genes.t" %
                             args.output_prefix)

for family_id in reference_random_genes:
Ejemplo n.º 7
0
                    "--header",
                    action="store_true",
                    dest="header",
                    help="Header is present in id file. Default: False")
parser.add_argument("-f",
                    "--format",
                    action="store",
                    dest="format",
                    required=True,
                    help="Format of the file with hits")
parser.add_argument("-o",
                    "--output_file",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="Output file")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
id_list = IdList()
id_list = id_list.read(args.id_file, header=args.header)

HMMER3.extract_hits_by_query_ids(id_list,
                                 args.input,
                                 args.output,
                                 fileformat=args.format,
                                 close_after_if_file_object=True)

out_fd.close()
Ejemplo n.º 8
0
parser.add_argument("-i", "--input_file", action="store", dest="input", required=True,
                    help="Input file with families")
parser.add_argument("-d", "--id_file", action="store", dest="id_file", default=None,
                    help="File with ids of families. If absent genes from all families will be extracted(default).")
parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout",
                    help="Output file")
parser.add_argument("-s", "--separate_families", action="store_true", dest="separate_families",
                    help="Separate families to different files. If set option -o/--output_file is ignored")
args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")
families = SynDict()
families.read(args.input, separator="\t", split_values=True, values_separator=",")
if args.id_file:
    id_list = IdList()
    id_list = id_list.read(args.id_file)


if args.separate_families:
    for fam_id in id_list if args.id_file else families:
        with open("%s.ids" % fam_id, "w") as fam_fd:
            for gene_id in families[fam_id]:
                fam_fd.write(gene_id + "\n")
else:
    with open(args.output, "w") as out_fd:
        for fam_id in id_list if args.id_file else families:
            for gene_id in families[fam_id]:
                out_fd.write(gene_id + "\n")
if args.output != "stdout":
    out_fd.close()
Ejemplo n.º 9
0
parser.add_argument("-s", "--store_logs", action="store_true", dest="store_logs", default=False,
                    help="Store download logs in directory set by -g/--logs_dir option")
parser.add_argument("-g", "--logs_dir", action="store", dest="logs_dir", default="logs", type=FileRoutines.check_path,
                    help="Directory with logs")
args = parser.parse_args()

FileRoutines.safe_mkdir(args.output_dir)
FileRoutines.safe_mkdir(args.logs_dir)

if (not args.alignment) and (not args.tree) and (not args.hmm):
    args.all = True

in_fd = sys.stdin if args.input == "stdin" else open(args.input, "r")

family_ids = IdList()
family_ids.read(in_fd)

if args.input != "stdin":
    in_fd.close()

absent_alignment_list = IdList()
absent_tree_list = IdList()
absent_hmm_list = IdList()


def download_data(fam_id):
    print("Downloading %s family" % fam_id)
    ali_log_file = "/dev/null" if not args.store_logs else "%s%s_alignment.log" % (args.logs_dir, fam_id)
    tree_log_file = "/dev/null" if not args.store_logs else "%s%s_tree.log" % (args.logs_dir, fam_id)
    hmm_log_file = "/dev/null" if not args.store_logs else "%s%s_hmm.log" % (args.logs_dir, fam_id)
Ejemplo n.º 10
0
                    action="store",
                    dest="filtered_family_dir",
                    default="filtered_fam",
                    type=FileRoutines.check_path,
                    help="Directory to write filtered_families")
args = parser.parse_args()

FileRoutines.safe_mkdir(args.filtered_family_dir)
species_list = sorted(args.species_set)
if args.white_list_file and args.black_list_file:
    raise ValueError("Black list and white list cant be set simultaneously")

black_list = IdList()
white_list = IdList()
if args.black_list_file:
    black_list.read(args.black_list_file)
if args.white_list_file:
    white_list.read(args.white_list_file)
out_fd = open(args.cafe_file, "w")
filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w")
out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list)))
species_filtered_fd_list = OrderedDict()
fam_count_dict = TwoLvlDict()
species_family_dict = TwoLvlDict()
for species in args.species_set:
    species_family_dict[species] = SynDict()
    species_family_dict[species].read(
        "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix),
        split_values=True,
        values_separator=",",
Ejemplo n.º 11
0
    def split_hmm(self,
                  hmmfile,
                  output_dir,
                  num_of_recs_per_file,
                  num_of_files=None,
                  output_prefix=None,
                  threads=4):

        try:
            os.mkdir(output_dir)
        except OSError:
            pass

        id_fd = CGAS.cgas(hmmfile,
                          grep_pattern="NAME",
                          whole_word_match=True,
                          awk_code="{print $2}",
                          capture_output=True)

        split_index = 1
        ids_written = 0
        ids_list = IdList()
        #ids_list = read_ids(id_fd, close_after_if_file_object=False)
        ids_list.read(id_fd, close_after_if_file_object=True)
        number_of_ids = len(ids_list)
        out_prefix = self.split_filename(
            hmmfile)[1] if output_prefix is None else output_prefix

        num_of_ids = int(
            number_of_ids /
            num_of_files) + 1 if num_of_files else num_of_recs_per_file

        common_options = " -f"
        common_options += " %s" % hmmfile
        options_list = []
        while (ids_written + num_of_ids) <= number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:ids_written +
                                          num_of_ids])
            tmp_id_list.write("%s/%s_%i.ids" %
                              (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" %
                                  (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
            ids_written += num_of_ids

        if ids_written != number_of_ids:
            tmp_id_list = IdList(ids_list[ids_written:])
            tmp_id_list.write("%s/%s_%i.ids" %
                              (output_dir, out_prefix, split_index))

            options = common_options
            options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index)
            options += " > %s" % ("%s/%s_%i.hmm" %
                                  (output_dir, out_prefix, split_index))
            options_list.append(options)

            split_index += 1
        #print options_list
        self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
Ejemplo n.º 12
0
parser.add_argument("-f",
                    "--link_file",
                    action="store",
                    dest="link_file",
                    help="File with links")
parser.add_argument("-t",
                    "--threads",
                    action="store",
                    dest="threads",
                    type=int,
                    default=1,
                    help="Number of simultaneous downloads")

args = parser.parse_args()

loader = IdList()
link_list = loader.read(args.link_file)

Wget.threads = args.threads
Wget.parallel_download(link_list)
"""
options_list = []
for entry_id in id_list:
    ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id)
    options_list.append("--no-host-directories -rc -t 500 %s" % ftp_path)


tool = Tool(cmd="wget", max_threads=args.threads)

tool.parallel_execute(options_list)
"""
Ejemplo n.º 13
0
            #print ("%s\t%s" % (record.id, feature.id))

            if (feature.id in annotation_ids) and (feature.type in white_list_of_annotation_types):
                new_record.features.append(feature)
        if len(new_record.features) > 0:
            yield new_record

parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input_gff", action="store", dest="input_gff",
                    help="Gff file with annotations to extract")
parser.add_argument("-o", "--output_file", action="store", dest="output_file",
                    help="Output file with extracted_annotations")
parser.add_argument("-d", "--ids_file", action="store", dest="ids_file",
                    help="File with ids of annotations to extract")
parser.add_argument("-t", "--annotation_types", action="store", dest="annotation_types", default=["gene"],
                    type=lambda s: s.split(","),
                    help="Comma-separated list of annotation types to extract")

args = parser.parse_args()

annotation_ids = IdList()
annotation_ids.read(args.ids_file, comments_prefix="#")
#print args.annotation_types
out_fd = open(args.output_file, "w")

GFF.write(record_with_extracted_annotations_generator(args.input_gff, args.annotation_types), out_fd)

out_fd.close()
Ejemplo n.º 14
0
__author__ = 'Sergei F. Kliver'
import sys
import argparse
from RouToolPa.Collections.General import IdList

parser = argparse.ArgumentParser()

parser.add_argument("-i",
                    "--fam_file",
                    action="store",
                    dest="fam_file",
                    required=True,
                    help="File with families")
parser.add_argument("-o",
                    "--output",
                    action="store",
                    dest="output",
                    default="stdout",
                    help="File to write ids")

args = parser.parse_args()

out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w")

id_list = IdList()
id_list.read(args.fam_file,
             close_after_if_file_object=True,
             column_number=1,
             id_in_column_separator=",")
id_list.write(args.output, close_after_if_file_object=True)
Ejemplo n.º 15
0
parser = argparse.ArgumentParser()

parser.add_argument("-i", "--input", action="store", dest="input", required=True,
                    help="Input .gff file")
parser.add_argument("-o", "--output", action="store", dest="output", required=True,
                    help="Output .gff file")
parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True,
                    help="File with ids of genes to extract")
parser.add_argument("-w", "--write_comments", action="store_true", dest="write_comments",
                    help="Write comments to output")

args = parser.parse_args()

feature_id_list = IdList()
feature_id_list.read(args.id_file)

with open(args.input, "r") as in_fd:
    with open(args.output, "w") as out_fd:
        for line in in_fd:
            if (line[0] == "#") and args.write_comments:
                out_fd.write(line)
                continue
            description_list = line.split("\t")[9].split(";")
            feature_id = description_list[0].split("=")[1]
            if feature_id not in feature_id_list:
                continue
            out_fd.write(line)
            while True:
                description_list = in_fd.next().split("\t")[9].split(";")
Ejemplo n.º 16
0
                    help="Number of simultaneous downloads")
parser.add_argument("-c",
                    "--connections",
                    action="store",
                    dest="connections",
                    type=int,
                    default=8,
                    help="Number of connections for each download")

args = parser.parse_args()

if (not args.ids) and (not args.id_file):
    raise ValueError("Both ids and id file were not set")

loader = IdList()
id_list = loader.read(args.id_file) if args.id_file else args.ids

Axel.threads = args.threads
Axel.parallel_download_from_sra(id_list, args.connections)
"""
options_list = []
for entry_id in id_list:
    ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id)
    options_list.append("-n %i %s" % (args.connections, ftp_path))

tool = Tool(cmd="axel", max_threads=args.threads)

tool.parallel_execute(options_list)

for filename in os.listdir(os.getcwd()):
    if ".sra" not in filename:
Ejemplo n.º 17
0
    def extract_sequences_from_selected_clusters(
        self,
        clusters_id_file,
        cluster_file,
        seq_file,
        output_dir="./",
        out_prefix=None,
        create_dir_for_each_cluster=False,
        skip_cluster_if_no_sequence_for_element=True,
    ):
        # TODO: TEST IT BEFORE USAGE.WAS SIGNIFICANTLY CHANGED WITHOUT TESTING
        cluster_id_list = IdList()
        cluster_dict = SynDict()
        #print(pep_file)
        self.safe_mkdir(output_dir)
        out_dir = self.check_path(output_dir)
        create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster
        if clusters_id_file:
            cluster_id_list.read(clusters_id_file)
        cluster_dict.read(cluster_file,
                          split_values=True,
                          values_separator=",")

        protein_files = self.make_list_of_path_to_files(seq_file)

        protein_collection = CollectionSequence(in_file=seq_file,
                                                parsing_mode="parse")

        #protein_dict = self.parse_seq_file(protein_files, "index_db", format=seq_format, index_file="tmp.idx") if len(protein_files) > 1 else self.parse_seq_file(protein_files[0], parsing_mode, format=seq_format, index_file="tmp.idx") #SeqIO.index_db("tmp.idx", self.make_list_of_path_to_files(seq_file), format=seq_format)

        number_of_skipped_clusters = 0
        for fam_id in cluster_id_list if clusters_id_file else cluster_dict:

            if skip_cluster_if_no_sequence_for_element:
                absent_elements = self.check_absence_of_cluster_elements(
                    cluster_dict[fam_id], protein_collection.records)
                if absent_elements:
                    print("Skipping cluster %s due to absent element(%s)" %
                          (fam_id, ",".join(absent_elements)))
                    number_of_skipped_clusters += 1
                    continue

            if fam_id in cluster_dict:
                if create_directory_for_each_cluster:
                    fam_dir = "%s%s/" % (out_dir, fam_id)
                    self.safe_mkdir(fam_dir)
                    out_file = "%s%s.fasta" % (fam_dir, out_prefix
                                               if out_prefix else fam_id)
                else:
                    out_file = "%s/%s.fasta" % (out_dir, out_prefix
                                                if out_prefix else fam_id)
                protein_collection.write(out_file,
                                         whitelist=cluster_dict[fam_id])
                #SeqIO.write(SequenceRoutines.record_by_id_generator(protein_dict, cluster_dict[fam_id], verbose=True),
                #            out_file, format=seq_format)

        #if (len(protein_files) > 1) or (parsing_mode == "index_db"):
        #    os.remove("tmp.idx")
        print("%i of %i clusters were skipped due to absent elements" %
              (number_of_skipped_clusters, len(cluster_dict)))

        return number_of_skipped_clusters