def extract_annotation_by_refence_id(list_of_target_gff, id_file, extracted_gff, filtered_out_gff): ids = IdList() ids.read(id_file) extracted_gff_fd = open(extracted_gff, "w") filtered_out_gff_fd = open(filtered_out_gff, "w") for filename in list_of_target_gff: with open(filename, "r") as in_fd: for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if target_name not in ids: writing_fd = filtered_out_gff_fd else: writing_fd = extracted_gff_fd # print target_name writing_fd.write(tmp) while True: tmp = next(in_fd, "") if tmp == "# --- END OF GFF DUMP ---\n": break writing_fd.write(tmp) if tmp == "": break extracted_gff_fd.close() filtered_out_gff_fd.close()
def extract_top_hits_from_target_gff(list_of_target_gff, top_hits_gff, secondary_hits_gff, id_white_list_file=None, max_hits_per_query=None): if id_white_list_file: white_ids = IdList() white_ids.read(id_white_list_file) top_hits_gff_fd = open(top_hits_gff, "w") secondary_hits_gff_fd = open(secondary_hits_gff, "w") targets_list = [] hit_counter = 0 gene_counter = 0 for filename in list_of_target_gff: index = 0 with open(filename, "r") as in_fd: #print u #tmp = None for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if id_white_list_file: if target_name not in white_ids: continue if target_name not in targets_list: writing_fd = top_hits_gff_fd targets_list.append(target_name) gene_counter += 1 hit_counter = 0 else: writing_fd = secondary_hits_gff_fd # print target_name hit_counter += 1 tmp = tmp.replace( "gene_id 0", "gene_id g%i_h%i" % (gene_counter, hit_counter)) if hit_counter <= max_hits_per_query: writing_fd.write(tmp) while True: tmp = next(in_fd, "") # print("cccc") if tmp == "# --- END OF GFF DUMP ---\n": break if max_hits_per_query: if hit_counter > max_hits_per_query: #print "aaaaa" continue writing_fd.write(tmp) if tmp == "": break top_hits_gff_fd.close() secondary_hits_gff_fd.close()
def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t", comments_prefix="#", column_number=None): id_list = IdList() id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix, column_number=column_number, header=header) if output_file: id_list.write(output_file, header=header) return id_list
def extract_evidence_by_ids(evidence_file, id_file, output_evidence_file, mode="transcript"): # possible modes: transcript, gene ids = IdList() ids.read(id_file, comments_prefix="#") column_id = 0 if mode == "gene" else 1 with open(evidence_file, "r") as ev_fd: with open(output_evidence_file, "w") as out_fd: for line in ev_fd: if line[0] == "#": out_fd.write(line) continue entry_id = line.split("\t")[column_id] if entry_id in ids: out_fd.write(line)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
default="stdout", help="Prefix of output file") args = parser.parse_args() out_fd = sys.stdout if args.output_prefix == "stdout" else open( "%s_reference_random_genes.ids" % args.output_prefix, "w") reference_families = SynDict() reference_families.read(args.reference_fam, separator="\t", split_values=True, values_separator=",") node_family_ids = IdList() node_family_ids.read(args.input, header=True, column_number=0, column_separator="\t") reference_random_genes = SynDict() for family_id in node_family_ids: if family_id not in reference_families: reference_random_genes[family_id] = "." else: reference_random_genes[family_id] = choice( reference_families[family_id]) reference_random_genes.write("%s_reference_random_genes.t" % args.output_prefix) for family_id in reference_random_genes:
"--header", action="store_true", dest="header", help="Header is present in id file. Default: False") parser.add_argument("-f", "--format", action="store", dest="format", required=True, help="Format of the file with hits") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") id_list = IdList() id_list = id_list.read(args.id_file, header=args.header) HMMER3.extract_hits_by_query_ids(id_list, args.input, args.output, fileformat=args.format, close_after_if_file_object=True) out_fd.close()
parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Input file with families") parser.add_argument("-d", "--id_file", action="store", dest="id_file", default=None, help="File with ids of families. If absent genes from all families will be extracted(default).") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") parser.add_argument("-s", "--separate_families", action="store_true", dest="separate_families", help="Separate families to different files. If set option -o/--output_file is ignored") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") families = SynDict() families.read(args.input, separator="\t", split_values=True, values_separator=",") if args.id_file: id_list = IdList() id_list = id_list.read(args.id_file) if args.separate_families: for fam_id in id_list if args.id_file else families: with open("%s.ids" % fam_id, "w") as fam_fd: for gene_id in families[fam_id]: fam_fd.write(gene_id + "\n") else: with open(args.output, "w") as out_fd: for fam_id in id_list if args.id_file else families: for gene_id in families[fam_id]: out_fd.write(gene_id + "\n") if args.output != "stdout": out_fd.close()
parser.add_argument("-s", "--store_logs", action="store_true", dest="store_logs", default=False, help="Store download logs in directory set by -g/--logs_dir option") parser.add_argument("-g", "--logs_dir", action="store", dest="logs_dir", default="logs", type=FileRoutines.check_path, help="Directory with logs") args = parser.parse_args() FileRoutines.safe_mkdir(args.output_dir) FileRoutines.safe_mkdir(args.logs_dir) if (not args.alignment) and (not args.tree) and (not args.hmm): args.all = True in_fd = sys.stdin if args.input == "stdin" else open(args.input, "r") family_ids = IdList() family_ids.read(in_fd) if args.input != "stdin": in_fd.close() absent_alignment_list = IdList() absent_tree_list = IdList() absent_hmm_list = IdList() def download_data(fam_id): print("Downloading %s family" % fam_id) ali_log_file = "/dev/null" if not args.store_logs else "%s%s_alignment.log" % (args.logs_dir, fam_id) tree_log_file = "/dev/null" if not args.store_logs else "%s%s_tree.log" % (args.logs_dir, fam_id) hmm_log_file = "/dev/null" if not args.store_logs else "%s%s_hmm.log" % (args.logs_dir, fam_id)
action="store", dest="filtered_family_dir", default="filtered_fam", type=FileRoutines.check_path, help="Directory to write filtered_families") args = parser.parse_args() FileRoutines.safe_mkdir(args.filtered_family_dir) species_list = sorted(args.species_set) if args.white_list_file and args.black_list_file: raise ValueError("Black list and white list cant be set simultaneously") black_list = IdList() white_list = IdList() if args.black_list_file: black_list.read(args.black_list_file) if args.white_list_file: white_list.read(args.white_list_file) out_fd = open(args.cafe_file, "w") filtered_fd = open("%sfiltered_families.cafe" % args.filtered_family_dir, "w") out_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) filtered_fd.write("FAMILYDESC\tFAMILY\t%s\n" % ("\t".join(species_list))) species_filtered_fd_list = OrderedDict() fam_count_dict = TwoLvlDict() species_family_dict = TwoLvlDict() for species in args.species_set: species_family_dict[species] = SynDict() species_family_dict[species].read( "%s%s%s" % (FileRoutines.check_path(args.input), species, args.suffix), split_values=True, values_separator=",",
def split_hmm(self, hmmfile, output_dir, num_of_recs_per_file, num_of_files=None, output_prefix=None, threads=4): try: os.mkdir(output_dir) except OSError: pass id_fd = CGAS.cgas(hmmfile, grep_pattern="NAME", whole_word_match=True, awk_code="{print $2}", capture_output=True) split_index = 1 ids_written = 0 ids_list = IdList() #ids_list = read_ids(id_fd, close_after_if_file_object=False) ids_list.read(id_fd, close_after_if_file_object=True) number_of_ids = len(ids_list) out_prefix = self.split_filename( hmmfile)[1] if output_prefix is None else output_prefix num_of_ids = int( number_of_ids / num_of_files) + 1 if num_of_files else num_of_recs_per_file common_options = " -f" common_options += " %s" % hmmfile options_list = [] while (ids_written + num_of_ids) <= number_of_ids: tmp_id_list = IdList(ids_list[ids_written:ids_written + num_of_ids]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 ids_written += num_of_ids if ids_written != number_of_ids: tmp_id_list = IdList(ids_list[ids_written:]) tmp_id_list.write("%s/%s_%i.ids" % (output_dir, out_prefix, split_index)) options = common_options options += " %s/%s_%i.ids" % (output_dir, out_prefix, split_index) options += " > %s" % ("%s/%s_%i.hmm" % (output_dir, out_prefix, split_index)) options_list.append(options) split_index += 1 #print options_list self.parallel_execute(options_list, cmd="hmmfetch", threads=threads)
parser.add_argument("-f", "--link_file", action="store", dest="link_file", help="File with links") parser.add_argument("-t", "--threads", action="store", dest="threads", type=int, default=1, help="Number of simultaneous downloads") args = parser.parse_args() loader = IdList() link_list = loader.read(args.link_file) Wget.threads = args.threads Wget.parallel_download(link_list) """ options_list = [] for entry_id in id_list: ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id) options_list.append("--no-host-directories -rc -t 500 %s" % ftp_path) tool = Tool(cmd="wget", max_threads=args.threads) tool.parallel_execute(options_list) """
#print ("%s\t%s" % (record.id, feature.id)) if (feature.id in annotation_ids) and (feature.type in white_list_of_annotation_types): new_record.features.append(feature) if len(new_record.features) > 0: yield new_record parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Gff file with annotations to extract") parser.add_argument("-o", "--output_file", action="store", dest="output_file", help="Output file with extracted_annotations") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of annotations to extract") parser.add_argument("-t", "--annotation_types", action="store", dest="annotation_types", default=["gene"], type=lambda s: s.split(","), help="Comma-separated list of annotation types to extract") args = parser.parse_args() annotation_ids = IdList() annotation_ids.read(args.ids_file, comments_prefix="#") #print args.annotation_types out_fd = open(args.output_file, "w") GFF.write(record_with_extracted_annotations_generator(args.input_gff, args.annotation_types), out_fd) out_fd.close()
__author__ = 'Sergei F. Kliver' import sys import argparse from RouToolPa.Collections.General import IdList parser = argparse.ArgumentParser() parser.add_argument("-i", "--fam_file", action="store", dest="fam_file", required=True, help="File with families") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write ids") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") id_list = IdList() id_list.read(args.fam_file, close_after_if_file_object=True, column_number=1, id_in_column_separator=",") id_list.write(args.output, close_after_if_file_object=True)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input .gff file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output .gff file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of genes to extract") parser.add_argument("-w", "--write_comments", action="store_true", dest="write_comments", help="Write comments to output") args = parser.parse_args() feature_id_list = IdList() feature_id_list.read(args.id_file) with open(args.input, "r") as in_fd: with open(args.output, "w") as out_fd: for line in in_fd: if (line[0] == "#") and args.write_comments: out_fd.write(line) continue description_list = line.split("\t")[9].split(";") feature_id = description_list[0].split("=")[1] if feature_id not in feature_id_list: continue out_fd.write(line) while True: description_list = in_fd.next().split("\t")[9].split(";")
help="Number of simultaneous downloads") parser.add_argument("-c", "--connections", action="store", dest="connections", type=int, default=8, help="Number of connections for each download") args = parser.parse_args() if (not args.ids) and (not args.id_file): raise ValueError("Both ids and id file were not set") loader = IdList() id_list = loader.read(args.id_file) if args.id_file else args.ids Axel.threads = args.threads Axel.parallel_download_from_sra(id_list, args.connections) """ options_list = [] for entry_id in id_list: ftp_path = NCBIRoutines.get_sra_ftp_path_from_id(entry_id) options_list.append("-n %i %s" % (args.connections, ftp_path)) tool = Tool(cmd="axel", max_threads=args.threads) tool.parallel_execute(options_list) for filename in os.listdir(os.getcwd()): if ".sra" not in filename:
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True, ): # TODO: TEST IT BEFORE USAGE.WAS SIGNIFICANTLY CHANGED WITHOUT TESTING cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) self.safe_mkdir(output_dir) out_dir = self.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_files = self.make_list_of_path_to_files(seq_file) protein_collection = CollectionSequence(in_file=seq_file, parsing_mode="parse") #protein_dict = self.parse_seq_file(protein_files, "index_db", format=seq_format, index_file="tmp.idx") if len(protein_files) > 1 else self.parse_seq_file(protein_files[0], parsing_mode, format=seq_format, index_file="tmp.idx") #SeqIO.index_db("tmp.idx", self.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_collection.records) if absent_elements: print("Skipping cluster %s due to absent element(%s)" % (fam_id, ",".join(absent_elements))) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) self.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) protein_collection.write(out_file, whitelist=cluster_dict[fam_id]) #SeqIO.write(SequenceRoutines.record_by_id_generator(protein_dict, cluster_dict[fam_id], verbose=True), # out_file, format=seq_format) #if (len(protein_files) > 1) or (parsing_mode == "index_db"): # os.remove("tmp.idx") print("%i of %i clusters were skipped due to absent elements" % (number_of_skipped_clusters, len(cluster_dict))) return number_of_skipped_clusters