def get_longest_pep_per_gene_from_ensembl_pep_dict(protein_dict, output_prefix=None): length_file = "%s.protein_length.tsv" % output_prefix if output_prefix: longest_protein_id_file = "%s.longest_pep.ids" % output_prefix len_fd = open(length_file, 'w') len_fd.write("#gene_id\tprotein_id\tprotein_length\n") data_dict = OrderedDict() for protein_id in protein_dict: length = len(protein_dict[protein_id].seq) description_list = protein_dict[protein_id].description.split() #print protein_dict[protein_id] #print '' #print description_list for entry in description_list: if "gene:" in entry: gene_id = entry.split(":")[1] if output_prefix: len_fd.write("%s\t%s\t%i\n" % (gene_id, protein_id, length)) if gene_id not in data_dict: data_dict[gene_id] = protein_id else: if length > len(protein_dict[data_dict[gene_id]].seq): data_dict[gene_id] = protein_id longest_pep_ids = IdList(data_dict.values()) if output_prefix: longest_pep_ids.write(longest_protein_id_file) len_fd.close() return longest_pep_ids
def calculate_fpkm_for_count_table(count_table_file, transcript_length_file, output_file, separator="\t"): length_dict = SynDict(filename=transcript_length_file, expression=int, comments_prefix="#") with open(count_table_file, "r") as in_fd: header_list = in_fd.readline().strip().split(separator) samples_list = header_list[1:] gene_list = IdList() count_list = [] for line in in_fd: tmp = line.strip().split(separator) gene_list.append(tmp[0]) count_list.append(map(float, tmp[1:])) per_sample_total_counts = [] for sample_index in range(0, len(samples_list)): total_counts = 0 for gene_index in range(0, len(count_list)): total_counts += count_list[gene_index][sample_index] per_sample_total_counts.append(total_counts) with open(output_file, "w") as out_fd: out_fd.write(separator.join(header_list) + "\n") for gene_index in range(0, len(count_list)): normalized_counts_list = [] for sample_index in range(0, len(samples_list)): gene_count = count_list[gene_index][sample_index] * (10**9) / length_dict[gene_list[gene_index]] / per_sample_total_counts[sample_index] normalized_counts_list.append(gene_count) out_fd.write("%s\t%s\n" % (gene_list[gene_index], "\t".join(map(str, normalized_counts_list))))
def extract_annotation_by_refence_id(list_of_target_gff, id_file, extracted_gff, filtered_out_gff): ids = IdList() ids.read(id_file) extracted_gff_fd = open(extracted_gff, "w") filtered_out_gff_fd = open(filtered_out_gff, "w") for filename in list_of_target_gff: with open(filename, "r") as in_fd: for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if target_name not in ids: writing_fd = filtered_out_gff_fd else: writing_fd = extracted_gff_fd # print target_name writing_fd.write(tmp) while True: tmp = next(in_fd, "") if tmp == "# --- END OF GFF DUMP ---\n": break writing_fd.write(tmp) if tmp == "": break extracted_gff_fd.close() filtered_out_gff_fd.close()
def extract_top_hits_from_target_gff(list_of_target_gff, top_hits_gff, secondary_hits_gff, id_white_list_file=None, max_hits_per_query=None): if id_white_list_file: white_ids = IdList() white_ids.read(id_white_list_file) top_hits_gff_fd = open(top_hits_gff, "w") secondary_hits_gff_fd = open(secondary_hits_gff, "w") targets_list = [] hit_counter = 0 gene_counter = 0 for filename in list_of_target_gff: index = 0 with open(filename, "r") as in_fd: #print u #tmp = None for line in in_fd: tmp = line if tmp == "# --- START OF GFF DUMP ---\n": # read until string with target_name will appear while tmp[0] == "#": tmp = next(in_fd, "") target_name = tmp.split("\t")[8].split( ";")[1].split()[1] if id_white_list_file: if target_name not in white_ids: continue if target_name not in targets_list: writing_fd = top_hits_gff_fd targets_list.append(target_name) gene_counter += 1 hit_counter = 0 else: writing_fd = secondary_hits_gff_fd # print target_name hit_counter += 1 tmp = tmp.replace( "gene_id 0", "gene_id g%i_h%i" % (gene_counter, hit_counter)) if hit_counter <= max_hits_per_query: writing_fd.write(tmp) while True: tmp = next(in_fd, "") # print("cccc") if tmp == "# --- END OF GFF DUMP ---\n": break if max_hits_per_query: if hit_counter > max_hits_per_query: #print "aaaaa" continue writing_fd.write(tmp) if tmp == "": break top_hits_gff_fd.close() secondary_hits_gff_fd.close()
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def create_per_cluster_element_id_files(self, cluster_dict, output_directory): self.safe_mkdir(output_directory) for cluster_id in cluster_dict: cluster_element_id_list = IdList(cluster_dict[cluster_id]) cluster_element_id_list.write("%s/%s.ids" % (output_directory, cluster_id))
def _get_tree_dist_values(tree, expression=None): feature_values_list = IdList() for node in tree.traverse(): if expression is not None: if not expression(node): continue feature_values_list.append(node.dist) return feature_values_list
def filter_psl_by_ids_from_file(self, psl_file, output_file, white_query_id_file=None, black_query_id_file=None, white_target_id_file=None, black_target_id_file=None): self.filter_psl_by_ids(psl_file, output_file, white_query_id_list=IdList(filename=white_query_id_file) if white_query_id_file else (), black_query_id_list=IdList(filename=black_query_id_file) if black_query_id_file else (), white_target_id_list=IdList(filename=white_target_id_file) if white_target_id_file else (), black_target_id_list=IdList(filename=black_target_id_file) if black_target_id_file else ())
def handle_input(filename): sys.stdout.write("Handling %s\n" % filename) not_significant_ids = IdList() not_found_ids = IdList() prefix = split_filename(filename)[1] index_file = "%s.tmp.idx" % prefix hmm_dict = SearchIO.index_db(index_file, filename, args.format) if args.output == "stdout": out_fd = sys.stdout else: out_fd = open("%s%s.top_hits" % (args.top_hits_dir, prefix), "w") out_fd.write("#query\thit\tevalue\tbitscore\n") for query in hmm_dict: if hmm_dict[query].hits: if hmm_dict[query][0].is_included: out_fd.write( "%s\t%s\t%s\t%s\n" % (query, hmm_dict[query][0].id, hmm_dict[query][0].evalue, hmm_dict[query][0].bitscore)) else: not_significant_ids.append(query) else: not_found_ids.append(query) if args.output != "stdout": out_fd.close() os.remove(index_file) return not_significant_ids, not_found_ids
def combine_ds_dn_w_from_bootstrap_data(self, input_dir, output_dir, use_node_names_if_possible=True): dn_dir = "%s/dN/" % output_dir ds_dir = "%s/dS/" % output_dir w_dir = "%s/W/" % output_dir for directory in output_dir, dn_dir, ds_dir, w_dir: self.safe_mkdir(directory) input_files = map(lambda s: "%s/%s" % (input_dir, s), os.listdir(input_dir)) data_dict = OrderedDict() for filename in input_files: with open(filename, "r") as in_fd: in_fd.readline() # read header for line in in_fd: node_id, node_name, dn, ds, w = line.strip().split("\t") if use_node_names_if_possible: node = node_id if node_name == "." else node_name else: node = node_id if node not in data_dict: data_dict[node] = OrderedDict() for parameter in "dN", "dS", "W": data_dict[node][parameter] = IdList() data_dict[node]["dN"].append(dn) data_dict[node]["dS"].append(ds) data_dict[node]["W"].append(w) for node in data_dict: for parameter in "dN", "dS", "W": out_file = "%s/%s/%s.tsv" % (output_dir, parameter, node) data_dict[node][parameter].write(out_file)
def extract_entries_by_GO_from_eggnogmapper_output( eggnogmapper_output, GO_file, output_prefix, comments_prefix="#", separator="\t", ): GO_list = IdList(filename=GO_file, column_number=0) #print "GOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO" #print GO_list print(len(GO_list)) extracted_entries_file = "%s.annotations" % output_prefix extracted_entries = 0 with open(eggnogmapper_output, "r") as eggnog_fd: with open(extracted_entries_file, "w") as out_fd: for line in eggnog_fd: if line[0] == comments_prefix: out_fd.write(line) continue line_list = line.strip().split(separator) entry_GO_list = line_list[5].split(",") #print entry_GO_list for GO in entry_GO_list: if GO in GO_list: out_fd.write(line) extracted_entries += 1 break print("Extracted %i entries" % extracted_entries)
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w", cluster_column=0, element_column=1, column_separator="\t", element_separator=",", id_column=None): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict(filename=cluster_file, split_values=True, comments_prefix="#", key_index=cluster_column, value_index=element_column, separator=column_separator, values_separator=element_separator) element_id_list = IdList(filename=element_file, comments_prefix="#", column_number=id_column) extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict() cluster_dict.read(cluster_file, split_values=True, comments_prefix="#") element_id_list = IdList() element_id_list.read(element_file, comments_prefix="#") extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_evidence_by_ids(evidence_file, id_file, output_evidence_file, mode="transcript"): # possible modes: transcript, gene ids = IdList() ids.read(id_file, comments_prefix="#") column_id = 0 if mode == "gene" else 1 with open(evidence_file, "r") as ev_fd: with open(output_evidence_file, "w") as out_fd: for line in ev_fd: if line[0] == "#": out_fd.write(line) continue entry_id = line.split("\t")[column_id] if entry_id in ids: out_fd.write(line)
def extract_emapper_annotations_by_protein_ids(emapper_annotation_file, protein_id_file, output_annotations): protein_ids = IdList(filename=protein_id_file) with open(emapper_annotation_file, "r") as ann_fd: with open(output_annotations, "w") as out_fd: for line in ann_fd: if line[0] == "#": out_fd.write(line) continue if line.split("\t")[0] in protein_ids: out_fd.write(line)
def add_length_to_accordance_file(accordance_file, length_file, output_prefix): accordance_dict = SynDict(filename=accordance_file, allow_repeats_of_key=True) length_dict = SynDict(filename=length_file, expression=int) print length_dict longest_list = IdList() all_output_file = "%s.all.correspondence" % output_prefix longest_output_file = "%s.longest.correspondence" % output_prefix longest_id_file = "%s.longest.ids" % output_prefix with open(all_output_file, "w") as all_out_fd: with open(longest_output_file, "w") as longest_out_fd: for gene in accordance_dict: current_transcript = None current_length = 0 for transcript in accordance_dict[gene]: if length_dict[transcript] > current_length: current_transcript = transcript current_length = length_dict[transcript] all_out_fd.write("%s\t%s\t%i\n" % (gene, transcript, length_dict[transcript])) longest_out_fd.write("%s\t%s\t%i\n" % (gene, current_transcript, current_length)) longest_list.append(current_transcript) longest_list.write(longest_id_file)
def extract_counts_by_max_level(input_file, output_prefix, separator="\t", verbose=True): output_file = "%s.divided_by_maxlvl" % output_prefix zero_max_lvl_list = IdList() zero_max_lvl_list_file = "%s.zero_max_lvl.ids" % output_prefix with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) with open(output_file, "w") as out_fd: out_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_level = max(data) if max_level == 0: zero_max_lvl_list.append(tmp_line[0]) if verbose: print("Zero max level for %s...Skipping..." % tmp_line[0]) continue data /= max_level output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" out_fd.write(output_string) zero_max_lvl_list.write(zero_max_lvl_list_file)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def cluster_sequence_names_by_id_fragment_from_file( self, seq_id_file, id_element_index, id_separator="_", output_prefix=None): seq_id_list = IdList(filename=seq_id_file) self.cluster_sequence_names_by_id_fragment(seq_id_list, id_element_index, id_separator=id_separator, output_prefix=output_prefix)
def extract_ids_from_file(input_file, output_file=None, header=False, column_separator="\t", comments_prefix="#", column_number=None): id_list = IdList() id_list.read(input_file, column_separator=column_separator, comments_prefix=comments_prefix, column_number=column_number, header=header) if output_file: id_list.write(output_file, header=header) return id_list
def remove_elements_by_ids_from_files(self, input_file, output_file, black_list_file, mode="full"): cluster_dict = SynDict(filename=input_file, split_values=True) black_list = IdList(filename=black_list_file) filtered_dict = self.remove_elements_by_ids(cluster_dict, black_list, mode=mode) filtered_dict.write(output_file, splited_values=True)
def convert_gff_to_simple_bed(input_gff, output_bed, feature_type_list=[], scaffold_id_file=None): if scaffold_id_file: scaffolds_id_list = IdList(filename=scaffold_id_file) with open(input_gff, "r") as gff_fd: with open(output_bed, "w") as bed_fd: for line in gff_fd: if line[0] == "#": continue tmp_list = line.strip().split("\t") if scaffold_id_file: if tmp_list[0] not in scaffolds_id_list: continue if feature_type_list: if tmp_list[2] not in feature_type_list: continue bed_fd.write("%s\t%s\t%s\n" % (tmp_list[0], tmp_list[3], tmp_list[4]))
def extract_clusters_and_elements_by_labels_from_files( self, cluster_file, label_file, output_file, separator="@", label_position="first"): cluster_dict = SynDict(filename=cluster_file, split_values=True) label_list = IdList( filename=label_file) if isinstance(label_file, str) else label_file output_dict = self.extract_clusters_and_elements_by_labels( cluster_dict, label_list, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True)
def divide_counts_by_several_base_level(input_file, output_prefix, base_levels, separator="\t", verbose=True, max_ratio_to_base_lvl=0.5): output_file = "%s.divided_by_max_baselvl" % output_prefix max_ratio_to_base_lvl_file = "%s.divided_by_max_baselvl.max_%f_ratio" % (output_prefix, max_ratio_to_base_lvl) zero_max_base_lvl_list = IdList() zero_max_base_lvl_list_file = "%s.zero_base_lvls.ids" % output_prefix max_ratio_to_base_lvl_fd = open(max_ratio_to_base_lvl_file, "w") with open(input_file, "r") as in_fd: header = in_fd.readline() header_list = header.strip().split(separator) data_base_lvl_index_list = [] base_level_list = [base_levels] if isinstance(base_levels, str) else base_levels for level in base_level_list: data_base_lvl_index_list.append(header_list.index(level) - 1) with open(output_file, "w") as out_fd: out_fd.write(header) max_ratio_to_base_lvl_fd.write(header) for line in in_fd: tmp_line = line.strip().split(separator) data = np.array(map(float, tmp_line[1:])) max_base_lvl = max(np.take(data, data_base_lvl_index_list)) if max_base_lvl == 0: zero_max_base_lvl_list.append(tmp_line[0]) if verbose: print("Zero max base level(s) for %s...Skipping..." % tmp_line[0]) continue data /= max_base_lvl output_string = tmp_line[0] + "\t" output_string += "\t".join(map(str, data)) output_string += "\n" if max(np.delete(data, data_base_lvl_index_list)) <= max_ratio_to_base_lvl: max_ratio_to_base_lvl_fd.write(output_string) out_fd.write(output_string) zero_max_base_lvl_list.write(zero_max_base_lvl_list_file) max_ratio_to_base_lvl_fd.close()
def extract_eggnog_fam_by_protein_syn_dict(self, eggnog_fam_dict, protein_syn_dict, output_prefix=None, species_id=None): extracted_families = SynDict() common_protein_names_to_families_dict = SynDict() common_names_to_eggnog_proteins_syn_dict = SynDict() not_found_proteins_common_names = IdList() transposed_eggnog_fam_dict = eggnog_fam_dict.exchange_key_and_value() for common_protein_name in protein_syn_dict: not_found = True for protein_id in protein_syn_dict[common_protein_name]: extended_protein_id = protein_id if species_id is None else species_id + "." + protein_id if extended_protein_id in transposed_eggnog_fam_dict: not_found = False if common_protein_name not in common_protein_names_to_families_dict: common_protein_names_to_families_dict[common_protein_name] = [transposed_eggnog_fam_dict[extended_protein_id][0]] common_names_to_eggnog_proteins_syn_dict[common_protein_name] = [extended_protein_id] else: common_protein_names_to_families_dict[common_protein_name].append(transposed_eggnog_fam_dict[extended_protein_id][0]) common_names_to_eggnog_proteins_syn_dict[common_protein_name].append(extended_protein_id) if transposed_eggnog_fam_dict[extended_protein_id][0] not in extracted_families: extracted_families[transposed_eggnog_fam_dict[extended_protein_id][0]] = eggnog_fam_dict[transposed_eggnog_fam_dict[extended_protein_id][0]] if not_found: not_found_proteins_common_names.append(common_protein_name) if output_prefix: extracted_families.write(filename="%s.extracted_families.fam" % output_prefix, splited_values=True) common_protein_names_to_families_dict.write(filename="%s.common_protein_names_to_families.correspondence" % output_prefix, splited_values=True) common_names_to_eggnog_proteins_syn_dict.write(filename="%s.common_protein_names_to_eggnog_proteins.correspondence" % output_prefix, splited_values=True) not_found_proteins_common_names.write(filename="%s.not_found.common_names" % output_prefix) #print common_names_to_eggnog_proteins_syn_dict #print common_protein_names_to_families_dict return extracted_families, common_protein_names_to_families_dict, \ common_names_to_eggnog_proteins_syn_dict, not_found_proteins_common_names
def cluster_sequence_names_by_id_fragment(self, seq_id_list, id_element_index, id_separator="_", output_prefix=None): cluster_dict = SynDict() skipped_id_list = IdList() for seq_id in seq_id_list: seq_id_splited = seq_id.split(id_separator) if id_element_index < len(seq_id_splited): if seq_id_list[id_element_index] in cluster_dict: cluster_dict[seq_id_list[id_element_index]].append(seq_id) else: cluster_dict[seq_id_list[id_element_index]] = [seq_id] else: skipped_id_list.append(seq_id) if output_prefix: cluster_dict.write("%s.seqid.clusters" % output_prefix, splited_values=True) skipped_id_list.write("%s.seqid.skipped.ids" % output_prefix) return cluster_dict
parser.add_argument("-o", "--output_file", action="store", dest="output_file", help="Output file with extracted_annotations") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of annotations to extract") parser.add_argument("-t", "--annotation_types", action="store", dest="annotation_types", default=["gene"], type=lambda s: s.split(","), help="Comma-separated list of annotation types to extract") args = parser.parse_args() annotation_ids = IdList() annotation_ids.read(args.ids_file, comments_prefix="#") #print args.annotation_types out_fd = open(args.output_file, "w") GFF.write( record_with_extracted_annotations_generator(args.input_gff, args.annotation_types), out_fd) out_fd.close()
action="store", dest="input", required=True, help="Input .gff file") parser.add_argument("-o", "--output_prefix", action="store", dest="output", default="stdout", help="Output file with single exon genes. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") annotations_dict = SeqIO.to_dict(GFF.parse(open(args.input))) single_gene_id_list = IdList() for record in annotations_dict: for feature in annotations_dict[record].features: #print feature.id if feature.type != "gene": continue for subfeature in feature.sub_features: if subfeature.type != "mRNA": continue exon_number = 0 for mRNA_subfeature in subfeature.sub_features: if mRNA_subfeature.type == "exon": exon_number += 1 if exon_number == 1: single_gene_id_list.append(feature.id)
def check_gvcf_integrity(self, gvcf_file, output_prefix, reference=None, length_dict=None, parsing_mode="parse"): len_dict = length_dict if length_dict else self.get_lengths(record_dict=self.parse_seq_file(reference, mode=parsing_mode), out_file=None, close_after_if_file_object=False) scaffold_dict = OrderedDict() with self.metaopen(gvcf_file, "r") as gvcf_fd: prev_scaffold = "" for line in gvcf_fd: #print line if line[0] == "#": continue line_list = line.split("\t") scaffold = line_list[0] start = int(line_list[1]) format = line_list[7].split(";") if (len(format) == 1) and (format[0][0:3] == "END"): end = int(format[0].split("=")[1]) else: end = start + len(line_list[3]) - 1 #print line_list #print scaffold, start, end, format if scaffold not in scaffold_dict: scaffold_dict[scaffold] = [] if scaffold != prev_scaffold: scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)]) else: #print scaffold_dict[scaffold][-1][1] if scaffold_dict[scaffold][-1][1] + 1 >= start: scaffold_dict[scaffold][-1][1] = deepcopy(max(end, scaffold_dict[scaffold][-1][1])) else: print scaffold_dict[scaffold] print line scaffold_dict[scaffold].append([deepcopy(start), deepcopy(end)]) prev_scaffold = scaffold complete_scaffolds = IdList() fragmented_scaffolds = IdList() scaffolds_with_absent_fragments = IdList() with open("%s.scaffold_regions" % output_prefix, "w") as scaf_reg_fd: for scaffold in scaffold_dict: if len(scaffold_dict[scaffold]) > 1: fragmented_scaffolds.append(scaffold) scaffold_length = sum(map(lambda s: s[1] - s[0] + 1, scaffold_dict[scaffold])) if scaffold_length != len_dict[scaffold]: scaffolds_with_absent_fragments.append(scaffold) else: complete_scaffolds.append(scaffold) scaf_reg_fd.write("%s\t%s\n" % (scaffold, ",".join(map(lambda s: "-".join(map(str,s)), scaffold_dict[scaffold])))) complete_scaffolds.write("%s.complete_scaffolds" % output_prefix) fragmented_scaffolds.write("%s.fragmented_scaffolds" % output_prefix) scaffolds_with_absent_fragments.write("%s.scaffolds_with_absent_fragments" % output_prefix)
help="Output .gff file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of genes to extract") parser.add_argument("-w", "--write_comments", action="store_true", dest="write_comments", help="Write comments to output") args = parser.parse_args() feature_id_list = IdList() feature_id_list.read(args.id_file) with open(args.input, "r") as in_fd: with open(args.output, "w") as out_fd: for line in in_fd: if (line[0] == "#") and args.write_comments: out_fd.write(line) continue description_list = line.split("\t")[9].split(";") feature_id = description_list[0].split("=")[1] if feature_id not in feature_id_list: continue out_fd.write(line) while True: description_list = in_fd.next().split("\t")[9].split(";")