def get_codon_alignment_from_files(self, protein_aln_file, nucleotide_seq_file, codon_alignment_file, cds2protein_accordance_file=None, alignment_format="fasta", nucleotide_sequence_format="fasta", cds_index_file=None, retain_cds_index=False): protein_aln_dict = AlignIO.read(protein_aln_file, format=alignment_format) nucleotide_seq_dict = SeqIO.index_db( cds_index_file if cds_index_file else "nuc_tmp.idx", nucleotide_seq_file, format=nucleotide_sequence_format) protein2cds_accordance_dict = None if cds2protein_accordance_file: protein2cds_accordance_dict = SynDict() protein2cds_accordance_dict.read(cds2protein_accordance_file, key_index=1, value_index=0) self.get_codon_alignment( protein_aln_dict, nucleotide_seq_dict, codon_alignment_file, protein2cds_accordance_dict=protein2cds_accordance_dict) if (not cds_index_file) and (not retain_cds_index): os.remove("nuc_tmp.idx")
def extract_dom_names_hits_from_domtblout(domtblout_file, output_file): hits_dict = SynDict() hits_dict.read(domtblout_file, header=False, separator=None, allow_repeats_of_key=True, key_index=3, value_index=0, comments_prefix="#") if output_file: hits_dict.write(output_file, splited_values=True) return hits_dict
def get_families_from_top_hits(top_hits_file, fam_file): hit_dict = SynDict() hit_dict.read(top_hits_file, allow_repeats_of_key=True, key_index=1, value_index=0, comments_prefix="#") hit_dict.write(fam_file, splited_values=True) return hit_dict
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def extract_hits_from_tbl_output(blast_hits, output_file): hits = SynDict() hits.read(blast_hits, allow_repeats_of_key=True, key_index=0, value_index=1, separator="\t") hits.write(output_file, splited_values=True, separator="\t", values_separator=",") return hits
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, elements_with_absent_synonyms_file=None, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() absent_elements_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] all_elements_were_renamed_flag = True for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: if cluster not in absent_elements_dict: absent_elements_dict[cluster] = [element] else: absent_elements_dict[cluster].append(element) all_elements_were_renamed_flag = False renamed_element_list.append(element) if (not remove_clusters_with_not_renamed_elements) or ( remove_clusters_with_not_renamed_elements and all_elements_were_renamed_flag): output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True) if elements_with_absent_synonyms_file: absent_elements_dict.write(elements_with_absent_synonyms_file, splited_values=True) return absent_elements_dict
def add_len_to_simple_output(top_hits_simple, len_file, out_file): len_dict = SynDict() len_dict.read(len_file) with open(top_hits_simple, "r") as in_fd: with open(out_file, "w") as out_fd: for line in in_fd: tmp_list = line.strip().split("\t") out_fd.write( "%s\t%s\t%s\t%s\t%s\t%f\n" % (tmp_list[0], len_dict[tmp_list[0]], tmp_list[3], tmp_list[1], tmp_list[2], (float(tmp_list[2]) - float(tmp_list[1]) + 1) / float(len_dict[tmp_list[0]])))
def replace_region_names_in_gff(input_gff, synonyms_file, output_gff): syn_dict = SynDict() syn_dict.read(synonyms_file, comments_prefix="#") with open(input_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) else: line_list = line.split("\t") if line_list[0] in syn_dict: line_list[0] = syn_dict[line_list[0]] out_fd.write("\t".join(line_list)) else: out_fd.write(line)
def label_cluster_elements_from_file(self, input_file, label, output_file, separator="@", label_position="first"): input_dict = SynDict() input_dict.read(input_file, split_values=True, comments_prefix="#") output_dict = self.label_cluster_elements( input_dict, label, separator=separator, label_position=label_position) output_dict.write(output_file, splited_values=True) return output_dict
def extract_clusters_by_element_ids_from_file(self, cluster_file, element_file, output_file, mode="w"): """" mode: "w" - if elements from element_id_list are present in cluster extracts only that elements "a" - if elements from element_id_list are present in cluster extracts all elements """ cluster_dict = SynDict() cluster_dict.read(cluster_file, split_values=True, comments_prefix="#") element_id_list = IdList() element_id_list.read(element_file, comments_prefix="#") extracted_clusters = self.extract_clusters_by_element_ids( cluster_dict, element_id_list, mode=mode) extracted_clusters.write(output_file, splited_values=True)
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
def rename_elements_in_clusters( clusters_file, syn_file, output_clusters_file, remove_clusters_with_not_renamed_elements=False, syn_file_key_column_index=0, syn_file_value_column_index=1, syn_file_column_separator='\t'): syn_dict = SynDict() syn_dict.read(syn_file, comments_prefix="#", key_index=syn_file_key_column_index, value_index=syn_file_value_column_index, separator=syn_file_column_separator) clusters_dict = SynDict() clusters_dict.read(clusters_file, split_values=True, values_separator=",", comments_prefix="#") output_clusters_dict = SynDict() for cluster in clusters_dict: renamed_element_list = [] for element in clusters_dict[cluster]: if element in syn_dict: renamed_element_list.append(syn_dict[element]) else: renamed_element_list.append(element) if remove_clusters_with_not_renamed_elements: break else: output_clusters_dict[cluster] = renamed_element_list output_clusters_dict.write(output_clusters_file, splited_values=True)
def add_length_to_fam_file(fam_file, len_file, out_file, close_after_if_file_object=False): fam_dict = SynDict() fam_dict.read(fam_file, split_values=True, comments_prefix="#") len_dict = SynDict() len_dict.read(len_file, comments_prefix="#") out_fd = out_file if isinstance(out_file, file) else open( out_file, "r") for family in fam_dict: len_list = [] for member in fam_dict[family]: len_list.append(None if member not in len_dict else len_dict[member]) out_fd.write( "%s\t%s\t%s\n" % (family, ",".join(fam_dict[family]), ",".join(len_list))) if close_after_if_file_object: out_fd.close()
parser.add_argument("-f", "--families_with_errors", action="store", dest="fam_error", default="error.fam.ids", help="File to write ids of families with errors") parser.add_argument("-s", "--species_set", action="store", dest="species_set", help="Comma-separated list of species.") parser.add_argument("-l", "--name_last", action="store_false", dest="name_first", default=True, help="Position of name of species in gene_id. Default: name first") parser.add_argument("-e", "--name_separator", action="store", dest="name_separator", default=".", help="Separator between species name and gene name. Default: '.'") args = parser.parse_args() args.species_set = set(args.species_set.split(",")) pep_fam_dict = SynDict() pep_fam_dict.read(args.input, split_values=True) cds_fam_dict = SynDict() cds_dict = {} accordance_dict = {} for species in args.species_set: #cds_file = "%s/%s.cds" % (args.cds_dir, species) #cds_dict[species] = SeqIO.index(cds_file, format="fasta") accordance_file = "%s/%s.accordance" % (args.accordance_dir, species) accordance_dict[species] = SynDict() accordance_dict[species].read(accordance_file, key_index=1, value_index=0)
type=FileRoutines.check_path, help="Directory to write fam files named by species names") parser.add_argument("-d", "--syn_file", action="store", dest="syn_file", required=True, help="File with taxa ids and species names") parser.add_argument("-k", "--key_index", action="store", dest="key_index", type=int, default=0, help="Key column in file with synonyms(0-based)") parser.add_argument("-v", "--value_index", action="store", dest="value_index", type=int, default=1, help="Value column in file with synonyms(0-based)") parser.add_argument("-c", "--comments_prefix", action="store", dest="comments_prefix", default="#", help="Prefix of comments in synonyms file") parser.add_argument("-m", "--columns_separator", action="store", dest="separator", default="\t", help="Column separator in file with synonyms") parser.add_argument("-e", "--header", action="store_true", dest="header", default=False, help="Header is present in synonyms file. Default - False") args = parser.parse_args() syn_dict = SynDict() syn_dict.read(args.syn_file, header=args.header, separator=args.separator, key_index=args.key_index, value_index=args.value_index, comments_prefix=args.comments_prefix) FileRoutines.safe_mkdir(args.output_files_dir) input_files = os.listdir(args.input_files_dir) for filename in input_files: directory, taxon_id, extension = FileRoutines.split_filename(filename) if taxon_id not in syn_dict: print("Species name was not found for taxon %s" % taxon_id) continue shutil.copy("%s%s" % (args.input_files_dir, filename), "%s%s%s" % (args.output_files_dir, syn_dict[taxon_id], extension))
"--input", action="store", dest="input", required=True, type=make_list_of_path_to_files_from_comma_sep_string, help="Comma-separated list of fam files or directories with them") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") args = parser.parse_args() out_fd = sys.stdout if args.input == "stdout" else open(args.output, "w") family_dict = SynDict() for filename in args.input: fam_dict = SynDict() fam_dict.read(filename, split_values=True) for family in fam_dict: if family not in family_dict: family_dict[family] = fam_dict[family] else: family_dict[family] += fam_dict[family] family_dict.write(args.output, splited_values=True) if args.output != "stdout": out_fd.close()
help="Suffix of fam files") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_list = [] suffix_list = [] if args.use_basename: for filename in sorted(os.listdir(args.input)): dir, basename, ext = split_filename(filename) species_list.append(basename) suffix_list.append("%s" % ext) else: species_list = sorted(args.species_set) suffix_list = [args.suffix for i in range(0, len(species_list))] out_fd.write("#species\tnumber_of_families\tnumber_of_proteins\n") for species, suffix in zip(species_list, suffix_list): fam_dict = SynDict() fam_dict.read("%s%s%s" % (args.input, species, suffix), separator="\t", split_values=True, values_separator=",", key_index=0, value_index=1) out_fd.write("%s\t%i\t%i\n" % (species, len(fam_dict), fam_dict.count_all_synonyms())) if args.output != "stdout": out_fd.close()
required=True, help="Reference family file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", default="stdout", help="Prefix of output file") args = parser.parse_args() out_fd = sys.stdout if args.output_prefix == "stdout" else open( "%s_reference_random_genes.ids" % args.output_prefix, "w") reference_families = SynDict() reference_families.read(args.reference_fam, separator="\t", split_values=True, values_separator=",") node_family_ids = IdList() node_family_ids.read(args.input, header=True, column_number=0, column_separator="\t") reference_random_genes = SynDict() for family_id in node_family_ids: if family_id not in reference_families: reference_random_genes[family_id] = "." else: reference_random_genes[family_id] = choice(
def replace_augustus_ids_by_syn(augustus_gff, output_gff, genes_syn_file, transcripts_syn_file, cds_syn_file=None): genes_syn_dict = SynDict() genes_syn_dict.read(genes_syn_file, comments_prefix="#") transcripts_syn_dict = SynDict() transcripts_syn_dict.read(transcripts_syn_file, comments_prefix="#") cds_syn_dict = SynDict() if cds_syn_file: cds_syn_dict.read(cds_syn_file, comments_prefix="#") with open(augustus_gff, "r") as in_fd: with open(output_gff, "w") as out_fd: for line in in_fd: tmp = line.strip() if len(tmp) < 13: out_fd.write(line) continue if tmp[:12] != "# start gene": out_fd.write(line) continue augustus_gene_id = tmp.split(" ")[-1] gene_syn_id = genes_syn_dict[augustus_gene_id] augustus_transcript_id = "" augustus_transcript_parent = "" out_fd.write("# start gene %s\n" % gene_syn_id) tmp = in_fd.next().strip() while True: while tmp[0] != "#": tmp_list = tmp.split("\t") feature_type = tmp_list[2] edited_str = "\t".join(tmp_list[:-1]) info_field_list = tmp_list[-1].split(";") if feature_type == "gene": edited_str += "\tID=%s\n" % gene_syn_id elif feature_type == "transcript": for entry in info_field_list: if "ID" in entry: augustus_transcript_id = entry.split("=")[-1] transcript_syn_id = transcripts_syn_dict[augustus_transcript_id] if "Parent" in entry: augustus_transcript_parent = entry.split("=")[-1] if augustus_transcript_parent != augustus_gene_id: raise ValueError("Transcript parent id and gene id are not same!") edited_str += "\tID=%s;Parent=%s\n" % (transcript_syn_id, gene_syn_id) elif feature_type == "CDS": for entry in info_field_list: if "ID" in entry: augustus_cds_id = entry.split("=")[-1] cds_syn_id = cds_syn_dict[augustus_cds_id] if cds_syn_dict else "%s.cds" % transcripts_syn_dict[augustus_cds_id[:-4]] if "Parent" in entry: augustus_cds_parent = entry.split("=")[-1] if augustus_cds_parent != augustus_transcript_id: raise ValueError("CDS parent id and transcript id are not same!") edited_str += "\tID=%s;Parent=%s\n" % (cds_syn_id, transcript_syn_id) elif (feature_type == "stop_codon") or (feature_type == "start_codon"): for entry in info_field_list: if "Parent" in entry: augustus_feature_parent = entry.split("=")[-1] if augustus_feature_parent != augustus_transcript_id: raise ValueError("Feature parent id and transcript id are not same!") edited_str += "\tParent=%s\n" % (transcript_syn_id) else: edited_str = tmp out_fd.write(edited_str) tmp = in_fd.next().strip() while tmp[0] == "#": if "# end gene" in tmp: break out_fd.write(tmp + "\n") tmp = in_fd.next().strip() if "# end gene" in tmp: break out_fd.write("# end gene %s\n" % gene_syn_id)
dont_add_read_groups else rmdup_sorted_filtered_alignment) GenomeCov.get_coverage( rmdup_sorted_filtered_alignment_with_groups if not args.dont_add_read_groups else rmdup_sorted_filtered_alignment, args.coverage_bed) if not args.retain_temp: os.remove(raw_alignment) os.remove(filtered_alignment) os.remove(sorted_filtered_alignment) if args.calculate_median_coverage or args.calculate_mean_coverage: coverage_dict = SynDict() coverage_dict.read(args.coverage_bed, header=False, separator="\t", allow_repeats_of_key=True, values_separator=",", key_index=0, value_index=2, expression=int) if args.calculate_median_coverage: with open("%s_median_coverage.tab" % args.prefix, "w") as out_fd: for region in coverage_dict: mediana = median( array(coverage_dict[region] if args.flanks_size == 0 else coverage_dict[region] [args.flanks_size:-args.flanks_size])) out_fd.write("%s\t%f\n" % (region, mediana)) if args.calculate_mean_coverage: with open("%s_mean_coverage.tab" % args.prefix, "w") as out_fd: for region in coverage_dict: meana = mean(
help="Output file with collapsed strings") parser.add_argument("-c", "--column_separator", action="store", dest="column_separator", default="\t", help="Column separator. Default: '\\t'") parser.add_argument("-v", "--value_separator", action="store", dest="value_separator", default=",", help="Value separator. Default: ','") parser.add_argument("-k", "--key_column", action="store", dest="key_column", default=0, type=int, help="Column to be used as key(0-based). Default: 0") parser.add_argument("-a", "--value_column", action="store", dest="value_column", default=1, type=int, help="Column to be used as value(0-based). Default: 1") parser.add_argument("-m", "--comments_prefix", action="store", dest="comments_prefix", default="#", help="Prefix of strings(comments) to be ignored. Default: #") parser.add_argument("-r", "--remove_value_repeats", action="store_true", dest="remove_value_repeats", help="Remove repeats of values") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") syn_dict = SynDict() syn_dict.read(args.input, header=False, separator=args.column_separator, allow_repeats_of_key=True, split_values=True, values_separator=args.value_separator, key_index=args.key_column, value_index=args.value_column, comments_prefix=args.comments_prefix) if args.remove_value_repeats: collapsed_dict = syn_dict.remove_value_repeats() collapsed_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) else: syn_dict.write(out_fd, splited_values=True, values_separator=args.value_separator, close_after_if_file_object=True) #out_fd.close()
type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format) os.remove(tmp_index_file)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Input file with families") parser.add_argument("-d", "--id_file", action="store", dest="id_file", default=None, help="File with ids of families. If absent genes from all families will be extracted(default).") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file") parser.add_argument("-s", "--separate_families", action="store_true", dest="separate_families", help="Separate families to different files. If set option -o/--output_file is ignored") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") families = SynDict() families.read(args.input, separator="\t", split_values=True, values_separator=",") if args.id_file: id_list = IdList() id_list = id_list.read(args.id_file) if args.separate_families: for fam_id in id_list if args.id_file else families: with open("%s.ids" % fam_id, "w") as fam_fd: for gene_id in families[fam_id]: fam_fd.write(gene_id + "\n") else: with open(args.output, "w") as out_fd: for fam_id in id_list if args.id_file else families: for gene_id in families[fam_id]: out_fd.write(gene_id + "\n")
def star_and_htseq(self, genome_dir, samples_directory, output_directory, gff_for_htseq, count_table_file, genome_fasta=None, samples_to_handle=None, genome_size=None, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=None, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=False, star_dir=None, threads=1, max_intron_length=None, stranded_rnaseq="yes", min_alignment_quality=10, feature_type_for_htseq="exon", feature_id_attribute_for_htseq="gene_id", htseq_mode="union"): STAR.threads = threads STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=genome_size) sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) self.prepare_diff_expression_directories(output_directory, sample_list) alignment_dir = "%s/alignment/" % output_directory count_table = TwoLvlDict() for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_directory, sample) alignment_sample_dir = "%s/%s/" % (alignment_dir, sample) filetypes, forward_files, reverse_files = self.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir print "\tIndexing alignment file..." os.system("samtools index %s" % alignment_file) print "\tCounting reads aligned to features..." count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample) HTSeq.count(alignment_file, gff_for_htseq, count_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) sample_counts = SynDict() sample_counts.read(count_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table[sample] = sample_counts count_table.write(count_table_file)
parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input fam file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", required=True, help="File with ids of families to extract") parser.add_argument("-o", "--output", action="store", dest="output", default="stdout", help="File to write extracted families. Default - stdout") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print not found ids. Default - no") args = parser.parse_args() out_file = sys.stdout if args.output == "stdout" else open(args.output, "w") fam_dict = SynDict() fam_dict.read(args.input) id_set = IdSet() id_set.read(args.id_file) extracted_dict = SynDict() for id_entry in id_set: if id_entry in fam_dict: extracted_dict[id_entry] = fam_dict[id_entry] else: if args.verbose: print("%s was not found" % id_entry) extracted_dict.write(out_file, close_after_if_file_object=True)
help= "Remove nucleotide substitutions from output(preserve only AA substitutions)" ) parser.add_argument("-c", "--convert_aa_to_single_letter", action="store_true", dest="convert_to_single_letter", help="Convert aminoacids to single letters") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) gene_alias_dict = SynDict() if args.gene_alias_file: gene_alias_dict.read(args.gene_alias_file, split_values=False) out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") summary_dict = TwoLvlDict() for filename in args.input: directory, prefix, extension = split_filename(filename) if args.write_dir_path and args.write_ext: name = filename elif args.write_dir_path: name = (directory + prefix) if directory else prefix elif args.write_ext: name = prefix + extension else: name = prefix if args.suffix_to_remove in name:
for node in nodes_list: if node not in id_list: return False return True def check_edge_soft(nodes_list, id_list): for node in nodes_list: if node in id_list: return True return False families_dict = SynDict() families_dict.read(args.fam_file, separator="\t", split_values=True, values_separator=",") try: os.mkdir(args.output_dir) except OSError: pass graph_list = [] with open(args.hclust_input, "r") as in_fd: for line in in_fd: graph_list.append(line.strip().split("\t")) def extract_fam_graph(family_name): print("Started extraction for family %s" % family_name)
action="store", dest="output", default="stdout", help="Output file") parser.add_argument("-k", "--family_column", action="store", dest="fam_col", default=1, type=int, help="Family column position(0-based). Default: 1") parser.add_argument("-a", "--genes_column", action="store", dest="gen_col", default=0, type=int, help="Genes column position(0-based). Default: 0") args = parser.parse_args() hit_dict = SynDict() hit_dict.read(args.input, header=args.header, allow_repeats_of_key=True, key_index=args.fam_col, value_index=args.gen_col) hit_dict.write(args.output, splited_values=True)
pep_uniq_description_file) remove_isoform_versions_str = "sed s/isoform.*// %s > %s" % ( pep_uniq_description_file, pep_uniq_description_no_isoform_versions) for exe_string in get_pep_decription_str, get_uniq_descriptions_str, remove_isoform_versions_str: print(exe_string) os.system(exe_string) os.system(awk_extract_ids_string % (pep_uniq_description_file, pep_uniq_ids)) syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w")