def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from Routines import SequenceRoutines, FileRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
"--columns_separator", action="store", dest="separator", default="\t", help="Column separator in file with synonyms") parser.add_argument("-e", "--header", action="store_true", dest="header", default=False, help="Header is present in synonyms file. Default - False") parser.add_argument("-l", "--clear_description", action="store_true", dest="clear_description", default=False, help="Clear description. Default - False") args = parser.parse_args() SequenceRoutines.rename_records_from_files( args.input, args.output, args.syn_file, format=args.format, header=args.header, separator=args.separator, key_index=args.key_index, value_index=args.value_index, clear_description=args.clear_description, comments_prefix=args.comments_prefix)
parser.add_argument("-c", "--cds_file", action="store", dest="cds_file", required=True, help="Input file CDS sequences") parser.add_argument("-p", "--pep_file", action="store", dest="pep_file", required=True, help="Input file protein sequences") parser.add_argument("-o", "--output_file", action="store", dest="out", required=True, help="Output file") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input files. Allowed: fasta, genbank. Default: fasta") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose", help="Print warning if no protein was found for CDS") parser.add_argument("-m", "--parsing_mode", action="store", dest="parsing_mode", default="parse", help="Parsing mode of sequence files. Allowed: parse, index, index_db." "Default: parse") parser.add_argument("-t", "--genetic_code_table", action="store", dest="genetic_code_table", default=1, type=int, help="Genetic code to use for translation of CDS. " "Allowed: table number from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi" "Default: 1(The standard code)") parser.add_argument("-d", "--id_check", action="store_true", dest="id_check", help="Also use id check - if there is id present in both files consider them as accordance") args = parser.parse_args() SequenceRoutines.get_cds_to_pep_accordance_from_files(args.cds_file, args.pep_file, args.out, verbose=args.verbose, parsing_mode=args.parsing_mode, genetic_code_table=args.genetic_code_table, include_id_check=args.id_check) if args.parsing_mode == "index_db": os.remove("cds_tmp.idx") os.remove("pep_tmp.idx")
type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_directory", action="store", dest="output", type=FileRoutines.check_path, help="Directory to output groups_of sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-e", "--extension", action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format) os.remove(tmp_index_file)
dest="min_len", help="Minimum length of sequence to count. Default: not set") parser.add_argument( "-x", "--max_len", action="store", dest="max_len", help="Maximum length of sequence to count. Default: not set") parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="index_db", help="Parsing mode for input sequence file. " "Possible variants: 'index_db'(default), 'index', 'parse'") args = parser.parse_args() SequenceRoutines.make_region_bed_file_from_file( args.input, args.output, white_id_file=args.white_list_ids, black_id_file=args.black_list_ids, output_format=args.bed_format, input_format=args.format, min_len=args.min_len, max_len=args.max_len, parsing_mode=args.parsing_mode, index_file="tmp.idx", retain_index=False)
from Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Comma separated list of genbank files/directories") parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="genbank", help= "Format of input and output file. Allowed formats genbank(default), fasta") args = parser.parse_args() SequenceRoutines.get_random_species_genomes_from_genbank_file( args.input, args.output_prefix, output_type=args.format)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import SequenceRoutines, FileRoutines parser = argparse.ArgumentParser() parser.add_argument( "-i", "--input", action="store", dest="input", type=lambda x: FileRoutines.make_list_of_path_to_files(x.split(",")), help="Comma-separated list of genbank files/directories") parser.add_argument("-o", "--output", action="store", dest="output", help="Output file") args = parser.parse_args() SequenceRoutines.extract_exon_lengths_from_genbank_file( args.input, args.output)
import argparse from Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input genbank file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with species_counts") parser.add_argument("-f", "--format", action="store", dest="format", default="genbank", help="Format of input file. Default - genbank ") args = parser.parse_args() SequenceRoutines.get_id_to_species_accordance_from_file(args.input, format=args.format, output=args.output)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from CustomCollections.GeneralCollections import IdList from Routines import SequenceRoutines, FileRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help="Comma-separated list of genbank files/directories with transcript annotations") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with id of transcripts to deal with") args = parser.parse_args() if args.id_file: id_list = IdList() id_list.read(args.id_file) else: id_list = None SequenceRoutines.extract_introns_from_transcripts_from_genbank_files(args.input, args.output, transcript_id_white_list=id_list)
"-s", "--separator", action="store", dest="separator", default="@", help="Separator between species name and sequence id. Default - '@'") parser.add_argument("-r", "--label_last", action="store_false", dest="label_first", default=True, help="Species label is at the end of id") args = parser.parse_args() input_file_list = sorted(os.listdir(args.input_dir)) FileRoutines.safe_mkdir(args.output_dir) if args.label_first: id_expression = lambda record_id: record_id.split(args.separator)[0] else: id_expression = lambda record_id: record_id.split(args.separator)[1] for filename in input_file_list: input_file = "%s%s" % (args.input_dir, filename) output_file = "%s%s" % (args.output_dir, filename) SequenceRoutines.rename_records_from_files( input_file, output_file, record_id_expression=id_expression, clear_description=True)
"--output_file", action="store", dest="output", required=True, help="Output file with translated sequences") parser.add_argument( "-g", "--genetic_code", action="store", dest="code", type=int, default=1, help= "Genetic code to use. Set by number of ncbi code. Default - 1 (universal)") parser.add_argument( "-s", "--translate_to_stop", action="store_true", dest="translate_to_stop", help= "Translate to first in-frame stop codon. Default - False(translate whole sequence)" ) args = parser.parse_args() SequenceRoutines.translate_sequences_from_file(args.input, args.output, format="fasta", id_expression=None, genetic_code_table=args.code, translate_to_stop=True)
def extract_proteins_from_output(self, augustus_output, protein_output, evidence_stats_file=None, supported_by_hints_file=None, complete_proteins_id_file=None, id_prefix="p."): if evidence_stats_file: ev_fd = open(evidence_stats_file, "w") ev_fd.write("#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t") ev_fd.write("5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n") if evidence_stats_file: sup_fd = open(supported_by_hints_file, "w") sup_fd.write("#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t") sup_fd.write("5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n") if complete_proteins_id_file: complete_fd = open(complete_proteins_id_file, "w") with open(protein_output, "w") as out_fd: with open(augustus_output, "r") as in_fd: for line in in_fd: if line[:12] == "# start gene": gene = line.strip().split()[-1] elif "\ttranscript\t" in line: transcript_id = line.split("\t")[8].split(";")[0].split("=")[1] start_presence = False stop_presence = False #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene)) elif "\tstart_codon\t" in line: start_presence = True elif "\tstop_codon\t" in line: stop_presence = True elif "# protein sequence" in line: protein = line.strip().split("[")[-1] if "]" in protein: protein = protein.split("]")[0] else: while True: part = in_fd.next().split()[-1] if "]" in part: protein += part.split("]")[0] break else: protein += part if complete_proteins_id_file: #print "AAAAA" #print (start_presence, stop_presence) if start_presence and stop_presence: complete_fd.write("%s%s\n" % (id_prefix, transcript_id)) out_fd.write(">%s%s\t gene=%s start_presence=%s stop_presence=%s\n" % (id_prefix, transcript_id, gene, str(start_presence), str(stop_presence))) out_fd.write(protein) protein_len = len(protein) out_fd.write("\n") elif evidence_stats_file or supported_by_hints_file: if line[:17] == "# % of transcript": supported_fraction = line.strip().split()[-1] while True: tmp_line = in_fd.next() if tmp_line[:12] == "# CDS exons:": cds_support = tmp_line.strip().split()[-1] elif tmp_line[:14] == "# CDS introns:": introns_support = tmp_line.strip().split()[-1] elif tmp_line[:13] == "# 5'UTR exons": five_utr_support = tmp_line.strip().split()[-1] elif tmp_line[:13] == "# 3'UTR exons": three_introns_support = tmp_line.strip().split()[-1] elif tmp_line[:27] == "# incompatible hint groups:": incompatible_hint_groups = tmp_line.strip().split()[-1] if evidence_stats_file: ev_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) ev_fd.write("%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) if supported_by_hints_file and (float(supported_fraction) > 0): sup_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) sup_fd.write("%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) break if evidence_stats_file: ev_fd.close() self.extract_longest_isoforms(evidence_stats_file, "%s.longest_pep" % evidence_stats_file, minimum_supported_fraction=0) SequenceRoutines.extract_sequence_by_ids(protein_output, "%s.longest_pep.ids" % evidence_stats_file, "%s.longest_pep.pep" % evidence_stats_file) if supported_by_hints_file: supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file self.extract_longest_isoforms(evidence_stats_file, supported_by_hints_longest_pep_evidence, minimum_supported_fraction=0.00001) SequenceRoutines.extract_sequence_by_ids(protein_output, supported_by_hints_longest_pep_ids, supported_by_hints_longest_pep) evidence_files = (evidence_stats_file, "%s.longest_pep" % evidence_stats_file, "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \ (evidence_stats_file,) for evidence_file in evidence_files: print ("Drawing transcript support distribution for %s" % evidence_file) MatplotlibRoutines.percent_histogram_from_file(evidence_file, evidence_file, column_list=(2,), separator=None, comments="#", n_bins=20, title="Transcript support by hints", xlabel="%%", ylabel="Number", extensions=["svg", "png"], legend_location="upper center", stats_as_legend=True)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input_file", help="Input genbank file with sequences") parser.add_argument("-o", "--output_file", action="store", dest="out_file", help="Output file") args = parser.parse_args() tmp_index_file = "temp.idx" print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format="genbank") gaps_dict = SequenceRoutines.find_gaps(sequence_dict) with open(args.out_file, "w") as out_fd: out_fd.write("#sequence_id\tspecies\tlength\tlineage\treferences\tgenes\trRNA\n") for record_id in sequence_dict: species = sequence_dict[record_id].annotations["organism"] lineage = sequence_dict[record_id].annotations["taxonomy"] length = len(sequence_dict[record_id].seq) references = ",".join(map(lambda x: x.title, sequence_dict[record_id].annotations['references'])) protein_genes_list = [] rRNA_genes_list = [] for feature in sequence_dict[record_id].features: if feature.type == "gene": if "gene" in feature.qualifiers: gene_name = feature.qualifiers["gene"][0]
help="Input file with sequences") parser.add_argument("-o", "--output_file", action="store", dest="out_file", help="Output gff with gaps") parser.add_argument( "-f", "--format", action="store", dest="format", default="fasta", help="Format of input. Allowed formats genbank, fasta(default)") args = parser.parse_args() tmp_index_file = "temp.idx" print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format) gaps_dict = SequenceRoutines.find_gaps(sequence_dict) with open(args.out_file, "w") as out_fd: GFF.write( SequenceRoutines.record_by_id_generator(gaps_dict, gaps_dict.keys(), verbose=True), out_fd) os.remove(tmp_index_file)
default=0, help="Number of column with ids in id file (0-based). Default: 0") parser.add_argument( "-e", "--extraction_mode", action="store", dest="coincidence_mode", default="exact", help="Coincidence mode for id: exact(full, default), partial") parser.add_argument( "-a", "--allow_multiple_coincidence_report", action="store_true", dest="allow_multiple_coincidence_report", default=False, help= "Allow multiple coincidence report of sequences for partial coincidence mode." "By default an error is raised") args = parser.parse_args() SequenceRoutines.extract_sequence_by_ids( args.input, args.id_file, args.output, format=args.format, verbose=True, id_column_number=args.id_column, coincidence_mode=args.coincidence_mode, allow_multiple_coincidence_report=args.allow_multiple_coincidence_report)
assemblies_dict[assembly_label] = SeqIO.index_db(tmp_index, args.input_file_list[i], format=args.format) assembly_N50_dict = TwoLvlDict() assembly_L50 = TwoLvlDict() assembly_bins = [] assembly_contig_cumulative_length = OrderedDict() assembly_contig_number_values = OrderedDict() assembly_general_stats = TwoLvlDict() assembly_length_array = OrderedDict() assembly_lengths = TwoLvlDict() for assembly in assemblies_dict: lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \ contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly], thresholds_list=args.thresholds, seq_len_file="%s.%s.len" % (args.output_prefix, assembly)) assembly_N50_dict[assembly] = N50_dict assembly_L50[assembly] = L50_dict assembly_contig_cumulative_length[ assembly] = contig_cumulative_length_values assembly_contig_number_values[assembly] = contig_number_values assembly_general_stats[assembly] = OrderedDict() assembly_general_stats[assembly]["Ns"] = Ns_number assembly_general_stats[assembly]["Longest contig"] = longest_contig assembly_general_stats[assembly]["Total length"] = total_length assembly_length_array[assembly] = lengths_array assembly_lengths[assembly] = length_dict if len(assembly_bins) < len(bins): assembly_bins = bins
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Input file with cds sequences") parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-g", "--genetic_code", action="store", dest="genetic_code", default=1, type=int, help="Number of NCBI genetic code table to use. Default - 1(universal) ") args = parser.parse_args() SequenceRoutines.check_cds_for_stop_codons_from_file(args.input, args.output_prefix, genetic_code_table=args.genetic_code, format=args.format)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input fasta with raw cDS") parser.add_argument( "-s", "--stop_codons", action="store", dest="stop_codons_list", default=["TGA", "TAA", "TAG"], type=lambda s: s.split(","), help= "Comma-separated list of stop codons. Can be set using any case and both RNA and DNA alphabet" ) parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file to write trimmed CDS") args = parser.parse_args() print "Using %s as stop codons" % ",".join(args.stop_codons_list) SequenceRoutines.trim_cds_and_remove_terminal_stop_codons( args.input, args.output, args.stop_codons_list)
help="Input file A with sequences") parser.add_argument("-b", "--input_file_B", action="store", dest="input_file_b", required=True, help="Input file A with sequences") parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="fasta", help= "Format of input and output files. Allowed formats genbank, fasta(default)" ) args = parser.parse_args() SequenceRoutines.compare_sequences_from_files(args.input_file_a, args.input_file_b, args.output_prefix, format=args.format, verbose=True)
out_right_se = "%s_2.se.fastq" % args.out_prefix out_left_fd = open("%s_1.fastq" % args.out_prefix, "w") out_right_fd = open("%s_2.fastq" % args.out_prefix, "w") out_left_se_fd = open("%s_1.se.fastq" % args.out_prefix, "w") out_right_se_fd = open("%s_2.se.fastq" % args.out_prefix, "w") left_input_reads_dict = SeqIO.index_db("left_in_reads.idx", args.input_left, "fastq") right_input_reads_dict = SeqIO.index_db("right_in_reads.idx", args.input_right, "fastq") left_input_set = set(left_input_reads_dict.keys()) right_input_set = set(right_input_reads_dict.keys()) SeqIO.write(SequenceRoutines.record_by_id_generator(left_input_reads_dict, sorted(left_input_set & right_input_set), verbose=True), out_left, "fastq") SeqIO.write(SequenceRoutines.record_by_id_generator(right_input_reads_dict, sorted(left_input_set & right_input_set), verbose=True), out_right, "fastq") SeqIO.write(SequenceRoutines.record_by_id_generator(left_input_reads_dict, left_input_set - right_input_set, verbose=True), out_left_se, "fastq") SeqIO.write(SequenceRoutines.record_by_id_generator(right_input_reads_dict, right_input_set - left_input_set, verbose=True), out_right_se, "fastq") out_left_fd.close() out_right_fd.close() out_left_se_fd.close() out_right_se_fd.close() os.remove("left_in_reads.idx")
parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="fasta", help= "Format of input and output files. Allowed formats genbank, fasta(default)" ) parser.add_argument( "-s", "--stop_codons", action="store", dest="stop_codons", default=("*", "."), type=lambda s: set(s.split(",")), help="Comma-separated list of stop codon symbols. Default - '.', '*'") args = parser.parse_args() SequenceRoutines.check_proteins_for_stop_codons_from_file( args.input, args.output_prefix, stop_codon_symbol_set=args.stop_codons, format=args.format)
import argparse from Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input genbank file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with species counts") parser.add_argument("-f", "--format", action="store", dest="format", default="genbank", help="Format of input file. Default - genbank ") args = parser.parse_args() SequenceRoutines.count_species_from_file(args.input, format=args.format, output_filename="count_species.count")
try: os.mkdir(args.output_directory) except OSError: pass split_index = 1 records_written = 0 record_ids_list = list(sequence_dict.keys()) number_of_records = len(record_ids_list) while (records_written + args.number_of_sequences) <= number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:records_written + args.number_of_sequences], verbose=True), "%s/%s_%i.fasta" % (args.splited_directory, args.output_prefix, split_index), format="fasta") split_index += 1 records_written += args.number_of_sequences if records_written != number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:], verbose=True), "%s/%s_%i.fasta" % (args.splited_directory, args.output_prefix, split_index), format="fasta")
action="store", dest="max_length", type=int, help="Maximum length of sequence to store. Default: filter not set") parser.add_argument( "-d", "--id_file", action="store", dest="id_file", help="File to write ids of extracted sequences. Default - don't write") args = parser.parse_args() SequenceRoutines.extract_sequences_by_length_from_file( args.input_file, args.output_file, min_len=args.min_length, max_len=args.max_length, format=args.format, tmp_index_file="tmp.idx", id_file=args.id_file) """ if (args.min_length is None) and (args.max_length is None): raise ValueError("Both minimum and maximum lengths were not set") elif (args.min_length is not None) and (args.max_length is not None) and (args.min_length > args.max_length): raise ValueError("Minimum length is greater then maximum lengths") tmp_index_file = "temp.idx" print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format) if (args.min_length is not None) and (args.max_length is not None):
""" AUGUSTUS.parallel_predict(args.species, args.input, output_raw_gff, strand=args.strand, gene_model=args.gene_model, output_gff3=True, other_options=args.other_options, config_dir=args.config_dir, use_softmasking=args.softmasking, hints_file=args.hintsfile, extrinsicCfgFile=args.extrinsicCfgFile, predict_UTR=args.predict_UTR) """ AUGUSTUS.replace_augustus_ids(output_raw_gff, args.output, species_prefix=args.species_prefix, number_of_digits_in_id=8) Gffread.extract_transcript_sequences(output_gff, args.input, args.output) SequenceRoutines.trim_cds_and_remove_terminal_stop_codons( "%s.cds" % args.output, "%s.trimmed.cds" % args.output, stop_codons_list=("TGA", "TAA", "TAG") ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins SequenceRoutines.translate_sequences_from_file( "%s.trimmed.cds" % args.output, "%s.trimmed.pep" % args.output, format="fasta", id_expression=None, genetic_code_table=1, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqs= prefix_of_file_inframe_stop_codons_seqs) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) if args.masking:
help= "Format of input and output file. Allowed formats genbank, fasta(default)") parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="index_db", help="Parsing mode for input sequence file. " "Possible variants: 'index_db'(default), 'index', 'parse'") args = parser.parse_args() """ example of usage ~/Soft/MAVR/scripts/sequence/filter_sequences_by_id_expression.py -i GSS_BOH_BAC_end.fa \ -a GSS_BOH_BAC_end.forward.fa \ -b GSS_BOH_BAC_end.reverse.fa \ -e "\.F$" -p parse """ SequenceRoutines.filter_seq_by_reg_expression_from_file( args.input, args.regular_expression, args.filtered_file, args.filtered_out_file, parsing_mode=args.parsing_mode, format=args.format, index_file="tmp.idx", retain_index=False, reg_exp_flags=0)
syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w") for gene in syn_dict: len_list = [] longest_isoform = None max_len = 0 for isoform_id in syn_dict[gene]: length = length_dict[isoform_id] len_list.append(length) if length > max_len: max_len = length