def calculate_p_distance(self): from RouToolPa.Routines import SequenceRoutines if self.parsing_mode == "generator": raise ValueError( "ERROR!!! P distance calculation was not implemented for generator mode!" ) if self.seq_lengths is None: self.get_stats_and_features(count_gaps=False, sort=False) seq_len = self.seq_lengths["length"].unique() if len(seq_len) > 1: raise ValueError("ERROR!!! Some sequences have different length!") else: seq_len = seq_len[0] distance_df = pd.DataFrame(0, index=self.scaffolds, columns=self.scaffolds) for record_id_a in self.scaffolds: for record_id_b in self.scaffolds: if record_id_a == record_id_b: continue distance_df.loc[record_id_a, record_id_b] = distance_df.loc[ record_id_b, record_id_a] = SequenceRoutines.p_distance( self.records[record_id_a], self.records[record_id_b], seq_len) return distance_df
def correct_coordinates(self, sequence_dict): for primer_pair in self.primer_pair_list: primer_pair.left_primer.start = sequence_dict[self.seq_id].find( primer_pair.left_primer.seq) primer_pair.right_primer.start = sequence_dict[self.seq_id].find( SequenceRoutines.reverse_complement( primer_pair.right_primer.seq) ) + primer_pair.right_primer.length - 1
def extract_proteins_from_selected_families( families_id_file, fam_file, pep_file, output_dir="./", pep_format="fasta", out_prefix=None, create_dir_for_each_family=False): from RouToolPa.Routines import SequenceRoutines fam_id_list = IdList() fam_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_family = True if out_prefix else create_dir_for_each_family if families_id_file: fam_id_list.read(families_id_file) fam_dict.read(fam_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db("tmp.idx", pep_file, format=pep_format) for fam_id in fam_id_list if families_id_file else fam_dict: if fam_id in fam_dict: if create_directory_for_each_family: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.pep" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.pep" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, fam_dict[fam_id], verbose=True), out_file, format=pep_format) else: print("%s was not found" % fam_id) os.remove("tmp.idx")
help= "Format of input and output file. Allowed formats genbank, fasta(default)") parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="index_db", help="Parsing mode for input sequence file. " "Possible variants: 'index_db'(default), 'index', 'parse'") args = parser.parse_args() """ example of usage ~/Soft/MAVR/scripts/sequence/filter_sequences_by_id_expression.py -i GSS_BOH_BAC_end.fa \ -a GSS_BOH_BAC_end.forward.fa \ -b GSS_BOH_BAC_end.reverse.fa \ -e "\.F$" -p parse """ SequenceRoutines.filter_seq_by_reg_expression_from_file( args.input, args.regular_expression, args.filtered_file, args.filtered_out_file, parsing_mode=args.parsing_mode, format=args.format, index_file="tmp.idx", retain_index=False, reg_exp_flags=0)
def extract_proteins_from_output(self, augustus_output, protein_output, evidence_stats_file=None, supported_by_hints_file=None, complete_proteins_id_file=None, id_prefix="p."): if evidence_stats_file: ev_fd = open(evidence_stats_file, "w") ev_fd.write( "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t" ) ev_fd.write( "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n" ) if evidence_stats_file: sup_fd = open(supported_by_hints_file, "w") sup_fd.write( "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t" ) sup_fd.write( "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n" ) if complete_proteins_id_file: complete_fd = open(complete_proteins_id_file, "w") with open(protein_output, "w") as out_fd: with open(augustus_output, "r") as in_fd: for line in in_fd: if line[:12] == "# start gene": gene = line.strip().split()[-1] elif "\ttranscript\t" in line: transcript_id = line.split("\t")[8].split( ";")[0].split("=")[1] start_presence = False stop_presence = False #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene)) elif "\tstart_codon\t" in line: start_presence = True elif "\tstop_codon\t" in line: stop_presence = True elif "# protein sequence" in line: protein = line.strip().split("[")[-1] if "]" in protein: protein = protein.split("]")[0] else: while True: part = in_fd.readline().split()[-1] if "]" in part: protein += part.split("]")[0] break else: protein += part if complete_proteins_id_file: #print "AAAAA" #print (start_presence, stop_presence) if start_presence and stop_presence: complete_fd.write("%s%s\n" % (id_prefix, transcript_id)) out_fd.write( ">%s%s\t gene=%s start_presence=%s stop_presence=%s\n" % (id_prefix, transcript_id, gene, str(start_presence), str(stop_presence))) out_fd.write(protein) protein_len = len(protein) out_fd.write("\n") elif evidence_stats_file or supported_by_hints_file: if line[:17] == "# % of transcript": supported_fraction = line.strip().split()[-1] while True: tmp_line = in_fd.readline() if tmp_line[:12] == "# CDS exons:": cds_support = tmp_line.strip().split()[-1] elif tmp_line[:14] == "# CDS introns:": introns_support = tmp_line.strip().split( )[-1] elif tmp_line[:13] == "# 5'UTR exons": five_utr_support = tmp_line.strip().split( )[-1] elif tmp_line[:13] == "# 3'UTR exons": three_introns_support = tmp_line.strip( ).split()[-1] elif tmp_line[: 27] == "# incompatible hint groups:": incompatible_hint_groups = tmp_line.strip( ).split()[-1] if evidence_stats_file: ev_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) ev_fd.write( "%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) if supported_by_hints_file and ( float(supported_fraction) > 0): sup_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) sup_fd.write( "%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) break if evidence_stats_file: ev_fd.close() self.extract_longest_isoforms(evidence_stats_file, "%s.longest_pep" % evidence_stats_file, minimum_supported_fraction=0) SequenceRoutines.extract_sequence_by_ids( protein_output, "%s.longest_pep.ids" % evidence_stats_file, "%s.longest_pep.pep" % evidence_stats_file) if supported_by_hints_file: supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file self.extract_longest_isoforms( evidence_stats_file, supported_by_hints_longest_pep_evidence, minimum_supported_fraction=0.00001) SequenceRoutines.extract_sequence_by_ids( protein_output, supported_by_hints_longest_pep_ids, supported_by_hints_longest_pep) evidence_files = (evidence_stats_file, "%s.longest_pep" % evidence_stats_file, "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \ (evidence_stats_file,) for evidence_file in evidence_files: print("Drawing transcript support distribution for %s" % evidence_file) MatplotlibRoutines.percent_histogram_from_file( evidence_file, evidence_file, column_list=(2, ), separator=None, comments="#", n_bins=20, title="Transcript support by hints", xlabel="%%", ylabel="Number", extensions=["svg", "png"], legend_location="upper center", stats_as_legend=True)
parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input fasta with raw cDS") parser.add_argument( "-s", "--stop_codons", action="store", dest="stop_codons_list", default=["TGA", "TAA", "TAG"], type=lambda s: s.split(","), help= "Comma-separated list of stop codons. Can be set using any case and both RNA and DNA alphabet." "Default: TGA, TAA, TAG") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file to write trimmed CDS") args = parser.parse_args() print("Using %s as stop codons" % ",".join(args.stop_codons_list)) SequenceRoutines.trim_cds_and_remove_terminal_stop_codons( args.input, args.output, args.stop_codons_list)
import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-r", "--reference", action="store", dest="reference", required=True, help="File with reference genome") parser.add_argument("-s", "--samtools_directory", action="store", dest="samtools_dir", default="", help="Directory with samtools binaries") parser.add_argument("-p", "--picard_directory", action="store", dest="picard_dir", default="", help="Directory with PICARD jar") args = parser.parse_args() SequenceRoutines.prepare_reference_for_GATK(args.reference, picard_dir=args.picard_dir, samtools_dir=args.samtools_dir)
syn_dict = SynDict() syn_dict.read(pep_uniq_description_no_isoform_versions, header=False, separator="\t", allow_repeats_of_key=True, split_values=True, values_separator=",", key_index=1, value_index=0, comments_prefix="#") syn_dict.write(pep_description_collapsed_isoforms, splited_values=True, values_separator=",") length_dict = SequenceRoutines.get_lengths_from_seq_file(args.input, format="fasta", out_file=len_file) descr_with_len_fd = open(pep_description_collapsed_isoforms_with_len, "w") descr_longest_isoform_fd = open(pep_description_longest_isoform, "w") descr_longest_isoform_ids_fd = open(pep_description_longest_isoform_ids, "w") for gene in syn_dict: len_list = [] longest_isoform = None max_len = 0 for isoform_id in syn_dict[gene]: length = length_dict[isoform_id] len_list.append(length) if length > max_len: max_len = length
in_fd = open(args.in_file, "r") max_length_list = [] number_poly_list = [] number_of_UTRs = 0 with open(args.out_file, "w") as out_fd: out_fd.write( "#main_id\tUTR_length\tmax_homopolymer_length\tnumber_homopolymers\tCoordinates_list\tOther_ids\n" ) for line in in_fd: name_line = line.strip() sequence = in_fd.readline().strip() number_of_UTRs += 1 coords_list, length_list = SequenceRoutines.find_homopolymers( sequence, args.nucleotide, min_size=args.min_size, search_type=args.search_type, max_single_insert_size=args.max_single_insert_size, max_total_insert_length=args.max_total_insert_length, max_number_of_insertions=args.max_number_of_insertions) if not coords_list: continue id_list = name_line.split("|")[1].split(",") max_length = max(length_list) number_of_homopolymers = len(length_list) max_length_list.append(max_length) number_poly_list.append(number_of_homopolymers) coords_str_list = map(lambda x: "(%i,%i)" % (x[0], x[1]), coords_list) out_fd.write( "%s\t%i\t%i\t%i\t%s\t%s\n" % (id_list[0], len(sequence), max_length, number_of_homopolymers, ",".join(coords_str_list), ",".join(id_list)))
import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input genbank file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with species_counts") parser.add_argument("-f", "--format", action="store", dest="format", default="genbank", help="Format of input file. Default - genbank ") args = parser.parse_args() SequenceRoutines.get_id_to_species_accordance_from_file(args.input, format=args.format, output=args.output)
dest="prefix", help="Prefix of output files") parser.add_argument("-o", "--output_directory", action="store", dest="output_dir", help="Directory to write output files") parser.add_argument("-n", "--num_of_records_per_file", action="store", dest="num_of_records_per_file", type=int, help="Number of sequences per output file") parser.add_argument("-f", "--num_of_out_files", action="store", dest="num_of_out_files", type=int, help="Number of output files") args = parser.parse_args() if args.num_of_records_per_file and args.num_of_out_files: raise ValueError("Options -n and -f can't be set simultaneously") SequenceRoutines.split_fasta(args.input, args.output_dir, num_of_recs_per_file=args.num_of_records_per_file, num_of_files=args.num_of_out_files, output_prefix=args.prefix)
"--black_list", action="store", dest="black_list", help="File with record ids from black list") parser.add_argument("-w", "--white_list", action="store", dest="white_list", help="File with record ids from white list") parser.add_argument("-m", "--masking", action="store", dest="masking", help="0-based BED file with regions to mask") parser.add_argument("-t", "--trimming", action="store", dest="trimming", help="0-based BED file with regions to trim") args = parser.parse_args() SequenceRoutines.correct_sequences_from_file( args.input, args.output, black_list_file=args.black_list, white_list_file=args.white_list, regions_to_trim_file=args.trimming, regions_to_mask_file=args.masking, parsing_mode="parse", format=args.format)
"-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help= "Comma-separated list of genbank files/directories with transcript annotations" ) parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file") parser.add_argument("-d", "--id_file", action="store", dest="id_file", help="File with id of transcripts to deal with") args = parser.parse_args() if args.id_file: id_list = IdList() id_list.read(args.id_file) else: id_list = None SequenceRoutines.extract_introns_from_transcripts_from_genbank_files( args.input, args.output, transcript_id_white_list=id_list)
import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", action="store", dest="input", required=True, help="Input genbank file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with species counts") parser.add_argument("-f", "--format", action="store", dest="format", default="genbank", help="Format of input file. Default - genbank ") args = parser.parse_args() SequenceRoutines.count_species_from_file(args.input, format=args.format, output_filename="count_species.count")
from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file_list", action="store", dest="input", required=True, type=lambda s: make_list_of_path_to_files(s.split(",")), help="Comma-separated list of input files/directories with sequences") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file with renamed sequences") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-s", "--syn_file", action="store", dest="syn_file", required=True, help="File to write correspondences between new and old ids") parser.add_argument("-n", "--numerical_part_of_id_length", action="store", dest="numerical_part_of_id_length", default=8, type=int, help="Length of numerical part of id. Default: 8") parser.add_argument("-d", "--id_prefix", action="store", dest="id_prefix", required=True, help="Prefix of new sequence ids") parser.add_argument("-l", "--clear_description", action="store_true", dest="clear_description", default=False, help="Clear description. Default - False") args = parser.parse_args() SequenceRoutines.rename_records_by_sequential_ids_from_files(args.input, args.output, args.syn_file, format=args.format, clear_description=args.clear_description, record_id_prefix=args.id_prefix, length_of_numerical_part=args.numerical_part_of_id_length, parse_mode="parse", index_file="temp.idx")
import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-a", "--seq_file_a", action="store", dest="seq_file_a", required=True, help="Sequence file A") parser.add_argument("-b", "--seq_file_b", action="store", dest="seq_file_b", required=True, help="Sequence file B") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") args = parser.parse_args() SequenceRoutines.compare_sequences_by_length_from_file(args.seq_file_a, args.seq_file_b, args.output_prefix)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Input file with protein sequences") parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input and output files. Allowed formats genbank, fasta(default)") parser.add_argument("-s", "--stop_codons", action="store", dest="stop_codons", default=("*", "."), type=lambda s: set(s.split(",")), help="Comma-separated list of stop codon symbols. Default - '.', '*'") args = parser.parse_args() SequenceRoutines.check_proteins_for_stop_codons_from_file(args.input, args.output_prefix, stop_codon_symbol_set=args.stop_codons, format=args.format)
parser.add_argument("-i", "--input", action="store", dest="input", required=True, type=lambda s: FileRoutines.make_list_of_path_to_files(s.split(",")), help="Comma-separated list of files/directories with sequences") parser.add_argument("-o", "--output_bed_file", action="store", dest="output", required=True, help="Output bed file") parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="Format of input files. Allowed formats genbank, fasta(default)") parser.add_argument("-w", "--white_list_ids", action="store", dest="white_list_ids", help="File with ids of regions from white list") parser.add_argument("-b", "--black_list_ids", action="store", dest="black_list_ids", help="File with ids of regions from black list") parser.add_argument("-e", "--bed_format", action="store", dest="bed_format", default="0-based", help="Format of output bed format. Allowed: 0-based(default), 1-based") parser.add_argument("-m", "--min_len", action="store", dest="min_len", help="Minimum length of sequence to count. Default: not set") parser.add_argument("-x", "--max_len", action="store", dest="max_len", help="Maximum length of sequence to count. Default: not set") parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="index_db", help="Parsing mode for input sequence file. " "Possible variants: 'index_db'(default), 'index', 'parse'") args = parser.parse_args() SequenceRoutines.make_region_bed_file_from_file(args.input, args.output, white_id_file=args.white_list_ids, black_id_file=args.black_list_ids, output_format=args.bed_format, input_format=args.format, min_len=args.min_len, max_len=args.max_len, parsing_mode=args.parsing_mode, index_file="tmp.idx", retain_index=False)
"-e", "--end_column_id", action="store", dest="end_column_id", type=int, default=2, help="0-based index of column with feature end. Default: 2") parser.add_argument( "-n", "--coordinates_type", action="store", dest="coordinates_type", default="1-based", help="Type of coordinates. Allowed: 0-based, 1-based(default)") args = parser.parse_args() SequenceRoutines.split_sequence_by_regions_from_file( args.input, args.regions, args.output_prefix, retain_description=False, min_length=args.min_length, parsing_mode="parse", scaffold_column_index=args.scaffold_column_id, start_column_index=args.start_column_id, end_column_index=args.end_column_id, coordinates_type=args.coordinates_type, input_separator="\t", sequence_format="fasta")
try: os.mkdir(args.output_directory) except OSError: pass split_index = 1 records_written = 0 record_ids_list = list(sequence_dict.keys()) number_of_records = len(record_ids_list) while (records_written + args.number_of_sequences) <= number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:records_written + args.number_of_sequences], verbose=True), "%s/%s_%i.fasta" % (args.splited_directory, args.output_prefix, split_index), format="fasta") split_index += 1 records_written += args.number_of_sequences if records_written != number_of_records: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, record_ids_list[records_written:], verbose=True), "%s/%s_%i.fasta" % (args.splited_directory, args.output_prefix, split_index), format="fasta")
"--parsing_mode", action="store", dest="parsing_mode", default="parse", help="Parsing mode of sequence files. Allowed: parse, index, index_db." "Default: parse") parser.add_argument( "-g", "--genetic_code_table", action="store", dest="genetic_code_table", default=1, type=int, help="Genetic code to use for translation of transcript. " "Allowed: table number from http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi" "Default: 1(The standard code)") args = parser.parse_args() SequenceRoutines.find_cds_coordinates_in_transcript_by_pep_from_file( args.transcript_file, args.pep_file, args.correspondence_file, args.output_prefix, parsing_mode=args.parsing_mode, verbose=args.verbose, format=args.format, transcript_index_file=None, protein_index_file=None, genetic_code_table=args.genetic_code_table)
default="selenocystein_proteins", help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="fasta", help= "Format of input and output files. Allowed formats genbank, fasta(default)" ) args = parser.parse_args() SequenceRoutines.check_selenocystein_presence_from_file(args.input, args.output_prefix, format="fasta") """ tmp_index_file = "temp.idx" print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format) selenocystein_ids = [] with open(args.out_prefix + ".ids", "w") as out_fd: for record_id in sequence_dict: if "U" in sequence_dict[record_id].seq: selenocystein_ids.append(record_id) out_fd.write(record_id + "\n") SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_dict, selenocystein_ids), "%s_seq.fasta" % args.out_prefix, args.format) os.remove(tmp_index_file)
"-w", "-transcript_with_no_pep_idfile", action="store", dest="transcript_with_no_pep_idfile", help= "File to write ids of transcripts with no protein hit. Default: not set") parser.add_argument( "-s", "-transcript_with_several_pep_idfile", action="store", dest="transcript_with_several_pep_idfile", help= "File to write ids of transcripts with several protein. Default: not set") args = parser.parse_args() SequenceRoutines.get_transcript_to_pep_accordance_from_files( args.transcript_file, args.pep_file, args.out, verbose=args.verbose, parsing_mode=args.parsing_mode, genetic_code_table=args.genetic_code_table, include_id_check=args.id_check, transcript_with_no_pep_idfile=args.transcript_with_no_pep_idfile, transcript_with_several_proteins_idfile=args. transcript_with_several_pep_idfile) if args.parsing_mode == "index_db": os.remove("transcript_tmp.idx") os.remove("pep_tmp.idx")
import argparse from RouToolPa.Routines import SequenceRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", action="store", dest="input", required=True, help="Comma separated list of genbank files/directories") parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="genbank", help= "Format of input and output file. Allowed formats genbank(default), fasta") args = parser.parse_args() SequenceRoutines.get_random_species_genomes_from_genbank_file( args.input, args.output_prefix, output_type=args.format)
parser.add_argument("-i", "--input", action="store", dest="input", help="Genbank file with annotations") parser.add_argument( "--fast_parsing", action="store_true", dest="fast_parsing", help="Fast parsing mode - high memory consumption. Default: false") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", default="output", help="Prefix of output files") args = parser.parse_args() record_dict = SeqIO.to_dict(SeqIO.parse( args.input, format="genbank")) if args.fast_parsing else SeqIO.index_db( "temp_index.idx", [args.input], format="genbank") SequenceRoutines.get_protein_marking_by_exons_from_genbank( record_dict, args.output_prefix, protein_id_field_in_cds_feature="protein_id") #os.remove("temp_index.idx")
parser.add_argument("-o", "--output_file_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument( "-f", "--format", action="store", dest="format", default="fasta", help= "Format of input and output files. Allowed formats genbank, fasta(default)" ) parser.add_argument("-p", "--print_stats", action="store_true", dest="print_stats", help="Print stats. Default: False") args = parser.parse_args() SequenceRoutines.count_softmasked_nucleotides_from_file( args.input, args.output_prefix, verbose=args.print_stats, parsing_mode="parse", format=args.format, index_file=None)
args = parser.parse_args() if args.labels_list is not None: if len(args.labels_list) != len(args.input_file_list): raise ValueError( "Length of labels list is not equal to number of files with assemblies" ) assemblies_dict = OrderedDict() for i in range(0, len(args.input_file_list)): assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i + 1) tmp_index = "%s.tmp.idx" % assembly_label assemblies_dict[assembly_label] = SequenceRoutines.parse_seq_file( args.input_file_list[i], args.parsing_mode, format=args.format, index_file=tmp_index) #SeqIO.index_db(tmp_index, args.input_file_list[i],format=args.format) assembly_N50_dict = TwoLvlDict() assembly_L50 = TwoLvlDict() assembly_bins = [] assembly_contig_cumulative_length = OrderedDict() assembly_contig_number_values = OrderedDict() assembly_general_stats = TwoLvlDict() assembly_length_array = OrderedDict() assembly_lengths = TwoLvlDict() for assembly in assemblies_dict: lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \ contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly],
default=1, help="Maximum number of sequences per region. Default: 1") parser.add_argument("-b", "--scaffold_black_list_file", action="store", dest="scaffold_black_list_file", type=lambda s: IdList(filename=s), help="File with scaffolds from black list") parser.add_argument( "-x", "--min_scaffold_len", action="store", dest="min_scaffold_len", type=int, default=None, help= "Minimum length of scaffold to be included in regions. Default: not set") args = parser.parse_args() SequenceRoutines.prepare_region_list_by_length( max_length=args.max_length, max_seq_number=args.max_seq_number, length_dict=None, reference=args.reference, parsing_mode="parse", output_dir=args.output_dir, split_scaffolds=args.split_scaffolds, min_scaffold_length=args.min_scaffold_len, black_list_scaffolds=None)
use_softmasking=args.softmasking, hints_file=args.hintsfile, extrinsicCfgFile=args.extrinsicCfgFile, predict_UTR=args.predict_UTR, parsing_mode="parse") AUGUSTUS.replace_augustus_ids(output_raw_gff, args.output, species_prefix=args.species_prefix, number_of_digits_in_id=8) Gffread.extract_transcript_sequences(output_gff, args.input, args.output) SequenceRoutines.trim_cds_and_remove_terminal_stop_codons( "%s.cds" % args.output, "%s.trimmed.cds" % args.output, stop_codons_list=("TGA", "TAA", "TAG") ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins SequenceRoutines.translate_sequences_from_file( "%s.trimmed.cds" % args.output, "%s.trimmed.pep" % args.output, format="fasta", id_expression=None, genetic_code_table=1, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqs= prefix_of_file_inframe_stop_codons_seqs) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff)
help="File to write ids of extracted sequences. Default - don't write") parser.add_argument( "-p", "--parsing_mode", action="store", dest="parsing_mode", default='parse', help="Parsing mode. Allowed: parse(default), index, index_db") args = parser.parse_args() SequenceRoutines.extract_sequences_by_length_from_file( args.input_file, args.output_file, min_len=args.min_length, max_len=args.max_length, format=args.format, tmp_index_file="tmp.idx", id_file=args.id_file, parsing_mode=args.parsing_mode) """ if (args.min_length is None) and (args.max_length is None): raise ValueError("Both minimum and maximum lengths were not set") elif (args.min_length is not None) and (args.max_length is not None) and (args.min_length > args.max_length): raise ValueError("Minimum length is greater then maximum lengths") tmp_index_file = "temp.idx" print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input_file, format=args.format)