def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def filter_trf_gff(self, input_gff, output_gff, filtered_out_gff, min_period=None, max_period=None, min_copy_number=None, max_copy_number=None, pattern=None, min_percentage_of_matches=None, max_percentage_of_indels=None, min_entropy=None, max_entropy=None): def filtering_expression(gff_description_dict): return self.gff_filtering_expression(gff_description_dict, min_period=min_period, max_period=max_period, min_copy_number=min_copy_number, max_copy_number=max_copy_number, pattern=pattern, min_percentage_of_matches=min_percentage_of_matches, max_percentage_of_indels=max_percentage_of_indels, min_entropy=min_entropy, max_entropy=max_entropy) AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
def filter_trf_gff_by_exact_copy_number(input_gff, output_gff, filtered_out_gff, min_copy_number, perfect_tandem=False): if perfect_tandem: def filtering_expression(gff_description_dict): if (gff_description_dict["Pattern"] * min_copy_number) in gff_description_dict["seq"]: return True return False else: def filtering_expression(gff_description_dict): if gff_description_dict["seq"].count(gff_description_dict["Pattern"]) >= min_copy_number: return True return False AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string(line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
def __init__(self, record_list=None, primer3_file=None, from_file=True, id_based_location_dict=None, repeat_gff_file=None, id_description_entry="ID"): self.general_entry_list = [ "SEQUENCE_ID", "SEQUENCE_TEMPLATE", "SEQUENCE_TARGET", "PRIMER_PICK_LEFT_PRIMER", "PRIMER_PICK_INTERNAL_OLIGO", "PRIMER_PICK_RIGHT_PRIMER", "PRIMER_PRODUCT_SIZE_RANGE", "PRIMER_LEFT_EXPLAIN", "PRIMER_RIGHT_EXPLAIN", "PRIMER_PAIR_EXPLAIN", "PRIMER_LEFT_NUM_RETURNED", "PRIMER_RIGHT_NUM_RETURNED", "PRIMER_INTERNAL_NUM_RETURNED", "PRIMER_PAIR_NUM_RETURNED" ] self.primer_entry_prefix_list = ["PRIMER_LEFT", "PRIMER_RIGHT"] self.primer_entry_suffix_list = [ "PENALTY", "SEQUENCE", "TM", "GC_PERCENT", "SELF_ANY_TH", "SELF_END_TH", "HAIRPIN_TH", "END_STABILITY" ] self.primer_pair_prefix_list = ["PRIMER_PAIR"] self.primer_pair_suffix_list = [ "PENALTY", "COMPL_ANY_TH", "COMPL_END_TH", "PRODUCT_SIZE" ] id_based_dict = id_based_location_dict if repeat_gff_file: id_based_dict = AnnotationsRoutines.get_id_based_dict_from_gff( repeat_gff_file, id_entry=id_description_entry) if from_file: self.records = [] with open(primer3_file, "r") as in_fd: for line in in_fd: entry_dict = {} lineeee = line while lineeee[0] != "=": line_list = lineeee.strip().split("=") entry_dict[line_list[0]] = line_list[1] lineeee = in_fd.readline() self._add_record(entry_dict, id_based_location_dict=id_based_dict) else: self.records = record_list
def alignments_string(self, segment_length=120, left_primer_symbol=">", target_symbol="*", right_primer_symbol="<"): string = "" string += "#SeqeunceID\t%s\n" % self.id string += "#Location" if self.chrom: string += "\t%s" % self.chrom if self.chrom_pos_start and self.chrom_pos_end: string += ":%i-%i" % (self.chrom_pos_start, self.chrom_pos_end) string += "\n" #string += "#Sequence\t%s\n" % self.seq for primer_pair in self.primer_pair_list: string += "#Primer pair %i\n" % primer_pair.id string += "\n" location_list = [ (primer_pair.left_primer.start, primer_pair.left_primer.start + primer_pair.left_primer.length), (self.target_start, self.target_start + self.target_len), (primer_pair.right_primer.start - primer_pair.right_primer.length + 1, primer_pair.right_primer.start + 1) ] #print self.id #print location_list string += AnnotationsRoutines.draw_string_regions( self.seq, location_list, [left_primer_symbol, target_symbol, right_primer_symbol], overlap_symbol="#", line_per_record=False, segment_length=segment_length, num_of_spaces=3, num_of_space_lines=1, empty_symbol=" ") return string
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="Input GFF file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Prefix of output files") parser.add_argument("-s", "--syn_file", action="store", dest="syn_file", required=True, help="File with scaffold synonyms") args = parser.parse_args() AnnotationsRoutines.rename_scaffolds_in_gff(args.input_gff, args.syn_file, args.output_prefix)
action="store", dest="output", required=True, help="Output gff file") parser.add_argument("-f", "--feature_type", action="store", dest="feature_type", required=True, help="Feature type to use in gff file") parser.add_argument("-s", "--source", action="store", dest="source", default="source", help="Source to use in gff file") parser.add_argument("-d", "--id_prefix", action="store", dest="id_prefix", default="ID", help="Id prefix for gff file") args = parser.parse_args() AnnotationsRoutines.convert_bedgraph_to_gff(args.input, args.output, args.feature_type, id_prefix=args.id_prefix, source=args.source)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Gff file with annotations to extract") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", help="Output gff file with extracted transcripts") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of transcripts to extract") args = parser.parse_args() AnnotationsRoutines.extract_transcripts_by_ids(args.input_gff, args.ids_file, args.output_gff)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="input gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output fixed gff file") args = parser.parse_args() AnnotationsRoutines.fix_gff_coordinates_order(args.input_gff, args.output_gff)
required=True, help="File to write output BED file") parser.add_argument( "-t", "--feature_types", action="store", dest="feature_types", type=lambda s: s.split(","), default=[], help="Comma-separated list of feature types to write in output file " "Default: all") """ parser.add_argument("-d", "--id_entry", action="store", dest="id_entry", default="ID", help="Id entry. Default: ID") """ parser.add_argument("-s", "--scaffold_id_file", action="store", dest="scaffold_id_file", default=None, help="File with IDs of scaffolds to include. Default: All") args = parser.parse_args() AnnotationsRoutines.convert_gff_to_simple_bed( args.input_gff, args.output_bed, feature_type_list=args.feature_types, scaffold_id_file=args.scaffold_id_file)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="input gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output fixed gff file") parser.add_argument("-f", "--feature_type", action="store", dest="feature_type", required=True, help="Feature type to use") args = parser.parse_args() AnnotationsRoutines.fix_absent_feature_type_field(args.input_gff, args.output_gff, args.feature_type)
def primer_prediction_pipeline(self, genome_fasta, output_prefix, trf_gff=None, min_str_period=3, max_str_period=5, min_copy_number=20, max_copy_number=None, pattern=None, min_perfect_copy_number=20, require_tandem_perfect_copies=True, left_flank_len=200, right_flank_len=200, core_seq_coords_entry="core_seq_coords", id_description_entry="ID", kmer_dir=None, kmer_file_prefix=None, count_kmers=False, min_percentage_of_matches=None, max_percentage_of_indels=None, optimal_primer_len=None, min_primer_len=None, max_primer_len=None, max_ns_accepted=None, softmasked_input=False, optimal_GC=None, min_GC=None, max_GC=None, optimal_melting_temperature=None, min_melting_temperature=None, max_melting_temperature=None, black_list_of_seqs_fasta=None, trf_matching_weight=2, trf_mismatching_penalty=7, trf_indel_penalty=7, trf_matching_probability=80, trf_indel_probability=10, trf_min_score=50, trf_max_period_size=500, threads=None, min_gap_len=5): TRF.path = self.trf_dir TRF.threads = threads if threads else self.threads Primer3.path = self.primer3_dir Primer3.threads = threads if threads else self.threads Glistmaker.path = self.glistmaker_dir Glistmaker.threads = threads if threads else self.threads trf_output_gff = "%s.with_rep_seqs.gff" % output_prefix if trf_gff is None else trf_gff filtered_suffix = "" filtered_suffix += ".min_period_%i" % min_str_period if min_str_period else "" filtered_suffix += ".max_period_%i" % max_str_period if max_str_period else "" filtered_suffix += ".min_copy_%i" % min_copy_number if min_copy_number else "" filtered_suffix += ".max_copy_%i" % max_copy_number if max_copy_number else "" filtered_suffix += ".pattern_%s" % pattern if pattern else "" filtered_trf_gff = "%s%s.gff" % (output_prefix, filtered_suffix) filtered_out_trf_gff = "%s%s.filtered_out.gff" % (output_prefix, filtered_suffix) final_filtered_gff = filtered_trf_gff if min_perfect_copy_number: filtering_prefix = "%s%s.%s" % (output_prefix, filtered_suffix, "min_tandem_perfect_copy_%i" % min_perfect_copy_number if require_tandem_perfect_copies else "min_perfect_copy_%i" % min_perfect_copy_number) final_filtered_gff = "%s.gff" % filtering_prefix filtered_out_exact_copy_trf_gff = "%s.filtered_out.gff" % filtering_prefix #final_filtered_gff = filtered_exact_copy_trf_gff final_filtered_len_file = "%s.monomer_len.len" % final_filtered_gff[:-4] with_flanks_prefix = "%s.with_flanks" % final_filtered_gff[:-4] with_flanks_gff = "%s.gff" % with_flanks_prefix with_flanks_fasta = "%s.fasta" % with_flanks_prefix primer3_output_prefix = "%s.primer3" % with_flanks_prefix if trf_gff is None: print("Annotating repeats...") trf_report = TRF.parallel_search_tandem_repeat(genome_fasta, output_prefix, matching_weight=trf_matching_weight, mismatching_penalty=trf_mismatching_penalty, indel_penalty=trf_indel_penalty, match_probability=trf_matching_probability, indel_probability=trf_indel_probability, min_alignment_score=trf_min_score, max_period=trf_max_period_size, report_flanking_sequences=False, max_len_per_file=1000000, store_intermediate_files=False) print("Filtering repeats...") TRF.filter_trf_gff(trf_output_gff, filtered_trf_gff, filtered_out_trf_gff, min_period=min_str_period, max_period=max_str_period, min_copy_number=min_copy_number, max_copy_number=max_copy_number, pattern=pattern, min_percentage_of_matches=min_percentage_of_matches, max_percentage_of_indels=max_percentage_of_indels, min_entropy=None, max_entropy=None) id_based_location_dict = AnnotationsRoutines.get_id_based_dict_from_gff(trf_output_gff, id_entry=id_description_entry) if trf_gff else trf_report.get_id_based_dict() if min_perfect_copy_number: TRF.filter_trf_gff_by_exact_copy_number(filtered_trf_gff, final_filtered_gff, filtered_out_exact_copy_trf_gff, min_perfect_copy_number, perfect_tandem=require_tandem_perfect_copies) #print final_filtered_gff #print final_filtered_len_file TRF.get_monomer_len_file_from_trf_gff(final_filtered_gff, final_filtered_len_file) monomer_length_id_file_prefix = "%s.monomer_len" % final_filtered_gff[:-4] monomer_length_id_dict = self.split_ids_from_len_file_by_len(final_filtered_len_file, monomer_length_id_file_prefix, len_column=1, id_column=0) AnnotationsRoutines.add_flanks_to_gff_record(final_filtered_gff, with_flanks_prefix, left_flank_len, right_flank_len, genome_fasta, coords_description_entry=core_seq_coords_entry, id_description_entry=id_description_entry) AnnotationsRoutines.extract_sequences_by_gff(genome_fasta, with_flanks_gff, with_flanks_fasta, type_list="repeat", parsing_mode="parse", format="fasta") if count_kmers: print("Counting kmers...") if (not kmer_file_prefix) or (not kmer_dir): raise ValueError("No kmer file prefix of kmer directory was set") glistmaker_prefix = "%s/%s" % (kmer_dir, kmer_file_prefix) self.safe_mkdir(kmer_dir) Glistmaker.generate_kmer_lists_for_primer3(genome_fasta, glistmaker_prefix, threads=None, max_tmp_table_number=None, max_tmp_table_size=None) print("Generating primers...") for human_readable_output in False, True: output_file_prefix = "%s.human_readable" % with_flanks_prefix if human_readable_output else with_flanks_prefix self.predict_primers(with_flanks_gff, with_flanks_fasta, output_file_prefix, kmer_dir, kmer_file_prefix, pcr_product_size_range=None, optimal_primer_len=optimal_primer_len, min_primer_len=min_primer_len, max_primer_len=max_primer_len, max_ns_accepted=max_ns_accepted, softmasked_input=softmasked_input, optimal_GC=optimal_GC, min_GC=min_GC, max_GC=max_GC, optimal_melting_temperature=optimal_melting_temperature, min_melting_temperature=min_melting_temperature, max_melting_temperature=max_melting_temperature, black_list_of_seqs_fasta=black_list_of_seqs_fasta, thermodynamic_parameters_dir=self.primer3_thermo_config_dir, format_output=human_readable_output, relative_core_seq_coords_relative_entry="%s_relative" % core_seq_coords_entry) primer3_output_file = "%s.out" % primer3_output_prefix filtered_results_file = "%s.filtered.res" % primer3_output_prefix filtered_results_table_form_file = "%s.filtered.table_form.res" % primer3_output_prefix filtered_results_table_form_with_aln_file = "%s.filtered.table_form_with_aln.res" % primer3_output_prefix filtered_out_results_file = "%s.filtered_out.res" % primer3_output_prefix primer3_results = CollectionPrimer3(primer3_file=primer3_output_file, from_file=True, id_based_location_dict=id_based_location_dict) primer3_results.remove_primers_with_gaps_in_pcr_product(min_gap_len) primer3_filtered_results, primer3_filtered_out_results = primer3_results.filter_out_records_without_primers() primer3_filtered_results.write(filtered_results_file) primer3_filtered_results.write_table_form(filtered_results_table_form_file) primer3_filtered_results.write_table_form_with_alignments(filtered_results_table_form_with_aln_file) primer3_filtered_out_results.write(filtered_out_results_file) filtered_results_file_splited_by_len_prefix = "%s.filtered.monomer_len" % primer3_output_prefix stat_fd = open("%s.stats" % output_prefix, "w") sorted_monomer_length_list = map(str, sorted(map(int, monomer_length_id_dict.keys()))) for monomer_length in sorted_monomer_length_list: primer3_monomer_len_results = primer3_filtered_results.extract_records_by_ids(monomer_length_id_dict[monomer_length]) primer3_monomer_len_results.write("%s.%s.res" % (filtered_results_file_splited_by_len_prefix, monomer_length)) primer3_monomer_len_results.write_table_form("%s.%s.table_form.res" % (filtered_results_file_splited_by_len_prefix, monomer_length)) primer3_monomer_len_results.write_table_form_with_alignments("%s.%s.table_form_with_aln.res" % (filtered_results_file_splited_by_len_prefix, monomer_length)) primer3_monomer_len_results.write_table_form2("%s.%s.table_form2.res" % (filtered_results_file_splited_by_len_prefix, monomer_length)) primer3_monomer_len_results.write_table_form2_short("%s.%s.table_form2_short.res" % (filtered_results_file_splited_by_len_prefix, monomer_length)) stat_string = "STR monomer length %s bp: %i repeats with primers" % (str(monomer_length), len(primer3_monomer_len_results.records)) print(stat_string) stat_fd.write(stat_string + "\n") stat_fd.close()
default="Alias", help="Name of field in gff description to add aliases. " "If this field is absent it will be created." "Default: Alias") parser.add_argument("-k", "--key_column", action="store", dest="key_column", type=int, default=0, help="Key column in synonym file(0-based). Default: 0") parser.add_argument("-v", "--value_column", action="store", dest="value_column", type=int, default=1, help="Value column in synonym file(0-based). Default: 1") args = parser.parse_args() AnnotationsRoutines.add_alias_to_feature( args.input_gff, args.output_gff, args.syn_file, feature_type_list=args.feature_types, name_field_list=args.feature_name_fields, alias_field=args.alias_field, key_column=args.key_column, value_column=args.value_column)
"--feature_types", action="store", dest="feature_types", type=lambda s: s.split(","), default=[ "mRNA", ], help="Comma-separated list of feature types to count. " "Default: mRNA") parser.add_argument("-d", "--id_entry", action="store", dest="id_entry", default="ID", help="Id entry. Default: ID") parser.add_argument("-p", "--parental_id_entry", action="store", dest="parental_id_entry", default="Parent", help="Parental id entry. Default: Parent") args = parser.parse_args() AnnotationsRoutines.get_feature_to_parent_correspondence_from_gff( args.input_gff, args.output, feature_list=args.feature_types, id_entry=args.id_entry, parental_id_entry=args.parental_id_entry)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="Input .gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output .gff file") parser.add_argument("-s", "--syn_file_file", action="store", dest="syn_file", required=True, help="File with synonyms of region names") args = parser.parse_args() AnnotationsRoutines.replace_region_names_in_gff(args.input_gff, args.syn_file, args.output_gff)
__author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Input .gff file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", help="Output prefix") parser.add_argument( "-t", "--feature_types", action="store", dest="feature_types", type=lambda s: s.split(","), default=None, help="Comma-separated list of feature types to add aliases." "Default: all feature types") args = parser.parse_args() AnnotationsRoutines.get_feature_length_distribution_from_gff( args.input_gff, args.output_prefix, feature_list=args.feature_types)
parser.add_argument( "-e", "--end_column_id", action="store", dest="end_column_id", type=int, default=2, help="0-based index of column with feature end. Default: 2") parser.add_argument( "-n", "--coordinates_type", action="store", dest="coordinates_type", default="1-based", help="Type of coordinates. Allowed: 0-based, 1-based(default)") args = parser.parse_args() AnnotationsRoutines.merge_overlapping_feature_in_simple_format( args.input, args.scaffold_column_id, args.start_column_id, args.end_column_id, output_file=args.output, output_separator="\t", comments_prefix="#", input_separator="\t", coordinates_type=args.coordinates_type, return_seqfeature_dict=False, feature_type=None)
__author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-c", "--correspondence_file", action="store", dest="correspondence_file", required=True, help="File with correspondence of transcripts to genes") parser.add_argument("-l", "--length_file", action="store", dest="length_file", required=True, help="Length file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Output prefix") args = parser.parse_args() AnnotationsRoutines.add_length_to_accordance_file(args.correspondence_file, args.length_file, args.output_prefix)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import sys import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-g", "--gff", action="store", dest="gff", required=True, help="Gff file") parser.add_argument("-o", "--output", action="store", dest="output", help="Output file with ids. Default: stdout") args = parser.parse_args() if args.output is None: args.output = sys.stdout AnnotationsRoutines.get_scaffold_ids_from_gff(args.gff, out_file=args.output)
parser.add_argument("-g", "--gff", action="store", dest="gff", required=True, help="Gff file") parser.add_argument( "-f", "--features", action="store", dest="features", default=[], type=lambda s: s.split(","), help="Comma-separated list of features to count per scaffold. " "If not set all features will be counted") parser.add_argument( "-o", "--output", action="store", dest="output", help="Output file with counts of features. Default: stdout") args = parser.parse_args() if args.output is None: args.output = sys.stdout AnnotationsRoutines.count_per_scaffold_feature_number( args.gff, out_file=args.output, feature_type_list=args.features)
parser.add_argument("-f", "--value_file", action="store", dest="value_file", required=True, help="Value with values to seek for") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output .gff file") parser.add_argument( "-d", "--description_fields", action="store", dest="field_id_list", type=lambda s: s.split(","), required=True, help="Comma-separated list of fields in gff description to check") args = parser.parse_args() value_list = IdList(filename=args.value_file) AnnotationsRoutines.extract_gff_records_by_description_value( args.input_gff, args.output_gff, args.field_id_list, value_list, retain_comments=False)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-g", "--gtf_file", action="store", dest="input", required=True, help="Input gtf file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output accordance file") args = parser.parse_args() AnnotationsRoutines.get_transcript_to_pep_accordance_from_gtf( args.input, args.output, comment_symbol="#")
action="store", dest="separator", default="_", help="Separator in chunk filename." "Chunks must be named as <prefix><separator><chunk number><suffix> . " "Default: '_'") parser.add_argument("-n", "--total_number_of_chunks", action="store", dest="number_of_chunks", type=int, required=True, help="Total number of chunks") parser.add_argument("-m", "--min_chunk_size", action="store", dest="min_chunk_size", type=int, required=True, help="Minimum size of chunk file.") args = parser.parse_args() AnnotationsRoutines.check_chunks( args.chunk_dir, args.number_of_chunks, args.min_chunk_size, separator=args.separator, chunk_filename_suffix=args.chunk_filename_suffix, chunk_filename_prefix=args.chunk_filename_prefix)
__author__ = 'Sergei F. Kliver' import argparse from RouToolPa.Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Input .gff file") parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", help="Output prefix") parser.add_argument( "-t", "--feature_types", action="store", dest="feature_types", type=lambda s: s.split(","), default=None, help="Comma-separated list of feature types to add aliases. " "Default: all feature types") args = parser.parse_args() AnnotationsRoutines.count_total_feature_length_from_gff( args.input_gff, args.output_prefix, features_to_count=args.feature_types)
dest="gff_file", help="Gff file with annotations to extract") parser.add_argument("-p", "--parsing_mode", action="store", dest="parsing_mode", default="parse", help="Parsing mode for input sequence file. " "Possible variants: 'index_db', 'index'(default), 'parse'") args = parser.parse_args() AnnotationsRoutines.extract_sequences_by_gff(args.input, args.gff_file, args.output, type_list=args.type, parsing_mode=args.parsing_mode, tmp_index_file="temp.idx", format=args.format) """ tmp_index_file = "temp.idx" args.type = args.type.split(",") annotations_dict = SeqIO.to_dict(GFF.parse(open(args.gff_file))) print annotations_dict print("Parsing %s..." % args.input) sequence_dict = SequenceRoutines.parse_seq_file(args.input, args.parsing_mode, args.format, index_file=tmp_index_file ) # SeqIO.index_db(tmp_index_file, args.input_file, format=args.format) SeqIO.write(SequenceRoutines.record_generator(annotations_dict, sequence_dict, args.type), args.output, format=args.format)
default="core_seq_coords", help= "Key for description entry with coordinates of core sequence in new feature" ) parser.add_argument("-l", "--left_flank_len", action="store", dest="left_flank_len", type=int, default=200, help="Length of left flank. Default: 200") parser.add_argument("-r", "--right_right_len", action="store", dest="right_flank_len", type=int, default=200, help="Length of right flank. Default: 200") args = parser.parse_args() AnnotationsRoutines.add_flanks_to_gff_record( args.input_gff, args.output_prefix, args.left_flank_len, args.right_flank_len, args.fasta, coords_description_entry=args.coords_description_entry, id_description_entry=args.id_description_entry)
SequenceRoutines.extract_sequence_by_ids( output_pep, "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, output_swissprot_pfam_or_hints_supported_transcripts_longest_pep) SequenceRoutines.extract_sequence_by_ids( output_pep, "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, output_swissprot_pfam_and_hints_supported_transcripts_longest_pep) for id_file in output_swissprot_pfam_or_hints_supported_transcripts_ids, \ output_swissprot_pfam_and_hints_supported_transcripts_ids, \ "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence: out_pref = id_file[:-4] out_gff = "%s.gff" % out_pref AnnotationsRoutines.extract_transcripts_by_ids(output_gff, id_file, out_gff) for suffix in ".trimmed.cds", ".transcript": SequenceRoutines.extract_sequence_by_ids( "%s%s" % (args.output, suffix), id_file, "%s%s" % (out_pref, suffix)) HMMER3.intersect_ids_from_files( output_swissprot_pfam_or_hints_supported_transcripts_ids, cds_with_inframe_stop_codons_ids, output_swissprot_pfam_or_hints_supported_transcripts_inframe_stop_ids, mode="common") HMMER3.intersect_ids_from_files( output_swissprot_pfam_and_hints_supported_transcripts_ids, cds_with_inframe_stop_codons_ids, output_swissprot_pfam_and_hints_supported_transcripts_inframe_stop_ids,
parser.add_argument("-o", "--output_prefix", action="store", dest="output_prefix", required=True, help="Output .gff file") parser.add_argument("-t", "--feature_types", action="store", dest="feature_types", type=lambda s: s.split(","), default=["CDS"], help="Comma-separated list of feature types to extract. " "Default: CDS only") parser.add_argument( "-u", "--unification_key", action="store", dest="unification_key", default="Parent", help="Annotation entry to use for unification. Default: Parent") args = parser.parse_args() AnnotationsRoutines.get_feature_dict(args.input_gff, output_prefix=args.output_prefix, feature_type_list=args.feature_types, unification_key=args.unification_key)