def filter_trf_gff(self, input_gff, output_gff, filtered_out_gff, min_period=None, max_period=None, min_copy_number=None, max_copy_number=None, pattern=None, min_percentage_of_matches=None, max_percentage_of_indels=None, min_entropy=None, max_entropy=None): def filtering_expression(gff_description_dict): return self.gff_filtering_expression( gff_description_dict, min_period=min_period, max_period=max_period, min_copy_number=min_copy_number, max_copy_number=max_copy_number, pattern=pattern, min_percentage_of_matches=min_percentage_of_matches, max_percentage_of_indels=max_percentage_of_indels, min_entropy=min_entropy, max_entropy=max_entropy) AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
def filter_trf_gff_by_exact_copy_number(input_gff, output_gff, filtered_out_gff, min_copy_number, perfect_tandem=False): if perfect_tandem: def filtering_expression(gff_description_dict): if (gff_description_dict["Pattern"] * min_copy_number) in gff_description_dict["seq"]: return True return False else: def filtering_expression(gff_description_dict): if gff_description_dict["seq"].count( gff_description_dict["Pattern"]) >= min_copy_number: return True return False AnnotationsRoutines.filter_gff_by_description(input_gff, output_gff, filtered_out_gff, filtering_expression)
def correct_regions_from_gff( self, reference, variants_vcf, gff_file, output_prefix=None, feature_type_list=["CDS"], unification_key="Parent", #raw_seq_per_line=False, vcf_with_masking=None, override_vcf_by_mask=None, use_ambiguous_nuccleotides=None): feature_dict = AnnotationsRoutines.get_feature_dict( gff_file, output_prefix=output_prefix, feature_type_list=feature_type_list, unification_key=unification_key) region_file = "%s.coordinates_only.list" % output_prefix raw_regions = "%s.raw.seq" % output_prefix final_regions = "%s.fasta" % output_prefix regions_with_frameshift_file = "%s.frameshifts.region.ids" % output_prefix self.correct_reference( reference, raw_regions, variants_vcf, raw_seq_per_line=True, vcf_with_masking=vcf_with_masking, override_vcf_by_mask=override_vcf_by_mask, use_ambiguous_nuccleotides=use_ambiguous_nuccleotides, interval_list=region_file) region_with_frameshift = SynDict() def new_regions_generator(): with open(raw_regions, "r") as in_fd: for region_id in feature_dict: seq = "" for i in range(0, len(feature_dict[region_id])): seq_fragment = in_fd.readline().strip() if ((int(feature_dict[region_id][i][2]) - int(feature_dict[region_id][i][1]) + 1) - len(seq_fragment)) % 3 != 0: if region_id not in region_with_frameshift: region_with_frameshift[region_id] = [i] else: region_with_frameshift[region_id].append(i) seq += seq_fragment yield SeqRecord( seq=Seq(seq) if feature_dict[region_id][0][3] == "+" else Seq(seq).reverse_complement(), id=region_id, description="") SeqIO.write(new_regions_generator(), final_regions, format="fasta") region_with_frameshift.write(regions_with_frameshift_file, splited_values=True)
def get_monomer_len_file_from_trf_gff(trf_gff, len_file): len_dict = SynDict() with open(trf_gff, "r") as trf_fd: for line in trf_fd: if line[0] == "#": continue description_dict = AnnotationsRoutines.get_description_dict_from_gff_string( line) len_dict[description_dict["ID"]] = description_dict["Period"] # print len_dict len_dict.write(len_file)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", help="Gff file with annotations to extract") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", help="Output gff file with extracted transcripts") parser.add_argument("-d", "--ids_file", action="store", dest="ids_file", help="File with ids of transcripts to extract") args = parser.parse_args() AnnotationsRoutines.extract_transcripts_by_ids(args.input_gff, args.ids_file, args.output_gff)
SequenceRoutines.extract_sequence_by_ids( output_pep, "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, output_swissprot_pfam_or_hints_supported_transcripts_longest_pep) SequenceRoutines.extract_sequence_by_ids( output_pep, "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, output_swissprot_pfam_and_hints_supported_transcripts_longest_pep) for id_file in output_swissprot_pfam_or_hints_supported_transcripts_ids, \ output_swissprot_pfam_and_hints_supported_transcripts_ids, \ "%s.ids" % output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ "%s.ids" % output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence: out_pref = id_file[:-4] out_gff = "%s.gff" % out_pref AnnotationsRoutines.extract_transcripts_by_ids(output_gff, id_file, out_gff) for suffix in ".trimmed.cds", ".transcript": SequenceRoutines.extract_sequence_by_ids( "%s%s" % (args.output, suffix), id_file, "%s%s" % (out_pref, suffix)) HMMER3.intersect_ids_from_files( output_swissprot_pfam_or_hints_supported_transcripts_ids, cds_with_inframe_stop_codons_ids, output_swissprot_pfam_or_hints_supported_transcripts_inframe_stop_ids, mode="common") HMMER3.intersect_ids_from_files( output_swissprot_pfam_and_hints_supported_transcripts_ids, cds_with_inframe_stop_codons_ids, output_swissprot_pfam_and_hints_supported_transcripts_inframe_stop_ids,
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import argparse from Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="input gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output fixed gff file") args = parser.parse_args() AnnotationsRoutines.fix_gff_coordinates_order(args.input_gff, args.output_gff)
#!/usr/bin/env python __author__ = 'Sergei F. Kliver' import os import argparse from Bio import SeqIO from Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-g", "--gtf_file", action="store", dest="input", required=True, help="Input gtf file") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output accordance file") args = parser.parse_args() AnnotationsRoutines.get_transcript_to_pep_accordance_from_gtf( args.input, args.output, comment_symbol="#")
__author__ = 'Sergei F. Kliver' import argparse from Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="Input .gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output .gff file") parser.add_argument("-s", "--syn_file_file", action="store", dest="syn_file", required=True, help="File with synonyms of region names") args = parser.parse_args() AnnotationsRoutines.replace_region_names_in_gff(args.input_gff, args.syn_file, args.output_gff)
__author__ = 'Sergei F. Kliver' import argparse from Routines import AnnotationsRoutines parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_gff", action="store", dest="input_gff", required=True, help="input gff file") parser.add_argument("-o", "--output_gff", action="store", dest="output_gff", required=True, help="Output fixed gff file") parser.add_argument("-f", "--feature_type", action="store", dest="feature_type", required=True, help="Feature type to use") args = parser.parse_args() AnnotationsRoutines.fix_absent_feature_type_field(args.input_gff, args.output_gff, args.feature_type)