def __init__(self, cmd, path="", max_threads=4, jar_path=None, jar=None, max_memory="500m", timelog="tool_time.log"): SequenceRoutines.__init__(self) self.path = self.check_path(path) self.cmd = cmd self.threads = max_threads #print(jar_path) self.jar_path = self.check_path(jar_path) if jar_path else None self.jar = jar self.max_memory = max_memory self.timelog = timelog
def __init__(self): SequenceRoutines.__init__(self) self.GFF_SCAFFOLD_COLUMN = 0 self.GFF_SOURCE_COLUMN = 1 self.GFF_FEATURETYPE_COLUMN = 2 self.GFF_START_COLUMN = 3 self.GFF_END_COLUMN = 4 self.GFF_SCORE_COLUMN = 5 self.GFF_STRAND_COLUMN = 6 self.GFF_PHASE_COLUMN = 7 self.GFF_ATTRIBUTE_COLUMN = 8 self.BED_SCAFFOLD_COLUMN = 0 self.BED_START_COLUMN = 1 self.BED_END_COLUMN = 2
def extract_sequences_from_selected_clusters( self, clusters_id_file, cluster_file, seq_file, output_dir="./", seq_format="fasta", out_prefix=None, create_dir_for_each_cluster=False, skip_cluster_if_no_sequence_for_element=True): from Routines import SequenceRoutines, FileRoutines cluster_id_list = IdList() cluster_dict = SynDict() #print(pep_file) FileRoutines.safe_mkdir(output_dir) out_dir = FileRoutines.check_path(output_dir) create_directory_for_each_cluster = True if out_prefix else create_dir_for_each_cluster if clusters_id_file: cluster_id_list.read(clusters_id_file) cluster_dict.read(cluster_file, split_values=True, values_separator=",") protein_dict = SeqIO.index_db( "tmp.idx", FileRoutines.make_list_of_path_to_files(seq_file), format=seq_format) number_of_skipped_clusters = 0 for fam_id in cluster_id_list if clusters_id_file else cluster_dict: if skip_cluster_if_no_sequence_for_element: absent_elements = self.check_absence_of_cluster_elements( cluster_dict[fam_id], protein_dict) if absent_elements: print "Skipping cluster %s due to absent element(%s)" % ( fam_id, ",".join(absent_elements)) number_of_skipped_clusters += 1 continue if fam_id in cluster_dict: if create_directory_for_each_cluster: fam_dir = "%s%s/" % (out_dir, fam_id) FileRoutines.safe_mkdir(fam_dir) out_file = "%s%s.fasta" % (fam_dir, out_prefix if out_prefix else fam_id) else: out_file = "%s/%s.fasta" % (out_dir, out_prefix if out_prefix else fam_id) SeqIO.write(SequenceRoutines.record_by_id_generator( protein_dict, cluster_dict[fam_id], verbose=True), out_file, format=seq_format) os.remove("tmp.idx") print "%i of %i clusters were skipped due to absent elements" % ( number_of_skipped_clusters, len(cluster_dict)) return number_of_skipped_clusters
def __init__(self, cmd, path="", max_threads=4, jar_path="", jar=None, max_memory="500m", max_per_thread_memory="500m", timelog=None, tmp_dir=None): SequenceRoutines.__init__(self) self.path = self.check_path(path) self.cmd = cmd self.threads = max_threads #print(jar_path) self.jar_path = self.check_path(jar_path) if jar_path else "" self.jar = jar self.max_memory = max_memory self.timelog = timelog self.max_per_thread_memory = max_per_thread_memory self.tmp_dir = tmp_dir
from Routines.Matplotlib import MatplotlibRoutines from Routines.Annotations import AnnotationsRoutines from Routines.Phylogenetics import PhylogeneticsRoutines from Routines.SequenceCluster import SequenceClusterRoutines from Routines.MultipleAlignment import MultipleAlignmentRoutines GORoutines = GORoutines() VCFRoutines = VCFRoutines() MathRoutines = MathRoutines() FileRoutines = FileRoutines() TreeRoutines = TreeRoutines() NCBIRoutines = NCBIRoutines() MtDNARoutines = MtDNARoutines() FastQRoutines = FastQRoutines() SmoothRoutines = SmoothRoutines() PrimerRoutines = PrimerRoutines() EggNOGRoutines = EggNOGRoutines() ProjectRoutines = ProjectRoutines() EnsemblRoutines = EnsemblRoutines() TreeFamRoutines = TreeFamRoutines() DrawingRoutines = DrawingRoutines() SequenceRoutines = SequenceRoutines() EvolutionRoutines = EvolutionRoutines() AlignmentRoutines = AlignmentRoutines() ExpressionRoutines = ExpressionRoutines() MatplotlibRoutines = MatplotlibRoutines() AnnotationsRoutines = AnnotationsRoutines() PhylogeneticsRoutines = PhylogeneticsRoutines() SequenceClusterRoutines = SequenceClusterRoutines() MultipleAlignmentRoutines = MultipleAlignmentRoutines()
parser.add_argument("-f", "--format", action="store", dest="format", default="fasta", help="format of file with sequences - default: fasta.") parser.add_argument("-i", "--input", action="store", dest="input", help="file with sequences") parser.add_argument("-o", "--output", action="store", dest="output", default="out.t", help="output file - default: out.t.") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") record_dict = SeqIO.index_db("temp_index.idx", [args.input], format=args.format) lengths_dict = SequenceRoutines.get_lengths(record_dict, out_file=out_fd) print("Longest sequence: %i" % max(lengths_dict.values())) print("Shortest sequence: %i" % min(lengths_dict.values())) print("Total length: %i" % sum(lengths_dict.values())) os.remove("temp_index.idx")
def __init__(self): SequenceRoutines.__init__(self)
def __init__(self): SequenceRoutines.__init__(self) self.mithochondrion_synonym_table = [ [ "12S_rRNA", "12S rRNA", "12S ribosomal RNA", "small subunit ribosomal RNA", "s-rRNA", "s-rRNA; 12S ribosomal RNA", "small ribosomal RNA subunit RNA", "12S ribosomal RNA", "12S ribosomal RNA subunit", "12S rivbosomal RNA", "12S ribosamal RNA", "l2S ribosomal RNA", "12 ribosomal RNA", "12S ribosormal RNA," "12 rRNA", "s-RNA" ], [ "16S_rRNA", "16S rRNA", "16S ribosomal RNA", "large subunit ribosomal RNA", "l-rRNA", "l-rRNA; 16S ribosomal RNA", "large ribosomal RNA subunit RNA", "16S ribosomal RNA", "16S ribosomal RNA subunit", "16S rivbosomal RNA", "16S ribosamal RNA", "l6S ribosomal RNA", "16 ribosomal RNA", "16S ribosormal RNA", "16 rRNA", "l-RNA" ], [ "ATP6", "atp6", "ATPase6", "ATPase 6", "ATPase subunit 6", "ATP synthase F0 subunit 6", "ATP synthetase F0 subunit 6", "ATP synthase subunit 6" "ATPase subunits 6", "adenosine triphosphatase subunit 6", "ATPase subunit-6" ], [ "ATP8", "atp8", "ATPase8", "ATPase 8", "ATPase subunit 8", "ATP synthase F0 subunit 8", "ATP synthetase F0 subunit 8", "ATP synthase subunit 8", "ATPase subunits 8", "adenosine triphosphatase subunit 8", "adenosine triphosphate subunit 8", "ATPase subunit-8" ], [ "COX1", "COXI", "cytochrome c oxidase subunit 1", "cytochrome c oxidase subunit I", "Cytochrome c oxidase subunit 1", "cytochrome oxidase subunit I", "chytochrome c oxidase subunit I", "COI", "CO1", "CO 1", "CO I", "coi", "product: cytochrome c oxidase subunit I", "cytochrome oxidase subunit 1" ], [ "COX2", "COXII", "cytochrome c oxidase subunit 2", "cytochrome c oxidase subunit II", "Cytochrome c oxidase subunit 2", "cytochrome oxidase subunit II", "chytochrome c oxidase subunit II", "COII", "CO2", "CO 2", "CO II", "coii", "cytochrome oxidase subunit 2" ], [ "COX3", "COXIII", "cytochrome c oxidase subunit 3", "cytochrome c oxidase subunit III", "Cytochrome c oxidase subunit 3", "cytochrome oxidase subunit III", "chytochrome c oxidase subunit III", "COIII", "CO3", "CO 3", "CO III", "coiii", "cytochrome oxidase subunit 3" ], [ "CYTB", "cytochrome b", "Cytochrome b", "cytb", "Cytb", "Cyt b", "Cytochrome b apoenzyme", "cytochrome b apoenzyme", "cytochrome b; TAA stop codon appears afterpolyadenylation" ], [ "ND1", "nd1", "nd 1", "ND 1", "Nd 1", "NADH dehydrogenase subunit 1", "NADH hydrogenase subunit 1", "subunit 1 of the NADH ubiquinone oxidoreductase complex", "NADH-1", "NADH1" ], [ "ND2", "nd2", "nd 2", "ND 2", "Nd 2", "NADH dehydrogenase subunit 2", "NADH hydrogenase subunit 2", "subunit 2 of the NADH ubiquinone oxidoreductase complex", "#NADH dehydrogenase subunit 2", "NADH-2", "NADH2" ], [ "ND3", "nd3", "nd 3", "ND 3", "Nd 3", "NADH dehydrogenase subunit 3", "NADH hydrogenase subunit 3", "subunit 3 of the NADH ubiquinone oxidoreductase complex", "NADH-3", "NADH3" ], [ "ND4", "nd4", "nd 4", "ND 4", "Nd 4", "NADH dehydrogenase subunit 4", "NADH hydrogenase subunit 4", "subunit 4 of the NADH ubiquinone oxidoreductase complex", "NADH-4", "NADH4" ], [ "ND4L", "nd4l", "nd 4l", "ND 4l", "Nd 4l", "NADH dehydrogenase subunit 4L", "NADH hydrogenase subunit 4L", "NADH-4L", "NADH4L" ], [ "ND5", "nd5", "nd 5", "ND 5", "Nd 5", "NADH dehydrogenase subunit 5", "NADH hydrogenase subunit 5", "subunit 5 of the NADH ubiquinone oxidoreductase complex", "NADH-5", "NADH5" ], [ "ND6", "nd6", "nd 6", "ND 6", "Nd 6", "NADH dehydrogenase subunit 6", "NADH hydrogenase subunit 6", "subunit 6 of the NADH ubiquinone oxidoreductase complex", "NADH-6", "NADH6", "NADH dehydrogenase subunit-6" ], ["tRNA-Val"], ["tRNA-Leu"], ["tRNA-Phe"], ["tRNA-Pro"], ["tRNA-Thr"], ["tRNA-Glu"], ["tRNA-Ser"], ["tRNA-His"], ["tRNA-Arg"], ["tRNA-Gly"], ["tRNA-Lys"], ["tRNA-Asp"], ["tRNA-Tyr"], ["tRNA-Cys"], ["tRNA-Asn"], ["tRNA-Ala"], ["tRNA-Trp"], ["tRNA-Met"], ["tRNA-Ile"], ["tRNA-Gln"] ] self.protein_gene_list = [ "ATP6", "ATP8", "COX1", "COX2", "COX3", "CYTB", "ND1", "ND2", "ND3", "ND4L", "ND4", "ND5", "ND6", ] self.rRNA_gene_list = ["12S_rRNA", "16S_rRNA"]
) parser.add_argument( "-s", action="store_true", dest="use_strand", help="Define -l and -r based on strand. E.g. if used, -l 500 for a " "negative-stranded feature, it will start the flank 500 bp downstream. Default = false." ) args = parser.parse_args() record_dict = SeqIO.index_db("temp_index.idx", [args.input_fasta], format="fasta") SequenceRoutines.get_lengths(record_dict, out_file="fasta_lengths.t", write=True) if args.fraction_mode: left = float(args.left) right = float(args.right) else: left = int(args.left) right = int(args.right) Flank.get(args.bed, "fasta_lengths.t", left, right, fraction_mode=args.fraction_mode, strand_based=args.use_strand,
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = FileRoutines.check_path(output_dir) for species in clusters_dict: idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (FileRoutines.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = SeqIO.index_db( idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_super_dict[species], seqeuence_names[species]), out_file, format=sequence_file_format) elif mode == "families": def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy( seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) SeqIO.write(per_family_record_generator( sequence_super_dict, clusters_dict, cluster_name), out_file, format=sequence_file_format) for species in clusters_dict: os.remove("%s_tmp.idx" % species)