def intersect_ids(list_of_group_a, list_of_group_b, mode="common"): # possible modes: common, only_a, only_b, not_common, combine, count a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b for id_list in list_of_group_a: a = a | IdSet(id_list) for id_list in list_of_group_b: b = b | IdSet(id_list) if mode != "count": return IdSet(expression(a, b)) else: return len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)
def get_cluster_names(clusters_dict, out_file=None, white_list_ids=None): cluster_names = IdSet() for species in clusters_dict: species_clusters = IdSet(clusters_dict[species].keys()) cluster_names |= species_clusters if out_file: cluster_names.write(out_file) return cluster_names & IdSet( white_list_ids) if white_list_ids else cluster_names
def intersect_ids_from_files(files_with_ids_from_group_a, files_with_ids_from_group_b, result_file=None, mode="common", case_insensitive=False): a = IdSet() b = IdSet() if mode == "common": expression = lambda a, b: a & b elif mode == "only_a": expression = lambda a, b: a - b elif mode == "only_b": expression = lambda a, b: b - a elif mode == "not_common": expression = lambda a, b: a ^ b elif mode == "combine": expression = lambda a, b: a | b #print(files_with_ids_from_group_a) for filename in [files_with_ids_from_group_a] if isinstance( files_with_ids_from_group_a, str) else files_with_ids_from_group_a: id_set = IdSet() id_set.read( filename, comments_prefix="#", expression=(lambda s: s.upper()) if case_insensitive else None) a = a | id_set for filename in [files_with_ids_from_group_b] if isinstance( files_with_ids_from_group_b, str) else files_with_ids_from_group_b: id_set = IdSet() id_set.read( filename, comments_prefix="#", expression=(lambda s: s.upper()) if case_insensitive else None) b = b | id_set result_fd = open(result_file, "w") if result_file else sys.stdout if mode != "count": final_set = IdSet(expression(a, b)) final_set.write(result_fd) else: result_fd.write( "Group_A\t%i\nGroup_B\t%i\nCommon\t%i\nOnly_group_A\t%i\nOnly_group_B\t%i\nNot_common\t%i\nAll\t%i\n" % (len(a), len(b), len(a & b), len(a - b), len(b - a), len(a ^ b), len(a | b)))
def extract_transcripts_by_ids(self, input_gff, transcript_id_file, output_gff): transcript_ids = IdSet() transcript_ids.read(transcript_id_file, header=False) GFF.write( self.record_with_extracted_transcripts_generator( input_gff, transcript_ids), open(output_gff, "w"))
def rename_scaffolds_in_gff(self, input_gff, syn_file, output_prefix, verbose=True): syn_dict = SynDict(filename=syn_file) skipped_id_list = IdSet() output_gff = "%s.renamed.gff" % output_prefix skipped_gff = "%s.skipped.gff" % output_prefix skipped_id_file = "%s.skipped_scaffolds.ids" % output_prefix with self.metaopen(input_gff, "r") as in_fd, \ self.metaopen(output_gff, "w") as out_fd, \ self.metaopen(skipped_gff, "w") as skipped_fd: for line in in_fd: if line[0] == "#": out_fd.write(line) gff_list = line.split("\t") if gff_list[0] in syn_dict: gff_list[0] = syn_dict[gff_list[0]] out_fd.write("\t".join(gff_list)) else: skipped_fd.write(line) skipped_id_list.add(gff_list[0]) if verbose: print("Not renamed scaffolds: %i" % len(skipped_id_list)) skipped_id_list.write(skipped_id_file)
def extract_monocluster_ids(self, clusters_dict, white_list_ids=None, out_file=None): """ Extracts clusters with only one sequence in all species. """ monocluster_ids = IdSet() cluster_names = self.get_cluster_names(clusters_dict) for cluster_name in cluster_names: for species in clusters_dict: if white_list_ids: if cluster_name not in white_list_ids: break if cluster_name not in clusters_dict[species]: break if len(clusters_dict[species][cluster_name]) > 1: break else: monocluster_ids.add(cluster_name) if out_file: monocluster_ids.write(out_file) return monocluster_ids
def get_sequence_names(clusters_dict, write_ids=False, out_prefix=None, white_list_ids=None): sequence_names_dict = SynDict() for species in clusters_dict: sequence_names_dict[species] = IdSet() for species in clusters_dict: for cluster_id in clusters_dict[species]: if white_list_ids: if cluster_id not in white_list_ids: continue sequence_names_dict[species] = sequence_names_dict[ species] | IdSet(clusters_dict[species][cluster_id]) if write_ids: for species in clusters_dict: out_file = "%s_%s.ids" % ( out_prefix, species) if out_prefix else "%s.ids" % species sequence_names_dict[species].write(out_file) return sequence_names_dict
def extract_monocluster_ids_from_file(self, dir_with_cluster_files, out_file, file_with_white_list_ids=None): # filenames are counted as species names white_list_ids = None if file_with_white_list_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) monoclusters = self.extract_monocluster_ids( clusters_dict, out_file=out_file, white_list_ids=white_list_ids) return monoclusters
def get_scaffold_ids_from_gff(gff_file, out_file=None): scaffold_id_set = IdSet() with open(gff_file, "r") as gff_fd: for line in gff_fd: if line[0] == "#": continue scaffold_id = line.split("\t")[0] scaffold_id_set.add(scaffold_id) if out_file: scaffold_id_set.write(out_file) return scaffold_id_set
def get_column_value_set_from_file(self, input_file, column_number, output_file=None, separator="\t", comments_prefix="#", verbose=False): column_value_set = IdSet([ line_list[column_number] for line_list in self.file_line_as_list_generator(input_file, separator=separator, comments_prefix=comments_prefix) ]) if output_file: column_value_set.write(output_file) if verbose: print("#Column %i (0-based) contains %i different values" % (column_number, len(column_value_set))) return column_value_set
def predict_genes(self, output_prefix, annotation_species_prefix, genome_fasta, augustus_species, output_directory="./", augustus_strand=None, augustus_gene_model=None, augustus_config_dir=None, augustus_use_softmasking=None, augustus_other_options="", augustus_hintsfile=None, augustus_extrinsicCfgFile=None, augustus_predict_UTR=None, augustus_min_intron_len=None, threads=1, augustus_dir="", hmmer_dir="", blast_dir="", stop_codons_list=("TGA", "TAA", "TAG"), genetic_code_table=1): draft_file_prefix = "%s/raw/%s" % (output_directory, output_prefix) augustus_splited_input_dir = "%s/splited_input/" % output_directory augustus_splited_output_dir = "%s/splited_output_dir" % output_directory output_raw_gff = "%s.raw.gff" % draft_file_prefix output_gff = "%s.renamed.gff" % draft_file_prefix augustus_pep = "%s.pep" % draft_file_prefix AUGUSTUS.path = augustus_dir AUGUSTUS.threads = threads HMMER3.path = hmmer_dir HMMER3.threads = threads BLASTp.path = blast_dir BLASTp.threads = threads print("Annotating genes...") AUGUSTUS.parallel_predict( augustus_species, genome_fasta, output_raw_gff, strand=augustus_strand, gene_model=augustus_gene_model, output_gff3=True, other_options=augustus_other_options, config_dir=augustus_config_dir, use_softmasking=augustus_use_softmasking, hints_file=augustus_hintsfile, split_dir=augustus_splited_input_dir, splited_output_dir=augustus_splited_output_dir, extrinsicCfgFile=augustus_extrinsicCfgFile, predict_UTR=augustus_predict_UTR, combine_output_to_single_file=True, min_intron_len=augustus_min_intron_len) #replace_augustus_ids(augustus_gff, output_prefix, species_prefix=None, number_of_digits_in_id=8): AUGUSTUS.replace_augustus_ids(output_raw_gff, draft_file_prefix, species_prefix=annotation_species_prefix, number_of_digits_in_id=8) #extract_transcript_sequences(self, input_gff_file, genomic_fasta_file, output_prefix, coding_only=False) gffread_file_prefix = "%s.gffread" % draft_file_prefix gffread_transcripts_file, gffread_cds_file, gffread_pep_file = Gffread.extract_transcript_sequences( output_gff, genome_fasta, gffread_file_prefix) gffread_trimmed_cds = ".".join( gffread_cds_file.split(".")[:-1]) + ".trimmed.cds" gffread_trimmed_pep = ".".join( gffread_pep_file.split(".")[:-1]) + ".trimmed.pep" self.trim_cds_and_remove_terminal_stop_codons( gffread_cds_file, gffread_trimmed_cds, stop_codons_list=stop_codons_list ) # using default stop_codons(from universal genetic_code)/ Note that this will affect mtDNA proteins inframe_stop_codons_file_prefix = "%s.inframe_stop_codon" % draft_file_prefix self.translate_sequences_from_file( gffread_trimmed_cds, gffread_trimmed_pep, format="fasta", id_expression=None, genetic_code_table=genetic_code_table, translate_to_stop=False, prefix_of_file_inframe_stop_codons_seqsin= inframe_stop_codons_file_prefix) # Universal code !!! AUGUSTUS.extract_gene_ids_from_output(output_gff, all_annotated_genes_ids) AUGUSTUS.extract_CDS_annotations_from_output(output_gff, CDS_gff) print("Extracting peptides...") AUGUSTUS.extract_proteins_from_output( output_gff, output_pep, id_prefix="", evidence_stats_file=output_evidence_stats, supported_by_hints_file=output_supported_stats) self.compare_sequences_from_files(output_pep, "%s.trimmed.pep" % args.output, "comparison_of_peptides", format="fasta", verbose=True) os.system("awk -F'\\t' 'NR==1 {}; NR > 1 {print $2}' %s > %s" % (output_supported_stats, output_supported_stats_ids)) print("Annotating domains(Pfam database)...") HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_hmmscan, num_of_seqs_per_scan=None, split_dir="splited_hmmscan_fasta/", splited_output_dir="splited_hmmscan_output_dir", tblout_outfile=None, domtblout_outfile=output_domtblout, pfamtblout_outfile=None, splited_tblout_dir=None, splited_domtblout_dir="hmmscan_domtblout/") HMMER3.extract_dom_ids_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) print("Annotating peptides(Swissprot database)...") BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir") hits_dict = BLASTp.extract_hits_from_tbl_output( output_swissprot_blastp_hits, output_swissprot_blastp_hits_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_swissprot_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_swissprot_supported_transcripts_ids, output_swissprot_supported_genes_ids) os.system(remove_transcript_ids_str) """
accordance_file = "%s/%s.accordance" % (args.accordance_dir, species) accordance_dict[species] = SynDict() accordance_dict[species].read(accordance_file, key_index=1, value_index=0) if args.name_first: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[0], args.name_separator.join(gene_list[1:]) else: def split_name(pep_name): gene_list = pep_name.split(args.name_separator) return gene_list[-1], args.name_separator.join(gene_list[:-1]) families_with_errors = IdSet() for family in pep_fam_dict: cds_fam_dict[family] = [] for pep in pep_fam_dict[family]: species, pep_name = split_name(pep) if pep_name in accordance_dict[species]: cds_name = "%s%s%s" % (species, args.name_separator, accordance_dict[species][pep_name]) if args.name_first else \ "%s%s%s" % (accordance_dict[species][pep_name], args.name_separator, species) cds_fam_dict[family].append(cds_name) else: print("%s %s %s doesn't have associated cds in accordance file" % (family, species, pep_name)) families_with_errors.add(family) for family in families_with_errors: cds_fam_dict.pop(family, None)
(output_supported_stats, output_supported_stats_ids)) if args.pfam_db: print("Annotating domains(Pfam database)...") HMMER3.threads = args.threads HMMER3.parallel_hmmscan( args.pfam_db, output_pep, output_prefix_hmmscan, "./", #output_hmmscan, num_of_seqs_per_scan=None) # TODO CHECK!!!!!!!!!!!!!!!!!!!!!!!!!! HMMER3.extract_dom_ids_hits_from_domtblout(output_domtblout, output_pfam_annotated_dom_ids) hits_dict = HMMER3.extract_dom_names_hits_from_domtblout( output_domtblout, output_pfam_annotated_dom_names) supported_ids = IdSet(hits_dict.keys()) supported_ids.write(output_pfam_supported_transcripts_ids) remove_transcript_ids_str = "sed -re 's/\.t[0123456789]+//' %s | sort -k 1 | uniq > %s" % ( output_pfam_supported_transcripts_ids, output_pfam_supported_genes_ids) os.system(remove_transcript_ids_str) if args.swissprot_db: print("Annotating peptides(Swissprot database)...") BLASTp.threads = args.threads BLASTp.parallel_blastp(output_pep, args.swissprot_db, evalue=0.0000001, output_format=6, outfile=output_swissprot_blastp_hits, split_dir="splited_blastp_fasta", splited_output_dir="splited_blastp_output_dir")
def filter_reference(self, reference, repeatmasking_gff_list, output_prefix, reference_len_file=None, annotation_gff=None, max_masked_fraction=0.8, white_scaffold_list=(), black_scaffold_list=(), max_length=None): scaffold_with_annotation_file = "%s.scaffolds_with_annotations.ids" % output_prefix sorted_combined_repeatmasking_gff = "%s.sorted_combined_repeatmasking.gff" % output_prefix reference_len_filename = "%s.reference.len" % output_prefix if reference_len_file is None else reference_len_file repeatmasking_coverage_file = "%s.repeatmasking.coverage" % output_prefix filtering_log_file = "%s.filtering.log" % output_prefix filtered_scaffolds_file = "%s.filtered.fasta" % output_prefix filtered_out_scaffolds_file = "%s.filtered_out.fasta" % output_prefix filtered_out_scaffolds_id_file = "%s.filtered_out.ids" % output_prefix scaffold_with_annotation_set = self.get_scaffold_ids_from_gff( annotation_gff, out_file=scaffold_with_annotation_file) print("Sorting GFFs with masking...") sorting_string = "cat %s | sort -k1,1 -k4,4n -k5,5n > %s" % ( repeatmasking_gff_list if isinstance(repeatmasking_gff_list, str) else " ".join(repeatmasking_gff_list), sorted_combined_repeatmasking_gff) self.execute(options=sorting_string, cmd="") print("Parsing reference...") reference_dict = self.parse_seq_file(reference, mode="parse") if reference_len_file is None: length_dict = self.get_lengths(reference_dict, out_file=reference_len_filename) else: length_dict = SynDict(filename=reference_len_filename) print("Calculating fraction of masked regions...") GenomeCov.get_coverage_for_gff(sorted_combined_repeatmasking_gff, reference_len_filename, output=repeatmasking_coverage_file) print("Filtering...") low_zero_coverage_fraction_dict = SynDict( filename=repeatmasking_coverage_file, key_index=0, value_index=4, include_line_expression=lambda l: l.split("\t")[1] == "0", expression=float, include_value_expression=lambda v: v < (1.0 - max_masked_fraction)) #print low_zero_coverage_fraction_dict scaffold_to_remove = IdSet() with open(filtering_log_file, "w") as log_fd: log_fd.write("#Scaffold\tStatus\tLength\tDescription\n") for scaffold in reference_dict: if scaffold in black_scaffold_list: scaffold_to_remove.add(scaffold) log_fd.write("%s\tRemoved\t%i\tBlackList\n" % (scaffold, length_dict[scaffold])) continue if scaffold in low_zero_coverage_fraction_dict: if scaffold in white_scaffold_list: log_fd.write( "%s\tRetained\t%i\tWhiteList,LowNonMaskedPercentage:%f\n" % (scaffold, length_dict[scaffold], low_zero_coverage_fraction_dict[scaffold])) continue if scaffold in scaffold_with_annotation_set: log_fd.write( "%s\tRetained\t%i\tWithAnnotations,LonNonMaskedPercentage:%f\n" % (scaffold, length_dict[scaffold], low_zero_coverage_fraction_dict[scaffold])) continue if not (max_length is None): if length_dict[scaffold] > max_length: log_fd.write( "%s\tRetained\t%i\tLong,LowNonMaskedPercentage:%f\n" % (scaffold, length_dict[scaffold], low_zero_coverage_fraction_dict[scaffold])) continue scaffold_to_remove.add(scaffold) log_fd.write( "%s\tRemoved\t%i\tLowNonMaskedPercentage:%f\n" % (scaffold, length_dict[scaffold], low_zero_coverage_fraction_dict[scaffold])) continue log_fd.write("%s\tRetained\t%i\tOK\n" % (scaffold, length_dict[scaffold])) scaffold_to_remove.write(filename=filtered_out_scaffolds_id_file) SeqIO.write(self.record_by_id_generator(reference_dict, scaffold_to_remove), filtered_out_scaffolds_file, format="fasta") SeqIO.write(self.record_by_id_generator( reference_dict, IdSet(reference_dict.keys()) - scaffold_to_remove), filtered_scaffolds_file, format="fasta") print("Total scaffolds\t%i\nRemoved\t%i\n" % (len(reference_dict), len(scaffold_to_remove)))
def extract_sequences_by_clusters(self, dir_with_cluster_files, dir_with_sequence_files, output_dir, file_with_white_list_cluster_ids=None, mode="families", sequence_file_extension="fasta", sequence_file_format="fasta", label_species=False, separator_for_labeling="@", species_label_first=True): """ basenames of cluster and sequence files must be same mode: clusters - extract sequences from clusters in separate files, species - extract sequences from species to separate files """ # TODO: TEST IT BEFORE USAGE.WAS SIGNIFICANTLY CHANGED WITHOUT TESTING white_list_ids = None if file_with_white_list_cluster_ids: white_list_ids = IdSet() white_list_ids.read(file_with_white_list_cluster_ids) clusters_dict = self.read_cluster_files_from_dir( dir_with_cluster_files) cluster_names = self.get_cluster_names(clusters_dict, white_list_ids=white_list_ids) sequence_super_dict = OrderedDict() out_dir = self.check_path(output_dir) for species in clusters_dict: #idx_file = "%s_tmp.idx" % species sequence_file = "%s%s.%s" % (self.check_path( dir_with_sequence_files), species, sequence_file_extension) sequence_super_dict[species] = CollectionSequence( in_file=sequence_file, format=sequence_file_format, parsing_mode="parse") #sequence_super_dict[species] = SeqIO.index_db(idx_file, sequence_file, format=sequence_file_format) if mode == "species": seqeuence_names = self.get_sequence_names( clusters_dict, write_ids=False, out_prefix=None, white_list_ids=white_list_ids) for species in seqeuence_names: out_file = "%s%s.%s" % (out_dir, species, sequence_file_extension) sequence_super_dict[species].write( out_file, whitelist=seqeuence_names[species]) #SeqIO.write(SequenceRoutines.record_by_id_generator(sequence_super_dict[species], # seqeuence_names[species]), # out_file, format=sequence_file_format) elif mode == "families": if species_label_first: label_sequence = lambda label, name: "%s%s%s" % ( label, separator_for_labeling, name) else: label_sequence = lambda label, name: "%s%s%s" % ( name, separator_for_labeling, label) """ def per_family_record_generator(seq_super_dict, clust_dict, cluster_id): for species in seq_super_dict: #print species, cluster_id for record_id in clust_dict[species][cluster_id]: if label_species: record = deepcopy(seq_super_dict[species][record_id]) record.id = label_sequence(species, record_id) yield record else: yield seq_super_dict[species][record_id] """ for cluster_name in cluster_names: out_file = "%s%s.%s" % (out_dir, cluster_name, sequence_file_extension) with self.metaopen(out_file) as out_fd: for species in sequence_super_dict: sequence_super_dict[species].write( out_fd, keep_file_open=True, whitelist=clusters_dict[species][cluster_name], out_id_expression=label_sequence if label_species else None)
(args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") nonassembled = species_syn_dict.filter_by_line(filter_nonassembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") nonassembled.write("not_assembled_families_in_all_species.t", absent_symbol=".") complicated_families_dict = nonassembled.filter_by_line( filter_splited_to_several_fam) complicated_families_dict.write("complicated_families.t", absent_symbol=".") complicated_families_syn_dict = SynDict() complicated_families_syn_ids = IdSet() sl_keys = list(complicated_families_dict.sl_keys()) for sl_key in sl_keys: sp_set = set() for species in complicated_families_dict: if sl_key not in complicated_families_dict[species]: continue tmp = complicated_families_dict[species][sl_key].split(";") for i in range(0, len(tmp)): if "_" in tmp[i]: tmp[i] = tmp[i][2:] tmp[i] = tmp[i].split(",") for syn_id in tmp[i]: complicated_families_syn_ids.add(syn_id) sp_set.add(syn_id) complicated_families_syn_dict[sl_key] = sp_set
def prepare_template_for_popart(alignment_file, output_file, haplotype_fam_file=None, traits_file=None, whitelist_file=None): from RouToolPa.Parsers.Sequence import CollectionSequence sequence_collection = CollectionSequence(in_file=alignment_file, parsing_mode="parse") sequence_collection.get_stats_and_features(count_gaps=False, sort=False) whitelist = IdSet(filename=whitelist_file) alignment_len = sequence_collection.seq_lengths["length"].unique() if len(alignment_len) > 1: raise ValueError( "ERROR!!! Sequences in alignment have different lengths!") alignment_len = alignment_len[0] haplotype_selected_sequence_dict = SynDict() haplotypes_without_sequences_ids = IdList() traits_df = pd.read_csv( traits_file, sep="\t", index_col=0) if traits_file else pd.DataFrame() if haplotype_fam_file: haplotype_dict = SynDict(filename=haplotype_fam_file, split_values=True) for haplotype_id in haplotype_dict: for sequence_id in haplotype_dict[haplotype_id]: if sequence_id in sequence_collection.records: haplotype_selected_sequence_dict[ haplotype_id] = sequence_id break else: haplotypes_without_sequences_ids.append(haplotype_id) else: haplotype_dict = dict([(entry, [entry]) for entry in sequence_collection.scaffolds]) haplotype_selected_sequence_dict = dict([ (entry, entry) for entry in sequence_collection.scaffolds ]) final_haplotype_set = (set(haplotype_selected_sequence_dict.keys()) & whitelist) if whitelist else set( haplotype_selected_sequence_dict.keys()) with open(output_file, "w") as out_fd: #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict), # "\n".join(haplotype_selected_sequence_dict.keys()))) out_fd.write("#NEXUS\n\n") out_fd.write( "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n" % (len(final_haplotype_set), alignment_len)) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[ haplotype_selected_sequence_dict[haplotype_id]])) out_fd.write("\t;\nEND;\n\n") if not traits_df.empty: traits_number = len(traits_df.columns) out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n" .format(traits_number)) out_fd.write("\tTraitLabels {0};\n".format(" ".join( traits_df.columns))) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, ",".join(map(str, traits_df.loc[haplotype_id])) if haplotype_id in traits_df.index else ("0," * traits_number)[:-1])) else: out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n" ) out_fd.write("\tTraitLabels Area;\n") out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %i\n" % (haplotype_id, len(haplotype_dict[haplotype_id]))) out_fd.write("\t;\nEND;\n\n")
action="store", dest="extension", help="Extension of output files. Default: equal to -f") parser.add_argument( "-d", "--id_file", action="store", dest="id_file", help="File with groups of sequences to extract(.fam file).") args = parser.parse_args() FileRoutines.safe_mkdir(args.output) args.extension = args.extension if args.extension else args.format tmp_index_file = "temp.idx" #id_list = read_ids(args.id_file) id_list = IdSet(filename=args.id_file) sequence_groups_id = SynDict() sequence_groups_id.read(args.id_file, split_values=True) #print("Parsing %s..." % args.input_file) sequence_dict = SeqIO.index_db(tmp_index_file, args.input, format=args.format) for group in sequence_groups_id: SeqIO.write(SequenceRoutines.record_by_id_generator( sequence_dict, sequence_groups_id[group], verbose=True), "%s%s.%s" % (args.output, group, args.extension), format=args.format) os.remove(tmp_index_file)
help="Directory with families of species") """ parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids") """ if args.output != "stdout": out_fd.close() """
def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None, black_scaffold_list=(), white_scaffold_list=()): busco_table_dict = OrderedDict() gene_id_dict = OrderedDict() counts_dict = OrderedDict() output_path_list = self.split_filename(output_prefix) pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".") pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".") self.safe_mkdir(pairwise_overlaps_dir) self.safe_mkdir(pairwise_overlap_counts_dir) lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)] for busco_table, label in zip(busco_file_list, lllabels_list): busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list, white_list=white_scaffold_list) gene_id_dict[label] = OrderedDict() counts_dict[label] = OrderedDict() gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses() # TODO: draw piecharts # TODO: count overlaps pairwise_overlap_dict = OrderedDict() count_pairwise_overlap_dict = OrderedDict() for label1 in lllabels_list: for label2 in lllabels_list: if label1 == label2: continue overlap_id = "%s_vs_%s" % (label1, label2) pairwise_overlap_dict[overlap_id] = TwoLvlDict() count_pairwise_overlap_dict[overlap_id] = TwoLvlDict() for status1 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() for status2 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2]) count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)]) pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2))) count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id)) if 2 <= len(busco_file_list) <= 3: fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6)) plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes") #print(subplot_list) for status, index in zip(self.status_list, range(0, 4)): plt.sca(subplot_list[index // 2][index % 2]) plt.title(status) MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status], gene_id_dict[lllabels_list[1]][status], set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None, set_labels=lllabels_list, set_colors=["red", "yellow", "green"], output_prefix=None, extensions=("png",), title=None) plt.savefig("%s.venn.png" % output_prefix) plt.close()