def prepare_data_for_target_alignment(self, query_fasta, target_fasta, correspondence_file, out_dir, correspondence_query_column=0, correspondence_target_column=1): query_dict = self.parse_seq_file(query_fasta, "parse") target_dict = self.parse_seq_file(target_fasta, "parse") self.safe_mkdir(out_dir) correspondence_dict = SynDict(filename=correspondence_file, allow_repeats_of_key=True, key_index=correspondence_query_column, value_index=correspondence_target_column) for query_id in correspondence_dict: query_outfile = "%s/%s.query.fasta" % (out_dir, query_id) target_outfile = "%s/%s.target.fasta" % (out_dir, query_id) SeqIO.write(self.record_by_id_generator(query_dict, [query_id]), query_outfile, format="fasta") SeqIO.write(self.record_by_id_generator( target_dict, correspondence_dict[query_id]), target_outfile, format="fasta") queries_with_targets_set = set(correspondence_dict.keys()) queries_set = set(query_dict.keys()) return queries_with_targets_set, queries_set - queries_with_targets_set
def prepare_template_for_popart(alignment_file, output_file, haplotype_fam_file=None, traits_file=None, whitelist_file=None): from RouToolPa.Parsers.Sequence import CollectionSequence sequence_collection = CollectionSequence(in_file=alignment_file, parsing_mode="parse") sequence_collection.get_stats_and_features(count_gaps=False, sort=False) whitelist = IdSet(filename=whitelist_file) alignment_len = sequence_collection.seq_lengths["length"].unique() if len(alignment_len) > 1: raise ValueError( "ERROR!!! Sequences in alignment have different lengths!") alignment_len = alignment_len[0] haplotype_selected_sequence_dict = SynDict() haplotypes_without_sequences_ids = IdList() traits_df = pd.read_csv( traits_file, sep="\t", index_col=0) if traits_file else pd.DataFrame() if haplotype_fam_file: haplotype_dict = SynDict(filename=haplotype_fam_file, split_values=True) for haplotype_id in haplotype_dict: for sequence_id in haplotype_dict[haplotype_id]: if sequence_id in sequence_collection.records: haplotype_selected_sequence_dict[ haplotype_id] = sequence_id break else: haplotypes_without_sequences_ids.append(haplotype_id) else: haplotype_dict = dict([(entry, [entry]) for entry in sequence_collection.scaffolds]) haplotype_selected_sequence_dict = dict([ (entry, entry) for entry in sequence_collection.scaffolds ]) final_haplotype_set = (set(haplotype_selected_sequence_dict.keys()) & whitelist) if whitelist else set( haplotype_selected_sequence_dict.keys()) with open(output_file, "w") as out_fd: #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict), # "\n".join(haplotype_selected_sequence_dict.keys()))) out_fd.write("#NEXUS\n\n") out_fd.write( "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n" % (len(final_haplotype_set), alignment_len)) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[ haplotype_selected_sequence_dict[haplotype_id]])) out_fd.write("\t;\nEND;\n\n") if not traits_df.empty: traits_number = len(traits_df.columns) out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n" .format(traits_number)) out_fd.write("\tTraitLabels {0};\n".format(" ".join( traits_df.columns))) out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %s\n" % (haplotype_id, ",".join(map(str, traits_df.loc[haplotype_id])) if haplotype_id in traits_df.index else ("0," * traits_number)[:-1])) else: out_fd.write( "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n" ) out_fd.write("\tTraitLabels Area;\n") out_fd.write("\tMATRIX\n") for haplotype_id in final_haplotype_set: out_fd.write( "\t\t%s %i\n" % (haplotype_id, len(haplotype_dict[haplotype_id]))) out_fd.write("\t;\nEND;\n\n")
def star_and_htseq(self, genome_dir, samples_directory, output_directory, gff_for_htseq, count_table_file_prefix, genome_fasta=None, samples_to_handle=None, genome_size=None, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_per_thread_for_bam_sorting="4G", include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=False, star_dir=None, threads=1, max_intron_length=None, stranded_rnaseq="yes", min_alignment_quality=10, feature_type_for_htseq="exon", feature_id_attribute_for_htseq="gene_id", htseq_mode="union"): STAR.threads = threads STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=genome_size) sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) self.prepare_diff_expression_directories(output_directory, sample_list) alignment_dir = "%s/alignment/" % output_directory count_pe_table = TwoLvlDict() count_se_table = TwoLvlDict() count_all_table = TwoLvlDict() count_pe_table_file = "%s/%s.pe.tab" % (output_directory, count_table_file_prefix) count_se_table_file = "%%s/%s.se.tab" % (output_directory, count_table_file_prefix) count_all_table_file = "%s/%s.all.tab" % (output_directory, count_table_file_prefix) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_directory, sample) alignment_sample_dir = "%s/%s/" % (alignment_dir, sample) alignment_sample_se_dir = "%s/se/" % alignment_sample_dir filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files( sample_dir) if se_files: self.safe_mkdir(alignment_sample_se_dir) print("\tAligning paired reads...") count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample) #""" STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir print("\tIndexing alignment file for paired reads...") os.system("samtools index %s" % alignment_file) print("\tCounting paired reads aligned to features...") HTSeq.count(alignment_file, gff_for_htseq, count_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_counts = SynDict(filename=count_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_pe_table[sample] = sample_counts if se_files: print("\tAligning single reads...") count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir, sample) #""" STAR.align( genome_dir, se_files, reverse_read_list=None, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon= feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_se_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir print("\tIndexing alignment file for single reads...") os.system("samtools index %s" % alignment_se_file) print("\tCounting single reads aligned to features...") HTSeq.count( alignment_se_file, gff_for_htseq, count_se_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_se_counts = SynDict(filename=count_se_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_se_table[sample] = sample_se_counts else: count_se_table[sample] = SynDict() count_all_table[sample] = SynDict() if se_files: for gene_id in set(sample_counts.keys()) | set( sample_se_counts.keys()): if (gene_id in sample_counts) and (gene_id in sample_se_counts): count_all_table[sample][gene_id] = sample_counts[ gene_id] + sample_se_counts[gene_id] elif gene_id in sample_counts: count_all_table[sample][gene_id] = sample_counts[ gene_id] elif gene_id in sample_se_counts: count_all_table[sample][gene_id] = sample_se_counts[ gene_id] else: count_all_table[sample] = count_pe_table[sample] count_pe_table.write(count_pe_table_file) count_se_table.write(count_se_table_file) count_all_table.write(count_all_table_file)
print("Started extraction for family %s" % family_name) family_genes_ids = families_dict[family_name] try: os.mkdir("%s%s" % (args.output_dir, family_name)) except OSError: pass fam_soft_fd = open("%s%s/%s_with_outer_edges.graph" % (args.output_dir, family_name, family_name), "w") """ with open(args.hclust_input, "r") as in_fd: for line in in_fd: edge_nodes = line.split("\t")[:2] if check_edge_soft(edge_nodes, family_genes_ids): fam_soft_fd.write(line) """ for edge in graph_list: if check_edge_soft(edge[:-1], family_genes_ids): fam_soft_fd.write("\t".join(edge) + "\n") fam_soft_fd.close() fam_strict_fd = open("%s%s/%s.graph" % (args.output_dir, family_name, family_name), "w") with open("%s%s/%s_with_outer_edges.graph" % (args.output_dir, family_name, family_name), "r") as in_fd: for line in in_fd: edge_nodes = line.split("\t")[:2] if check_edge_strict(edge_nodes, family_genes_ids): fam_strict_fd.write(line) fam_strict_fd.close() pool = Pool(args.threads) pool.map(extract_fam_graph, families_dict.keys())