Exemple #1
0
    def prepare_data_for_target_alignment(self,
                                          query_fasta,
                                          target_fasta,
                                          correspondence_file,
                                          out_dir,
                                          correspondence_query_column=0,
                                          correspondence_target_column=1):

        query_dict = self.parse_seq_file(query_fasta, "parse")
        target_dict = self.parse_seq_file(target_fasta, "parse")

        self.safe_mkdir(out_dir)

        correspondence_dict = SynDict(filename=correspondence_file,
                                      allow_repeats_of_key=True,
                                      key_index=correspondence_query_column,
                                      value_index=correspondence_target_column)

        for query_id in correspondence_dict:
            query_outfile = "%s/%s.query.fasta" % (out_dir, query_id)
            target_outfile = "%s/%s.target.fasta" % (out_dir, query_id)

            SeqIO.write(self.record_by_id_generator(query_dict, [query_id]),
                        query_outfile,
                        format="fasta")
            SeqIO.write(self.record_by_id_generator(
                target_dict, correspondence_dict[query_id]),
                        target_outfile,
                        format="fasta")

        queries_with_targets_set = set(correspondence_dict.keys())
        queries_set = set(query_dict.keys())

        return queries_with_targets_set, queries_set - queries_with_targets_set
Exemple #2
0
    def prepare_template_for_popart(alignment_file,
                                    output_file,
                                    haplotype_fam_file=None,
                                    traits_file=None,
                                    whitelist_file=None):
        from RouToolPa.Parsers.Sequence import CollectionSequence
        sequence_collection = CollectionSequence(in_file=alignment_file,
                                                 parsing_mode="parse")
        sequence_collection.get_stats_and_features(count_gaps=False,
                                                   sort=False)
        whitelist = IdSet(filename=whitelist_file)
        alignment_len = sequence_collection.seq_lengths["length"].unique()
        if len(alignment_len) > 1:
            raise ValueError(
                "ERROR!!! Sequences in alignment have different lengths!")
        alignment_len = alignment_len[0]

        haplotype_selected_sequence_dict = SynDict()
        haplotypes_without_sequences_ids = IdList()

        traits_df = pd.read_csv(
            traits_file, sep="\t",
            index_col=0) if traits_file else pd.DataFrame()

        if haplotype_fam_file:
            haplotype_dict = SynDict(filename=haplotype_fam_file,
                                     split_values=True)
            for haplotype_id in haplotype_dict:
                for sequence_id in haplotype_dict[haplotype_id]:
                    if sequence_id in sequence_collection.records:
                        haplotype_selected_sequence_dict[
                            haplotype_id] = sequence_id
                        break
                else:
                    haplotypes_without_sequences_ids.append(haplotype_id)
        else:
            haplotype_dict = dict([(entry, [entry])
                                   for entry in sequence_collection.scaffolds])
            haplotype_selected_sequence_dict = dict([
                (entry, entry) for entry in sequence_collection.scaffolds
            ])

        final_haplotype_set = (set(haplotype_selected_sequence_dict.keys())
                               & whitelist) if whitelist else set(
                                   haplotype_selected_sequence_dict.keys())

        with open(output_file, "w") as out_fd:
            #out_fd.write("#NEXUS\nBEGIN TAXA;\nDIMENSIONS\nNTAX = %i;\nTAXLABELS\n%s\n;\nEND;\n\n" % (len(haplotype_selected_sequence_dict),
            #                                                                                          "\n".join(haplotype_selected_sequence_dict.keys())))
            out_fd.write("#NEXUS\n\n")
            out_fd.write(
                "BEGIN DATA;\n\tDIMENSIONS NTAX=%i NCHAR=%i;\n\tFORMAT DATATYPE=DNA MISSING=? GAP=- MATCHCHAR=. ;\n"
                % (len(final_haplotype_set), alignment_len))
            out_fd.write("\tMATRIX\n")

            for haplotype_id in final_haplotype_set:
                out_fd.write(
                    "\t\t%s %s\n" % (haplotype_id, sequence_collection.records[
                        haplotype_selected_sequence_dict[haplotype_id]]))
            out_fd.write("\t;\nEND;\n\n")

            if not traits_df.empty:
                traits_number = len(traits_df.columns)
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS={0};\n\tFormat labels=yes missing=? separator=Comma;\n"
                    .format(traits_number))
                out_fd.write("\tTraitLabels {0};\n".format(" ".join(
                    traits_df.columns)))
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %s\n" %
                        (haplotype_id,
                         ",".join(map(str, traits_df.loc[haplotype_id]))
                         if haplotype_id in traits_df.index else
                         ("0," * traits_number)[:-1]))
            else:
                out_fd.write(
                    "BEGIN TRAITS;\n\tDimensions NTRAITS=1;\n\tFormat labels=yes missing=? separator=Comma;\n"
                )
                out_fd.write("\tTraitLabels Area;\n")
                out_fd.write("\tMATRIX\n")
                for haplotype_id in final_haplotype_set:
                    out_fd.write(
                        "\t\t%s %i\n" %
                        (haplotype_id, len(haplotype_dict[haplotype_id])))
            out_fd.write("\t;\nEND;\n\n")
Exemple #3
0
    def star_and_htseq(self,
                       genome_dir,
                       samples_directory,
                       output_directory,
                       gff_for_htseq,
                       count_table_file_prefix,
                       genome_fasta=None,
                       samples_to_handle=None,
                       genome_size=None,
                       annotation_gtf=None,
                       feature_from_gtf_to_use_as_exon=None,
                       exon_tag_to_use_as_transcript_id=None,
                       exon_tag_to_use_as_gene_id=None,
                       length_of_sequences_flanking_junction=None,
                       junction_tab_file_list=None,
                       three_prime_trim=None,
                       five_prime_trim=None,
                       adapter_seq_for_three_prime_clip=None,
                       max_mismatch_percent_for_adapter_trimming=None,
                       three_prime_trim_after_adapter_clip=None,
                       output_type="BAM",
                       sort_bam=True,
                       max_memory_per_thread_for_bam_sorting="4G",
                       include_unmapped_reads_in_bam=True,
                       output_unmapped_reads=True,
                       two_pass_mode=False,
                       star_dir=None,
                       threads=1,
                       max_intron_length=None,
                       stranded_rnaseq="yes",
                       min_alignment_quality=10,
                       feature_type_for_htseq="exon",
                       feature_id_attribute_for_htseq="gene_id",
                       htseq_mode="union"):

        STAR.threads = threads
        STAR.path = star_dir

        if genome_fasta:
            STAR.index(genome_dir,
                       genome_fasta,
                       annotation_gtf=None,
                       junction_tab_file=None,
                       sjdboverhang=None,
                       genomeSAindexNbases=None,
                       genomeChrBinNbits=None,
                       genome_size=genome_size)

        sample_list = samples_to_handle if samples_to_handle else self.get_sample_list(
            samples_directory)
        self.prepare_diff_expression_directories(output_directory, sample_list)

        alignment_dir = "%s/alignment/" % output_directory

        count_pe_table = TwoLvlDict()
        count_se_table = TwoLvlDict()
        count_all_table = TwoLvlDict()
        count_pe_table_file = "%s/%s.pe.tab" % (output_directory,
                                                count_table_file_prefix)
        count_se_table_file = "%%s/%s.se.tab" % (output_directory,
                                                 count_table_file_prefix)
        count_all_table_file = "%s/%s.all.tab" % (output_directory,
                                                  count_table_file_prefix)

        for sample in sample_list:
            print("Handling %s" % sample)
            sample_dir = "%s/%s/" % (samples_directory, sample)
            alignment_sample_dir = "%s/%s/" % (alignment_dir, sample)
            alignment_sample_se_dir = "%s/se/" % alignment_sample_dir
            filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files(
                sample_dir)

            if se_files:
                self.safe_mkdir(alignment_sample_se_dir)

            print("\tAligning paired reads...")
            count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample)
            #"""
            STAR.align(
                genome_dir,
                forward_files,
                reverse_read_list=reverse_files,
                annotation_gtf=annotation_gtf,
                feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon,
                exon_tag_to_use_as_transcript_id=
                exon_tag_to_use_as_transcript_id,
                exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                length_of_sequences_flanking_junction=
                length_of_sequences_flanking_junction,
                junction_tab_file_list=junction_tab_file_list,
                three_prime_trim=three_prime_trim,
                five_prime_trim=five_prime_trim,
                adapter_seq_for_three_prime_clip=
                adapter_seq_for_three_prime_clip,
                max_mismatch_percent_for_adapter_trimming=
                max_mismatch_percent_for_adapter_trimming,
                three_prime_trim_after_adapter_clip=
                three_prime_trim_after_adapter_clip,
                output_type=output_type,
                sort_bam=sort_bam,
                max_memory_per_thread_for_bam_sorting=
                max_memory_per_thread_for_bam_sorting,
                include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                output_unmapped_reads=output_unmapped_reads,
                output_dir=alignment_sample_dir,
                two_pass_mode=two_pass_mode,
                max_intron_length=max_intron_length)

            alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir

            print("\tIndexing alignment file for paired reads...")
            os.system("samtools index %s" % alignment_file)

            print("\tCounting paired reads aligned to features...")

            HTSeq.count(alignment_file,
                        gff_for_htseq,
                        count_file,
                        samtype="bam",
                        order="pos",
                        stranded_rnaseq=stranded_rnaseq,
                        min_alignment_quality=min_alignment_quality,
                        feature_type=feature_type_for_htseq,
                        feature_id_attribute=feature_id_attribute_for_htseq,
                        mode=htseq_mode,
                        suppress_progres_report=False)
            #"""
            sample_counts = SynDict(filename=count_file,
                                    header=False,
                                    separator="\t",
                                    allow_repeats_of_key=False,
                                    split_values=False,
                                    values_separator=",",
                                    key_index=0,
                                    value_index=1,
                                    close_after_if_file_object=False,
                                    expression=int,
                                    comments_prefix="__")
            count_pe_table[sample] = sample_counts

            if se_files:
                print("\tAligning single reads...")
                count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir,
                                                       sample)
                #"""
                STAR.align(
                    genome_dir,
                    se_files,
                    reverse_read_list=None,
                    annotation_gtf=annotation_gtf,
                    feature_from_gtf_to_use_as_exon=
                    feature_from_gtf_to_use_as_exon,
                    exon_tag_to_use_as_transcript_id=
                    exon_tag_to_use_as_transcript_id,
                    exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id,
                    length_of_sequences_flanking_junction=
                    length_of_sequences_flanking_junction,
                    junction_tab_file_list=junction_tab_file_list,
                    three_prime_trim=three_prime_trim,
                    five_prime_trim=five_prime_trim,
                    adapter_seq_for_three_prime_clip=
                    adapter_seq_for_three_prime_clip,
                    max_mismatch_percent_for_adapter_trimming=
                    max_mismatch_percent_for_adapter_trimming,
                    three_prime_trim_after_adapter_clip=
                    three_prime_trim_after_adapter_clip,
                    output_type=output_type,
                    sort_bam=sort_bam,
                    max_memory_per_thread_for_bam_sorting=
                    max_memory_per_thread_for_bam_sorting,
                    include_unmapped_reads_in_bam=include_unmapped_reads_in_bam,
                    output_unmapped_reads=output_unmapped_reads,
                    output_dir=alignment_sample_se_dir,
                    two_pass_mode=two_pass_mode,
                    max_intron_length=max_intron_length)

                alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir

                print("\tIndexing alignment file for single reads...")
                os.system("samtools index %s" % alignment_se_file)

                print("\tCounting single reads aligned to features...")

                HTSeq.count(
                    alignment_se_file,
                    gff_for_htseq,
                    count_se_file,
                    samtype="bam",
                    order="pos",
                    stranded_rnaseq=stranded_rnaseq,
                    min_alignment_quality=min_alignment_quality,
                    feature_type=feature_type_for_htseq,
                    feature_id_attribute=feature_id_attribute_for_htseq,
                    mode=htseq_mode,
                    suppress_progres_report=False)
                #"""

                sample_se_counts = SynDict(filename=count_se_file,
                                           header=False,
                                           separator="\t",
                                           allow_repeats_of_key=False,
                                           split_values=False,
                                           values_separator=",",
                                           key_index=0,
                                           value_index=1,
                                           close_after_if_file_object=False,
                                           expression=int,
                                           comments_prefix="__")

                count_se_table[sample] = sample_se_counts
            else:
                count_se_table[sample] = SynDict()
            count_all_table[sample] = SynDict()
            if se_files:
                for gene_id in set(sample_counts.keys()) | set(
                        sample_se_counts.keys()):
                    if (gene_id in sample_counts) and (gene_id
                                                       in sample_se_counts):
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id] + sample_se_counts[gene_id]
                    elif gene_id in sample_counts:
                        count_all_table[sample][gene_id] = sample_counts[
                            gene_id]
                    elif gene_id in sample_se_counts:
                        count_all_table[sample][gene_id] = sample_se_counts[
                            gene_id]
            else:
                count_all_table[sample] = count_pe_table[sample]

        count_pe_table.write(count_pe_table_file)
        count_se_table.write(count_se_table_file)
        count_all_table.write(count_all_table_file)
Exemple #4
0
    print("Started extraction for family %s" % family_name)
    family_genes_ids = families_dict[family_name]
    try:
        os.mkdir("%s%s" % (args.output_dir, family_name))
    except OSError:
        pass

    fam_soft_fd = open("%s%s/%s_with_outer_edges.graph" % (args.output_dir, family_name, family_name), "w")
    """
    with open(args.hclust_input, "r") as in_fd:
        for line in in_fd:
            edge_nodes = line.split("\t")[:2]
            if check_edge_soft(edge_nodes, family_genes_ids):
                fam_soft_fd.write(line)
    """
    for edge in graph_list:
        if check_edge_soft(edge[:-1], family_genes_ids):
            fam_soft_fd.write("\t".join(edge) + "\n")
    fam_soft_fd.close()
    fam_strict_fd = open("%s%s/%s.graph" % (args.output_dir, family_name, family_name), "w")
    with open("%s%s/%s_with_outer_edges.graph" % (args.output_dir, family_name, family_name), "r") as in_fd:
        for line in in_fd:
            edge_nodes = line.split("\t")[:2]
            if check_edge_strict(edge_nodes, family_genes_ids):
                fam_strict_fd.write(line)
    fam_strict_fd.close()

pool = Pool(args.threads)
pool.map(extract_fam_graph, families_dict.keys())