Beispiel #1
0
def find_isoform(gff3, genome_bam, isoform_gff3, tss_tes_stat, genomefa,
                 transcript_fa, downsample_ratio, config_dict,
                 raw_splice_isoform):
    # find isoform
    print "#### Read genne annotations"
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(
        gff3)

    transcript_to_junctions = {
        tr: blocks_to_junctions(transcript_to_exon[tr])
        for tr in transcript_to_exon
    }
    remove_similar_tr(gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)

    # finding isoforms are required
    print "#### find isoforms"
    group_bam2isoform(genome_bam,
                      isoform_gff3,
                      tss_tes_stat,
                      "",
                      chr_to_blocks,
                      gene_dict,
                      transcript_to_junctions,
                      transcript_dict,
                      genomefa,
                      config=config_dict["isoform_parameters"],
                      downsample_ratio=downsample_ratio,
                      raw_gff3=None)
    #raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(
        isoform_gff3)
    ref_dict = {
        "chr_to_gene": chr_to_gene,
        "transcript_dict": transcript_dict,
        "gene_to_transcript": gene_to_transcript,
        "transcript_to_exon": transcript_to_exon
    }
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(genomefa,
                       transcript_fa,
                       chr_to_gene_i,
                       transcript_dict_i,
                       gene_to_transcript_i,
                       transcript_to_exon_i,
                       ref_dict=ref_dict)

    return {
        "transcript_dict": transcript_dict,
        "transcript_dict_i": transcript_dict_i
    }
Beispiel #2
0
def sc_long_pipeline(args):
    # parse configuration file

    if os.path.isfile(args.config_file):
        print("Use config file: {}".format(args.config_file))
        config_dict = parse_json_config(args.config_file)
    elif os.path.isfile(os.path.join(sys.path[0], args.config_file)):
        print("Use config file: {}".format(os.path.join(sys.path[0], args.config_file)))
        config_dict = parse_json_config(os.path.join(sys.path[0], args.config_file))
    else:
        print("Cannot find config file in current directory or script depository: {}".format(args.config_file))
        exit()
    print_config(config_dict)
    # check if files exist
    if args.downsample_ratio>1 or args.downsample_ratio<=0:
        print("downsample_ratio shoulw between 0 and 1: {}".format(args.downsample_ratio))
        exit()
    if not (os.path.isfile(args.infq) and os.path.isfile(args.gff3) and os.path.isfile(args.genomefa)):
        print("make sure all file exists:")
        print(args.infq)
        print(args.gff3)
        print(args.genomefa)
        exit()
    if not os.path.exists(args.outdir):
        os.makedirs(args.outdir)
        print("output directory not exist, create one:")
        print(args.outdir)
    if args.inbam != "" and (not os.path.isfile(args.inbam)):
        print("make sure input inbam file exists:")
        print(args.inbam)
        exit()
    # output files:
    isoform_gff3 = os.path.join(args.outdir, "isoform_annotated.gff3")
    isoform_gff3_f = os.path.join(args.outdir, "isoform_annotated.filtered.gff3")
    FSM_anno_out = os.path.join(args.outdir, "isoform_FSM_annotation.csv")
    raw_splice_isoform = os.path.join(args.outdir, "splice_raw.gff3")
    tss_tes_stat = os.path.join(args.outdir, "tss_tes.bedgraph")
    transcript_fa = os.path.join(args.outdir, "transcript_assembly.fa")
    transcript_fa_idx = os.path.join(args.outdir, "transcript_assembly.fa.fai")
    tmp_bam = os.path.join(args.outdir, "tmp.align.bam")
    tmp_bed = os.path.join(args.outdir, "tmp.splice_anno.bed12")
    genome_bam = os.path.join(args.outdir, "align2genome.bam")
    realign_bam = os.path.join(args.outdir, "realign2transcript.bam")
    tr_cnt_csv = os.path.join(args.outdir, "transcript_count.csv.gz")
    tr_badcov_cnt_csv = os.path.join(args.outdir, "transcript_count.bad_coverage.csv.gz")
    print "Input parameters:"
    print "\tgene annotation:", args.gff3
    print "\tgenome fasta:", args.genomefa
    if args.inbam != "":
        print "\tinput bam:", args.inbam
        genome_bam = args.inbam
    else:
        print "\tinput fastq:", args.infq
    print "\toutput directory:", args.outdir
    print "\tdirectory contains minimap2:", args.minimap2_dir

    # align reads to genome
    if args.inbam == "" and config_dict["pipeline_parameters"]["do_genome_alignment"]:
        print "### align reads to genome using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        if config_dict["alignment_parameters"]["use_junctions"]:
            gff3_to_bed12(args.minimap2_dir, args.gff3, tmp_bed)
        minimap2_align(args.minimap2_dir, args.genomefa, args.infq, tmp_bam, no_flank=config_dict["alignment_parameters"]["no_flank"], bed12_junc=tmp_bed if config_dict["alignment_parameters"]["use_junctions"] else None)
        samtools_sort_index(tmp_bam, genome_bam)
        os.remove(tmp_bam)
        if config_dict["alignment_parameters"]["use_junctions"]:
            os.remove(tmp_bed)
    else:
        print "### skip aligning reads to genome", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # find isoform
    print "### read gene annotation", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene, transcript_dict, gene_to_transcript, transcript_to_exon = parse_gff_tree(args.gff3)
    transcript_to_junctions = {tr: blocks_to_junctions(transcript_to_exon[tr]) for tr in transcript_to_exon}
    remove_similar_tr(transcript_dict, gene_to_transcript, transcript_to_exon)
    gene_dict = get_gene_flat(gene_to_transcript, transcript_to_exon)
    chr_to_blocks = get_gene_blocks(gene_dict, chr_to_gene, gene_to_transcript)
    if config_dict["pipeline_parameters"]["do_isoform_identification"]:
        print "### find isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        group_bam2isoform(genome_bam, isoform_gff3, tss_tes_stat, "", chr_to_blocks, gene_dict, transcript_to_junctions, transcript_dict, args.genomefa,
        config=config_dict["isoform_parameters"], 
        downsample_ratio=args.downsample_ratio,
        raw_gff3=raw_splice_isoform if config_dict["global_parameters"]["generate_raw_isoform"] else None)
    else:
        print "### skip finding isoforms", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # get fasta
    #print "### generate transcript fasta file", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i = parse_gff_tree(isoform_gff3)
    ref_dict = {"chr_to_gene":chr_to_gene, "transcript_dict":transcript_dict, "gene_to_transcript":gene_to_transcript, "transcript_to_exon":transcript_to_exon}
    if not config_dict["realign_parameters"]["use_annotation"]:
        ref_dict = None
    get_transcript_seq(args.genomefa, transcript_fa, chr_to_gene_i, transcript_dict_i, gene_to_transcript_i, transcript_to_exon_i,ref_dict=ref_dict)

    # realign to transcript using minimap2
    if config_dict["pipeline_parameters"]["do_read_realignment"]:
        print "### realign to transcript using minimap2", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        minimap2_tr_align(args.minimap2_dir, transcript_fa, args.infq, tmp_bam)
        samtools_sort_index(tmp_bam, realign_bam)
        os.remove(tmp_bam)
    else:
        print "### skip read realignment", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    # quantification
    if config_dict["pipeline_parameters"]["do_transcript_quantification"]:
        print "### generate transcript count matrix", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        bc_tr_count_dict, bc_tr_badcov_count_dict, tr_kept = parse_realigned_bam(realign_bam, transcript_fa_idx, config_dict["isoform_parameters"]["Min_sup_cnt"], config_dict["transcript_counting"]["min_tr_coverage"], config_dict["transcript_counting"]["min_read_coverage"])
        #realigned_bam_coverage(realign_bam, transcript_fa_idx, args.outdir)
        tr_cnt = wrt_tr_to_csv(bc_tr_count_dict, transcript_dict_i, tr_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        wrt_tr_to_csv(bc_tr_badcov_count_dict, transcript_dict_i, tr_badcov_cnt_csv, transcript_dict, config_dict["global_parameters"]["has_UMI"])
        annotate_filter_gff(isoform_gff3,args.gff3,isoform_gff3_f,FSM_anno_out,tr_cnt,config_dict["isoform_parameters"]["Min_sup_cnt"])
    else:
        print "### skip transcript quantification", datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")