Ejemplo n.º 1
0
    def __init__(self):
        desc = "Toolkit for cDNA analysis."
        super(PBTranscript, self).__init__(desc)
        subparsers = self.subParsers

        parser = subparsers.add_parser(
            'classify',
            description="Classify reads based on whether they are " +
                        "non-chimeric, full length and have their 5', " +
                        "3' and poly A tail seen.")
        # Add arguments for subcommand classify
        add_classify_arguments(parser)

        parser = subparsers.add_parser(
            'cluster',
            description='Discover consensus isoforms based on ' +
                        'quality controlled non-chimeric, ' +
                        'full length reads to reference genome.')
        # Add arguments for subcommand cluster
        add_cluster_arguments(parser)

        parser = subparsers.add_parser(
            'subset',
            description='Subset annotated reads in Fasta format.')
        add_subset_arguments(parser)
Ejemplo n.º 2
0
    def __init__(self):
        desc = "Toolkit for cDNA analysis."
        super(PBTranscript, self).__init__(desc)
        subparsers = self.subParsers

        parser = subparsers.add_parser(
            'classify',
            description="Classify reads based on whether they are " + \
                        "non-chimeric, full length and have their 5', " + \
                        "3' and poly A tail seen.")
        # Add arguments for subcommand classify
        add_classify_arguments(parser)

        parser = subparsers.add_parser(
            'cluster',
            description='Discover consensus isoforms based on ' + \
                        'quality controlled non-chimeric, ' + \
                        'full length reads to reference genome.')
        # Add arguments for subcommand cluster
        add_cluster_arguments(parser)

        parser = subparsers.add_parser(
            'subset',
            description='Subset annotated reads in Fasta format.')
        add_subset_arguments(parser)
Ejemplo n.º 3
0
def tofu_wrap_main():
    parser = argparse.ArgumentParser()
    add_cluster_arguments(parser)

    parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)")
    parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb")
    parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)")
    parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)")
    parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)")
    parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)")
    parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated")
    parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS)
    args = parser.parse_args()

    # DEBUG
    if args.mem_debug:
        from memory_profiler import memory_usage
    
    # #################################################################
    # SANITY CHECKS
    if not args.quiver:
        print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit."
        sys.exit(-1)
    if args.nfl_fa is None:
        print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit."
        sys.exit(-1)
    if not os.path.exists(args.gmap_db):
        print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db)
        sys.exit(-1)
    if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)):
        print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name)
        sys.exit(-1)
    # #################################################################

    tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else output_seqid_prefix

    ice_opts = IceOptions(cDNA_size=args.cDNA_size,
            quiver=args.quiver)
    sge_opts = SgeOptions(unique_id=args.unique_id,
            use_sge=args.use_sge,
            max_sge_jobs=args.max_sge_jobs,
            blasr_nproc=args.blasr_nproc,
            quiver_nproc=args.quiver_nproc)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
            qv_trim_3=args.qv_trim_3,
            hq_quiver_min_accuracy=args.hq_quiver_min_accuracy,
            hq_isoforms_fa=args.hq_isoforms_fa,
            hq_isoforms_fq=args.hq_isoforms_fq,
            lq_isoforms_fa=args.lq_isoforms_fa,
            lq_isoforms_fq=args.lq_isoforms_fq)
    # ex: all_quivered_hq.100_30_0.99.fastq
    quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\
            args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy)
    quiver_lq_filename = "all_quivered_lq.fastq"

    # (1) separate input flnc into size bins or primers
    if args.bin_by_primer:
        split_files = sep_flnc_by_primer(args.flnc_fa, args.root_dir)
    else:
        bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None
        split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files))

    # (2) if fasta_fofn already is there, use it; otherwise make it first
    if args.quiver and args.fasta_fofn is None:
        print >> sys.stderr, "Making fasta_fofn now"
        nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files"))
        if not os.path.exists(nfl_dir):
            os.makedirs(nfl_dir)
        args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn')
        print >> sys.stderr, "fasta_fofn", args.fasta_fofn
        print >> sys.stderr, "nfl_dir", nfl_dir
        convert_fofn_to_fasta(fofn_filename=args.bas_fofn,
                            out_filename=args.fasta_fofn,
                            fasta_out_dir=nfl_dir,
                            cpus=args.blasr_nproc)
    else:
        if not os.path.exists(args.fasta_fofn):
            raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn)
        for line in open(args.fasta_fofn):
            file = line.strip()
            if len(file) > 0 and not os.path.exists(file):
                raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn)

    # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn
    split_dirs = []
    for cur_file in split_files:
        cur_dir = os.path.dirname(cur_file)
        split_dirs.append(cur_dir)
        cur_out_cons = os.path.join(cur_dir, args.consensusFa)
        
        hq_quiver = os.path.join(cur_dir, quiver_hq_filename)
        if os.path.exists(hq_quiver):
            print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver)
            continue
        print >> sys.stderr, "running ICE/Quiver on", cur_dir
        start_t = time.time()
        obj = Cluster(root_dir=cur_dir,
                flnc_fa=cur_file,
                nfl_fa=args.nfl_fa,
                bas_fofn=args.bas_fofn,
                ccs_fofn=args.ccs_fofn,
                fasta_fofn=args.fasta_fofn,
                out_fa=cur_out_cons,
                sge_opts=sge_opts,
                ice_opts=ice_opts,
                ipq_opts=ipq_opts,
                report_fn=args.report_fn,
                summary_fn=args.summary_fn,
                nfl_reads_per_split=args.nfl_reads_per_split)
        
        # DEBUG
        if args.mem_debug: 
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

    combined_dir = os.path.join(args.root_dir, 'combined')
    if not os.path.exists(combined_dir):
        os.makedirs(combined_dir)
    # (4) combine quivered HQ/LQ results
    hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \
            combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\
            tofu_prefix)
    with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f:
        dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f)
    # (5) collapse quivered HQ results
    collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc)
    # (6) make abundance 
    get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq)
Ejemplo n.º 4
0
def tofu_wrap_main():
    parser = argparse.ArgumentParser(prog='tofu_wrap')
    add_cluster_arguments(parser, show_sge_env_name=True, show_sge_queue=True)

    parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)")
    parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb")
    parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)")
    parser.add_argument("--max_base_limit_MB", default=600, type=int, help="Maximum number of bases per partitioned bin, in MB (default: 600)")
    parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)")
    parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)")
    parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated")
    parser.add_argument("--mem_debug", default=False, action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--max_fuzzy_junction", default=5, type=int, help="Max fuzzy junction (default: 5 bp)")
    parser.add_argument("--version", action='version', version='%(prog)s ' + str(get_version()))
    args = parser.parse_args()

    # PRINT VERSION AND EXIT
#    if args.version:
#        print >> sys.stderr, get_version()
#        sys.exit(0)
    # DEBUG
    if args.mem_debug:
        from memory_profiler import memory_usage
    
    # #################################################################
    # SANITY CHECKS
    if not args.quiver:
        print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit."
        sys.exit(-1)
    if args.nfl_fa is None:
        print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit."
        sys.exit(-1)
    if not os.path.exists(args.gmap_db):
        print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db)
        sys.exit(-1)
    if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)):
        print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name)
        sys.exit(-1)
    # #################################################################

    tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else args.output_seqid_prefix

    ice_opts = IceOptions(quiver=args.quiver,
            use_finer_qv=args.use_finer_qv,
            targeted_isoseq=args.targeted_isoseq,
            ece_penalty=args.ece_penalty,
            ece_min_len=args.ece_min_len,
    )
    sge_opts = SgeOptions(unique_id=args.unique_id,
            use_sge=args.use_sge,
            max_sge_jobs=args.max_sge_jobs,
            blasr_nproc=args.blasr_nproc,
            quiver_nproc=args.quiver_nproc,
            gcon_nproc=args.gcon_nproc,
            sge_env_name=args.sge_env_name,
            sge_queue=args.sge_queue)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
            qv_trim_3=args.qv_trim_3,
            hq_quiver_min_accuracy=args.hq_quiver_min_accuracy,
            hq_isoforms_fa=args.hq_isoforms_fa,
            hq_isoforms_fq=args.hq_isoforms_fq,
            lq_isoforms_fa=args.lq_isoforms_fa,
            lq_isoforms_fq=args.lq_isoforms_fq)

    # ex: all_quivered_hq.100_30_0.99.fastq
    quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\
            args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy)
    quiver_lq_filename = "all_quivered_lq.fastq"

    # (1) separate input flnc into size bins or primers
    if args.bin_by_primer:
        split_files = sep_flnc_by_primer(args.flnc_fa, os.path.abspath(args.root_dir))
    else:
        bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None
        split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual, max_base_limit_MB=args.max_base_limit_MB)
    print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files))

    # (2) if fasta_fofn already is there, use it; otherwise make it first
    if args.quiver and args.fasta_fofn is None:
        print >> sys.stderr, "Making fasta_fofn now"
        nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files"))
        if not os.path.exists(nfl_dir):
            os.makedirs(nfl_dir)
        args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn')
        print >> sys.stderr, "fasta_fofn", args.fasta_fofn
        print >> sys.stderr, "nfl_dir", nfl_dir
        convert_fofn_to_fasta(fofn_filename=args.bas_fofn,
                            out_filename=args.fasta_fofn,
                            fasta_out_dir=nfl_dir,
                            cpus=args.blasr_nproc)
    else:
        if not os.path.exists(args.fasta_fofn):
            raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn)
        for line in open(args.fasta_fofn):
            file = line.strip()
            if len(file) > 0 and not os.path.exists(file):
                raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn)

    # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn
    split_dirs = []
    for cur_file in split_files:
        cur_dir = os.path.abspath(os.path.dirname(cur_file))
        split_dirs.append(cur_dir)
        cur_out_cons = os.path.join(cur_dir, args.consensusFa)
        
        hq_quiver = os.path.join(cur_dir, quiver_hq_filename)
        if os.path.exists(hq_quiver):
            print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver)
            continue
        print >> sys.stderr, "running ICE/Quiver on", cur_dir
        start_t = time.time()

        obj = Cluster(root_dir=cur_dir,
                flnc_fa=cur_file,
                nfl_fa=realpath(args.nfl_fa),
                bas_fofn=realpath(args.bas_fofn),
                ccs_fofn=realpath(args.ccs_fofn),
                fasta_fofn=realpath(args.fasta_fofn),
                out_fa=cur_out_cons,
                sge_opts=sge_opts,
                ice_opts=ice_opts,
                ipq_opts=ipq_opts,
                report_fn=args.report_fn,
                summary_fn=args.summary_fn,
                nfl_reads_per_split=args.nfl_reads_per_split)
        
        # DEBUG
        if args.mem_debug: 
            mem_usage = memory_usage(obj.run, interval=60)
            end_t = time.time()
            with open('mem_debug.log', 'a') as f:
                f.write("Running ICE/Quiver on {0} took {1} secs.\n".format(cur_dir, end_t-start_t))
                f.write("Maximum memory usage: {0}\n".format(max(mem_usage)))
                f.write("Memory usage: {0}\n".format(mem_usage))
        else:
            obj.run()

    combined_dir = os.path.join(args.root_dir, 'combined')
    if not os.path.exists(combined_dir):
        os.makedirs(combined_dir)
    # (4) combine quivered HQ/LQ results
    hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \
            combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename,\
            tofu_prefix)
    with open(os.path.join(args.root_dir, 'combined', 'combined.hq_lq_pre_dict.pickle'), 'w') as f:
        dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f)
    # (5) collapse quivered HQ results
    collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc, max_fuzzy_junction=args.max_fuzzy_junction, dun_merge_5_shorter=True)
    # (6) make abundance 
    get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq)
    # (7) run filtering & removing subsets in no5merge
    if args.targeted_isoseq:
        run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_5', min_count=5)
        run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_5', collapse_prefix_hq+'.min_fl_5.filtered', args.max_fuzzy_junction)
    else:
        run_filtering_by_count(collapse_prefix_hq, collapse_prefix_hq+'.min_fl_2', min_count=2)
        run_filtering_away_subsets(collapse_prefix_hq+'.min_fl_2', collapse_prefix_hq+'.min_fl_2.filtered', args.max_fuzzy_junction)
Ejemplo n.º 5
0
def tofu_wrap_main():
    parser = argparse.ArgumentParser()
    add_cluster_arguments(parser)

    parser.add_argument("--bin_size_kb", default=1, type=int, help="Bin size by kb (default: 1)")
    parser.add_argument("--bin_manual", default=None, help="Bin manual (ex: (1,2,3,5)), overwrites bin_size_kb")
    parser.add_argument("--bin_by_primer", default=False, action="store_true", help="Instead of binning by size, bin by primer (overwrites --bin_size_kb and --bin_manual)")
    parser.add_argument("--gmap_name", default="hg19", help="GMAP DB name (default: hg19)")
    parser.add_argument("--gmap_db", default="/home/UNIXHOME/etseng/share/gmap_db_new/", help="GMAP DB location (default: /home/UNIXHOME/etseng/share/gmap_db_new/)")
    parser.add_argument("--output_seqid_prefix", type=str, default=None, help="Output seqid prefix. If not given, a random ID is generated")
    args = parser.parse_args()

    # #################################################################
    # SANITY CHECKS
    if not args.quiver:
        print >> sys.stderr, "--quiver must be turned on for tofu_wrap. Quit."
        sys.exit(-1)
    if args.nfl_fa is None:
        print >> sys.stderr, "--nfl_fa must be provided for tofu_wrap. Quit."
        sys.exit(-1)
    if not os.path.exists(args.gmap_db):
        print >> sys.stderr, "GMAP DB location not valid: {0}. Quit.".format(args.gmap_db)
        sys.exit(-1)
    if not os.path.exists(os.path.join(args.gmap_db, args.gmap_name)):
        print >> sys.stderr, "GMAP name not valid: {0}. Quit.".format(args.gmap_name)
        sys.exit(-1)
    # #################################################################


    tofu_prefix = binascii.b2a_hex(os.urandom(3)) if args.output_seqid_prefix is None else output_seqid_prefix

    ice_opts = IceOptions(cDNA_size=args.cDNA_size,
            quiver=args.quiver)
    sge_opts = SgeOptions(unique_id=args.unique_id,
            use_sge=args.use_sge,
            max_sge_jobs=args.max_sge_jobs,
            blasr_nproc=args.blasr_nproc,
            quiver_nproc=args.quiver_nproc)
    ipq_opts = IceQuiverHQLQOptions(qv_trim_5=args.qv_trim_5,
            qv_trim_3=args.qv_trim_3,
            hq_quiver_min_accuracy=args.hq_quiver_min_accuracy,
            hq_isoforms_fa=args.hq_isoforms_fa,
            hq_isoforms_fq=args.hq_isoforms_fq,
            lq_isoforms_fa=args.lq_isoforms_fa,
            lq_isoforms_fq=args.lq_isoforms_fq)
    # ex: all_quivered_hq.100_30_0.99.fastq
    quiver_hq_filename = "all_quivered_hq.{0}_{1}_{2:.2f}.fastq".format(\
            args.qv_trim_5,args.qv_trim_3,args.hq_quiver_min_accuracy)
    quiver_lq_filename = "all_quivered_lq.fastq"

    # (1) separate input flnc into size bins or primers
    if args.bin_by_primer:
        split_files = sep_flnc_by_primer(args.flnc_fa, args.root_dir)
    else:
        bin_manual = eval(args.bin_manual) if args.bin_manual is not None else None
        split_files = sep_flnc_by_size(args.flnc_fa, args.root_dir, bin_size_kb=args.bin_size_kb, bin_manual=bin_manual)
    print >> sys.stderr, "split input {0} into {1} bins".format(args.flnc_fa, len(split_files))

    # (2) if fasta_fofn already is there, use it; otherwise make it first
    if args.quiver and args.fasta_fofn is None:
        print >> sys.stderr, "Making fasta_fofn now"
        nfl_dir = os.path.abspath(os.path.join(args.root_dir, "fasta_fofn_files"))
        if not os.path.exists(nfl_dir):
            os.makedirs(nfl_dir)
        args.fasta_fofn = os.path.join(nfl_dir, 'input.fasta.fofn')
        print >> sys.stderr, "fasta_fofn", args.fasta_fofn
        print >> sys.stderr, "nfl_dir", nfl_dir
        convert_fofn_to_fasta(fofn_filename=args.bas_fofn,
                            out_filename=args.fasta_fofn,
                            fasta_out_dir=nfl_dir,
                            cpus=args.blasr_nproc)
    else:
        if not os.path.exists(args.fasta_fofn):
            raise Exception, "fasta_fofn {0} does not exist!".format(args.fasta_fofn)
        for line in open(args.fasta_fofn):
            file = line.strip()
            if len(file) > 0 and not os.path.exists(file):
                raise Exception, "File {0} does not exists in {1}".format(file, args.fasta_fofn)

    # (3) run ICE/Quiver (the whole thing), providing the fasta_fofn
    split_dirs = []
    for cur_file in split_files:
        cur_dir = os.path.dirname(cur_file)
        split_dirs.append(cur_dir)
        cur_out_cons = os.path.join(cur_dir, args.consensusFa)
        
        hq_quiver = os.path.join(cur_dir, quiver_hq_filename)
        if os.path.exists(hq_quiver):
            print >> sys.stderr, "{0} already exists. SKIP!".format(hq_quiver)
            continue
        print >> sys.stderr, "running ICE/Quiver on", cur_dir
        obj = Cluster(root_dir=cur_dir,
                flnc_fa=cur_file,
                nfl_fa=args.nfl_fa,
                bas_fofn=args.bas_fofn,
                ccs_fofn=args.ccs_fofn,
                fasta_fofn=args.fasta_fofn,
                out_fa=cur_out_cons,
                sge_opts=sge_opts,
                ice_opts=ice_opts,
                ipq_opts=ipq_opts,
                report_fn=args.report_fn,
                summary_fn=args.summary_fn,
                nfl_reads_per_split=args.nfl_reads_per_split)
        obj.run()

    combined_dir = os.path.join(args.root_dir, 'combined')
    if not os.path.exists(combined_dir):
        os.makedirs(combined_dir)
    # (4) combine quivered HQ/LQ results
    hq_filename, lq_filename, hq_pre_dict, lq_pre_dict = \
            combine_quiver_results(split_dirs, combined_dir, quiver_hq_filename, quiver_lq_filename, \
            prefix=tofu_prefix)
    with open('combined.hq_lq_pre_dict.pickle', 'w') as f:
        dump({'HQ': hq_pre_dict, 'LQ': lq_pre_dict}, f)
    # (5) collapse quivered HQ results
    collapse_prefix_hq = run_collapse_sam(hq_filename, args.gmap_db, args.gmap_name, cpus=args.blasr_nproc)
    # (6) make abundance 
    get_abundance(collapse_prefix_hq, hq_pre_dict, collapse_prefix_hq)