Example #1
0
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows):

    datasets_json = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"),
                 'r')
        datasets_json = json.load(f)
        f.close()
    except:
        printtime("ERROR: problem parsing %s" %
                  os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"))
        traceback.print_exc()
        return

    for dataset in datasets_json['datasets']:

        # What needs merging:
        #  - alignment.summary
        #  - alignTable.txt
        # Some time in the future:
        #  - alignStats_err.json

        # Merge alignStats metrics
        try:
            input_prefix_list = [
                os.path.join(dir, ALIGNMENT_RESULTS,
                             dataset['file_prefix'] + '.') for dir in dirs
            ]
            input_prefix_list = [
                prefix for prefix in input_prefix_list
                if os.path.exists(prefix + 'alignment.summary')
            ]
            composite_prefix = os.path.join(ALIGNMENT_RESULTS,
                                            dataset['file_prefix'] + '.')
            if input_prefix_list:
                mergeAlignStatsResults(input_prefix_list, composite_prefix)
            else:
                printtime("Nothing to merge: " + dataset['file_prefix'])
        except:
            printtime("ERROR: merging %s stats unsuccessful" %
                      (dataset['file_prefix'] + '.bam'))

    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"),
                 'r')
        datasets_basecaller = json.load(f)
        f.close()
    except:
        printtime("ERROR: problem parsing %s" %
                  os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,
                           dataset['file_prefix'] + '.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(
                os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][
                    read_group]:
                X_name = datasets_basecaller['read_groups'][read_group][
                    'barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS,
                               'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))

    # Merge alignStats_err.json right here!

    merged_align_stats = {}
    align_stats_num_bases = 400
    for dir in dirs:
        current_align_stats = {}
        try:
            f = open(
                os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json'),
                'r')
            current_align_stats = json.load(f)
            f.close()
        except:
            printtime(
                "Merge alignStats_err.json: skipping %s" %
                os.path.join(dir, ALIGNMENT_RESULTS, 'alignStats_err.json'))
            continue

        if not merged_align_stats:
            merged_align_stats = current_align_stats
            align_stats_num_bases = len(
                merged_align_stats.get("read_length", []))
            continue

        for idx in range(align_stats_num_bases):
            merged_align_stats['nread'][idx] += current_align_stats['nread'][
                idx]
            merged_align_stats['unaligned'][idx] += current_align_stats[
                'unaligned'][idx]
            merged_align_stats['filtered'][idx] += current_align_stats[
                'filtered'][idx]
            merged_align_stats['clipped'][idx] += current_align_stats[
                'clipped'][idx]
            merged_align_stats['aligned'][idx] += current_align_stats[
                'aligned'][idx]
            merged_align_stats['n_err_at_position'][
                idx] += current_align_stats['n_err_at_position'][idx]
            merged_align_stats['cum_aligned'][idx] += current_align_stats[
                'cum_aligned'][idx]
            merged_align_stats['cum_err_at_position'][
                idx] += current_align_stats['cum_err_at_position'][idx]

        merged_align_stats['accuracy_total_bases'] += current_align_stats[
            'accuracy_total_bases']
        merged_align_stats['accuracy_total_errors'] += current_align_stats[
            'accuracy_total_errors']
        merged_align_stats['total_mapped_target_bases'] += current_align_stats[
            'total_mapped_target_bases']
        merged_align_stats['total_mapped_reads'] += current_align_stats[
            'total_mapped_reads']

    try:
        f = open(os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'), "w")
        json.dump(merged_align_stats, f, indent=4)
        f.close()
    except:
        printtime("ERROR; Failed to write merged alignStats_err.json")
        traceback.print_exc()

    mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'),
            int(graph_max_x))

        ionstats_plots.alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'),
            os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'),
            int(graph_max_x))

        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment("./", barcodelist_path)
Example #2
0
def alignment_post_processing(BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows,
                              mark_duplicates, force_alignstats):

    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"),
                 'r')
        datasets_basecaller = json.load(f)
        f.close()
    except:
        printtime("ERROR: problem parsing %s" %
                  os.path.join(BASECALLER_RESULTS, "datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        if not os.path.exists(
                os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])):
            continue

        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,
                           dataset['file_prefix'] + '.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(
                os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix'] + '.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][
                    read_group]:
                X_name = datasets_basecaller['read_groups'][read_group][
                    'barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS,
                               'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src, os.path.dirname(dst)), dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))

    # Special legacy post-processing.
    # Generate merged rawlib.bam on barcoded runs

    composite_bam_filename = os.path.join(ALIGNMENT_RESULTS, 'rawlib.bam')
    if not os.path.exists(composite_bam_filename):

        bam_file_list = []
        for dataset in datasets_basecaller["datasets"]:
            bam_name = os.path.join(
                ALIGNMENT_RESULTS,
                os.path.basename(dataset['file_prefix']) + '.bam')
            if os.path.exists(bam_name):
                bam_file_list.append(bam_name)

        blockprocessing.merge_bam_files(bam_file_list, composite_bam_filename,
                                        composite_bam_filename + '.bai',
                                        mark_duplicates)
        force_alignstats = True

    if force_alignstats:
        ## Generate data for error plot for barcoded run from composite bam
        printtime("Call alignStats to generate raw accuracy")
        try:
            cmd = "alignStats"
            cmd += " -n 12"
            cmd += " --alignSummaryFile alignStats_err.txt"
            cmd += " --alignSummaryJsonFile alignStats_err.json"
            cmd += " --alignSummaryMinLen  1"
            #cmd += " --alignSummaryMaxLen  %s" % str(int(graph_max_x))
            cmd += " --alignSummaryMaxLen  %s" % str(int(400))
            cmd += " --alignSummaryLenStep 1"
            cmd += " --alignSummaryMaxErr  10"
            cmd += " --infile %s" % composite_bam_filename
            cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS
            printtime("DEBUG: Calling '%s'" % cmd)
            os.system(cmd)
        except:
            printtime("alignStats failed")

    mergeAlignStatsResults(input_prefix_list, ALIGNMENT_RESULTS + "/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS, 'base_error_plot.png'),
            int(graph_max_x))
        ionstats_plots.alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS, 'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'),
            os.path.join(ALIGNMENT_RESULTS, 'alignment_rate_plot.png'),
            int(graph_max_x))

        # Create aligned histogram plot

        # Create AQ20 plot

        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment("./", barcodelist_path)

    # These graphs are likely obsolete
    makeAlignGraphs()
Example #3
0
           if os.path.exists(ionstats_path):
               file_list.append(ionstats_path)
           elif os.path.exists(ionstats_path_CA):
               file_list.append(ionstats_path_CA)
           else:
               raise Exception('')
               
       ionstats.reduce_stats(file_list, ionstats_file)
       
       # Make alignment_rate_plot.png        
       stats = json.load(open(ionstats_file))
       l = stats['full']['max_read_length']        
       graph_max_x = int(round(l + 49, -2)) 
       
       ionstats_plots.alignment_rate_plot(
           'alignStats_err.json',
           'ionstats_basecaller.json',
           'alignment_rate_plot.png', int(graph_max_x))
       print("Ionstats plot created successfully")            
   except:            
       print("ERROR: Failed to generate alignment rate plot")
 
   try:
       # Make base_error_plot.png
       base_error_plot.generate_base_error_plot(
           'alignStats_err.json',
           'base_error_plot.png',int(graph_max_x))            
   except:
       print("ERROR: Failed to generate base error plot")
       traceback.print_exc()        
   
    
Example #4
0
                if os.path.exists(ionstats_path):
                    file_list.append(ionstats_path)
                elif os.path.exists(ionstats_path_CA):
                    file_list.append(ionstats_path_CA)
                else:
                    raise Exception('')

            ionstats.reduce_stats(file_list, ionstats_file)

            # Make alignment_rate_plot.png
            stats = json.load(open(ionstats_file))
            l = stats['full']['max_read_length']
            graph_max_x = int(round(l + 49, -2))

            ionstats_plots.alignment_rate_plot('alignStats_err.json',
                                               'ionstats_basecaller.json',
                                               'alignment_rate_plot.png',
                                               int(graph_max_x))
            print("Ionstats plot created successfully")
        except:
            print("ERROR: Failed to generate alignment rate plot")

        try:
            # Make base_error_plot.png
            base_error_plot.generate_base_error_plot('alignStats_err.json',
                                                     'base_error_plot.png',
                                                     int(graph_max_x))
        except:
            print("ERROR: Failed to generate base error plot")
            traceback.print_exc()

    if args.zip and len(args.files) > 1:
Example #5
0
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows):
    
    datasets_json = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_json = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return
    
    for dataset in datasets_json['datasets']:

        # What needs merging:
        #  - alignment.summary
        #  - alignTable.txt
        # Some time in the future:
        #  - alignStats_err.json

        # Merge alignStats metrics
        try:
            input_prefix_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.') for dir in dirs]
            input_prefix_list = [prefix for prefix in input_prefix_list if os.path.exists(prefix+'alignment.summary')]
            composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.')
            if input_prefix_list:
                mergeAlignStatsResults(input_prefix_list,composite_prefix)
            else:
                printtime("Nothing to merge: "+dataset['file_prefix'])
        except:
            printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix']+'.bam'))
    

    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_basecaller = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][read_group]:
                X_name = datasets_basecaller['read_groups'][read_group]['barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))


    # Merge alignStats_err.json right here!

    merged_align_stats = {}
    align_stats_num_bases = 400
    for dir in dirs:
        current_align_stats = {}
        try:
            f = open(os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'),'r')
            current_align_stats = json.load(f);
            f.close()
        except:
            printtime("Merge alignStats_err.json: skipping %s" % os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'))
            continue
        
        if not merged_align_stats:
            merged_align_stats = current_align_stats
            align_stats_num_bases = len(merged_align_stats.get("read_length",[]))
            continue
        
        for idx in range(align_stats_num_bases):
            merged_align_stats['nread'][idx] += current_align_stats['nread'][idx]
            merged_align_stats['unaligned'][idx] += current_align_stats['unaligned'][idx]
            merged_align_stats['filtered'][idx] += current_align_stats['filtered'][idx]
            merged_align_stats['clipped'][idx] += current_align_stats['clipped'][idx]
            merged_align_stats['aligned'][idx] += current_align_stats['aligned'][idx]
            merged_align_stats['n_err_at_position'][idx] += current_align_stats['n_err_at_position'][idx]
            merged_align_stats['cum_aligned'][idx] += current_align_stats['cum_aligned'][idx]
            merged_align_stats['cum_err_at_position'][idx] += current_align_stats['cum_err_at_position'][idx]

        merged_align_stats['accuracy_total_bases'] += current_align_stats['accuracy_total_bases']
        merged_align_stats['accuracy_total_errors'] += current_align_stats['accuracy_total_errors']
        merged_align_stats['total_mapped_target_bases'] += current_align_stats['total_mapped_target_bases']
        merged_align_stats['total_mapped_reads'] += current_align_stats['total_mapped_reads']
            
        
    try:
        f = open(os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),"w")
        json.dump(merged_align_stats, f, indent=4)
        f.close()
    except:
        printtime("ERROR; Failed to write merged alignStats_err.json")
        traceback.print_exc()
        
        
        
    mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x))
        
        ionstats_plots.alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'),
            os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x))

        
        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment ("./",barcodelist_path)
Example #6
0
def alignment_post_processing(
        BASECALLER_RESULTS,
        ALIGNMENT_RESULTS,
        flows,
        mark_duplicates,
        force_alignstats):


    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_basecaller = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])):
            continue

        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][read_group]:
                X_name = datasets_basecaller['read_groups'][read_group]['barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))

    # Special legacy post-processing.
    # Generate merged rawlib.bam on barcoded runs

    composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam')
    if not os.path.exists(composite_bam_filename):

        bam_file_list = []
        for dataset in datasets_basecaller["datasets"]:
            bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam')
            if os.path.exists(bam_name):
                bam_file_list.append(bam_name)

        blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates)
        force_alignstats = True

    if force_alignstats:        
        ## Generate data for error plot for barcoded run from composite bam
        printtime("Call alignStats to generate raw accuracy")
        try:
            cmd = "alignStats"
            cmd += " -n 12"
            cmd += " --alignSummaryFile alignStats_err.txt"
            cmd += " --alignSummaryJsonFile alignStats_err.json"
            cmd += " --alignSummaryMinLen  1"
            #cmd += " --alignSummaryMaxLen  %s" % str(int(graph_max_x))
            cmd += " --alignSummaryMaxLen  %s" % str(int(400))
            cmd += " --alignSummaryLenStep 1"
            cmd += " --alignSummaryMaxErr  10"
            cmd += " --infile %s" % composite_bam_filename
            cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS
            printtime("DEBUG: Calling '%s'" % cmd)
            os.system(cmd)
        except:
            printtime("alignStats failed")


    mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x))
        ionstats_plots.alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'),
            os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x))

        # Create aligned histogram plot
        
        # Create AQ20 plot
        
        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment ("./",barcodelist_path)

    # These graphs are likely obsolete
    makeAlignGraphs()