Ejemplo n.º 1
0
def mergeBasecallerResults(dirs, QualityPath, merged_bead_mask_path, floworder, libsff, tfsff, BASECALLER_RESULTS):
    ############################################
    # Merge individual quality.summary files #
    ############################################
    printtime("Merging individual quality.summary files")

    config_out = ConfigParser.RawConfigParser()
    config_out.optionxform = str # don't convert to lowercase
    config_out.add_section('global')

    numberkeys = ['Number of 50BP Reads',
                  'Number of 100BP Reads',
                  'Number of 150BP Reads',
                  'Number of Reads at Q0',
                  'Number of Bases at Q0',
                  'Number of 50BP Reads at Q0',
                  'Number of 100BP Reads at Q0',
                  'Number of 150BP Reads at Q0',
                  'Number of Reads at Q17',
                  'Number of Bases at Q17',
                  'Number of 50BP Reads at Q17',
                  'Number of 150BP Reads at Q17',
                  'Number of 100BP Reads at Q17',
                  'Number of Reads at Q20',
                  'Number of Bases at Q20',
                  'Number of 50BP Reads at Q20',
                  'Number of 100BP Reads at Q20',
                  'Number of 150BP Reads at Q20']

    maxkeys = ['Max Read Length at Q0',
               'Max Read Length at Q17',
               'Max Read Length at Q20']

    meankeys = ['System SNR',
                'Mean Read Length at Q0',
                'Mean Read Length at Q17',
                'Mean Read Length at Q20']

    config_in = MyConfigParser()
    config_in.optionxform = str # don't convert to lowercase
    doinit = True
    for i,subdir in enumerate(dirs):
        if isbadblock(subdir, "Merging quality.summary"):
            continue
        summaryfile=os.path.join(BASECALLER_RESULTS, subdir, 'quality.summary')
        if os.path.exists(summaryfile):
            printtime("INFO: process %s" % summaryfile)
            config_in.read(summaryfile)
            for key in numberkeys:
                value_in = config_in.get('global',key)
                if doinit:
                    value_out = 0
                else:
                    value_out = config_out.get('global', key)
                config_out.set('global', key, int(value_in) + int(value_out))
            for key in maxkeys:
                value_in = config_in.get('global',key)
                if doinit:
                    value_out = 0
                else:
                    value_out = config_out.get('global', key)
                config_out.set('global', key, max(int(value_in),int(value_out)))
            for key in meankeys:
                value_in = config_in.get('global',key)
                if doinit:
                    value_out = 0
                else:
                    value_out = config_out.get('global', key)
                config_out.set('global', key, float(value_out)+float(value_in)/len(dirs))
            doinit = False
        else:
            printtime("ERROR: skipped %s" % summaryfile)

    with open(QualityPath, 'wb') as configfile:
        config_out.write(configfile)

    ##################################################
    #generate TF Metrics                             #
    #look for both keys and append same file         #
    ##################################################

    printtime("Merging TFMapper metrics and generating TF plots")

    try:
        TFPipeline.mergeBlocks(BASECALLER_RESULTS,dirs,floworder)

    except:
        printtime("ERROR: Merging TFMapper metrics failed")


    ###############################################
    # Merge BaseCaller.json files                 #
    ###############################################
    printtime("Merging BaseCaller.json files")

    try:
        basecallerfiles = []
        for subdir in dirs:
            subdir = os.path.join(BASECALLER_RESULTS,subdir)
            printtime("DEBUG: %s:" % subdir)
            if isbadblock(subdir, "Merging BaseCaller.json files"):
                continue
            basecallerjson = os.path.join(subdir,'BaseCaller.json')
            if os.path.exists(basecallerjson):
                basecallerfiles.append(subdir)
            else:
                printtime("ERROR: Merging BaseCaller.json files: skipped %s" % basecallerjson)

        mergeBaseCallerJson.merge(basecallerfiles,BASECALLER_RESULTS)
    except:
        printtime("Merging BaseCaller.json files failed")


    ########################################
    # Merge individual block SFF files     #
    ########################################
    printtime("Merging Library SFF files")
    try:
        cmd = 'SFFProtonMerge'
        cmd = cmd + ' -i rawlib.sff'
        cmd = cmd + ' -o %s ' % libsff
        for subdir in dirs:
            subdir = os.path.join(BASECALLER_RESULTS,subdir)
            if isbadblock(subdir, "Merging Library SFF files"):
                continue
            rawlibsff = os.path.join(subdir,'rawlib.sff')
            if os.path.exists(rawlibsff):
                cmd = cmd + ' %s' % subdir
            else:
                printtime("ERROR: skipped %s" % rawlibsff)
        printtime("DEBUG: Calling '%s'" % cmd)
        subprocess.call(cmd,shell=True)
    except:
        printtime("SFFProtonMerge failed (library)")

    printtime("Merging Test Fragment SFF files")
    try:
        cmd = 'SFFProtonMerge'
        cmd = cmd + ' -i rawtf.sff'
        cmd = cmd + ' -o %s ' % tfsff
        for subdir in dirs:
            subdir = os.path.join(BASECALLER_RESULTS,subdir)
            if isbadblock(subdir, "Merging Test Fragment SFF files"):
                continue
            rawtfsff = os.path.join(subdir,'rawtf.sff')
            if os.path.exists(rawtfsff):
                cmd = cmd + ' %s' % subdir
            else:
                printtime("ERROR: skipped %s" % rawtfsff)
        printtime("DEBUG: Calling '%s'" % cmd)
        subprocess.call(cmd,shell=True)
    except:
        printtime("SFFProtonMerge failed (test fragments)")
Ejemplo n.º 2
0
def mergeAlignStatsResults(input_prefix_list,output_prefix):

    ############################################
    # Merge individual alignment.summary files #
    ############################################
    printtime("Merging individual alignment.summary files")

    config_out = ConfigParser.RawConfigParser()
    config_out.optionxform = str # don't convert to lowercase
    config_out.add_section('global')

    quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q30', 'Q47']
    bplist = [50, 100, 150, 200, 250, 300, 350, 400]

    fixedkeys = [ 'Genome', 'Genome Version', 'Index Version', 'Genomesize' ]

    numberkeys = ['Total number of Reads',
                  'Filtered Mapped Bases in Q7 Alignments',
                  'Filtered Mapped Bases in Q10 Alignments',
                  'Filtered Mapped Bases in Q17 Alignments',
                  'Filtered Mapped Bases in Q20 Alignments',
                  'Filtered Mapped Bases in Q30 Alignments',
                  'Filtered Mapped Bases in Q47 Alignments',
                  'Filtered Q7 Alignments',
                  'Filtered Q10 Alignments',
                  'Filtered Q17 Alignments',
                  'Filtered Q20 Alignments',
                  'Filtered Q30 Alignments',
                  'Filtered Q47 Alignments']

    for q in quallist:
        for bp in bplist:
            numberkeys.append('Filtered %s%s Reads' % (bp, q))

    maxkeys = ['Filtered Q7 Longest Alignment',
               'Filtered Q10 Longest Alignment',
               'Filtered Q17 Longest Alignment',
               'Filtered Q20 Longest Alignment',
               'Filtered Q30 Longest Alignment',
               'Filtered Q47 Longest Alignment']

    # init
    for key in fixedkeys:
        value_out = 'unknown'
        config_out.set('global', key, value_out)
    for key in numberkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))
    for key in maxkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))

    config_in = MyConfigParser()
    config_in.optionxform = str # don't convert to lowercase
    for input_prefix in input_prefix_list:
        alignmentfile = input_prefix + 'alignment.summary'
        if os.path.exists(alignmentfile):
            config_in.read(os.path.join(alignmentfile))

            for key in numberkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, int(value_in) + int(value_out))
            for key in maxkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, max(int(value_in),int(value_out)))
            for key in fixedkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global',key)
                #todo
                config_out.set('global', key, value_in)

        else:
            printtime("ERROR: skipped %s" % alignmentfile)

    # Regenerate trickier alignment.summary metrics

    for qual in quallist:
        try:
            q_bases = config_out.get('global','Filtered Mapped Bases in %s Alignments' % qual)
            q_reads = config_out.get('global','Filtered %s Alignments' % qual)

            q_readlen = 0
            if q_reads > 0:
                q_readlen = q_bases / q_reads
            config_out.set('global','Filtered %s Mean Alignment Length' % qual, q_readlen)

            genomesize = float(config_out.get('global','Genomesize'))
            q_coverage = 0.0
            if genomesize > 0:
                q_coverage = q_bases / genomesize
            config_out.set('global','Filtered %s Mean Coverage Depth' % qual, '%1.1f' % q_coverage)

            # Not mergeable at this point
            config_out.set('global','Filtered %s Coverage Percentage' % qual, 'N/A')
           
        except:
            pass
    

    with open(output_prefix + 'alignment.summary', 'wb') as configfile:
        config_out.write(configfile)


    #########################################
    # Merge individual alignTable.txt files #
    #########################################
    printtime("Merging individual alignTable.txt files")
    
    table = 0
    header = None
    for input_prefix in input_prefix_list:
        alignTableFile = input_prefix + 'alignTable.txt'
        if os.path.exists(alignTableFile):
            if header is None:
                header = numpy.loadtxt(alignTableFile, dtype='string', comments='#')
            table += numpy.loadtxt(alignTableFile, dtype='int', comments='#',skiprows=1)
        else:
            printtime("ERROR: skipped %s" % alignTableFile)
    #fix first column
    if header is not None:
        table[:,0] = (header[1:,0])
        f_handle = open(output_prefix+ 'alignTable.txt', 'w')
        numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t')
        numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t')
        f_handle.close()
Ejemplo n.º 3
0
def mergeAlignStatsResults(input_prefix_list, output_prefix):

    ############################################
    # Merge individual alignment.summary files #
    ############################################
    printtime("Merging individual alignment.summary files")

    config_out = ConfigParser.RawConfigParser()
    config_out.optionxform = str  # don't convert to lowercase
    config_out.add_section('global')

    quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q30', 'Q47']
    bplist = [50, 100, 150, 200, 250, 300, 350, 400]

    fixedkeys = ['Genome', 'Genome Version', 'Index Version', 'Genomesize']

    numberkeys = [
        'Total number of Reads', 'Total Mapped Reads',
        'Total Mapped Target Bases', 'Filtered Mapped Bases in Q7 Alignments',
        'Filtered Mapped Bases in Q10 Alignments',
        'Filtered Mapped Bases in Q17 Alignments',
        'Filtered Mapped Bases in Q20 Alignments',
        'Filtered Mapped Bases in Q30 Alignments',
        'Filtered Mapped Bases in Q47 Alignments', 'Filtered Q7 Alignments',
        'Filtered Q10 Alignments', 'Filtered Q17 Alignments',
        'Filtered Q20 Alignments', 'Filtered Q30 Alignments',
        'Filtered Q47 Alignments'
    ]

    for q in quallist:
        for bp in bplist:
            numberkeys.append('Filtered %s%s Reads' % (bp, q))

    maxkeys = [
        'Filtered Q7 Longest Alignment', 'Filtered Q10 Longest Alignment',
        'Filtered Q17 Longest Alignment', 'Filtered Q20 Longest Alignment',
        'Filtered Q30 Longest Alignment', 'Filtered Q47 Longest Alignment'
    ]

    # init
    for key in fixedkeys:
        value_out = 'unknown'
        config_out.set('global', key, value_out)
    for key in numberkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))
    for key in maxkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))

    config_in = MyConfigParser()
    config_in.optionxform = str  # don't convert to lowercase
    for input_prefix in input_prefix_list:
        alignmentfile = input_prefix + 'alignment.summary'
        if os.path.exists(alignmentfile):
            config_in.read(os.path.join(alignmentfile))

            for key in numberkeys:
                value_in = config_in.get('global', key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, int(value_in) + int(value_out))
            for key in maxkeys:
                value_in = config_in.get('global', key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, max(int(value_in),
                                                  int(value_out)))
            for key in fixedkeys:
                value_in = config_in.get('global', key)
                value_out = config_out.get('global', key)
                #todo
                config_out.set('global', key, value_in)

        else:
            printtime("ERROR: skipped %s" % alignmentfile)

    # Regenerate trickier alignment.summary metrics

    for qual in quallist:
        try:
            q_bases = config_out.get(
                'global', 'Filtered Mapped Bases in %s Alignments' % qual)
            q_reads = config_out.get('global', 'Filtered %s Alignments' % qual)

            q_readlen = 0
            if q_reads > 0:
                q_readlen = q_bases / q_reads
            config_out.set('global',
                           'Filtered %s Mean Alignment Length' % qual,
                           q_readlen)

            genomesize = float(config_out.get('global', 'Genomesize'))
            q_coverage = 0.0
            if genomesize > 0:
                q_coverage = q_bases / genomesize
            config_out.set('global', 'Filtered %s Mean Coverage Depth' % qual,
                           '%1.1f' % q_coverage)

            # Not mergeable at this point
            config_out.set('global', 'Filtered %s Coverage Percentage' % qual,
                           'N/A')

        except:
            pass

    with open(output_prefix + 'alignment.summary', 'wb') as configfile:
        config_out.write(configfile)

    #########################################
    # Merge individual alignTable.txt files #
    #########################################
    printtime("Merging individual alignTable.txt files")

    table = 0
    header = None
    for input_prefix in input_prefix_list:
        alignTableFile = input_prefix + 'alignTable.txt'
        if os.path.exists(alignTableFile):
            if header is None:
                header = numpy.loadtxt(alignTableFile,
                                       dtype='string',
                                       comments='#')
            table += numpy.loadtxt(alignTableFile,
                                   dtype='int',
                                   comments='#',
                                   skiprows=1)
        else:
            printtime("ERROR: skipped %s" % alignTableFile)
    #fix first column
    if header is not None:
        table[:, 0] = (header[1:, 0])
        f_handle = open(output_prefix + 'alignTable.txt', 'w')
        numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t')
        numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t')
        f_handle.close()
Ejemplo n.º 4
0
def mergeAlignmentResults(dirs, env, ALIGNMENT_RESULTS):

    ############################################
    # Merge individual alignment.summary files #
    ############################################
    printtime("Merging individual alignment.summary files")

    config_out = ConfigParser.RawConfigParser()
    config_out.optionxform = str # don't convert to lowercase
    config_out.add_section('global')

    quallist = ['Q7', 'Q10', 'Q17', 'Q20', 'Q47']
    bplist = [50, 100, 150, 200, 250, 300, 350, 400]

    fixedkeys = [ 'Genome', 'Genome Version', 'Index Version', 'Genomesize' ]

    numberkeys = ['Total number of Reads',
                  'Filtered Mapped Bases in Q7 Alignments',
                  'Filtered Mapped Bases in Q10 Alignments',
                  'Filtered Mapped Bases in Q17 Alignments',
                  'Filtered Mapped Bases in Q20 Alignments',
                  'Filtered Mapped Bases in Q47 Alignments',
                  'Filtered Q7 Alignments',
                  'Filtered Q10 Alignments',
                  'Filtered Q17 Alignments',
                  'Filtered Q20 Alignments',
                  'Filtered Q47 Alignments']

    for q in quallist:
        for bp in bplist:
            numberkeys.append('Filtered %s%s Reads' % (bp, q))

    maxkeys = ['Filtered Q7 Longest Alignment',
               'Filtered Q10 Longest Alignment',
               'Filtered Q17 Longest Alignment',
               'Filtered Q20 Longest Alignment',
               'Filtered Q47 Longest Alignment']

    meankeys = ['Filtered Q7 Mean Alignment Length',
                'Filtered Q10 Mean Alignment Length',
                'Filtered Q17 Mean Alignment Length',
                'Filtered Q20 Mean Alignment Length',
                'Filtered Q47 Mean Alignment Length',
                'Filtered Q7 Coverage Percentage',
                'Filtered Q10 Coverage Percentage',
                'Filtered Q17 Coverage Percentage',
                'Filtered Q20 Coverage Percentage',
                'Filtered Q47 Coverage Percentage',
                'Filtered Q7 Mean Coverage Depth',
                'Filtered Q10 Mean Coverage Depth',
                'Filtered Q17 Mean Coverage Depth',
                'Filtered Q20 Mean Coverage Depth',
                'Filtered Q47 Mean Coverage Depth']

    # init
    for key in fixedkeys:
        value_out = 'unknown'
        config_out.set('global', key, value_out)
    for key in numberkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))
    for key in maxkeys:
        value_out = 0
        config_out.set('global', key, int(value_out))
    for key in meankeys:
        value_out = 0
        config_out.set('global', key, float(value_out))

    config_in = MyConfigParser()
    config_in.optionxform = str # don't convert to lowercase
    for i,subdir in enumerate(dirs):
        if isbadblock(subdir, "Merging alignment.summary"):
            continue
        alignmentfile=os.path.join(subdir, 'alignment.summary')
        if os.path.exists(alignmentfile):
            config_in.read(os.path.join(alignmentfile))

            for key in numberkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, int(value_in) + int(value_out))
            for key in maxkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, max(int(value_in),int(value_out)))
            for key in fixedkeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global',key)
                #todo
                config_out.set('global', key, value_in)
            for key in meankeys:
                value_in = config_in.get('global',key)
                value_out = config_out.get('global', key)
                config_out.set('global', key, float(value_out)+float(value_in)/len(dirs))

         #              'Filtered Q17 Mean Coverage Depth' = 
         #                  'Filtered Mapped Bases in Q17 Alignments' / 'Genomesize';

        else:
            printtime("ERROR: skipped %s" % alignmentfile)


    with open('alignment.summary.merged', 'wb') as configfile:
        config_out.write(configfile)

    r = subprocess.call(["ln", "-s", os.path.join(ALIGNMENT_RESULTS,"alignment.summary.merged"), os.path.join(ALIGNMENT_RESULTS,"alignment.summary")])

    #########################################
    # Merge individual alignTable.txt files #
    #########################################
    printtime("Merging individual alignTable.txt files")

    table = 0
    header = None
    for subdir in dirs:
        if isbadblock(subdir, "Merging alignTable.txt"):
            continue
        alignTableFile = os.path.join(subdir,'alignTable.txt')
        if os.path.exists(alignTableFile):
            if header is None:
                header = numpy.loadtxt(alignTableFile, dtype='string', comments='#')
            table += numpy.loadtxt(alignTableFile, dtype='int', comments='#',skiprows=1)
        else:
            printtime("ERROR: skipped %s" % alignTableFile)
    #fix first column
    table[:,0] = (header[1:,0])
    f_handle = open('alignTable.txt.merged', 'w')
    numpy.savetxt(f_handle, header[0][None], fmt='%s', delimiter='\t')
    numpy.savetxt(f_handle, table, fmt='%i', delimiter='\t')
    f_handle.close()

    r = subprocess.call(["ln", "-s", os.path.join(ALIGNMENT_RESULTS,"alignTable.txt.merged"), os.path.join(ALIGNMENT_RESULTS,"alignTable.txt")])


    #############################################
    # Merge alignment.summary (json)            #
    #############################################
    printtime("Merging  alignment.summary (json)")
    try:
        cmd = 'merge_alignment.summary.py'
        for subdir in dirs:
            if isbadblock(subdir, "Merging alignment.summary (json)"):
                continue
            alignmentfile=os.path.join(subdir, 'alignment.summary')
            if os.path.exists(alignmentfile):
                cmd = cmd + ' %s' % alignmentfile
            else:
                printtime("ERROR: skipped %s" % alignmentfile)
        cmd = cmd + ' > alignment.summary.json'
        printtime("DEBUG: Calling '%s'" % cmd)
        subprocess.call(cmd,shell=True)
    except:
        printtime("Merging alignment.summary (json) failed")


    #############################################
    # Merge alignTable.txt (json)               #
    #############################################
    printtime("Merging alignTable.txt (json)")
    try:
        cmd = 'merge_alignTable.py'
        for subdir in dirs:
            if isbadblock(subdir, "Merging alignTable.txt (json)"):
                continue
            alignstatsfile=os.path.join(subdir, 'alignTable.txt')
            if os.path.exists(alignstatsfile):
                cmd = cmd + ' %s' % alignstatsfile
            else:
                printtime("ERROR: skipped %s" % alignstatsfile)
        cmd = cmd + ' > alignTable.txt.json'
        printtime("DEBUG: Calling '%s'" % cmd)
        subprocess.call(cmd,shell=True)
    except:
        printtime("Merging alignTable.txt (json) failed")


    #############################################
    # Merge individual block bam files   #
    #############################################
    printtime("Merging bam files")
    try:
#        cmd = 'picard-tools MergeSamFiles'
        cmd = 'java -Xmx8g -jar /opt/picard/picard-tools-current/MergeSamFiles.jar'
        for subdir in dirs:
            if isbadblock(subdir, "Merging bam files"):
                continue
            bamfile = os.path.join(ALIGNMENT_RESULTS, subdir, "rawlib.bam")
            if os.path.exists(bamfile):
                cmd = cmd + ' I=%s' % bamfile
            else:
                printtime("ERROR: skipped %s" % bamfile)
        cmd = cmd + ' O=%s/%s_%s.bam' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName'])
        cmd = cmd + ' ASSUME_SORTED=true'
        cmd = cmd + ' CREATE_INDEX=true'
        cmd = cmd + ' USE_THREADING=true'
        cmd = cmd + ' VALIDATION_STRINGENCY=LENIENT'
        printtime("DEBUG: Calling '%s'" % cmd)
        subprocess.call(cmd,shell=True)
    except:
        printtime("bam file merge failed")

    try:
        srcbaifilepath = '%s/%s_%s.bai' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName'])
        dstbaifilepath = '%s/%s_%s.bam.bai' % (ALIGNMENT_RESULTS, env['expName'], env['resultsName'])
        if os.path.exists(srcbaifilepath):
            os.rename(srcbaifilepath, dstbaifilepath)
        else:
            printtime("ERROR: %s doesn't exists" % srcbaifilepath)
    except:
        traceback.print_exc()

    #remove symbolic links
    os.remove("alignment.summary")
    os.remove("alignTable.txt")

    ##################################################
    #Call alignStats on merged bam file              #
    ##################################################
    printtime("Call alignStats on merged bam file")

    try:
        cmd = "alignStats -i %s/%s_%s.bam" % (ALIGNMENT_RESULTS, env['expName'], env['resultsName'])
        cmd = cmd + " -g /results/referenceLibrary/%s/%s/%s.info.txt" % (env["tmap_version"],env["libraryName"], env["libraryName"])
        cmd = cmd + " -n 12 -l 20 -m 400 -q 7,10,17,20,47 -s 0 -a alignTable.txt"
        cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS
        cmd = cmd + " 2>> " + os.path.join(ALIGNMENT_RESULTS, "alignStats_out.txt")
        printtime("DEBUG: Calling '%s'" % cmd)
        os.system(cmd)
    except:
        printtime("alignStats failed")