Example #1
0
def do_call(args, fsa_list, params, dbh):

    cerr('I: Calling non-ladder peaks...')

    for (fsa, fsa_index) in fsa_list:
        cverr(3, 'D: calling FSA %s' % fsa.filename)
        fsa.call(params)
Example #2
0
def do_call( args, fsa_list, dbh ):

    cerr('I: Calling non-ladder peaks...')

    for (fsa, sample_code) in fsa_list:
        cverr(3, 'D: calling FSA %s' % fsa.filename)
        fsa.call(args.marker)
Example #3
0
def do_call( args, fsa_list, dbh ):

    cerr('I: Calling non-ladder peaks...')

    for (fsa, sample_code) in fsa_list:
        cverr(3, 'D: calling FSA %s' % fsa.filename)
        fsa.call(params.Params(), args.marker)
Example #4
0
def main(args):

    if args.verbose != 0:
        set_verbosity(args.verbose)

    dbh = None

    if args.file or args.infile:
        cverr(4, 'D: opening FSA file(s)')
        fsa_list = open_fsa(args)
    elif dbh is None:
        cverr(4, 'D: connecting to database')
        dbh = get_dbhandler(args)
        fsa_list = get_fsa_list(args, dbh)

    cerr('I: obtained %d FSA' % len(fsa_list))

    if args.commit:
        with transaction.manager:
            do_facmd(args, fsa_list, dbh)
            cerr('** COMMIT to database **')
    elif dbh:
        cerr('WARNING ** running without database COMMIT! All changes will be discarded!')
        if not ( args.test or args.y ):
            keys = input('Do you want to continue [y/n]? ')
            if not keys.lower().strip().startswith('y'):
                sys.exit(1)
        do_facmds(args, fsa_list, dbh)
    else:
        do_facmds(args, fsa_list)
Example #5
0
def do_align( args, fsa_list, dbh ):

    cerr('I: Aligning size standards...')

    for (fsa, sample_code) in fsa_list:
        cverr(3, 'D: aligning FSA %s' % fsa.filename)
        fsa.align(params.Params())
Example #6
0
def main(args):

    if args.verbose != 0:
        set_verbosity(args.verbose)

    dbh = None

    if args.file or args.infile:
        cverr(4, 'D: opening FSA file(s)')
        fsa_list = open_fsa(args)
    elif dbh is None:
        cverr(4, 'D: connecting to database')
        dbh = get_dbhandler(args)
        fsa_list = get_fsa_list(args, dbh)

    cerr('I: obtained %d FSA' % len(fsa_list))

    if args.commit:
        with transaction.manager:
            do_facmd(args, fsa_list, dbh)
            cerr('** COMMIT to database **')
    elif dbh:
        cerr('WARNING ** running without database COMMIT! All changes will be discarded!')
        if not ( args.test or args.y ):
            keys = input('Do you want to continue [y/n]? ')
            if not keys.lower().strip().startswith('y'):
                sys.exit(1)
        do_facmds(args, fsa_list, dbh)
    else:
        do_facmds(args, fsa_list)
Example #7
0
def do_align( args, fsa_list, dbh ):

    cerr('I: Aligning size standards...')

    for (fsa, sample_code) in fsa_list:
        cverr(3, 'D: aligning FSA %s' % fsa.filename)
        fsa.align(params.Params())
Example #8
0
def do_merge(args, fsa_list, params):

    cerr('I: merging smeared peaks...')

    for (fsa, fsa_index) in fsa_list:
        print("fsa_index: ", fsa_index)
        cverr(3, 'D: calling merge for FSA %s' % fsa.filename)
        fsa.merge(params, args.plot_merged_peaks)
Example #9
0
    def scan(self, params, offset=0):

        if self.is_ladder():
            alleles = scan_peaks(self, params.ladder)
        else:
            alleles = scan_peaks(self, params.ladder, offset)

        cverr(1, "# scanning %s: %d peak(s)" % (self.marker, len(alleles)))

        return alleles
Example #10
0
    def scan(self, params, offset=0):

        if self.is_ladder():
            alleles = scan_peaks(self, params.ladder)
        else:
            alleles = scan_peaks(self, params.ladder, offset)

        cverr(1, "# scanning %s: %d peak(s)" % (self.marker, len(alleles)))

        return alleles
Example #11
0
def generate_cluster(T, k):

    grouping = fcluster(T.z, k, criterion='maxclust')

    groups = defaultdict(list)
    for i, e in enumerate(grouping):
        groups[e].append( T.p[i][0] )

    clusters = sorted( list(groups.values()), key = lambda x: x[0] )
    cverr(3, str(clusters))

    return clusters
Example #12
0
def generate_cluster(T, k):

    grouping = fcluster(T.z, k, criterion='maxclust')

    groups = defaultdict(list)
    for i, e in enumerate(grouping):
        groups[e].append(T.p[i][0])

    clusters = sorted(list(groups.values()), key=lambda x: x[0])
    cverr(3, str(clusters))

    return clusters
Example #13
0
def do_normalize(args, fsa_list, params):

    cerr('I: Normalizing all peaks...')

    # use panel method to set scale factors for all FSA
    from fatools.lib.fileio.models import Panel
    panel = Panel.get_panel(args.panel)

    ladder_means = panel.get_ladder_area_means(fsa_list)

    # normalize areas for each FSA
    for (fsa, fsa_index) in fsa_list:
        cverr(3, 'D: calling normalize for %s' % fsa.filename)
        fsa.normalize(params, ladder_means)
Example #14
0
def do_align(args, fsa_list, _params, f_bad_files, dbh):
    """
    This takes an input list of FSA instances , calls FSA.align for 
    FSA in the list, and returns a list of good FSAs.
    """

    cerr('I: Aligning size standards...')

    good_fsa = []
    for (fsa, fsa_index) in fsa_list:
        cverr(3, 'D: aligning FSA %s' % fsa.filename)
        try:
            fsa.align(_params)
            good_fsa.append((fsa, fsa_index))
        except LadderMismatchException:
            f_bad_files.write(("LadderMismatch: %s\n") % fsa.filename)

    return good_fsa
Example #15
0
def find_raw_peaks(data, params, offset, expected_peak_number=0):
    """
    params.min_dist
    params.norm_thres
    params.min_rfu
    params.max_peak_number
    """
    #print("expected:", expected_peak_number)
    # cut and pad data to overcome peaks at the end of array
    obs_data = np.append(data[offset:], [0, 0, 0])
    if False:  #expected_peak_number:
        min_dist = params.min_dist
        indices = []
        norm_threshold = params.norm_thres
        expected_peak_number = expected_peak_number * 1.8
        while len(indices) <= expected_peak_number and norm_threshold > 1e-7:
            indices = indexes(obs_data, norm_threshold, min_dist)
            print(len(indices), norm_threshold)
            norm_threshold *= 0.5
    elif False:
        indices = indexes(obs_data, params.norm_thres, params.min_dist)

    indices = indexes(obs_data, 1e-7, params.min_dist)
    cverr(5, '## indices: %s' % str(indices))
    cverr(3, '## raw indices: %d' % len(indices))

    if len(indices) == 0:
        return []

    # normalize indices
    if offset > 0:
        indices = indices + offset

    # filter peaks by minimum rfu, and by maximum peak number after sorted by rfu
    peaks = [
        Peak(int(i), int(data[i])) for i in indices if
        (data[i] >= params.min_rfu and params.min_rtime < i < params.max_rtime)
    ]
    #peaks = sorted( peaks, key = lambda x: x.rfu )[:params.max_peak_number * 2]

    #import pprint; pprint.pprint(peaks)
    #print('======')

    if expected_peak_number:
        peaks.sort(key=lambda x: x.rfu, reverse=True)
        peaks = peaks[:round(expected_peak_number * 2)]
        peaks.sort(key=lambda x: x.rtime)

    cverr(3, '## peak above min rfu: %d' % len(peaks))

    return peaks
Example #16
0
def find_raw_peaks(data, params, offset, expected_peak_number=0):
    """
    params.min_dist
    params.norm_thres
    params.min_rfu
    params.max_peak_number
    """
    #print("expected:", expected_peak_number)
    # cut and pad data to overcome peaks at the end of array
    obs_data = np.append(data[offset:], [0,0,0])
    if False: #expected_peak_number:
        min_dist = params.min_dist
        indices = []
        norm_threshold = params.norm_thres
        expected_peak_number = expected_peak_number * 1.8
        while len(indices) <= expected_peak_number and norm_threshold > 1e-7:
            indices = indexes( obs_data, norm_threshold, min_dist)
            print(len(indices), norm_threshold)
            norm_threshold *= 0.5
    elif False:
        indices = indexes( obs_data, params.norm_thres, params.min_dist)

    indices = indexes( obs_data, 1e-7, params.min_dist)
    cverr(5, '## indices: %s' % str(indices))
    cverr(3, '## raw indices: %d' % len(indices))

    if len(indices) == 0:
        return []

    # normalize indices
    if offset > 0:
        indices = indices + offset

    # filter peaks by minimum rfu, and by maximum peak number after sorted by rfu
    peaks = [Peak(int(i), int(data[i])) for i in indices
             if data[i] >= params.min_rfu and params.min_rtime < i]
    #peaks = sorted( peaks, key = lambda x: x.rfu )[:params.max_peak_number * 2]

    #import pprint; pprint.pprint(peaks)
    #print('======')

    if expected_peak_number:
        peaks.sort( key = lambda x: x.rfu, reverse = True )
        peaks = peaks[: round(expected_peak_number * 2)]
        peaks.sort( key = lambda x: x.rtime )

    cverr(3, '## peak above min rfu: %d' % len(peaks))

    return peaks
Example #17
0
def find_peaks( raw_data,  params, raw_peaks = None ):
    """
    find all peaks based on the criteria defined in params, and assign as peak-scanned
    raw_data is baseline-normalized & smoothed trace

    parameters used are:
    method: 'cwt' or 'mlpy'
    widths: window size for peak scanning
    cwt_min_snr:
    min_height:
    min_relative_ratio:
    max_relative_ratio:
    min_height_ratio:
    max_peak_number:

    """

    if raw_peaks is None:
        raw_peaks = find_raw_peaks( raw_data, params )

    # check for any peaks
    if not raw_peaks:
        return raw_peaks

    # only retain 2 * max_peak_number and discard the rest
    raw_peaks = sorted( raw_peaks, key = lambda x: x[1],
            reverse = True )[:params.max_peak_number * 2]

    if params.min_relative_ratio > 0 or params.max_relative_ratio > 0:
        med = np.median( list(p[1] for p in raw_peaks) )
        if params.min_relative_ratio > 0:
            median_min = med * params.min_relative_ratio
            raw_peaks = [ p for p in raw_peaks if p[1] > median_min ]
        if params.max_relative_ratio > 0:
            median_max = med * params.max_relative_ratio
            raw_peaks = [ p for p in raw_peaks if p[1] < median_max ]

    if not raw_peaks:
        return raw_peaks

    # filter for minimum height ratio

    if params.min_height_ratio > 0:
        min_height = max( list( p[1] for p in raw_peaks) ) * params.min_height_ratio
        raw_peaks = [ p for p in raw_peaks if p[1] > min_height ]

    # calculate area

    (q50, q75) = np.percentile( raw_data, [ 50, 75 ] )
    peaks = []
    for (peak, height) in raw_peaks:
        area, brtime, ertime, srtime, ls, rs = calculate_area( raw_data, peak, 5e-2, q50 )
        wrtime = ertime - brtime
        if wrtime < 3:
            continue
        beta = area / height
        theta = height / wrtime
        if height >= 25 and beta * theta < 6: #10:
            continue
        if height < 25 and beta * theta < 3: #6:
            continue
        peaks.append( (peak, height, area, brtime, ertime, srtime, beta, theta) )

    peaks.sort()
    cverr(3, 'peaks stage 1 size: %d' % len(peaks))
    cverr(3, 'peaks stage 1: %s' % repr(peaks))

    non_artifact_peaks = []

    for idx in range(len(peaks)):
        peak = peaks[idx]

        if idx > 0:
            prev_p = peaks[idx-1]
            if peak[3] - prev_p[4] < 5 and peak[1] < params.artifact_ratio * prev_p[1]:
                # we are artifact, just skip
                continue
        if idx < len(peaks)-1:
            next_p = peaks[idx+1]
            if next_p[3] - peak[4] < 5 and peak[1] < params.artifact_ratio * next_p[1]:
                # we are another artifact, just skip
                continue

        non_artifact_peaks.append( peak )

    cverr(3, 'max_peak_number: %d' % params.max_peak_number)

    sorted_peaks = sorted( non_artifact_peaks, key = lambda x: (x[1], x[6] * x[7]),
                        reverse=True )[:params.max_peak_number]
    peaks = sorted( sorted_peaks )
    cverr(3, 'peaks stage 3 size: %d' % len(peaks))
    cverr(3, 'peaks stage 3: %s' % repr(peaks))

    return peaks
Example #18
0
def find_raw_peaks( raw_data, params ):

    max_height = max(raw_data)
    width_ratio = max(1, round(math.log( max_height/params.width_ratio )))
    widths = params.widths

    if params.method == 'cwt':
        from scipy.signal import find_peaks_cwt
        indices = find_peaks_cwt( raw_data, widths,
                                    min_snr = params.min_snr )
        #cerr('find_peaks_cwt() found %d peaks' % len(indices))
        #pprint.pprint(indices)

    elif params.method == 'relmax':
        indice_set = []
        from scipy.signal import argrelmax
        for i in (params.widths * width_ratio):
            indice_set.append( argrelmax( raw_data, order=i+5 )[0] )
        # get consensus
        indices = filter_by_snr( get_consensus_indices( indice_set ),
                                raw_data, params.min_snr * 3.5 )
        #print('indices => %d' % len(indices))
        #pprint.pprint( indices )

    elif params.method == 'mlpy':
        indice_set = []
        from mlpy import findpeaks_win
        for i in params.widths:
            indice_set.append( findpeaks_win( raw_data, span=i ) )
        # get consensus
        indices = filter_by_snr( get_consensus_indices( indice_set ),
            raw_data, params.min_snr )

    elif params.method == 'pd':
        from peakutils import indexes
        indices = indexes( raw_data, 1e-5, 10 )
        #pprint.pprint(indices)
        cverr(3, 'indice size: %d' % len(indices))
        cverr(3, 'indices => %s' % repr(indices))

    else:
        raise RuntimeError('unknown peak finding method: %s' % params.method)

    if indices is None or len(indices) == 0:
        return []


    # filter for absolute heights within proximity

    # special cases for pd (peak detect) method:

    if params.method == 'pd':
        return [ ( int(i), int(raw_data[i]) )
                    for i in indices if raw_data[i] > params.min_height and
                        params.min_rtime < i < params.max_rtime ]

    raw_peaks = []
    max_len = len(raw_data)
    for idx in indices:

        if not (params.min_rtime < idx < params.max_rtime):
            continue

        height, index = max( [ (raw_data[i], i)
                                for i in range(max(0, idx-3), min(max_len,idx+3) ) ] )

        if height < params.min_height: continue
        if (index, height) in raw_peaks: continue
        raw_peaks.append( (index, height) )

    return raw_peaks
Example #19
0
def scan_peaks( channel, params, peakdb ):
    """
    scan for peaks based on the criteria defined in params, set as peak-scanned,
    and prepare the channel data structure
    """

    if peakdb:
        raw_peaks = pickle.loads(peakdb.Get(channel.tag().encode()))
    else:
        raw_peaks = None

    initial_peaks = find_peaks( channel.data, params, raw_peaks)
    # peaks = ( rtime, height, area, brtime, ertime )
    #cerr('DEBUG - initial peaks: %d' % len(initial_peaks))

    cverr(3, 'initial peaks: %d' % len(initial_peaks))

    # perform futher cleaning for ladder channels
    if params.expected_peak_number:
        epn = params.expected_peak_number
        peak_qualities = sorted([ (p[6] * p[7], p) for p in initial_peaks ], reverse=True)
        low_scores = [ q[0] for q in peak_qualities[round(epn/3):round(epn * 1.5)] ]
        avg_low_score = sum(low_scores) / len(low_scores)
        ratio_low_score = (avg_low_score - low_scores[-1]) / low_scores[-1]
        if avg_low_score < 75:
            # questionable quality, please use more peaks
            score_threshold = 4 #avg_low_score * 0.1
            height_threshold = 6
        else:
            if avg_low_score - low_scores[-1] > low_scores[-1]:
            # peaks' height are likely not to evenly distributed
                score_threshold = max(low_scores[-1] * 0.90, 4)
            else:
                score_threshold = avg_low_score * 0.25
            height_threshold = 10
            cverr(3, 'using score threshold: %f' % score_threshold)
            cverr(3, 'using height_threshold: %d' % height_threshold)
        peaks = [ q for q in peak_qualities
                            if q[0] > score_threshold and q[1][1] > height_threshold ]
        cverr(3, 'after peak quality filtering: %d' % len(peaks))
        if len(peaks) > 1.5 * params.expected_peak_number:
            # try to remove peaks further
            saved_peaks = peaks
            while len(peaks) - len(saved_peaks) < 0.30 * len(peaks) and height_threshold < 20:
                height_threshold += 1
                saved_peaks = [ q for q in saved_peaks if q[0] > height_threshold ]
            peaks = saved_peaks
            cverr(3, 'after reducing peaks number by height: %d' % len(peaks))
        peaks = sorted( [ q[1] for q in peaks ] )

    else:
        peaks = initial_peaks


    # create alleles based on these peaks
    alleles = []
    for peak in peaks:
        ( rtime, height, area, brtime, ertime, srtime, beta, theta ) = peak
        wrtime = ertime - brtime
        height = round(height)
        allele = channel.new_allele(    rtime = rtime,
                                        height = height,
                                        area = area,
                                        brtime = brtime,
                                        ertime = ertime,
                                        wrtime = wrtime,
                                        srtime = srtime,
                                        beta = beta,
                                        theta = theta )
        allele.type = peaktype.scanned
        allele.method = binningmethod.notavailable
        allele.marker = channel.marker
        alleles.append( allele )

    return alleles
Example #20
0
def find_peaks( raw_data,  params, raw_peaks = None ):
    """
    find all peaks based on the criteria defined in params, and assign as peak-scanned
    raw_data is baseline-normalized & smoothed trace

    parameters used are:
    method: 'cwt' or 'mlpy'
    widths: window size for peak scanning
    cwt_min_snr:
    min_height:
    min_relative_ratio:
    max_relative_ratio:
    min_height_ratio:
    max_peak_number:

    """

    if raw_peaks is None:
        raw_peaks = find_raw_peaks( raw_data, params )

    # check for any peaks
    if not raw_peaks:
        return raw_peaks

    # only retain 2 * max_peak_number and discard the rest
    raw_peaks = sorted( raw_peaks, key = lambda x: x[1],
            reverse = True )[:params.max_peak_number * 2]

    if params.min_relative_ratio > 0 or params.max_relative_ratio > 0:
        med = np.median( list(p[1] for p in raw_peaks) )
        if params.min_relative_ratio > 0:
            median_min = med * params.min_relative_ratio
            raw_peaks = [ p for p in raw_peaks if p[1] > median_min ]
        if params.max_relative_ratio > 0:
            median_max = med * params.max_relative_ratio
            raw_peaks = [ p for p in raw_peaks if p[1] < median_max ]

    if not raw_peaks:
        return raw_peaks

    # filter for minimum height ratio

    if params.min_height_ratio > 0:
        min_height = max( list( p[1] for p in raw_peaks) ) * params.min_height_ratio
        raw_peaks = [ p for p in raw_peaks if p[1] > min_height ]

    # calculate area

    (q50, q75) = np.percentile( raw_data, [ 50, 75 ] )
    peaks = []
    for (peak, height) in raw_peaks:
        area, brtime, ertime, srtime, ls, rs = calculate_area( raw_data, peak, 5e-2, q50 )
        wrtime = ertime - brtime
        if wrtime < 3:
            continue
        beta = area / height
        theta = height / wrtime
        if height >= 25 and beta * theta < 6: #10:
            continue
        if height < 25 and beta * theta < 3: #6:
            continue
        peaks.append( (peak, height, area, brtime, ertime, srtime, beta, theta) )

    peaks.sort()
    cverr(3, 'peaks stage 1 size: %d' % len(peaks))
    cverr(3, 'peaks stage 1: %s' % repr(peaks))

    non_artifact_peaks = []

    for idx in range(len(peaks)):
        peak = peaks[idx]

        if idx > 0:
            prev_p = peaks[idx-1]
            if peak[3] - prev_p[4] < 5 and peak[1] < params.artifact_ratio * prev_p[1]:
                # we are artifact, just skip
                continue
        if idx < len(peaks)-1:
            next_p = peaks[idx+1]
            if next_p[3] - peak[4] < 5 and peak[1] < params.artifact_ratio * next_p[1]:
                # we are another artifact, just skip
                continue

        non_artifact_peaks.append( peak )

    cverr(3, 'max_peak_number: %d' % params.max_peak_number)

    sorted_peaks = sorted( non_artifact_peaks, key = lambda x: (x[1], x[6] * x[7]),
                        reverse=True )[:params.max_peak_number]
    peaks = sorted( sorted_peaks )
    cverr(3, 'peaks stage 3 size: %d' % len(peaks))
    cverr(3, 'peaks stage 3: %s' % repr(peaks))

    return peaks
Example #21
0
def find_raw_peaks( raw_data, params ):

    max_height = max(raw_data)
    width_ratio = max(1, round(math.log( max_height/params.width_ratio )))
    widths = params.widths

    if params.method == 'cwt':
        from scipy.signal import find_peaks_cwt
        indices = find_peaks_cwt( raw_data, widths,
                                    min_snr = params.min_snr )
        #cerr('find_peaks_cwt() found %d peaks' % len(indices))
        #pprint.pprint(indices)

    elif params.method == 'relmax':
        indice_set = []
        from scipy.signal import argrelmax
        for i in (params.widths * width_ratio):
            indice_set.append( argrelmax( raw_data, order=i+5 )[0] )
        # get consensus
        indices = filter_by_snr( get_consensus_indices( indice_set ),
                                raw_data, params.min_snr * 3.5 )
        #print('indices => %d' % len(indices))
        #pprint.pprint( indices )

    elif params.method == 'mlpy':
        indice_set = []
        from mlpy import findpeaks_win
        for i in params.widths:
            indice_set.append( findpeaks_win( raw_data, span=i ) )
        # get consensus
        indices = filter_by_snr( get_consensus_indices( indice_set ),
            raw_data, params.min_snr )

    elif params.method == 'pd':
        from peakutils import indexes
        indices = indexes( raw_data, 1e-5, 10 )
        #pprint.pprint(indices)
        cverr(3, 'indice size: %d' % len(indices))
        cverr(3, 'indices => %s' % repr(indices))

    else:
        raise RuntimeError('unknown peak finding method: %s' % params.method)

    if indices is None or len(indices) == 0:
        return []


    # filter for absolute heights within proximity

    # special cases for pd (peak detect) method:

    if params.method == 'pd':
        return [ ( int(i), int(raw_data[i]) )
                    for i in indices if raw_data[i] > params.min_height and
                        params.min_rtime < i < params.max_rtime ]

    raw_peaks = []
    max_len = len(raw_data)
    for idx in indices:

        if not (params.min_rtime < idx < params.max_rtime):
            continue

        height, index = max( [ (raw_data[i], i)
                                for i in range(max(0, idx-3), min(max_len,idx+3) ) ] )

        if height < params.min_height: continue
        if (index, height) in raw_peaks: continue
        raw_peaks.append( (index, height) )

    return raw_peaks
Example #22
0
def do_listrawdata(args, fsa_list, dbh):

    outfile = '-'
    if args.outfile != '-':
        print("outfile: ", args.outfile)
        outfile = args.outfile.rsplit('.', 1)[0]
        outfile += "_rawdata."
        outfile += args.outfile.rsplit('.', 1)[1]

        out_stream = open(outfile, 'w')
    else:
        out_stream = sys.stdout

    out_stream.write('SAMPLE NAME,WELL ID,TRACE DYE,RAW DATA\n')
    out_stream.close()

    for (fsa, fsa_index) in fsa_list:
        cverr(3, 'D: calling FSA %s' % fsa.filename)

        if outfile != '-':
            out_stream = open(outfile, 'a')
        else:
            out_stream = sys.stdout

        # sample name
        sample_name = fsa.filename.rsplit('.', 1)[0]

        # iterate through channels
        markers = fsa.panel.data['markers']
        trace = fsa.get_trace()

        # get well ID
        well_id = fsa.get_well_id()

        for channel in fsa.channels:

            # get trace dye
            if channel.is_ladder():
                trace_dye = markers['x/ladder']['filter']
            else:
                trace_dye = markers['x/' + channel.dye]['filter']

            # get raw data
            data = channel.data
            datastring = "["

            data = channel.data
            basepairs = channel.get_basepairs()
            #channel.set_basepairs(fsa.allele_fit_func)
            for i in range(len(data)):
                rfu = data[i]
                bp = basepairs[i] if basepairs else -999
                if bp > -999:
                    datastring += "[%i,%.2f,%i]," % (i, bp, rfu)
                else:
                    datastring += "[%i,null,%i]," % (i, rfu)
            datastring = datastring[:-1] + "]"

            out_stream.write("\"%10s\",\"%s\",\"%s\",\"%s\"\n" %
                             (sample_name, well_id, trace_dye, datastring))

        out_stream.close()
Example #23
0
def align_hc( peaks, ladder):
    """ peaks: list of rtime, in ascending order
        ladders: list of size from ladders, in ascending order

        returns: (score, msg, result, method)
    """

    #import pprint; pprint.pprint(peaks)

    # generate P for ladder
    if 'C' not in ladder:
        if 'T' not in ladder:
            ladder['T'] = generate_tree( [ (n,0) for n in ladder['sizes'] ] )
        ladder['C'] = generate_cluster(ladder['T'], ladder['k'])
    ladder_clusters = ladder['C']
    ladder_sizes = ladder['sizes']

    P = generate_tree( [ (n.rtime, 0) for n in peaks ] )
    peak_clusters = generate_cluster( P, ladder['k'] )

    # generate cluster should use so-called balance tree

    print(peak_clusters)

    if len(peak_clusters[-1]) == 1:
        if len( reduce(operator.add, peak_clusters ) ) > len(ladder_sizes):
            del peak_clusters[-1]
            #del peaks[-1]
    if len(peak_clusters[0]) == 1:
        if len( reduce(operator.add, peak_clusters ) ) > len(ladder_sizes):
            del peak_clusters[0]
            #del peaks[0]

    if len(peak_clusters) < ladder['k']:
        P = generate_tree( [ (n, 0) for n in reduce(operator.add, peak_clusters) ] )
        peak_clusters = generate_cluster(P, ladder['k'])

    # short cut, in case we have good high quality peaks
    if sum( len(c) for c in peak_clusters ) == len(ladder_sizes):
        hq_peaks = sum(peak_clusters, [])
        #hq_pairs = zip(hq_peaks, ladder_sizes)
        zres = estimate_z(hq_peaks, ladder_sizes)
        dp_result = align_dp( hq_peaks, ladder_sizes, [1.0] * len(hq_peaks),
                                    zres.z, zres.rss )
        dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks)
        score, msg = ladder['qcfunc']( dp_result, method = 'relax')
        if score > 0.9:
            return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict)

    #print(">>> clusters:\n", peak_clusters)
    cluster_pairings, expected_missing = align_clusters( peak_clusters,
            ladder_clusters )

    #print(">>> cluster pairs:\n", cluster_pairings)
    # check each cluster pairing

    initial_pairs = []
    for pairs in cluster_pairings:
        if is_good_pairing(pairs):
            initial_pairs.extend( pairs )
        else:
            cverr(3, '>> this pairings is not included:\n%s' % pairs)

    cverr(3, '>> initial pairs:\n%s' % initial_pairs)

    if not initial_pairs:
        return AlignResult(-1, 'E: no initial pairs defined!', None, None)

    # try to dp align the initial pairs as a shortcut for good sample or peaks

    rtimes, sizes = zip( *initial_pairs )
    zres = estimate_z(rtimes, sizes)

    dp_result = align_dp( [p.rtime for p in peaks], ladder_sizes,
                            generate_similarity(peaks), zres.z, zres.rss )
    dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks)
    score, msg = ladder['qcfunc']( dp_result, method = 'strict')
    if score > 0.9:
        return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict)

    return AlignResult(-1, 'ERR: alignment needs minimization', None, None,
                initial_pairs=initial_pairs)
Example #24
0
def filter_for_artifact(peaks, params, expected_peak_number = 0):
    """
    params.max_peak_number
    params.artifact_ratio
    params.artifact_dist ~ 5
    """

    # the following code in this function performs the necessary acrobatic act
    # to select the most likely peaks that can be considered as true signals,
    # which is especially necessary for ladder - size assignment

    if len(peaks) == expected_peak_number:
        return peaks

    # we need to adapt to the noise level of current channel
    if expected_peak_number > 0:
        epn = expected_peak_number
        theta_peaks = sorted(peaks, key = lambda x: x.theta, reverse=True)[round(epn/2)+3:epn-1]
        #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1]
        omega_peaks = sorted(peaks, key = lambda x: x.omega, reverse=True)
        omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn/2):epn-1]
        rfu_peaks = sorted(peaks, key = lambda x: x.rfu, reverse=True)[:epn-1]

        if theta_peaks[-1].theta < 8:
            theta_peaks.sort()
            thetas = np.array([ p.theta for p in theta_peaks ])
            rtimes = [ p.rtime for p in theta_peaks ]

            #plt.scatter(rtimes, thetas)
            #plt.show()
            popt, pcov = curve_fit( math_func, rtimes, 0.5 * thetas, p0 = [ -1, 1 ])

            if is_verbosity(4):
                xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 )
                yy = math_func(xx, *popt)
                plt.plot(xx, yy)
                plt.scatter( [p.rtime for p in peaks], [p.theta for p in peaks])
                plt.show()

            q_theta = lambda x: x.theta >= math_func(x.rtime, *popt) or x.theta > 100

        else:
            q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params.min_theta)


        if omega_peaks[-1].omega < 200:
            omega_peaks.sort()
            omegas = np.array([ p.omega for p in omega_peaks ])
            rtimes = np.array([ p.rtime for p in omega_peaks ])

            # generate a quadratic threshold for omega

            # generate a quadratic ratio series first
            popt, pcov = curve_fit( quadratic_math_func,
                    [rtimes[0], (rtimes[0] + rtimes[-1])/2, rtimes[-1]],
                    [0.05, 0.25, 0.05])
            ratios = quadratic_math_func(rtimes, *popt)
            if is_verbosity(4):
                plt.plot(rtimes, ratios)
                plt.show()

            # use the ratios to enforce quadratic threshold
            popt, pcov = curve_fit( quadratic_math_func, rtimes, ratios * omegas,
                                        p0 = [ -1, 1, 0 ])
            if popt[0] > 0:
                # enforce small flat ratio
                popt, pcov = curve_fit( math_func, rtimes, 0.25 * omegas, p0 = [ 1, 0 ])
                popt = np.insert(popt, 0, 0.0)  # convert to 3 params
            if is_verbosity(4):
                plt.scatter(rtimes, omegas)
                xx = np.linspace( rtimes[0], rtimes[-1]+2000, 100 )
                yy = quadratic_math_func(xx, *popt)
                plt.plot(xx, yy)
                plt.scatter( [p.rtime for p in peaks], [p.omega for p in peaks])
                plt.show()

            q_omega = lambda x: (   x.omega >= 100 or
                                    x.omega >= quadratic_math_func(x.rtime, *popt) )

        else:

            q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50)


        min_rfu = rfu_peaks[-1].rfu * 0.125

    else:
        min_theta = 0
        min_omega = 0
        min_theta_omega = 0
        min_rfu = 2


    # filter for too sharp/thin peaks
    filtered_peaks = []
    for p in peaks:
        #filtered_peaks.append(p); continue
        cverr(5, str(p))

        if len(filtered_peaks) < 2 and p.area > 50:
            # first two real peaks might be a bit lower
            filtered_peaks.append(p)
            continue

        if not q_omega(p):
            cverr(5, '! q_omega')
            continue
        #if not q_theta(p):
        #    print('! q_theta')
        #    continue

        #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta:
        #    print('! omega & theta')
        #    continue
        #if min_theta_omega and p.theta * p.omega < min_theta_omega:
        #    print('! theta_omega')
        #    continue
        if p.theta < 1.0 and p.area < 25 and p.omega < 5:
            cverr(5, '! extreme theta & area & omega')
            continue
        if p.rfu < min_rfu:
            cverr(5, '! extreme min_rfu')
            continue
        if p.beta > 25 and p.theta < 0.5:
            cverr(5, '! extreme beta')
            continue
        if p.wrtime < 3:
            continue
        if p.rfu >= 25 and p.beta * p.theta < 6:
            continue
        if p.rfu < 25 and p.beta * p.theta < 3:
            continue
        #if p.omega < 50:
        #    continue
        #if p.omega < 100 and p.theta < 5:
        #    continue
        #if ( params.max_beta and min_theta and
        #        (p.beta > params.max_beta and p.theta < min_theta) ):
        #    print('! max_beta')
        #    continue
        filtered_peaks.append(p)

    #import pprint; pprint.pprint(filtered_peaks)

    # filter for distance between peaks and their rfu ratio
    peaks = sorted(filtered_peaks, key = lambda x: x.rtime)
    non_artifact_peaks = []
    for idx in range(len(peaks)):
        p = peaks[idx]

        if idx > 0:
            prev_p = peaks[idx-1]
            if ( p.brtime - prev_p.ertime < params.artifact_dist
                    and p.rfu < params.artifact_ratio * prev_p.rfu ):
                # we are artifact, just skip
                print('artifact1:', p)
                continue

        if idx < len(peaks)-1:
            next_p = peaks[idx+1]
            if ( next_p.brtime - p.ertime < params.artifact_dist
                    and p.rfu < params.artifact_ratio * next_p.rfu ):
                # we are artifact, just skip
                print('artefact2:', p)
                continue

        non_artifact_peaks.append( p )

    #import pprint; pprint.pprint(non_artifact_peaks)
    #print(len(non_artifact_peaks))

    peaks = non_artifact_peaks

    cverr(3, '## non artifact peaks: %d' % len(peaks))

    return peaks
Example #25
0
def align_pm(peaks, ladder, anchor_pairs=None):

    if not anchor_pairs:
        longest_rtime_peak = max([p.rtime for p in peaks])
        if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND:
            bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND
            anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio
            anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio
        else:
            anchor_start = ANCHOR_RTIME_LOWER_BOUND
            anchor_end = ANCHOR_RTIME_UPPER_BOUND
        anchor_peaks = [
            p for p in peaks if anchor_start < p.rtime < anchor_end
        ]
        anchor_pairs, initial_z = estimate_pm(anchor_peaks,
                                              ladder['signature'])

    else:
        rtimes, bpsizes = zip(*anchor_pairs)
        initial_z = estimate_z(rtimes, bpsizes, 1)

    anchor_pairs.sort()
    pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z)
    #print(pairs)
    pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z)

    #print(rss)
    #plot(f.rtimes, f.sizes, z, pairs)
    # last dp
    dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss)
    if is_verbosity(1):
        import pprint
        pprint.pprint(dp_result.sized_peaks)
    if is_verbosity(4):
        plot(f.rtimes, f.sizes, dp_result.z,
             [(x[1], x[0]) for x in dp_result.sized_peaks])

    dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks)

    score, msg = ladder['qcfunc'](dp_result, method='strict')
    if score > 0.9:
        return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict)

    score, msg = ladder['qcfunc'](dp_result, method='relax')
    return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax)

    f = ZFunc(peaks, ladder['sizes'], anchor_pairs)

    z = initial_z
    score = last_score = 0
    last_z = None

    for order in [1, 2, 3]:

        last_rss = -1
        rss = 0

        niter = 0
        while abs(rss - last_rss) > 1e-3:

            niter += 1
            cverr(5, 'Iter: %d' % niter)

            cverr(5, z)
            score = f(z)
            if last_score and last_score < score:
                # score does not converge; just exit
                cverr(5, 'does not converge!')
                break

            pairs, cur_rss = f.get_pairs(z)
            rtimes, bpsizes = zip(*pairs)
            zres = estimate_z(rtimes, bpsizes, order)

            last_z = z
            z = zres.z
            last_rss = rss
            rss = zres.rss
            cverr(5, rss)

    dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss)

    return align_gm2(peaks, ladder, anchor_pairs, dp_result.z)

    new_anchor_pairs = []
    zf = np.poly1d(dp_result.z)
    for p in dp_result.sized_peaks:
        if (p[0] - zf(p[1]))**2 < 2:
            new_anchor_pairs.append((p[1], p[0]))
    #import pprint; pprint.pprint(dp_result.sized_peaks)
    plot(f.rtimes, f.sizes, dp_result.z,
         [(x[1], x[0]) for x in dp_result.sized_peaks])

    return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
Example #26
0
def align_hc(peaks, ladder):
    """ peaks: list of rtime, in ascending order
        ladders: list of size from ladders, in ascending order

        returns: (score, msg, result, method)
    """

    #import pprint; pprint.pprint(peaks)

    # generate P for ladder
    if 'C' not in ladder:
        if 'T' not in ladder:
            ladder['T'] = generate_tree([(n, 0) for n in ladder['sizes']])
        ladder['C'] = generate_cluster(ladder['T'], ladder['k'])
    ladder_clusters = ladder['C']
    ladder_sizes = ladder['sizes']

    P = generate_tree([(n.rtime, 0) for n in peaks])
    peak_clusters = generate_cluster(P, ladder['k'])

    # generate cluster should use so-called balance tree

    #print(peak_clusters)

    if len(peak_clusters[-1]) == 1:
        if len(reduce(operator.add, peak_clusters)) > len(ladder_sizes):
            del peak_clusters[-1]
            #del peaks[-1]
    if len(peak_clusters[0]) == 1:
        if len(reduce(operator.add, peak_clusters)) > len(ladder_sizes):
            del peak_clusters[0]
            #del peaks[0]

    if len(peak_clusters) < ladder['k']:
        P = generate_tree([(n, 0)
                           for n in reduce(operator.add, peak_clusters)])
        peak_clusters = generate_cluster(P, ladder['k'])

    # short cut, in case we have good high quality peaks
    if sum(len(c) for c in peak_clusters) == len(ladder_sizes):
        hq_peaks = sum(peak_clusters, [])
        #hq_pairs = zip(hq_peaks, ladder_sizes)
        zres = estimate_z(hq_peaks, ladder_sizes)
        dp_result = align_dp(hq_peaks, ladder_sizes, [1.0] * len(hq_peaks),
                             zres.z, zres.rss)
        dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks)
        score, msg = ladder['qcfunc'](dp_result, method='relax')
        if score > 0.9:
            return AlignResult(score, msg, dp_result,
                               const.alignmethod.hcm_strict)

    #print(">>> clusters:\n", peak_clusters)
    cluster_pairings, expected_missing = align_clusters(
        peak_clusters, ladder_clusters)

    #print(">>> cluster pairs:\n", cluster_pairings)
    # check each cluster pairing

    initial_pairs = []
    for pairs in cluster_pairings:
        if is_good_pairing(pairs):
            initial_pairs.extend(pairs)
        else:
            cverr(3, '>> this pairings is not included:\n%s' % pairs)

    cverr(3, '>> initial pairs:\n%s' % initial_pairs)

    if not initial_pairs:
        return AlignResult(-1, 'E: no initial pairs defined!', None, None)

    # try to dp align the initial pairs as a shortcut for good sample or peaks

    rtimes, sizes = zip(*initial_pairs)
    zres = estimate_z(rtimes, sizes)

    dp_result = align_dp([p.rtime for p in peaks], ladder_sizes,
                         generate_similarity(peaks), zres.z, zres.rss)
    dp_result.sized_peaks = pair_sized_peaks(peaks, dp_result.sized_peaks)
    score, msg = ladder['qcfunc'](dp_result, method='strict')
    if score > 0.9:
        return AlignResult(score, msg, dp_result, const.alignmethod.hcm_strict)

    return AlignResult(-1,
                       'ERR: alignment needs minimization',
                       None,
                       None,
                       initial_pairs=initial_pairs)
Example #27
0
def main(args):

    if args.verbose != 0:
        set_verbosity(args.verbose)

    dbh = None

    # set parameter for baseline correction and allelemethod
    from fatools.lib.const import allelemethod, baselinemethod
    _params = params.Params()

    _params.baselinewindow = args.baselinewindow

    if args.baselinemethod != "":
        if args.baselinemethod == 'none':
            _params.baselinemethod = baselinemethod.none
        elif args.baselinemethod == 'median':
            _params.baselinemethod = baselinemethod.median
        elif args.baselinemethod == 'minimum':
            _params.baselinemethod = baselinemethod.minimum
        else:
            raise NotImplementedError()

    if args.allelemethod != "":
        if args.allelemethod == 'leastsquare':
            _params.allelemethod = allelemethod.leastsquare
        elif args.allelemethod == 'cubicspline':
            _params.allelemethod = allelemethod.cubicspline
        elif args.allelemethod == 'localsouthern':
            _params.allelemethod = allelemethod.localsouthern
        else:
            raise NotImplementedError()

    if args.nonladder_smoothing_window > 0:
        _params.nonladder.smoothing_window = args.nonladder_smoothing_window
        _params.nonladder.smoothing_order = args.nonladder_smoothing_order

    cerr('I: Aligning size standards...')
    if args.file or args.infile or args.indir:
        cverr(4, 'D: opening FSA file(s)')
        fsa_list = open_fsa(args, _params)
    elif dbh is None:
        cverr(4, 'D: connecting to database')
        dbh = get_dbhandler(args)
        fsa_list = get_fsa_list(args, dbh)

    cerr('I: obtained %d FSA' % len(fsa_list))

    if args.commit:
        with transaction.manager:
            do_facmd(args, fsa_list, dbh)
            cerr('** COMMIT to database **')
    elif dbh:
        cerr(
            'WARNING ** running without database COMMIT! All changes will be discarded!'
        )
        if not (args.test or args.y):
            keys = input('Do you want to continue [y/n]? ')
            if not keys.lower().strip().startswith('y'):
                sys.exit(1)
        do_facmds(args, fsa_list, _params, dbh)
    else:
        do_facmds(args, fsa_list, _params)
Example #28
0
def filter_for_artifact(peaks, params, expected_peak_number=0):
    """
    params.max_peak_number
    params.artifact_ratio
    params.artifact_dist ~ 5
    """

    # the following code in this function performs the necessary acrobatic act
    # to select the most likely peaks that can be considered as true signals,
    # which is especially necessary for ladder - size assignment

    if len(peaks) == expected_peak_number:
        return peaks

    # we need to adapt to the noise level of current channel
    if expected_peak_number > 0:
        epn = expected_peak_number
        theta_peaks = sorted(peaks, key=lambda x: x.theta,
                             reverse=True)[round(epn / 2) + 3:epn - 1]
        #theta_peaks = theta_peaks[2:4] + theta_peaks[round(epn/2):epn-1]
        omega_peaks = sorted(peaks, key=lambda x: x.omega, reverse=True)
        omega_peaks = omega_peaks[2:4] + omega_peaks[round(epn / 2):epn - 1]
        rfu_peaks = sorted(peaks, key=lambda x: x.rfu, reverse=True)[:epn - 1]

        if theta_peaks[-1].theta < 8:
            theta_peaks.sort()
            thetas = np.array([p.theta for p in theta_peaks])
            rtimes = [p.rtime for p in theta_peaks]

            #plt.scatter(rtimes, thetas)
            #plt.show()
            popt, pcov = curve_fit(math_func, rtimes, 0.5 * thetas, p0=[-1, 1])

            if is_verbosity(4):
                xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100)
                yy = math_func(xx, *popt)
                plt.plot(xx, yy)
                plt.scatter([p.rtime for p in peaks], [p.theta for p in peaks])
                plt.show()

            q_theta = lambda x: x.theta >= math_func(x.rtime, *popt
                                                     ) or x.theta > 100

        else:
            q_theta = lambda x: x.theta >= min(theta_peaks[-1].theta, params.
                                               min_theta)

        if omega_peaks[-1].omega < 200:
            omega_peaks.sort()
            omegas = np.array([p.omega for p in omega_peaks])
            rtimes = np.array([p.rtime for p in omega_peaks])

            # generate a quadratic threshold for omega

            # generate a quadratic ratio series first
            popt, pcov = curve_fit(
                quadratic_math_func,
                [rtimes[0],
                 (rtimes[0] + rtimes[-1]) / 2, rtimes[-1]], [0.05, 0.25, 0.05])
            ratios = quadratic_math_func(rtimes, *popt)
            if is_verbosity(4):
                plt.plot(rtimes, ratios)
                plt.show()

            # use the ratios to enforce quadratic threshold
            popt, pcov = curve_fit(quadratic_math_func,
                                   rtimes,
                                   ratios * omegas,
                                   p0=[-1, 1, 0])
            if popt[0] > 0:
                # enforce small flat ratio
                popt, pcov = curve_fit(math_func,
                                       rtimes,
                                       0.25 * omegas,
                                       p0=[1, 0])
                popt = np.insert(popt, 0, 0.0)  # convert to 3 params
            if is_verbosity(4):
                plt.scatter(rtimes, omegas)
                xx = np.linspace(rtimes[0], rtimes[-1] + 2000, 100)
                yy = quadratic_math_func(xx, *popt)
                plt.plot(xx, yy)
                plt.scatter([p.rtime for p in peaks], [p.omega for p in peaks])
                plt.show()

            q_omega = lambda x: (x.omega >= 100 or x.omega >=
                                 quadratic_math_func(x.rtime, *popt))

        else:

            q_omega = lambda x: x.omega >= min(omega_peaks[-1].omega, 50)

        min_rfu = rfu_peaks[-1].rfu * 0.125

    else:
        min_theta = 0
        min_omega = 0
        min_theta_omega = 0
        min_rfu = 2

    # filter for too sharp/thin peaks
    filtered_peaks = []
    for p in peaks:
        #filtered_peaks.append(p); continue
        cverr(5, p)

        if len(filtered_peaks) < 2 and p.area > 50:
            # first two real peaks might be a bit lower
            filtered_peaks.append(p)
            continue

        if not q_omega(p):
            cverr(5, '! q_omega')
            continue
        #if not q_theta(p):
        #    print('! q_theta')
        #    continue

        #if min_theta and min_omega and p.omega < min_omega and p.theta < min_theta:
        #    print('! omega & theta')
        #    continue
        #if min_theta_omega and p.theta * p.omega < min_theta_omega:
        #    print('! theta_omega')
        #    continue
        if p.theta < 1.0 and p.area < 25 and p.omega < 5:
            cverr(5, '! extreme theta & area & omega')
            continue
        if p.rfu < min_rfu:
            cverr(5, '! extreme min_rfu')
            continue
        if p.beta > 25 and p.theta < 0.5:
            cverr(5, '! extreme beta')
            continue
        if p.wrtime < 3:
            continue
        if p.rfu >= 25 and p.beta * p.theta < 6:
            continue
        if p.rfu < 25 and p.beta * p.theta < 3:
            continue
        #if p.omega < 50:
        #    continue
        #if p.omega < 100 and p.theta < 5:
        #    continue
        #if ( params.max_beta and min_theta and
        #        (p.beta > params.max_beta and p.theta < min_theta) ):
        #    print('! max_beta')
        #    continue
        filtered_peaks.append(p)

    #import pprint; pprint.pprint(filtered_peaks)

    # filter for distance between peaks and their rfu ratio
    peaks = sorted(filtered_peaks, key=lambda x: x.rtime)
    non_artifact_peaks = []
    for idx in range(len(peaks)):
        p = peaks[idx]

        if idx > 0:
            prev_p = peaks[idx - 1]
            if (p.brtime - prev_p.ertime < params.artifact_dist
                    and p.rfu < params.artifact_ratio * prev_p.rfu):
                # we are artifact, just skip
                print('artifact1:', p)
                continue

        if idx < len(peaks) - 1:
            next_p = peaks[idx + 1]
            if (next_p.brtime - p.ertime < params.artifact_dist
                    and p.rfu < params.artifact_ratio * next_p.rfu):
                # we are artifact, just skip
                print('artefact2:', p)
                continue

        non_artifact_peaks.append(p)

    #import pprint; pprint.pprint(non_artifact_peaks)
    #print(len(non_artifact_peaks))

    peaks = non_artifact_peaks

    cverr(3, '## non artifact peaks: %d' % len(peaks))

    return peaks
Example #29
0
def scan_peaks( channel, params, peakdb ):
    """
    scan for peaks based on the criteria defined in params, set as peak-scanned,
    and prepare the channel data structure
    """

    if peakdb:
        raw_peaks = pickle.loads(peakdb.Get(channel.tag().encode()))
    else:
        raw_peaks = None

    initial_peaks = find_peaks( channel.data, params, raw_peaks)
    # peaks = ( rtime, height, area, brtime, ertime )
    #cerr('DEBUG - initial peaks: %d' % len(initial_peaks))

    cverr(3, 'initial peaks: %d' % len(initial_peaks))

    # perform futher cleaning for ladder channels
    if params.expected_peak_number:
        epn = params.expected_peak_number
        peak_qualities = sorted([ (p[6] * p[7], p) for p in initial_peaks ], reverse=True)
        low_scores = [ q[0] for q in peak_qualities[round(epn/3):round(epn * 1.5)] ]
        avg_low_score = sum(low_scores) / len(low_scores)
        ratio_low_score = (avg_low_score - low_scores[-1]) / low_scores[-1]
        if avg_low_score < 75:
            # questionable quality, please use more peaks
            score_threshold = 4 #avg_low_score * 0.1
            height_threshold = 6
        else:
            if avg_low_score - low_scores[-1] > low_scores[-1]:
            # peaks' height are likely not to evenly distributed
                score_threshold = max(low_scores[-1] * 0.90, 4)
            else:
                score_threshold = avg_low_score * 0.25
            height_threshold = 10
            cverr(3, 'using score threshold: %f' % score_threshold)
            cverr(3, 'using height_threshold: %d' % height_threshold)
        peaks = [ q for q in peak_qualities
                            if q[0] > score_threshold and q[1][1] > height_threshold ]
        cverr(3, 'after peak quality filtering: %d' % len(peaks))
        if len(peaks) > 1.5 * params.expected_peak_number:
            # try to remove peaks further
            saved_peaks = peaks
            while len(peaks) - len(saved_peaks) < 0.30 * len(peaks) and height_threshold < 20:
                height_threshold += 1
                saved_peaks = [ q for q in saved_peaks if q[0] > height_threshold ]
            peaks = saved_peaks
            cverr(3, 'after reducing peaks number by height: %d' % len(peaks))
        peaks = sorted( [ q[1] for q in peaks ] )

    else:
        peaks = initial_peaks


    # create alleles based on these peaks
    alleles = []
    for peak in peaks:
        ( rtime, height, area, brtime, ertime, srtime, beta, theta ) = peak
        wrtime = ertime - brtime
        height = round(height)
        allele = channel.new_allele(    rtime = rtime,
                                        height = height,
                                        area = area,
                                        brtime = brtime,
                                        ertime = ertime,
                                        wrtime = wrtime,
                                        srtime = srtime,
                                        beta = beta,
                                        theta = theta )
        allele.type = peaktype.scanned
        allele.method = binningmethod.notavailable
        allele.marker = channel.marker
        alleles.append( allele )

    return alleles
Example #30
0
def align_pm(peaks, ladder, anchor_pairs=None):

    if not anchor_pairs:
        longest_rtime_peak = max([p.rtime for p in peaks])
        if longest_rtime_peak > PEAK_RTIME_UPPER_BOUND:
            bound_adjust_ratio = longest_rtime_peak / PEAK_RTIME_UPPER_BOUND
            anchor_start = ANCHOR_RTIME_LOWER_BOUND * bound_adjust_ratio
            anchor_end = ANCHOR_RTIME_UPPER_BOUND * bound_adjust_ratio
        else:
            anchor_start = ANCHOR_RTIME_LOWER_BOUND
            anchor_end = ANCHOR_RTIME_UPPER_BOUND
        anchor_peaks = [ p for p in peaks if anchor_start < p.rtime < anchor_end ]
        anchor_pairs, initial_z = estimate_pm( anchor_peaks, ladder['signature'] )

    else:
        rtimes, bpsizes = zip( *anchor_pairs )
        initial_z = estimate_z(rtimes, bpsizes, 1)

    anchor_pairs.sort()
    pairs, z, rss, f = align_upper_pm(peaks, ladder, anchor_pairs, initial_z)
    #print(pairs)
    pairs, z, rss, f = align_lower_pm(peaks, ladder, pairs, initial_z)

    #print(rss)
    #plot(f.rtimes, f.sizes, z, pairs)
    # last dp
    dp_result = align_dp(f.rtimes, f.sizes, f.similarity, z, rss)
    if is_verbosity(1):
        import pprint; pprint.pprint(dp_result.sized_peaks)
    if is_verbosity(4):
        plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks])

    dp_result.sized_peaks = f.get_sized_peaks(dp_result.sized_peaks)

    score, msg = ladder['qcfunc'](dp_result, method='strict')
    if score > 0.9:
        return AlignResult(score, msg, dp_result, const.alignmethod.pm_strict)

    score, msg = ladder['qcfunc'](dp_result, method='relax')
    return AlignResult(score, msg, dp_result, const.alignmethod.pm_relax)


    f = ZFunc(peaks, ladder['sizes'], anchor_pairs)

    z = initial_z
    score = last_score = 0
    last_z = None

    for order in [1, 2, 3]:

        last_rss = -1
        rss = 0

        niter = 0
        while abs(rss - last_rss) > 1e-3:

            niter += 1
            cverr(5, 'Iter: %d' % niter)

            cverr(5, z)
            score = f(z)
            if last_score and last_score < score:
                # score does not converge; just exit
                cverr(5, 'does not converge!')
                break

            pairs, cur_rss = f.get_pairs(z)
            rtimes, bpsizes = zip( *pairs )
            zres = estimate_z(rtimes, bpsizes, order)

            last_z = z
            z = zres.z
            last_rss = rss
            rss = zres.rss
            cverr(5, rss)

    dp_result = align_dp(f.rtimes, f.sizes, last_z, last_rss)

    return align_gm2(peaks, ladder, anchor_pairs, dp_result.z)



    new_anchor_pairs = []
    zf = np.poly1d(dp_result.z)
    for p in dp_result.sized_peaks:
        if (p[0] - zf(p[1]))**2 < 2:
            new_anchor_pairs.append( (p[1], p[0]) )
    #import pprint; pprint.pprint(dp_result.sized_peaks)
    plot(f.rtimes, f.sizes, dp_result.z, [(x[1], x[0]) for x in dp_result.sized_peaks])

    return align_gm(peaks, ladder, anchor_pairs, dp_result.z)
Example #31
0
def do_listpeaks(args, fsa_list, dbh):

    if args.outfile != '-':
        out_stream = open(args.outfile, 'w')
    else:
        out_stream = sys.stdout

    if args.peaks_format == 'standard':
        out_stream.write(
            'SAMPLE\tFILENAME   \tDYE\tRTIME\tSIZE\tHEIGHT\tAREA\tSCORE\n')
    elif args.peaks_format == 'peakscanner':
        out_stream.write(
            "Dye/Sample Peak,Sample File Name,Type,Size,Height,Area in Point,Area in BP,Corrected Area in BP,Data Point,Begin Point,"
        )
        if args.merge:
            out_stream.write(
                "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,Peak Group,User Comments,User Edit\n"
            )
        else:
            out_stream.write(
                "Begin BP,End Point,End BP,Width in Point,Width in BP,Score,User Comments,User Edit\n"
            )

    else:
        raise RuntimeError("Unknown value for args.peaks_format")
    out_stream.close()

    for (fsa, fsa_index) in fsa_list:
        cverr(3, 'D: calling FSA %s' % fsa.filename)

        markers = fsa.panel.data['markers']

        if args.outfile != '-':
            out_stream = open(args.outfile, 'a')
        else:
            out_stream = sys.stdout

        for channel in fsa.channels:
            if channel.is_ladder():
                color = markers['x/ladder']['filter']
            else:
                color = markers['x/' + channel.dye]['filter']

            alleles = channel.get_alleles(broad_peaks_only=False)

            if is_verbosity(4):
                cout('Marker => %s | %s [%d]' %
                     (channel.marker.code, channel.dye, len(alleles)))
                cout("channel has alleles :", len(alleles))

            i = 1

            smeared_alleles = channel.smeared_alleles
            if (not args.merge) or channel.is_ladder():
                for p in alleles:
                    if args.peaks_format == 'standard':
                        out_stream.write(
                            '%6s\t%10s\t%3s\t%d\t%d\t%5i\t%3.2f\t%3.2f\n' %
                            (fsa_index, fsa.filename[:-4], color, p.rtime,
                             p.size, p.height, p.area, p.qscore))
                    else:
                        out_stream.write(
                            '"%s, %i",%s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f,,\n'
                            % (color, i, fsa.filename, p.type, p.size,
                               p.height, p.area, p.area_bp, p.area_bp_corr,
                               p.rtime, p.brtime, p.begin_bp, p.ertime,
                               p.end_bp, p.wrtime, p.width_bp, p.qscore))
                    i = i + 1

            else:
                if is_verbosity(4):
                    cout('Marker => %s | %s [%d]' %
                         (channel.marker.code, channel.dye,
                          len(smeared_alleles)))
                    cout("channel has smeared alleles :", len(smeared_alleles))
                i = 1
                for p in smeared_alleles:
                    out_stream.write(
                        '"%s, %i", %s, %s, %f, %i, %i, %i, %i, %i, %i, %f, %i, %f, %i, %f, %f, %i,,\n'
                        % (color, i, fsa.filename, p.type, p.size, p.height,
                           p.area, p.area_bp, p.area_bp_corr, p.rtime,
                           p.brtime, p.begin_bp, p.ertime, p.end_bp, p.wrtime,
                           p.width_bp, p.qscore, p.group))
                    i = i + 1

        out_stream.close()