Exemple #1
0
def LDA(line_list, temp):
    """

    :param line_list: list of SAM object
    :param temp: temperature
    :return:
    """
    temp_list = [32, 37, 42, 47, 52, 57]
    coef_list = [[[-0.14494789, 0.18791679, 0.02588474]],
                 [[-0.13364364, 0.22510179, 0.05494031]],
                 [[-0.09006122, 0.25660706, 0.1078303]],
                 [[-0.01593182, 0.24498485, 0.15753649]],
                 [[0.01860365, 0.1750174, 0.17003374]],
                 [[0.03236755, 0.11624593, 0.24306498]]]
    inter_list = [-1.17545204, -5.40436344, -12.45549846,
                  -19.32670233, -20.11992898, -23.98652919]
    class_list = [-1, 1]
    try:
        classfier_index = temp_list.index(temp)
    except ValueError:
        print("The given temperature was not in temp_list:", temp_list)
        sys.exit()

    coef_array = np.asarray(coef_list)
    inter_array = np.asarray(inter_list)
    class_array = np.asarray(class_list)

    lda_classifer = LinearDiscriminantAnalysis()

    lda_classifer.coef_ = coef_array[classfier_index]
    lda_classifer.intercept_ = inter_array[classfier_index]
    lda_classifer.classes_ = class_array

    test_list = []
    for sub_line in line_list:
        if sub_line.xs_tag:
            test_list.append([np.float(len(sub_line)), sub_line.xs_tag, sub_line.gc_content])
        else:
            return False
    lda_prob = lda_classifer.predict_proba(np.asarray(test_list))[:, 1]
    lda_prob = map(lambda x: x < 0.5, lda_prob)
    if all(lda_prob):
        return True
    return False
Exemple #2
0
# Convert lists to ndarrays.
coefArray = np.asarray(coefList)
interArray = np.asarray(interList)
classArray = np.asarray(classList)

# Determine which index to reference for model values.
# default
tempVal = 57
np_index = tempList.index(tempVal)

# Build model from encoded values.
clf = LinearDiscriminantAnalysis()
clf.coef_ = coefArray[np_index]
clf.intercept_ = interArray[np_index]
clf.classes_ = classArray

# Determine which classifier parameters to use.
clfT = tempList.index(tempVal)

# Make lists to hold data about candidates.
testList = []
testSet = set()
candsInfo = []

# the realignment result
samf = "/Users/yeweijian/Downloads/data/t.sam"

# Make a list to hold the output.
outList = []
Exemple #3
0
def cleanOutput(inputFile, uniqueVal, zeroVal, probVal, tempVal, sal, form,
                reportVal, debugVal, metaVal, outNameVal, startTime):
    # Determine the stem of the input filename.
    fileName = str(inputFile).split('.')[0]

    # Open input file for reading.
    with open(inputFile, 'r') as f:
        file_read = [line.strip() for line in f]

    # Determine how many unique candidates are in the .sam file
    samIDs = [x.split('\t')[0].split(':')[1].split('-')[0] \
              if x[0] is not '@' else ' ' for x in file_read]
    candsSet = set()
    for x in samIDs:
        if x is not ' ':
            candsSet.add(x)

    # Make a list to hold the output.
    outList = []

    # Make lists to hold Report info if desired.
    if reportVal or debugVal is True:
        rejectList = []
        reportList = []

    if uniqueVal or zeroVal is True:
        # Process .sam file, keeping probes with only 0 or 1 unique alignment.
        for i in range(0, len(file_read), 1):
            if file_read[i][0] is not '@':
                chromField = file_read[i].split('\t')[2]
                chrom = file_read[i].split('\t')[0].split(':')[0]
                start = file_read[i].split('\t')[0].split(':')[1].split('-')[0]
                stop = file_read[i].split('\t')[0].split('-')[1].strip(' ')
                seq = file_read[i].split('\t')[9]
                Tm = probeTm(seq, sal, form)

                # For unique mode.
                if uniqueVal is True:
                    if re.match('\*', chromField) is None \
                       and re.search('XS', file_read[i]) is None:
                        outList.append('%s\t%s\t%s\t%s\t%s' \
                                       % (chrom, start, stop, seq, Tm))
                        # Report info on selected probe if desired.
                        if reportVal is True:
                            reportList.append('Candidate probe at %s:%s-%s '
                                              'aligned 1 time, added to output' \
                                              % (chrom, start, stop))
                        if debugVal is True:
                            print(
                                'Candidate probe at %s:%s-%s aligned 1 time, '
                                'added to output' % (chrom, start, stop))

                    else:
                        # Report info on rejected candidates if desired.
                        if reportVal or debugVal is True:
                            if start not in rejectList:
                                rejectList.append(start)
                                if re.match('\*', chromField) is not None:
                                    if reportVal is True:
                                        reportList.append('Candidate probe at '
                                                          '%s:%s-%s aligned 0 '
                                                          'times, was not added '
                                                          'to output' \
                                                          % (chrom, start, stop))
                                    if debugVal is True:
                                        print(
                                            'Candidate probe at %s:%s-%s '
                                            'aligned 0 times, was not added to '
                                            'output' % (chrom, start, stop))
                                elif re.search('XS', file_read[i]) is not None:
                                    if reportVal is True:
                                        reportList.append('Candidate probe at '
                                                          '%s:%s-%s aligned >1 '
                                                          'time, was not added '
                                                          'to output' \
                                                          % (chrom, start, stop))
                                    if debugVal is True:
                                        print(
                                            'Candidate probe at %s:%s-%s '
                                            'aligned >1 time, was not added to '
                                            'output' % (chrom, start, stop))

                # For zero mode.
                elif zeroVal is True:
                    if re.match('\*', chromField) is not None:
                        outList.append('%s\t%s\t%s\t%s\t%s' \
                                       % (chrom, start, stop, seq, Tm))
                        # Report info on selected probe if desired.
                        if reportVal is True:
                            reportList.append('Candidate probe at %s:%s-%s '
                                              'aligned 0 times, added to output '
                                              '(Zero mode active)' \
                                              % (chrom, start, stop))
                        if debugVal is True:
                            print('Candidate probe at %s:%s-%s aligned 0 times, '
                                  'added to output (Zero mode active)' \
                                  % (chrom, start, stop))
                    else:
                        # Report info on rejected candidates if desired.
                        if reportVal or debugVal is True:
                            if start not in rejectList:
                                rejectList.append(start)
                                if reportVal is True:
                                    reportList.append('Candidate probe at '
                                                      '%s:%s-%s aligned >0 '
                                                      'times, was not added to '
                                                      'output (Zero mode '
                                                      'active)' \
                                                      % (chrom, start, stop))
                                if debugVal is True:
                                    print('Candidate probe at %s:%s-%s aligned '
                                          '>0 times, was not added to output '
                                          '(Zero mode active)' \
                                          % (chrom, start, stop))

    # Else use LDA model.
    else:
        # Import scikit-learn LDA module.
        # Note the module name changed between sklearn versions 0.16 and 0.17
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

        # Import numpy module.
        import numpy as np

        # LDA model information.
        tempList = [32, 37, 42, 47, 52, 57]
        coefList = [[[-0.14494789, 0.18791679, 0.02588474]],
                    [[-0.13364364, 0.22510179, 0.05494031]],
                    [[-0.09006122, 0.25660706, 0.1078303]],
                    [[-0.01593182, 0.24498485, 0.15753649]],
                    [[0.01860365, 0.1750174, 0.17003374]],
                    [[0.03236755, 0.11624593, 0.24306498]]]
        interList = [
            -1.17545204, -5.40436344, -12.45549846, -19.32670233, -20.11992898,
            -23.98652919
        ]
        classList = [-1, 1]

        # Convert lists to ndarrays.
        coefArray = np.asarray(coefList)
        interArray = np.asarray(interList)
        classArray = np.asarray(classList)

        # Determine which index to reference for model values.
        np_index = tempList.index(tempVal)

        # Build model from encoded values.
        clf = LinearDiscriminantAnalysis()
        clf.coef_ = coefArray[np_index]
        clf.intercept_ = interArray[np_index]
        clf.classes_ = classArray

        # Determine which classifier parameters to use.
        clfT = tempList.index(tempVal)

        # Make lists to hold data about candidates.
        testList = []
        testSet = set()
        candsInfo = []

        # Process .sam file and extract information about each candidate probe.
        for i in range(0, len(file_read), 1):
            if file_read[i][0] is not '@':
                chromField = file_read[i].split('\t')[2]
                chrom = file_read[i].split('\t')[0].split(':')[0]
                start = file_read[i].split('\t')[0].split(':')[1].split('-')[0]
                stop = file_read[i].split('\t')[0].split('-')[1].strip(' ')
                seq = file_read[i].split('\t')[9]
                Tm = probeTm(seq, sal, form)

                # First look for candidate probes with only one unique alignment.
                if re.match('\*', chromField) is None \
                   and re.search('XS', file_read[i]) is None:
                    outList.append('%s\t%s\t%s\t%s\t%s' \
                                   % (chrom, start, stop, seq, Tm))
                    # Record info on selected probe if desired.
                    if reportVal is True:
                        reportList.append('Candidate probe at %s:%s-%s aligned '
                                          '1 time, added to output' \
                                          % (chrom, start, stop))
                    if debugVal is True:
                        print(
                            'Candidate probe at %s:%s-%s aligned 1 time, '
                            'added to output' % (chrom, start, stop))

                # Populate lists that will be used to make the classification
                # model input.
                else:
                    if re.match('\*', chromField) is None \
                       and start not in testSet:
                        t = [
                            float(len(seq)),
                            float(file_read[i].split('\t')[12].split(':')[2]),
                            GC(seq)
                        ]
                        testList.append(t)
                        testSet.add(start)
                        candsInfo.append('%s\t%s\t%s\t%s\t%s' \
                                         % (chrom, start, stop, seq, Tm))
                    else:
                        # Report info on rejected candidates if desired.
                        if reportVal or debugVal is True:
                            if re.match('\*', chromField) is not None:
                                if start not in rejectList:
                                    rejectList.append(start)
                                    if reportVal is True:
                                        reportList.append('Candidate probe at '
                                                          '%s:%s-%s aligned 0 '
                                                          'times, was not added '
                                                          'to output' \
                                                          % (chrom, start, stop))
                                    if debugVal is True:
                                        print(
                                            'Candidate probe at %s:%s-%s '
                                            'aligned 0 times, was not added to '
                                            'output' % (chrom, start, stop))

        # Make ndarray for input into classifier.
        testArray = np.asarray(testList)

        # Create classifier
        clf = LinearDiscriminantAnalysis()

        # Load temperature-specific model information.
        clf.coef_ = coefArray[clfT]
        clf.intercept_ = interArray[clfT]
        clf.classes_ = classArray

        # Use model to predict the probability that candidate
        # probes will have thermodynamically relevant
        # off-target binding sites unless all have just 1
        # alignment in the .sam file.
        if len(testArray) > 1:
            probs = clf.predict_proba(testArray)[:, 1]

            # Filter through tested candidates using
            # based on user-specified probability threshold.
            for i in range(0, len(probs), 1):
                if float(probs[i]) < probVal:
                    outList.append(candsInfo[i])
                    if reportVal is True:
                        reportList.append('Candidate probe at %s:%s-%s added to '
                                          'output with %0.4f < %0.4f probability of '
                                          'having off-target sites' \
                                          % (candsInfo[i].split('\t')[0],
                                             candsInfo[i].split('\t')[1],
                                             candsInfo[i].split('\t')[2],
                                             probs[i], probVal))
                    if debugVal is True:
                        print(
                            'Candidate probe at %s:%s-%s added to output with '
                            '%0.4f < %0.4f probability of having off-target sites'
                            % (candsInfo[i].split('\t')[0],
                               candsInfo[i].split('\t')[1],
                               candsInfo[i].split('\t')[2], probs[i], probVal))
                else:
                    if reportVal is True:
                        reportList.append('Candidate probe at %s:%s-%s filtered with '
                                          '%0.4f => %0.4f probability of having '
                                          'off-target sites' \
                                          % (candsInfo[i].split('\t')[0],
                                             candsInfo[i].split('\t')[1],
                                             candsInfo[i].split('\t')[2],
                                             probs[i], probVal))
                    if debugVal is True:
                        print(
                            'Candidate probe at %s:%s-%s filtered with '
                            '%0.4f => %0.4f probability of having off-target sites'
                            % (candsInfo[i].split('\t')[0],
                               candsInfo[i].split('\t')[1],
                               candsInfo[i].split('\t')[2], probs[i], probVal))
        # Sort output list.
        outList.sort(key=lambda x: [int(x.split('\t')[1])])

    # Determine the name of the output file.
    if outNameVal is None:
        outName = '%s_probes' % fileName
    else:
        outName = outNameVal

    # Create the output file.
    output = open('%s.bed' % outName, 'w')

    # Write the output file.
    output.write('\n'.join(outList))
    output.close()

    # Print info about the results to terminal.
    candsNum = len(candsSet)
    cleanNum = len(outList)
    if zeroVal is True:
        print('outputClean identified %d of %d / %0.4f%% candidate probes as '
              'having zero alignments' \
              % (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100))
    elif uniqueVal is True:
        print(
            'outputClean identified %d of %d / %0.4f%% candidate probes as '
            'unique' %
            (cleanNum, candsNum, float(cleanNum) / float(candsNum) * 100))
    else:
        print('outputClean passed %d of %d / %0.4f%% candidate probes through '
              'specificity filtering using the %dC LDA model' \
              % (cleanNum, candsNum,
                 float(cleanNum) / float(candsNum) * 100, tempVal))

    # Write meta information to a .txt file if desired.
    if metaVal is True:
        metaText = open('%s_outputClean_meta.txt' % outName, 'w')
        metaText.write('%s\t%f\t%s\t%d\t%d' \
                       % (inputFile,
                          timeit.default_timer() - startTime,
                          Version, cleanNum, candsNum))
        metaText.close()

    # If desired, create report file.
    if reportVal is True:
        reportOut = open('%s_outputClean_log.txt' % outName, 'w')
        reportList.sort(key=lambda x: [int(x.split(':')[1].split('-')[0])])
        reportList.insert(0,
                          'Results produced by %s %s' % (scriptName, Version))
        reportList.insert(1, '-' * 100)
        if uniqueVal is True:
            reportList.insert(2, 'outputClean returned %d of %d / %0.4f%% '
                                 'candidate probes as having exactly 1 '
                                 'alignment' \
                                 % (cleanNum, candsNum,
                                    float(cleanNum) / float(candsNum) * 100))
        elif zeroVal is True:
            reportList.insert(2, 'outputClean returned %d of %d / %0.4f%% '
                                 'candidate probes as having 0 alignments (Zero '
                                  'mode active)' \
                                 % (cleanNum, candsNum,
                                    float(cleanNum) / float(candsNum) * 100))
        else:
            reportList.insert(2, 'outputClean passed %d of %d / %0.4f%% '
                                 'candidate probes through specificity filtering '
                                 'using the %dC LDA model' \
                                 % (cleanNum, candsNum,
                                    float(cleanNum) / float(candsNum) * 100,
                                    tempVal))
        reportList.insert(3, '-' * 100)
        reportOut.write('\n'.join(reportList))
        reportOut.close()