Example #1
0
def make_ident_persid(output,
                      logDir,
                      topMatch,
                      highResMs2,
                      spec_dict,
                      meanFile,
                      pepDBperSidList,
                      output_dir,
                      writeDelSequences=False,
                      writeAllDelSequences=False):
    """Driver function
    """

    # means are constant for low-res
    if not highResMs2:
        dripMeans = load_drip_means(meanFile)

    try:
        identFile = open(output, "w")
    except IOError:
        print "Could not open file %s for writing, exitting" % output

    sid_numPeps_lookup = open(pepDBperSidList, "r")
    reader = csv.DictReader(sid_numPeps_lookup, delimiter='\t')
    identFile.write(
        'Kind\tSid\tFrames\tScore\tPeptide\tObs_Inserts\tTheo_Deletes\tObs_peaks_scored\tTheo_peaks_used\tSum_obs_intensities\tSum_scored_mz_dist\tCharge\n'
    )

    if (writeDelSequences):
        try:
            delSequencesFid = open(
                output[0:-4] + '_deletionSequences' + output[-4:], "w")
            delSequencesFid.write('Kind\tSid\tPeptide\tNum_deletes\tCharge\n')
        except IOError:
            print('Could not open %s for writing' %
                  (output[0:-4] + '_deletionSequences' + output[-4:]))
            writeDelSequences = False
            delSequencesFid = None
    else:
        delSequencesFid = None

    target_peptide = None
    decoy_peptide = None
    for row in reader:
        sid = int(row['sid'])
        currSpec = spec_dict[sid]
        numPeps = int(row['numPeps'])
        testOutputFile = '%s/vitVals-sid%d.txt' % (logDir, sid)
        pepDBlist = '%s/sid%d-pepDB.txt' % (output_dir, sid)

        if highResMs2:
            # load this spectrum's collection of means
            mean_file = meanFile[:-4] + '-sid' + str(sid) + '.txt'
            dripMeans = load_drip_means(mean_file)

        td = parse_segments_persid(testOutputFile, dripMeans, topMatch, sid,
                                   currSpec, numPeps, pepDBlist, identFile,
                                   writeDelSequences, writeAllDelSequences,
                                   delSequencesFid)
    identFile.close()
def make_ident_persid(output, logDir, 
                      topMatch, highResMs2, 
                      spec_dict, meanFile,
                      pepDBperSidList, output_dir,
                      writeDelSequences = False,
                      writeAllDelSequences = False):
    """Driver function
    """

    # means are constant for low-res
    if not highResMs2:
        dripMeans = load_drip_means(meanFile)

    try:
        identFile = open(output, "w")
    except IOError:
        print "Could not open file %s for writing, exitting" % output

    sid_numPeps_lookup = open(pepDBperSidList, "r")
    reader = csv.DictReader(sid_numPeps_lookup, delimiter = '\t')
    identFile.write('Kind\tSid\tFrames\tScore\tPeptide\tObs_Inserts\tTheo_Deletes\tObs_peaks_scored\tTheo_peaks_used\tSum_obs_intensities\tSum_scored_mz_dist\tCharge\n')
    
    if(writeDelSequences):
        try:
            delSequencesFid=open(output[0:-4]+'_deletionSequences'+output[-4:], "w")
            delSequencesFid.write('Kind\tSid\tPeptide\tNum_deletes\tCharge\n')
        except IOError:
            print('Could not open %s for writing' % (output[0:-4]+'_deletionSequences'+output[-4:]))
            writeDelSequences=False
            delSequencesFid = None
    else:
        delSequencesFid = None
        
    target_peptide = None
    decoy_peptide = None
    for row in reader:
        sid = int(row['sid'])
        currSpec = spec_dict[sid]
        numPeps = int(row['numPeps'])
        testOutputFile = '%s/vitVals-sid%d.txt' % (logDir, sid)
        pepDBlist = '%s/sid%d-pepDB.txt' % (output_dir, sid)

        if highResMs2:
            # load this spectrum's collection of means
            mean_file = meanFile[:-4] + '-sid' + str(sid) + '.txt'
            dripMeans = load_drip_means(mean_file)

        td = parse_segments_persid(testOutputFile, dripMeans, 
                                   topMatch, sid, 
                                   currSpec, numPeps, 
                                   pepDBlist, identFile,
                                   writeDelSequences, writeAllDelSequences, 
                                   delSequencesFid)
    identFile.close()
Example #3
0
def write_output(targets, decoys, filename, meanFile, spec_dict):
    """ Write PSMs and features to file
    """

    dripMeans = load_drip_means(meanFile)

    try:
        identFid = open(filename, "w")
    except IOError:
        print "Could not open file %s for writing, exitting" % output

    identFid.write('Kind\tSid\tFrames\tScore\tPeptide\tObs_Inserts\tTheo_Deletes\tObs_peaks_scored\tTheo_peaks_used\tSum_obs_intensities\tSum_scored_mz_dist\tCharge\n')

    for sid, charge in targets:
        s = spec_dict[sid]
        for psm in targets[sid,charge]:
            write_psm_ins_dels(psm, s, dripMeans, identFid)
        if (sid,charge) in decoys:
            for psm in decoys[sid,charge]:
                write_psm_ins_dels(psm, s, dripMeans, identFid)
    identFid.close()
Example #4
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t
Example #5
0
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html',
              highResMs2 = False,
              dripLearnedMeans = 'dripLearned.means',
              dripLearnedCovars = 'dripLearned.covars',
              mods = '', ntermMods = '', ctermMods = '',
              precursor_filter = False, 
              high_res_gauss_dist = 0.05):
    """
    """
    # initialize arguments for dripExtract
    args = dripExtractParams(psmFile, spectrumFile, 'all', 
                             mods, ntermMods, ctermMods, 
                             highResMs2, 
                             dripLearnedMeans, dripLearnedCovars)

    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    stde = open('gmtk_err', "w")
    # stdo = sys.stdout
    stdo = stde

    args.precursor_filter = False
    args.high_res_gauss_dist = high_res_gauss_dist
    if precursor_filter: 
        args.normalize = 'top300TightSequest'
    else:
        args.normalize = 'top300Sequest'

    # decode DRIP PSMs
    t, d, spectra0 = runDripExtract(args, stdo, stde)
    
    # if variable mods, get variable mod string per PSM
    if varMods or ntermVarMods or ctermVarMods:
        varModDict = psm_var_mods(psmFile)
        assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting"  (psmFile)
    spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile)

    # get original intensity values to plot
    for sid in spectra0:
        spectra[sid].mz = list(spectra0[sid].mz)
        mz_vals = set(spectra0[sid].mz)
        z = max(spectra0[sid].intensity)
        spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity)
                                  if mz in mz_vals]

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
    else:
        dripMeansSet = set([])
        for sid, c in t:
            for p in t[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c,
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        for sid, c in d:
            for p in d[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        dripMeans = {}
        for ind, ion in enumerate(sorted(dripMeansSet)):
            dripMeans[ind] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    all_psms = []
    varModSequence = ''
    for sid, c in t:
        s = spectra[sid]
        for p in t[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)
    for sid, c in d:
        s = spectra[sid]
        for p in d[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)

    fid = open(plotList, "w")

    all_psms.sort(key = lambda r: r.score, reverse = True)
    for p in all_psms:
        if p.kind == 't':
            kind = 'target'
        elif p.kind == 'd':
            kind = 'decoy'
        else:
            continue

        plotName = kind + 'Scan' + str(p.scan) + \
            'Charge' + str(p.charge) + \
            p.peptide + '.png'

        p.plot_drip_viterbi(plotName)
        fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" %
                  (plotName, kind, p.scan, p.charge, p.peptide))

    fid.close()
Example #6
0
def make_drip_data_lowres(args, spectra, stdo, stde):
    """Generate test data .pfile. and create job scripts for cluster use.
       Decrease number of calls to GMTK by only calling once per spectrum
       and running for all charge states in one go
    """
    # parse modifications
    mods, varMods = parse_var_mods(args.mods_spec, True)
    # print "mods:"
    # print mods
    ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False)
    # print "n-term mods:"
    # print nterm_mods
    ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False)

    varModKey = "Var_mod_seq"

    # load means
    dripMeans = load_drip_means(args.learned_means)
    # make master file
    make_master_parameters_lowres(args, dripMeans)

    if not args.append_to_pin:
        target,decoy,num_psms = load_psms(args.psm_file)
    else:
        target,decoy,num_psms = load_pin_file(args.psm_file)

    # check whether variable mods enzyme options were specified and 
    # necessary variable mod string specifying which amino acids are modded
    # were in the PSM files
    for i in target[target.keys()[0]]:
        t = i
        break
    if varMods or ntermVarMods or ctermVarMods:
        if varModKey not in t.other:
            print "Variable modifications enzyme options specified,"
            print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified."
            print "Exitting"
            exit(-1)
    # else:
    #     if varModKey in t.other:
    #         print "PSM file does contains field Var_mod_seq denoting variable modifications,"
    #         print "but variable modifications enzyme options not specified."
    #         print "Exitting"
    #         exit(-1)

    pfile_dir = os.path.join(args.output_dir, args.obs_dir)
    sid_charges =  list(set(target.iterkeys()) | set(decoy.iterkeys()))
    # sid_charges = list(set(list(target.iterkeys()) + list(decoy.iterkeys())))

    # assume that we should randomize PSMs for multithreading purposes; only reason
    # why we are currently assuming this is that there is already a parameter for dripSearch
    # which signifies whether we should shuffle the data
    shuffle(sid_charges)

    if(args.normalize != 'filter0'):
        preprocess = pipeline(args.normalize)

    validcharges = args.charges

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    peptide_pfile = create_pfile(pfile_dir,
                                 'pep-lengths.pfile',
                                 0, 1)
            
    spectrum_pfile = create_pfile(pfile_dir,
                                  'spectrum.pfile',
                                  2,0)

    pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    spec_dict = {}
    pep_num = 0
    for sid, charge in sid_charges:
        if sid not in spec_dict:
            s = spectra[sid]
            preprocess(s)
            spec_dict[sid] = s
        else:
            s = spec_dict[sid]

        if args.filt_theo_peaks:
            if args.per_spectrum_mz_bound:
                minMz = s.mz[0]
                maxMz = s.mz[-1]
            else:
                minMz = args.mz_lb
                maxMz = args.mz_ub

        if (sid,charge) in target:
            for p in target[sid,charge]:
                pep = p.peptide
                # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods,
                #                                  ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = p.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, 
                                                              mods, ntermMods, ctermMods,
                                                              varMods, ntermVarMods, ctermVarMods,
                                                              varModSequence)
                else:
                    bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, 
                                                     mods, ntermMods, ctermMods)
                pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge))
                # numBY for DRIP features assumes all b-/y-ions, not just those
                # unfiltered per spectrum
                if args.filt_theo_peaks:
                    filter_theoretical_peaks_lowres(bNy, dripMeans,
                                                    minMz, maxMz)
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, True, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pep_num += 1

        if (sid,charge) in decoy:
            for d in decoy[sid,charge]:
                pep = d.peptide
                # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, 
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = d.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, 
                                                              mods, ntermMods, ctermMods,
                                                              varMods, ntermVarMods, ctermVarMods,
                                                              varModSequence)
                else:
                    bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, 
                                                     mods, ntermMods, ctermMods)
                pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge))
                # numBY for DRIP features assumes all b-/y-ions, not just those
                # unfiltered per spectrum
                if args.filt_theo_peaks:
                    filter_theoretical_peaks_lowres(bNy, dripMeans,
                                                    minMz, maxMz)
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, False, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pep_num += 1

    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(args.output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)

    return spec_dict, pep_num
Example #7
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DMAX_FRAGMENT_MASS=' + str(max_obs_mass) \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t
Example #8
0
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html',
              highResMs2 = False,
              dripLearnedMeans = 'dripLearned.means',
              dripLearnedCovars = 'dripLearned.covars',
              mods = '', ntermMods = '', ctermMods = '',
              precursor_filter = False, 
              high_res_gauss_dist = 0.05):
    """
    """
    # initialize arguments for dripExtract
    args = dripExtractParams(psmFile, spectrumFile, 'all', 
                             mods, ntermMods, ctermMods, 
                             highResMs2, 
                             dripLearnedMeans, dripLearnedCovars)

    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    stde = open('gmtk_err', "w")
    # stdo = sys.stdout
    stdo = stde

    args.precursor_filter = False
    args.high_res_gauss_dist = high_res_gauss_dist
    if precursor_filter: 
        args.normalize = 'top300TightSequest'
    else:
        args.normalize = 'top300Sequest'

    # decode DRIP PSMs
    t, d, spectra0 = runDripExtract(args, stdo, stde)
    
    # if variable mods, get variable mod string per PSM
    if varMods or ntermVarMods or ctermVarMods:
        varModDict = psm_var_mods(psmFile)
        assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting"  (psmFile)
    spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile)

    # get original intensity values to plot
    for sid in spectra0:
        spectra[sid].mz = list(spectra0[sid].mz)
        mz_vals = set(spectra0[sid].mz)
        z = max(spectra0[sid].intensity)
        spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity)
                                  if mz in mz_vals]

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
    else:
        dripMeansSet = set([])
        for sid, c in t:
            for p in t[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c,
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        for sid, c in d:
            for p in d[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        dripMeans = {}
        for ind, ion in enumerate(sorted(dripMeansSet)):
            dripMeans[ind] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    all_psms = []
    varModSequence = ''
    for sid, c in t:
        s = spectra[sid]
        for p in t[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)
    for sid, c in d:
        s = spectra[sid]
        for p in d[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)

    fid = open(plotList, "w")

    all_psms.sort(key = lambda r: r.score, reverse = True)
    for p in all_psms:
        if p.kind == 't':
            kind = 'target'
        elif p.kind == 'd':
            kind = 'decoy'
        else:
            continue

        plotName = kind + 'Scan' + str(p.scan) + \
            'Charge' + str(p.charge) + \
            p.peptide + '.png'

        p.plot_drip_viterbi(plotName)
        fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" %
                  (plotName, kind, p.scan, p.charge, p.peptide))

    fid.close()