Beispiel #1
0
def runDripExtract(args, stdo, stde):
    """ Run drip once per spectrum, collapsing all charge-varying candidates into a single GMTK call
    """
    # create constant gmtkViterbi command line string
    # don't need frame/segment difference actions since each PSM corresponds to a specific spectrum, 
    # so that there isn't much redudandancy to exploit
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"

    # for now, don't worry about checking whether peptide is in valid (i.e., present in the digested
    # set of peptide candidates given the protein database)

    # currently ignore ident file input for spectra filtering
    spectra, minMz, maxMz, validcharges, _ = load_spectra_ret_dict(args.spectra, args.charges)
    # update encountered charges
    args.charges = validcharges
    args.mz_lb = minMz
    args.mz_ub = maxMz

    # create GMTK observation files
    # add in support for cluster usage later; assume standalone with multithreading
    if args.high_res_ms2:
        spec_dict, num_psms = make_drip_data_highres(args, spectra, stdo, stde)
    else:
        spec_dict, num_psms = make_drip_data_lowres(args, spectra, stdo, stde)

    pfile_dir = os.path.join(args.output_dir, args.obs_dir)

    # create structure and master files then triangulate
    try:
        create_drip_structure(args.high_res_ms2, args.structure_file, 
                              args.max_obs_mass, False, False,
                              args.high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(args.high_res_ms2, args.master_file, 
                           args.max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(args.high_res_ms2, args.covar_file,
                         args.learned_covars, True,
                         args.high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(args.output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    # gmtkViterbi command line
    vitValsFile = os.path.join(args.logDir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + pfile_dir + '/spectrum.pfile' \
        + ' -of2 ' + pfile_dir + '/pep-lengths.pfile' \
        + ' -cppCommand ' + cppCommand
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    t,d = psm.parse_dripExtract(vitValsFile, os.path.join(args.output_dir, 'pepDB.txt'))

    return t,d, spec_dict
Beispiel #2
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t
Beispiel #3
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DMAX_FRAGMENT_MASS=' + str(max_obs_mass) \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t