Ejemplo n.º 1
0
def make_drip_data_lowres(args, spectra, stdo, stde):
    """Generate test data .pfile. and create job scripts for cluster use.
       Decrease number of calls to GMTK by only calling once per spectrum
       and running for all charge states in one go
    """
    # parse modifications
    mods, varMods = parse_var_mods(args.mods_spec, True)
    # print "mods:"
    # print mods
    ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False)
    # print "n-term mods:"
    # print nterm_mods
    ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False)

    varModKey = "Var_mod_seq"

    # load means
    dripMeans = load_drip_means(args.learned_means)
    # make master file
    make_master_parameters_lowres(args, dripMeans)

    if not args.append_to_pin:
        target,decoy,num_psms = load_psms(args.psm_file)
    else:
        target,decoy,num_psms = load_pin_file(args.psm_file)

    # check whether variable mods enzyme options were specified and 
    # necessary variable mod string specifying which amino acids are modded
    # were in the PSM files
    for i in target[target.keys()[0]]:
        t = i
        break
    if varMods or ntermVarMods or ctermVarMods:
        if varModKey not in t.other:
            print "Variable modifications enzyme options specified,"
            print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified."
            print "Exitting"
            exit(-1)
    # else:
    #     if varModKey in t.other:
    #         print "PSM file does contains field Var_mod_seq denoting variable modifications,"
    #         print "but variable modifications enzyme options not specified."
    #         print "Exitting"
    #         exit(-1)

    pfile_dir = os.path.join(args.output_dir, args.obs_dir)
    sid_charges =  list(set(target.iterkeys()) | set(decoy.iterkeys()))
    # sid_charges = list(set(list(target.iterkeys()) + list(decoy.iterkeys())))

    # assume that we should randomize PSMs for multithreading purposes; only reason
    # why we are currently assuming this is that there is already a parameter for dripSearch
    # which signifies whether we should shuffle the data
    shuffle(sid_charges)

    if(args.normalize != 'filter0'):
        preprocess = pipeline(args.normalize)

    validcharges = args.charges

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    peptide_pfile = create_pfile(pfile_dir,
                                 'pep-lengths.pfile',
                                 0, 1)
            
    spectrum_pfile = create_pfile(pfile_dir,
                                  'spectrum.pfile',
                                  2,0)

    pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    spec_dict = {}
    pep_num = 0
    for sid, charge in sid_charges:
        if sid not in spec_dict:
            s = spectra[sid]
            preprocess(s)
            spec_dict[sid] = s
        else:
            s = spec_dict[sid]

        if args.filt_theo_peaks:
            if args.per_spectrum_mz_bound:
                minMz = s.mz[0]
                maxMz = s.mz[-1]
            else:
                minMz = args.mz_lb
                maxMz = args.mz_ub

        if (sid,charge) in target:
            for p in target[sid,charge]:
                pep = p.peptide
                # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods,
                #                                  ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = p.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, 
                                                              mods, ntermMods, ctermMods,
                                                              varMods, ntermVarMods, ctermVarMods,
                                                              varModSequence)
                else:
                    bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, 
                                                     mods, ntermMods, ctermMods)
                pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge))
                # numBY for DRIP features assumes all b-/y-ions, not just those
                # unfiltered per spectrum
                if args.filt_theo_peaks:
                    filter_theoretical_peaks_lowres(bNy, dripMeans,
                                                    minMz, maxMz)
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, True, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pep_num += 1

        if (sid,charge) in decoy:
            for d in decoy[sid,charge]:
                pep = d.peptide
                # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, 
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = d.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, 
                                                              mods, ntermMods, ctermMods,
                                                              varMods, ntermVarMods, ctermVarMods,
                                                              varModSequence)
                else:
                    bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, 
                                                     mods, ntermMods, ctermMods)
                pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge))
                # numBY for DRIP features assumes all b-/y-ions, not just those
                # unfiltered per spectrum
                if args.filt_theo_peaks:
                    filter_theoretical_peaks_lowres(bNy, dripMeans,
                                                    minMz, maxMz)
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, False, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pep_num += 1

    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(args.output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)

    return spec_dict, pep_num
Ejemplo n.º 2
0
def make_drip_data_highres(args, spectra, stdo, stde):
    """Generate test data .pfile. and create job scripts for cluster use (if num_jobs > 1).
       Decrease number of calls to GMTK by only calling once per spectrum
       and running for all charge states in one go.

       inputs:
       args - output of parsed input arguments (struct)

       outputs:
       sids - list of scan IDs for the generated data

       pre:
       - args has been created by parse_args(), directories have been created/checked for existence,
         relevant arguments have been processed (Booleans, mods, digesting enzyme, etc)
       - data has been created by candidate_spectra_generate() and contains the above mentioned fields

       post:
       - args.{mean_file, gauss_file, mixture_file, collection_file} will all be adjusted
       - args.max_mass will be updated to the size of the number of unique theoretical fragmentation locations (floating point if high-res ms2, integers if low-res ms2)
    """
    # parse modifications
    mods, varMods = parse_var_mods(args.mods_spec, True)
    # print "mods:"
    # print mods
    ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False)
    # print "n-term mods:"
    # print nterm_mods
    ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False)

    varModKey = "Var_mod_seq"

    if not args.append_to_pin:
        target,decoy,num_psms = load_psms(args.psm_file)
    else:
        target,decoy,num_psms = load_pin_file(args.psm_file)

    # check whether variable mods enzyme options were specified and 
    # necessary variable mod string specifying which amino acids are modded
    # were in the PSM files
    for i in target[target.keys()[0]]:
        t = i
        break
    if varMods or ntermVarMods or ctermVarMods:
        if varModKey not in t.other:
            print "Variable modifications enzyme options specified,"
            print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified."
            print "Exitting"
            exit(-1)
    # else:
    #     if varModKey in t.other:
    #         print "PSM file does contains field Var_mod_seq denoting variable modifications,"
    #         print "but variable modifications enzyme options not specified."
    #         print "Exitting"
    #         exit(-1)


    pfile_dir = os.path.join(args.output_dir, args.obs_dir)
    sid_charges =  list(set(target.iterkeys()) | set(decoy.iterkeys()))
    # assume that we should randomize PSMs for multithreading purposes; only reason
    # why we are currently assuming this is that there is already a parameter for dripSearch
    # which signifies whether we should shuffle the data
    shuffle(sid_charges)

    if(args.normalize != 'filter0'):
        preprocess = pipeline(args.normalize)

    validcharges = args.charges

    ion_dict = {} # global dictionary for used fragment ions
    theo_spec_dict = {}
    numBY_dict_per_sid = {}
    # construct ion_dict
    for sid in spectra:
        s = spectra[sid]
        preprocess(s)
        for charge in validcharges:
            if (s.spectrum_id, charge) not in target:
                continue
            # check if we're filtering theoretical peaks outside observed m/z values
            if args.filt_theo_peaks:
                if args.per_spectrum_mz_bound:
                    minMz = s.mz[0]
                    maxMz = s.mz[-1]
                else:
                    minMz = args.mz_lb
                    maxMz = args.mz_ub

            # calculate maximum decoy and target theoretical spectra cardinalities
            for p in target[s.spectrum_id, charge]:
                pep = p.peptide
                # bNy = interleave_b_y_ions(Peptide(pep), charge, mods,
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = p.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), charge, 
                                              mods, ntermMods, ctermMods)
                numBY_dict_per_sid[sid, pep] = len(bNy)
                if args.filt_theo_peaks:
                    filter_theoretical_peaks(bNy, minMz, maxMz)
                theo_spec_dict[s.spectrum_id, pep] = bNy

                for i in bNy:
                    ion_dict[i] = 1
            for d in decoy[s.spectrum_id, charge]:
                pep = d.peptide
                # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, 
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = d.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), charge, 
                                              mods, ntermMods, ctermMods)
                numBY_dict_per_sid[sid, pep] = len(bNy)
                if args.filt_theo_peaks:
                    filter_theoretical_peaks(bNy, minMz, maxMz)
                theo_spec_dict[s.spectrum_id, pep] = bNy
                for i in bNy:
                    ion_dict[i] = 1

    ions = list(ion_dict.iterkeys())
    ions.sort()
    for i, ion in enumerate(ions):
        ion_dict[ion] = i

    # make collection per spectrum
    make_master_parameters(args, ion_dict, ions)
    peptide_pfile = create_pfile(pfile_dir,
                                 'pep-lengths.pfile',
                                 0, 1)
            
    spectrum_pfile = create_pfile(pfile_dir,
                                  'spectrum.pfile',
                                  2,0)

    pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    spec_dict = {}
    pep_num = 0
    for sid, charge in sid_charges:
        if sid not in spec_dict:
            s = spectra[sid]
            preprocess(s)
            spec_dict[sid] = s
        else:
            s = spec_dict[sid]

        for p in target[sid,charge]:
            pep = p.peptide
            bNy = theo_spec_dict[s.spectrum_id, pep]
            bNy = [ion_dict[bOrY] for bOrY in bNy]
            drip_peptide_sentence(pep_dt, pep, bNy, 
                                  pep_num, s.spectrum_id, args.max_obs_mass,
                                  peptide_pfile, True, len(bNy)-1)
            drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
            pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                                      pep, 
                                                      numBY_dict_per_sid[sid, pep],
                                                      charge))
            pep_num += 1

        if (sid,charge) in decoy:
            for d in decoy[sid,charge]:
                pep = d.peptide
                bNy = theo_spec_dict[s.spectrum_id, pep]
                bNy = [ion_dict[bOrY] for bOrY in bNy]
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, False, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, 
                                                          pep, 
                                                          numBY_dict_per_sid[sid, pep],
                                                          charge))
                pep_num += 1

    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(args.output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)

    return spec_dict, pep_num