def expmat_construction(exp_file, exp_paramlist, charge_list):
    mslev = 1
    for param in exp_paramlist:
        mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \
              tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \
              window, shift =  exp_paramlist[mslev]
    print ' from:', exp_file, ' using mass spectrogram from', gradient_starttime, 'to', gradient_endtime, 'minutes'
    exp_df = pd.read_pickle(exp_file)
    # transform exp_df_head from str to numeric
    exp_df_head = ['ind', 'mslev', 'bpmz', 'bpint', 'starttime']
    for each in exp_df_head:
        exp_df[each] = pd.to_numeric(exp_df[each])
    # drop out of range time
    exp_df = exp_df[exp_df['starttime'] >= gradient_starttime]
    exp_df = exp_df[exp_df['starttime'] < gradient_endtime]

    # combine array and its bp to list of float
    for bp, ar, combine in zip(['bpmz', 'bpint'], ['mzarray', 'intarray'],
                               ['allmz', 'allint']):
        exp_df[combine] = exp_df[bp].apply(lambda x: [x]) + exp_df[ar]

    ## Create index
    exp_df['starttime'] = time_index(exp_df['starttime'], gradient_starttime,
                                     tt)
    exp_df['allmz'] = mz_index(exp_df['allmz'].values, mrange_min, mrange_max,
                               mm)
    exp_df = exp_df[['ind', 'starttime', 'allmz', 'allint']]
    time_col = []
    time_col_temp = []
    for index, row in exp_df.iterrows():
        # remove out of range m
        row['allint'] = [
            i for m, i in zip(row['allmz'], row['allint'])
            if m >= 0 and m < MZ_SCALE
        ]
        row['allmz'] = [m for m in row['allmz'] if m >= 0 and m < MZ_SCALE]
        # use bincount to sum int at same mz_index to create time_index col with MZ_SCALE length
        timecol_array = np.bincount(row['allmz'],
                                    row['allint'],
                                    minlength=(MZ_SCALE))
        timecol_array[timecol_array < 1] = 0
        time_col_temp.append(timecol_array)  # append each row, int sum
        if index % 500 == 0:
            time_col.extend(time_col_temp)
            time_col_temp = []
    # flush last
    time_col.extend(time_col_temp)
    exp_df['allint_overlap'] = time_col

    expdf_row = np.tile(np.arange(MZ_SCALE), exp_df.shape[0])
    expdf_col = np.repeat(exp_df['starttime'].values, MZ_SCALE)
    expdf_value = np.concatenate(exp_df['allint_overlap'].values)

    exp_mat = sparse.coo_matrix((expdf_value,\
           (expdf_row, expdf_col)), \
           shape=(MZ_SCALE, TIME_SCALE))

    exp_mat = smoothingtime_mat(exp_mat, window, shift)
    exp_mat, mat_mean = rescale_mat(exp_mat)
    return exp_mat, exp_paramlist, mat_mean
Ejemplo n.º 2
0
def h_prediction(initRT_tuple, initRT_width, Hpeak_mean, noise_number,
                 globalparam_list, gaussian_width):
    gc.disable()
    for param in globalparam_list:
        mm, mrange_min, mrange_max, mz_range, MZ_SCALE, \
              tt, gradient_starttime, gradient_endtime, gradient_time, TIME_SCALE, \
              window, shift =  globalparam_list[1]

    peakside_index = int(gaussian_width / 2 * tt)  #before smooth
    peakwidth_index = int((peakside_index * 2) + 1)
    peakSD_index = peakwidth_index / 4
    rt_index = time_index([x[2] for x in initRT_tuple], gradient_starttime, tt)
    keeprow, keepcol, keepdata = [], [], []
    for idx, rt in enumerate(rt_index):
        if rt < 0:
            rt = 0
        elif rt > TIME_SCALE - 1:
            rt = TIME_SCALE - 1
        H_row = np.zeros(TIME_SCALE)
        left, right = rt - peakside_index, rt + peakside_index
        peak_at = np.linspace(left, right, peakwidth_index)
        peak_int = norm.pdf(peak_at, rt, peakSD_index) * Hpeak_mean
        # delete the plot which is located out of range
        mask = [(peak_at >= 0) & (peak_at < TIME_SCALE)]
        peak_at = peak_at[tuple(mask)]
        peak_int = peak_int[tuple(mask)]

        nonzero = np.arange(peak_at[0], peak_at[-1] + 1)
        keeprow.append([idx] * len(nonzero))
        keepcol.append(peak_at)
        keepdata.append(peak_int)

    keeprow, keepcol, keepdata = flatten(keeprow), flatten(keepcol), flatten(
        keepdata)
    H_mat = sparse.coo_matrix((keepdata, (keeprow, keepcol)),
                              shape=(len(initRT_tuple), TIME_SCALE))
    H_mat = smoothingtime_mat(H_mat, window, shift)

    initRT_correct = [x[2] for x in initRT_tuple]
    initRT_correct = [
        gradient_starttime if x < gradient_starttime else x
        for x in initRT_correct
    ]
    initRT_correct = [
        gradient_endtime if x > gradient_endtime else x for x in initRT_correct
    ]

    initRT_correct_keep = []
    for ind, (a, b, c) in enumerate(initRT_tuple):
        initRT_correct_keep.append((a, b, c, initRT_correct[ind]))
    del (initRT_correct)

    return H_mat, initRT_correct_keep