if args.loc_out != '': if not (os.path.isdir(args.loc_out)): os.makedirs(args.loc_out) log.critical("created output folder %r", args.loc_out) config = configparser.RawConfigParser() config.read(os.path.join(os.path.dirname( os.path.realpath(sys.argv[0])), 'moff_setting.properties')) # just for Galaxy input is possible to use one big input file and a list of raw file. # the big file must have the result of each raw file and the columns 'Spectrum File' should be availabe # This option work only with PS report using only --tsv_list and --raw_list if ( args.tsv_list is not None) and ( args.raw_list is not None) and (len(args.tsv_list)==1) : data_temp= pd.read_csv(args.tsv_list[0],sep="\t") if moff.check_ps_input_data(data_temp.columns.tolist(), ast.literal_eval(config.get('moFF', 'ps_default_export_v1'))) == 1: # split the data input file only if inave more than ONE raw file and tha input file contain identification for more the ONE run if len(data_temp['Spectrum File'].unique())> 1 and len(args.raw_list) > 1: output_list_loc=[] for file in data_temp['Spectrum File'].unique(): data_temp[data_temp['Spectrum File']== file].to_csv(os.path.join(os.path.split(args.tsv_list[0])[0],file.split('.')[0]+ '.txt') , sep='\t' , index=False ) output_list_loc.append(os.path.join(os.path.split(args.tsv_list[0])[0],file.split('.')[0]+ '.txt') ) if len(args.raw_list) != len(output_list_loc): exit('-- Number of raw file is different to the number of input sources detectd in your one input file --') #sort them to be sure about the association between input - raw file args.raw_list= sorted(args.raw_list) args.tsv_list= sorted(output_list_loc) #clean dataset thta I don use anymore
def run_mbr(args): """ Macthing Between Run module. :param args: :return: """ ch = logging.StreamHandler() ch.setLevel(logging.ERROR) log.addHandler(ch) if args.loc_in is None: # the user uses --inputtsv option if not (args.loc_out is None): # if the user use --output_folder the mbr folder will be created there output_dir = os.path.join(args.loc_out, 'mbr_output') else: # if the user does not use --output_folder the mbr folder will be created on moFF path location output_dir = os.path.join('mbr_output') print(os.path.abspath(output_dir)) else: # the user use the --inputF option if os.path.exists(os.path.join(args.loc_in)): # if '/' in str(args.loc_in): output_dir = os.path.join(args.loc_in, 'mbr_output') else: exit( os.path.join(args.loc_in) + ' EXIT input folder path is not well specified --> / missing or wrong path' ) # if not (os.path.isdir(args.loc_in)): # exit(str(args.loc_in) + '--> input folder does not exist ! ') # if str(args.loc_in) == '': # output_dir = 'mbr_output' # else: # if os.path.exists(os.path.join(args.loc_in)): # if '/' in str(args.loc_in): # output_dir = os.path.join(args.loc_in, 'mbr_output') # else: # exit(os.path.join(args.loc_in) + ' EXIT input folder path not well specified --> / missing ') if not (os.path.isdir(output_dir)): log.critical("Created MBR output folder in : %s ", os.path.abspath(output_dir)) os.makedirs(output_dir) else: log.critical("MBR Output folder in : %s ", os.path.abspath(output_dir)) # set log to file w_mbr = logging.FileHandler(os.path.join(output_dir, args.log_label + '_mbr_.log'), mode='w') w_mbr.setLevel(logging.INFO) log.addHandler(w_mbr) moff_path = os.path.dirname(os.path.realpath(sys.argv[0])) config = configparser.RawConfigParser() config.read(os.path.join(moff_path, 'moff_setting.properties')) # it s always placed in same folder of moff_mbr.py # read input # comment better # name of the input file exp_set = [] # list of the input dataframe exp_t = [] # list of the output dataframe exp_out = [] # lsit of input datafra used as help exp_subset = [] # list of the name of the mbr output exp_out_name = [] if args.loc_in is None: for id_name in args.tsv_list: exp_set.append(id_name) else: for item in os.listdir(args.loc_in): if os.path.isfile(os.path.join(args.loc_in, item)): if os.path.join(args.loc_in, item).endswith('.' + args.ext): log.critical(item) exp_set.append(os.path.join(args.loc_in, item)) # sample optiion is valid only if folder iin option is valid if (args.sample is not None) and (args.loc_in is not None): exp_set_app = copy.deepcopy(exp_set) for a in exp_set: if re.search(args.sample, a) is None: exp_set_app.remove(a) exp_set = exp_set_app if (exp_set == []) or (len(exp_set) == 1): exit( 'ERROR input files not found or just one input file selected . check the folder or the extension given in input' ) for a in exp_set: log.critical('Reading file: %s ', a) exp_subset.append(a) data_moff = pd.read_csv(a, sep="\t", header=0) list_name = data_moff.columns.values.tolist() # get the lists of PS defaultcolumns from properties file list_ps_def = ast.literal_eval( config.get('moFF', 'ps_default_export_v1')) # here it controls if the input file is a PS export; if yes it maps the input in right moFF name if moff.check_ps_input_data(list_name, list_ps_def) == 1: log.critical( 'Detected input file from PeptideShaker export..: %s ', a) # map the columns name according to moFF input requirements data_moff, list_name = moff.map_ps2moff(data_moff, 'col_must_have_mbr') log.critical( 'Mapping columns names into the the moFF requested column name..: %s ', a) # print data_moff.columns if moff.check_columns_name( list_name, ast.literal_eval(config.get('moFF', 'col_must_have_mbr')), log) == 1: exit('ERROR minimal field requested are missing or wrong') data_moff['matched'] = 0 data_moff['mass'] = data_moff['mass'].map('{:.4f}'.format) data_moff['code_unique'] = data_moff['mod_peptide'].astype( str) # + '_' + data_moff['mass'].astype(str) data_moff = data_moff.sort_values(by='rt') exp_t.append(data_moff) exp_out.append(data_moff) log.critical('Read input --> done ') # parameter of the number of query # set a list of filed mandatory # ['matched','peptide','mass','mz','charge','prot','rt'] n_replicates = len(exp_t) exp_set = exp_subset aa = range(0, n_replicates) out = list(itertools.product(aa, repeat=2)) # just to save all the model # add matched columns list_name.append('matched') # final status -1 if one of the output is empty out_flag = 1 # input of the methods diff_field = np.setdiff1d(exp_t[0].columns, [ 'matched', 'mod_peptide', 'peptide', 'mass', 'mz', 'charge', 'prot', 'rt' ]) log.info('Outlier Filtering is %s ', 'active' if args.out_flag else 'not active') log.info('Number of replicates %i,', n_replicates) log.info('Pairwise model computation ----') if args.rt_feat_file is not None: log.critical('Custom list of peptide used provided by the user in %s', args.rt_feat_file) # log.info('Custom list of peptide used provided by the user in %s', args.rt_feat_file) shared_pep_list = pd.read_csv(args.rt_feat_file, sep='\t') shared_pep_list['mass'] = shared_pep_list['mass'].map('{:.4f}'.format) shared_pep_list['code'] = shared_pep_list['peptide'].astype( str) + '_' + shared_pep_list['mass'].astype(str) list_shared_pep = shared_pep_list['code'] log.info('Custom list of peptide contains %i ', list_shared_pep.shape[0]) for jj in aa: # list of the model saved model_save = [] # list of the error in min/or sec model_err = [] # list of the status of the model -1 means model not available for low points in the training set model_status = [] c_rt = 0 pre_pep_save = [] log.info('matching in %s', exp_set[jj]) result = itertools.filterfalse(lambda x: x[0] != jj or x[1] == jj, out) for i in result: #if i[0] == jj and i[1] != jj: if args.rt_feat_file is not None: # use of custom peptide comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin( list_shared_pep)][['code_unique', 'peptide', 'prot', 'rt']] comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin( list_shared_pep)][['code_unique', 'peptide', 'prot', 'rt']] comA = comA.groupby('code_unique', as_index=False).mean() comB = comB.groupby('code_unique', as_index=False).mean() common = pd.merge(comA, comB, on=['code_unique'], how='inner') else: # use of shared peptdes. log.info(' Matching %s peptide in searching in %s ', exp_set[i[0]], exp_set[i[1]]) list_pep_repA = exp_t[i[0]]['code_unique'].unique() list_pep_repB = exp_t[i[1]]['code_unique'].unique() log.info('Peptide unique (mass + sequence) %i , %i ', list_pep_repA.shape[0], list_pep_repB.shape[0]) set_dif_s_in_1 = np.setdiff1d(list_pep_repB, list_pep_repA) add_pep_frame = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin( set_dif_s_in_1)].copy() #-- prepare the testing set add_pep_frame = add_pep_frame[[ 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt' ]] # add_pep_frame['code_unique'] = '_'.join([add_pep_frame['peptide'], add_pep_frame['prot'], add_pep_frame['mass'].astype(str), add_pep_frame['charge'].astype(str)]) add_pep_frame['code_unique'] = add_pep_frame['mod_peptide'] + '_' + \ add_pep_frame['prot'] + '_' + '_' + \ add_pep_frame['charge'].astype(str) add_pep_frame = add_pep_frame.groupby( 'code_unique', as_index=False)['peptide', 'mod_peptide', 'mass', 'charge', 'mz', 'prot', 'rt'].aggregate(max) add_pep_frame = add_pep_frame[[ 'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt' ]] list_name = add_pep_frame.columns.tolist() list_name = [ w.replace('rt', 'rt_' + str(c_rt)) for w in list_name ] add_pep_frame.columns = list_name pre_pep_save.append(add_pep_frame) c_rt += 1 #-------- pep_shared = np.intersect1d(list_pep_repA, list_pep_repB) log.info(' Peptide (mass + sequence) added size %i ', add_pep_frame.shape[0]) log.info(' Peptide (mass + sequence) )shared %i ', pep_shared.shape[0]) comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin( pep_shared)][['code_unique', 'peptide', 'prot', 'rt']] comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin( pep_shared)][['code_unique', 'peptide', 'prot', 'rt']] # filtering using the variance added 17_08 flag_var_filt = False if flag_var_filt: dd = comA.groupby('code_unique', as_index=False) top_res = dd.agg(['std', 'mean', 'count']) # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100]) th = np.nanpercentile(top_res['rt']['std'].values, 60) comA = comA[~comA['code_unique'].isin(top_res[ top_res['rt']['std'] > th].index)] # data B ' dd = comB.groupby('code_unique', as_index=False) top_res = dd.agg(['std', 'mean', 'count']) # print comB.shape # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100]) th = np.nanpercentile(top_res['rt']['std'].values, 60) comB = comB[~comB['code_unique'].isin(top_res[ top_res['rt']['std'] > th].index)] comA = comA.groupby('code_unique', as_index=False).mean() comB = comB.groupby('code_unique', as_index=False).mean() common = pd.merge(comA, comB, on=['code_unique'], how='inner') if common.shape[0] <= 10 and args.rt_feat_file is not None: model_status.append(-1) continue # filtering outlier option else: if args.out_flag: filt_x, filt_y, pos_out = MD_removeOutliers( common['rt_y'].values, common['rt_x'].values, args.w_filt) data_B = filt_x data_A = filt_y data_B = np.reshape(data_B, [filt_x.shape[0], 1]) data_A = np.reshape(data_A, [filt_y.shape[0], 1]) log.info('Outlier founded %i w.r.t %i', pos_out.shape[0], common['rt_y'].shape[0]) else: data_B = common['rt_y'].values data_A = common['rt_x'].values data_B = np.reshape(data_B, [common.shape[0], 1]) data_A = np.reshape(data_A, [common.shape[0], 1]) log.info(' Size trainig shared peptide , %i %i ', data_A.shape[0], data_B.shape[0]) clf = linear_model.RidgeCV(alphas=np.power( 2, np.linspace(-30, 30)), scoring='neg_mean_absolute_error') clf.fit(data_B, data_A) clf_final = linear_model.Ridge(alpha=clf.alpha_) clf_final.fit(data_B, data_A) # save the model model_save.append(clf_final) model_err.append( mean_absolute_error(data_A, clf_final.predict(data_B))) log.info( ' Mean absolute error training : %4.4f sec', mean_absolute_error(data_A, clf_final.predict(data_B))) model_status.append(1) ''' # GP version model_gp, predicted_train, error = train_gp(data_A, data_B,c= str(i[0])+'_'+str(i[1])) #print i[1], comA.shape, error model_err.append(error) model_save.append(model_gp) model_status.append(1) ''' if np.where(np.array(model_status) == -1)[0].shape[0] >= (len(aa) / 2): log.error( 'MBR aborted : mbr cannnot be run, not enough shared pepetide among the replicates ' ) exit( 'ERROR : mbr cannnot be run, not enough shared pepetide among the replicates' ) log.info('Combination of the model --------') log.info('Weighted combination %s : ', 'Weighted' if args.w_comb else 'Unweighted') if n_replicates == 2: test = pre_pep_save[0] else: test = reduce( lambda left, right: pd.merge( left, right, on=[ 'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot' ], how='outer'), pre_pep_save) test = test.groupby('code_unique', as_index=False).aggregate(max) test.drop('code_unique', axis=1, inplace=True) test['time_pred'] = test.iloc[:, 6:(6 + (n_replicates - 1))].apply( lambda x: combine_model(x, model_save, model_err, args.w_comb), axis=1) #test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply( # lambda x: combine_model(x, model_save[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], # model_err[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], args.w_comb), # axis=1) test['matched'] = 1 # still to check better if test[test['time_pred'] <= 0].shape[0] >= 1: log.info( ' -- Predicted negative RT: those peptide will be deleted') test = test[test['time_pred'] > 0] list_name = test.columns.tolist() list_name = [w.replace('time_pred', 'rt') for w in list_name] test.columns = list_name # test = test[['peptide','mod_peptide', 'mass', 'mz', 'charge', # 'prot', 'rt', 'matched','uncertainty_win']] test = test[[ 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt', 'matched' ]] # just put nan with the missing values for field in diff_field.tolist(): test[field] = np.nan log.info('Before adding %s contains %i ', exp_set[jj], exp_t[jj].shape[0]) exp_out[jj] = pd.concat([exp_t[jj], test], join='outer', axis=0, sort=False) log.info('After MBR %s contains: %i peptides', exp_set[jj], exp_out[jj].shape[0]) log.critical('matched features %i MS2 features %i ', exp_out[jj][exp_out[jj]['matched'] == 1].shape[0], exp_out[jj][exp_out[jj]['matched'] == 0].shape[0]) exp_out[jj].to_csv(path_or_buf=os.path.join( output_dir, os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt'), sep='\t', index=False) exp_out_name.append( os.path.join( output_dir, os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt')) if exp_out[jj].shape[0] > 0: out_flag = 1 * out_flag else: out_flag = -1 * out_flag w_mbr.close() log.removeHandler(w_mbr) return out_flag, exp_out_name
def run_mbr(args): """ Macthing Between Run module. :param args: :return: """ ch = logging.StreamHandler() ch.setLevel(logging.ERROR) log.addHandler(ch) if args.loc_in is None: # the user uses --inputtsv option if not (args.loc_out is None): # if the user use --output_folder the mbr folder will be created there output_dir = os.path.join(args.loc_out, 'mbr_output') else: # if the user does not use --output_folder the mbr folder will be created on moFF path location output_dir = os.path.join('mbr_output') print(os.path.abspath(output_dir)) else: # the user use the --inputF option if os.path.exists(os.path.join(args.loc_in)): # if '/' in str(args.loc_in): output_dir = os.path.join(args.loc_in, 'mbr_output') else: exit(os.path.join(args.loc_in) + ' EXIT input folder path is not well specified --> / missing or wrong path') # if not (os.path.isdir(args.loc_in)): # exit(str(args.loc_in) + '--> input folder does not exist ! ') # if str(args.loc_in) == '': # output_dir = 'mbr_output' # else: # if os.path.exists(os.path.join(args.loc_in)): # if '/' in str(args.loc_in): # output_dir = os.path.join(args.loc_in, 'mbr_output') # else: # exit(os.path.join(args.loc_in) + ' EXIT input folder path not well specified --> / missing ') if not (os.path.isdir(output_dir)): log.critical("Created MBR output folder in : %s ", os.path.abspath(output_dir)) os.makedirs(output_dir) else: log.critical("MBR Output folder in : %s ", os.path.abspath(output_dir)) # set log to file w_mbr = logging.FileHandler(os.path.join( output_dir, args.log_label + '_mbr_.log'), mode='w') w_mbr.setLevel(logging.INFO) log.addHandler(w_mbr) moff_path = os.path.dirname(os.path.realpath(sys.argv[0])) config = configparser.RawConfigParser() config.read(os.path.join(moff_path, 'moff_setting.properties')) # it s always placed in same folder of moff_mbr.py # read input # comment better # name of the input file exp_set = [] # list of the input dataframe exp_t = [] # list of the output dataframe exp_out = [] # lsit of input datafra used as help exp_subset = [] # list of the name of the mbr output exp_out_name = [] if args.loc_in is None: for id_name in args.tsv_list: exp_set.append(id_name) else: for item in os.listdir(args.loc_in): if os.path.isfile(os.path.join(args.loc_in, item)): if os.path.join(args.loc_in, item).endswith('.' + args.ext): log.critical(item) exp_set.append(os.path.join(args.loc_in, item)) # sample optiion is valid only if folder iin option is valid if (args.sample is not None) and (args.loc_in is not None): exp_set_app = copy.deepcopy(exp_set) for a in exp_set: if re.search(args.sample, a) is None: exp_set_app.remove(a) exp_set = exp_set_app if (exp_set == []) or (len(exp_set) == 1): exit( 'ERROR input files not found or just one input file selected . check the folder or the extension given in input') for a in exp_set: log.critical('Reading file: %s ', a) exp_subset.append(a) data_moff = pd.read_csv(a, sep="\t", header=0) list_name = data_moff.columns.values.tolist() # get the lists of PS defaultcolumns from properties file list_ps_def = ast.literal_eval( config.get('moFF', 'ps_default_export_v1')) # here it controls if the input file is a PS export; if yes it maps the input in right moFF name if moff.check_ps_input_data(list_name, list_ps_def) == 1: log.critical( 'Detected input file from PeptideShaker export..: %s ', a) # map the columns name according to moFF input requirements data_moff, list_name = moff.map_ps2moff( data_moff, 'col_must_have_mbr') log.critical( 'Mapping columns names into the the moFF requested column name..: %s ', a) # print data_moff.columns if moff.check_columns_name(list_name, ast.literal_eval(config.get('moFF', 'col_must_have_mbr')), log) == 1: exit('ERROR minimal field requested are missing or wrong') data_moff['matched'] = 0 data_moff['mass'] = data_moff['mass'].map('{:.4f}'.format) data_moff['code_unique'] = data_moff['mod_peptide'].astype( str) # + '_' + data_moff['mass'].astype(str) data_moff = data_moff.sort_values(by='rt') exp_t.append(data_moff) exp_out.append(data_moff) log.critical('Read input --> done ') # parameter of the number of query # set a list of filed mandatory # ['matched','peptide','mass','mz','charge','prot','rt'] n_replicates = len(exp_t) exp_set = exp_subset aa = range(0, n_replicates) out = list(itertools.product(aa, repeat=2)) # just to save all the model # add matched columns list_name.append('matched') # final status -1 if one of the output is empty out_flag = 1 # input of the methods diff_field = np.setdiff1d(exp_t[0].columns, [ 'matched', 'mod_peptide', 'peptide', 'mass', 'mz', 'charge', 'prot', 'rt']) log.info('Outlier Filtering is %s ', 'active' if args.out_flag else 'not active') log.info('Number of replicates %i,', n_replicates) log.info('Pairwise model computation ----') if args.rt_feat_file is not None: log.critical( 'Custom list of peptide used provided by the user in %s', args.rt_feat_file) # log.info('Custom list of peptide used provided by the user in %s', args.rt_feat_file) shared_pep_list = pd.read_csv(args.rt_feat_file, sep='\t') shared_pep_list['mass'] = shared_pep_list['mass'].map('{:.4f}'.format) shared_pep_list['code'] = shared_pep_list['peptide'].astype( str) + '_' + shared_pep_list['mass'].astype(str) list_shared_pep = shared_pep_list['code'] log.info('Custom list of peptide contains %i ', list_shared_pep.shape[0]) for jj in aa: # list of the model saved model_save = [] # list of the error in min/or sec model_err = [] # list of the status of the model -1 means model not available for low points in the training set model_status = [] c_rt = 0 pre_pep_save = [] log.info('matching in %s', exp_set[jj]) result = itertools.filterfalse(lambda x: x[0] != jj or x[1] == jj, out) for i in result: #if i[0] == jj and i[1] != jj: if args.rt_feat_file is not None: # use of custom peptide comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(list_shared_pep)][ ['code_unique', 'peptide', 'prot', 'rt']] comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(list_shared_pep)][ ['code_unique', 'peptide', 'prot', 'rt']] comA = comA.groupby('code_unique', as_index=False).mean() comB = comB.groupby('code_unique', as_index=False).mean() common = pd.merge( comA, comB, on=['code_unique'], how='inner') else: # use of shared peptdes. log.info(' Matching %s peptide in searching in %s ', exp_set[i[0]], exp_set[i[1]]) list_pep_repA = exp_t[i[0]]['code_unique'].unique() list_pep_repB = exp_t[i[1]]['code_unique'].unique() log.info('Peptide unique (mass + sequence) %i , %i ', list_pep_repA.shape[0], list_pep_repB.shape[0]) set_dif_s_in_1 = np.setdiff1d(list_pep_repB, list_pep_repA) add_pep_frame = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin( set_dif_s_in_1)].copy() #-- prepare the testing set add_pep_frame = add_pep_frame[[ 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt']] # add_pep_frame['code_unique'] = '_'.join([add_pep_frame['peptide'], add_pep_frame['prot'], add_pep_frame['mass'].astype(str), add_pep_frame['charge'].astype(str)]) add_pep_frame['code_unique'] = add_pep_frame['mod_peptide'] + '_' + \ add_pep_frame['prot'] + '_' + '_' + \ add_pep_frame['charge'].astype(str) add_pep_frame = add_pep_frame.groupby('code_unique', as_index=False)[ 'peptide', 'mod_peptide', 'mass', 'charge', 'mz', 'prot', 'rt'].aggregate(max) add_pep_frame = add_pep_frame[[ 'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt']] list_name = add_pep_frame.columns.tolist() list_name = [w.replace('rt', 'rt_' + str(c_rt)) for w in list_name] add_pep_frame.columns = list_name pre_pep_save.append(add_pep_frame) c_rt += 1 #-------- pep_shared = np.intersect1d(list_pep_repA, list_pep_repB) log.info( ' Peptide (mass + sequence) added size %i ', add_pep_frame.shape[0]) log.info(' Peptide (mass + sequence) )shared %i ', pep_shared.shape[0]) comA = exp_t[i[0]][exp_t[i[0]]['code_unique'].isin(pep_shared)][ ['code_unique', 'peptide', 'prot', 'rt']] comB = exp_t[i[1]][exp_t[i[1]]['code_unique'].isin(pep_shared)][ ['code_unique', 'peptide', 'prot', 'rt']] # filtering using the variance added 17_08 flag_var_filt = False if flag_var_filt: dd = comA.groupby('code_unique', as_index=False) top_res = dd.agg(['std', 'mean', 'count']) # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100]) th = np.nanpercentile(top_res['rt']['std'].values, 60) comA = comA[~ comA['code_unique'].isin( top_res[top_res['rt']['std'] > th].index)] # data B ' dd = comB.groupby('code_unique', as_index=False) top_res = dd.agg(['std', 'mean', 'count']) # print comB.shape # print np.nanpercentile(top_res['rt']['std'].values,[5,10,20,30,50,60,80,90,95,97,99,100]) th = np.nanpercentile(top_res['rt']['std'].values, 60) comB = comB[~ comB['code_unique'].isin( top_res[top_res['rt']['std'] > th].index)] comA = comA.groupby('code_unique', as_index=False).mean() comB = comB.groupby('code_unique', as_index=False).mean() common = pd.merge( comA, comB, on=['code_unique'], how='inner') if common.shape[0] <= 10 and args.rt_feat_file is not None: model_status.append(-1) continue # filtering outlier option else: if args.out_flag : filt_x, filt_y, pos_out = MD_removeOutliers(common['rt_y'].values, common['rt_x'].values, args.w_filt) data_B = filt_x data_A = filt_y data_B = np.reshape(data_B, [filt_x.shape[0], 1]) data_A = np.reshape(data_A, [filt_y.shape[0], 1]) log.info('Outlier founded %i w.r.t %i', pos_out.shape[0], common['rt_y'].shape[0]) else: data_B = common['rt_y'].values data_A = common['rt_x'].values data_B = np.reshape(data_B, [common.shape[0], 1]) data_A = np.reshape(data_A, [common.shape[0], 1]) log.info(' Size trainig shared peptide , %i %i ', data_A.shape[0], data_B.shape[0]) clf = linear_model.RidgeCV(alphas=np.power( 2, np.linspace(-30, 30)), scoring='neg_mean_absolute_error') clf.fit(data_B, data_A) clf_final = linear_model.Ridge(alpha=clf.alpha_) clf_final.fit(data_B, data_A) # save the model model_save.append(clf_final) model_err.append(mean_absolute_error( data_A, clf_final.predict(data_B))) log.info(' Mean absolute error training : %4.4f sec', mean_absolute_error(data_A, clf_final.predict(data_B))) model_status.append(1) ''' # GP version model_gp, predicted_train, error = train_gp(data_A, data_B,c= str(i[0])+'_'+str(i[1])) #print i[1], comA.shape, error model_err.append(error) model_save.append(model_gp) model_status.append(1) ''' if np.where(np.array(model_status) == -1)[0].shape[0] >= (len(aa) / 2): log.error( 'MBR aborted : mbr cannnot be run, not enough shared pepetide among the replicates ') exit('ERROR : mbr cannnot be run, not enough shared pepetide among the replicates') log.info('Combination of the model --------') log.info('Weighted combination %s : ', 'Weighted' if args.w_comb else 'Unweighted') if n_replicates == 2: test = pre_pep_save[0] else: test = reduce( lambda left, right: pd.merge(left, right, on=[ 'code_unique', 'peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot'], how='outer'), pre_pep_save) test = test.groupby('code_unique', as_index=False).aggregate(max) test.drop('code_unique', axis=1, inplace=True) test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply( lambda x: combine_model(x, model_save, model_err, args.w_comb),axis=1) #test['time_pred'] = test.iloc[:, 6: (6 + (n_replicates - 1))].apply( # lambda x: combine_model(x, model_save[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], # model_err[(jj * (n_replicates - 1)):((jj + 1) * (n_replicates - 1))], args.w_comb), # axis=1) test['matched'] = 1 # still to check better if test[test['time_pred'] <= 0].shape[0] >= 1: log.info(' -- Predicted negative RT: those peptide will be deleted') test = test[test['time_pred'] > 0] list_name = test.columns.tolist() list_name = [w.replace('time_pred', 'rt') for w in list_name] test.columns = list_name # test = test[['peptide','mod_peptide', 'mass', 'mz', 'charge', # 'prot', 'rt', 'matched','uncertainty_win']] test = test[['peptide', 'mod_peptide', 'mass', 'mz', 'charge', 'prot', 'rt', 'matched']] # just put nan with the missing values for field in diff_field.tolist(): test[field] = np.nan log.info('Before adding %s contains %i ', exp_set[jj], exp_t[jj].shape[0]) exp_out[jj] = pd.concat( [exp_t[jj], test], join='outer', axis=0, sort=False) log.info('After MBR %s contains: %i peptides', exp_set[jj], exp_out[jj].shape[0]) log.critical('matched features %i MS2 features %i ', exp_out[jj][exp_out[jj]['matched'] == 1].shape[0], exp_out[jj][exp_out[jj]['matched'] == 0].shape[0]) exp_out[jj].to_csv( path_or_buf=os.path.join(output_dir, os.path.split(exp_set[jj])[1].split('.')[0] + '_match.txt'), sep='\t', index=False) exp_out_name.append(os.path.join(output_dir, os.path.split( exp_set[jj])[1].split('.')[0] + '_match.txt')) if exp_out[jj].shape[0] > 0: out_flag = 1 * out_flag else: out_flag = -1 * out_flag w_mbr.close() log.removeHandler(w_mbr) return out_flag, exp_out_name
else: raw_list = None loc_raw = args.raw_repo if not None else raw_list loc_output = args.loc_out config = configparser.RawConfigParser() config.read( os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'moff_setting.properties')) df = pd.read_csv(file_name, sep="\t") # add same safety checks len > 1 # Flag for pride pipeline, or to set from second to minute as input rt time scale moff_pride_flag = False if moff.check_ps_input_data( df.columns.tolist(), ast.literal_eval(config.get('moFF', 'moffpride_format'))) == 1: # if it is a moff_pride data I do not check aany other requirement log.critical('moffPride input detected') moff_pride_flag = True else: if not 'matched' in df.columns: # check if it is a PS file , list_name = df.columns.values.tolist() # get the lists of PS defaultcolumns from properties file list = ast.literal_eval( config.get('moFF', 'ps_default_export_v1')) # here it controls if the input file is a PS export; if yes it maps the input in right moFF name if moff.check_ps_input_data(list_name, list) == 1: # map the columns name according to moFF input requirements if not args.peptide_summary: