def generate_peakml_files(self, xseto, polarity_dir, peakml_params): ionisation = self.get_value(peakml_params, 'ionisation') add_scans = self.get_value(peakml_params, 'addscans') write_rejected = self.get_value(peakml_params, 'writeRejected') apodisation_filter = self.get_value(peakml_params, 'ApodisationFilter') ppm = self.get_value( peakml_params, 'ppm') # this doesn't seem to be set inside peakml_params ..? if ppm == 0: ppm = 5 peakml_files = [] regex = re.compile(re.escape('mzxml'), re.IGNORECASE) xset_names = run_r('names', xseto) for name in xset_names: replaced = regex.sub('peakml', name) outfile = os.path.join(polarity_dir, os.path.basename(replaced)) peakml_files.append(outfile) for i in range(len(peakml_files)): logger.debug('Now creating %s', peakml_files[i]) run_r('PeakML.xcms.write.SingleInstance', xseto[i], outputfile=peakml_files[i], ionisation=ionisation, addscans=add_scans, writeRejected=write_rejected, ApodisationFilter=apodisation_filter, ppm=ppm)
def combine_peaksets(self, in_files, out_file, label, ppm, rtwindow, combine_type): peakml_list_str = ','.join(in_files) files = robjects.StrVector([peakml_list_str]) run_r('Pimp.combineSingle', files, out_file, label, ppm, rtwindow, combine_type)
def rsd_filter(self, in_peaksets, rsd): out_files = [] for in_file in in_peaksets: basename, extension = os.path.splitext(in_file) out_file = basename + '_rsd' + extension rej_file = basename + '_rsdrej' + extension run_r('Pimp.rsdSingle', in_file, out_file, rej_file, rsd) out_files.append(os.path.abspath(out_file)) return out_files
def gap_filling(self, in_file, peakml_params, mzmatch_outputs): out_file = self.get_value(mzmatch_outputs, 'final.combined.gapfilled.file')[0] out_file = os.path.abspath(out_file) ionisation = self.get_value(peakml_params, 'ionisation')[0] ppm = self.get_value(peakml_params, 'ppm')[0] rtwindow = self.get_value(peakml_params, 'rtwin')[0] run_r('Pimp.gapFilling', in_file, out_file, ionisation, ppm, rtwindow) return out_file
def connect_to_rpy2(self): logger.info('******************************************') logger.info('Setup rpy2 connection') logger.info('******************************************') packrat_lib_path = self.get_env_packrat_lib_path() run_r('.libPaths', packrat_lib_path) base = importr('base') base.options( **{'java.parameters': "".join(["-Xmx", str(1024 * 8), "m"])}) importr('PiMP') # activate the conversion from pandas dataframe to r pandas2ri.activate()
def rt_correct(self, xset, method): if method == 'obiwarp': xset_aln = run_r('retcor', xset, method='obiwarp', profStep=0.01) elif method == 'loess': xset_aln = run_r('retcor', xset, method='loess', family="symmetric") else: xset_aln = xset fp = run_r('filepaths', xset_aln) xseto = run_r('split', xset_aln, fp) return xseto
def identify(self, polarity, in_file, databases, non_empty, mzmatch_params, mzmatch_outputs): group_dict = {} for group_label, index, description, files, abspath in non_empty: group_dict[group_label] = robjects.StrVector(files) groups = robjects.ListVector(group_dict) # turns out that 'stds.xml.db' always has a value, e.g. /home/pimp/media/projects/10/analysis_38/stds_db.xml # regardless of whether the file exists or not. # this behaviour is different from the old pipeline? So here we set it to R NULL if the file doesn't exist stds_xml_file = self.get_value(mzmatch_outputs, 'stds.xml.db')[0] stds_xml_file = os.path.abspath(stds_xml_file) if not os.path.isfile(stds_xml_file): logger.info('%s is not found, setting stds.xml.db to NULL', stds_xml_file) self.set_value(mzmatch_outputs, 'stds.xml.db', robjects.r("NULL")) args = { 'in_file': in_file, 'databases': databases, 'groups': groups, 'mzmatch.outputs': mzmatch_outputs, 'mzmatch.params': mzmatch_params, 'polarity': polarity } raw_data = run_r('Pimp.identify.metabolites', **args) return raw_data
def related_peaks(self, in_file, mzmatch_params, mzmatch_outputs): out_file = self.get_value(mzmatch_outputs, 'final.combined.related.file')[0] out_file = os.path.abspath(out_file) basepeak_file = self.get_value(mzmatch_outputs, 'final.combined.basepeaks.file')[0] basepeak_file = os.path.abspath(basepeak_file) ppm = self.get_value(mzmatch_params, 'ppm')[0] rtwindow = self.get_value(mzmatch_params, 'rtwindow')[0] run_r('Pimp.relatedPeaks', in_file, out_file, basepeak_file, ppm, rtwindow) return out_file, basepeak_file
def process_raw_data(self, polarity, xcms_params, mzmatch_params, peakml_params, mzmatch_outputs, mzmatch_filters, n_slaves): formatted_mzmatch_outputs = run_r('Pimp.getFormattedMzmatchOutputs', self.analysis.id, polarity, mzmatch_outputs) polarity_dir, combined_dir = self.create_input_directories( polarity, formatted_mzmatch_outputs) logger.info('------------------------------------------') logger.info('%s %s %s', polarity, polarity_dir, combined_dir) logger.info('------------------------------------------') # peak detection and rt correction logger.debug('Input for peak detection and RT correction %s', self.metadata.files[polarity]) self.create_peakml(polarity, polarity_dir, xcms_params, mzmatch_params, peakml_params, mzmatch_outputs, n_slaves) # separate samples into peaksets for grouping non_empty = self.generate_combinations(polarity, combined_dir) # perform the grouping of peaks across samples into peaksets out_files = self.generate_peaksets(polarity_dir, combined_dir, non_empty, mzmatch_params) logger.debug('combined groups = %s', out_files) # filter each peakset out_files = self.filter_peaksets(out_files, mzmatch_params) logger.debug('filtered groups = %s', out_files) # combine all the peaksets into a single peakml file and filter it out_file = self.combine_final(out_files, mzmatch_params, formatted_mzmatch_outputs) logger.debug('combined final = %s', out_file) out_file = self.filter_final(out_file, mzmatch_filters, mzmatch_params, formatted_mzmatch_outputs) logger.debug('filter final = %s', out_file) # do gap filling out_file = self.gap_filling(out_file, peakml_params, formatted_mzmatch_outputs) logger.debug('gap-filled = %s', out_file) # do related peaks out_file, basepeak_file = self.related_peaks( out_file, mzmatch_params, formatted_mzmatch_outputs) logger.debug('related peaks = %s %s', out_file, basepeak_file) # do identification databases = self.r_dbs logger.debug('identification databases %s', databases) logger.debug('identification groups %s', non_empty) raw_data = self.identify(polarity, out_file, databases, non_empty, mzmatch_params, formatted_mzmatch_outputs) return raw_data
def run_stats(self, raw_data_dict, mzmatch_outputs, mzmatch_params): analysis_id = self.analysis.id groups, _ = self.metadata.get_groups() # We need to make sure we close the database connection when we are idle for long periods connection.close() df, metadata = convert_to_dataframe(groups) r_factors = robjects.StrVector([f.label for f in groups]) r_contrasts = pandas2ri.py2ri(self.metadata.contrasts) databases = self.r_databases dbs = self.r_dbs wd = self.working_dir save_fixtures = True run_r('Pimp.runStats.save', raw_data_dict['positive'], raw_data_dict['negative'], analysis_id, r_factors, metadata, r_contrasts, databases, dbs, mzmatch_outputs, mzmatch_params, save_fixtures, wd)
def filter_final(self, in_file, mzmatch_filters, mzmatch_params, mzmatch_outputs): apply_noise_filter = self.get_value(mzmatch_filters, 'noise')[0] if apply_noise_filter: noise = self.get_value(mzmatch_params, 'noise')[0] out_file = self.get_value(mzmatch_outputs, 'final.combined.noise.filtered.file')[0] out_file = os.path.abspath(out_file) run_r('Pimp.noiseFilter', in_file, out_file, noise) else: out_file = in_file in_file = out_file filter_ppm = self.get_value(mzmatch_params, 'ppm')[0] filter_minintensity = self.get_value(mzmatch_params, 'minintensity')[0] filter_mindetections = self.get_value(mzmatch_params, 'mindetections')[0] out_file = self.get_value(mzmatch_outputs, 'final.combined.simple.filtered.file')[0] out_file = os.path.abspath(out_file) run_r('Pimp.simpleFilter', in_file, out_file, filter_ppm, filter_minintensity, filter_mindetections) return out_file
def setup(self): self.analysis.status = 'Processing' self.analysis.save(update_fields=['status']) self.working_dir = get_pimp_wd(self.project.id) self.validate_input() pp = self.get_analysis_params(self.analysis.id) self.pimp_params = self.create_analysis_dir(self.analysis.id, pp) # generate std xml stds = robjects.StrVector(self.metadata.stds) databases = robjects.StrVector(self.metadata.databases) output = run_r('Pimp.generateStdXml', stds, databases, self.pimp_params, self.working_dir) self.r_dbs = output[output.names.index('DBS')] self.r_databases = output[output.names.index('databases')]
def peak_detection(self, files, xcms_params, n_slaves): # collect the parameters args = { 'files': files, 'method': self.get_value(xcms_params, 'method'), 'ppm': self.get_value(xcms_params, 'ppm'), 'peakwidth': self.get_value(xcms_params, 'peakwidth'), 'snthresh': self.get_value(xcms_params, 'snthresh'), 'prefilter': self.get_value(xcms_params, 'prefilter'), 'integrate': self.get_value(xcms_params, 'integrate'), 'mzdiff': self.get_value(xcms_params, 'mzdiff'), 'verbose.columns': self.get_value(xcms_params, 'verbose.columns'), 'fitgauss': self.get_value(xcms_params, 'fitgauss'), 'nSlaves': n_slaves } # call centwave xset = run_r('xcmsSet', **args) return xset
def create_analysis_dir(self, analysis_id, pimp_params): # pimp_params$mzmatch.outputs will be updated inside to point to the right analysis folder pimp_params = run_r('Pimp.createAnalysisDir', analysis_id, pimp_params, self.working_dir) return pimp_params
def get_analysis_params(self, analysis_id): pimp_params = run_r('Pimp.getAnalysisParams', analysis_id) return pimp_params