def copy_input_data(self, input_data_path, ds_config_path): """ Copy imzML/ibd/config files from input path to a dataset work directory Args ---- input_data_path : str Path to input files """ # if self.local_fs_only: # ex = self.local_dir.exists(self.local_dir.txt_path) # else: # ex = self.remote_dir.exists(self.remote_dir.txt_path) if not self.local_dir.exists(self.local_dir.imzml_path): logger.info('Copying data from %s to %s', input_data_path, self.local_dir.ds_path) if input_data_path.startswith('s3a://'): cmd_check('mkdir -p {}', self.local_dir.ds_path) bucket_name, inp_path = split_s3_path(input_data_path) bucket = self.s3.Bucket(bucket_name) for obj in bucket.objects.filter(Prefix=inp_path): if not obj.key.endswith('/'): path = join(self.local_dir.ds_path, obj.key.split('/')[-1]) self.s3transfer.download_file(bucket_name, obj.key, path) else: self.local_dir.copy(input_data_path, self.local_dir.ds_path) if ds_config_path: self.local_dir.copy(ds_config_path, self.local_dir.ds_config_path, is_file=True)
def copy_input_data(self, input_data_path, ds_config_path): """ Copy imzML/ibd/config files from input path to a dataset work directory Args ---- input_data_path : str Path to input files """ # if self.local_fs_only: # ex = self.local_dir.exists(self.local_dir.txt_path) # else: # ex = self.remote_dir.exists(self.remote_dir.txt_path) if not self.local_dir.exists(self.local_dir.imzml_path): logger.info('Copying data from %s to %s', input_data_path, self.local_dir.ds_path) if input_data_path.startswith('s3a://'): cmd_check('mkdir -p {}', self.local_dir.ds_path) bucket_name, inp_path = split_s3_path(input_data_path) bucket = self.s3.Bucket(bucket_name) for obj in bucket.objects.filter(Prefix=inp_path): if not obj.key.endswith('/'): path = join(self.local_dir.ds_path, obj.key.split('/')[-1]) self.s3transfer.download_file(bucket_name, obj.key, path) else: self.local_dir.copy(input_data_path, self.local_dir.ds_path) self.local_dir.copy(ds_config_path, self.local_dir.ds_config_path, is_file=True)
def estimate_fdr(self, msm_df): logger.info('Estimating FDR...') target_fdr_df_list = [] for ta in self.target_adducts: target_msm = msm_df.loc(axis=0)[:, ta] msm_fdr_list = [] for i in range(self.decoy_sample_size): sf_da_list = map( tuple, self.td_df[self.td_df.ta == ta][[ 'sf_id', 'da' ]][i::self.decoy_sample_size].values) decoy_msm = msm_df.loc[sf_da_list] msm_fdr = self._msm_fdr_map(target_msm, decoy_msm) msm_fdr_list.append(msm_fdr) msm_fdr_avg = pd.Series(pd.concat(msm_fdr_list, axis=1).median(axis=1), name='fdr') target_fdr = self._digitize_fdr( target_msm.join(msm_fdr_avg, on='msm')) target_fdr_df_list.append(target_fdr.drop('msm', axis=1)) return pd.concat(target_fdr_df_list, axis=0)
def __init__(self, job_id, db_id, ds_config, db): self.job_id = job_id self.db_id = db_id self.ppm = ds_config['image_generation']['ppm'] iso_gen_conf = ds_config['isotope_generation'] charge = '{}{}'.format(iso_gen_conf['charge']['polarity'], iso_gen_conf['charge']['n_charges']) target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id, iso_gen_conf['adducts'], iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)' decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id, self.db_id, iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)' sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs self.sf_ids, self.adducts, self.sf_theor_peaks, self.sf_theor_peak_ints = zip( *sf_peak_rs) self.check_formula_uniqueness(self.sf_ids, self.adducts) logger.info('Loaded %s sum formulas from the DB', len(self.sf_ids))
def search(self): logger.info('Running molecule search') sf_images = compute_sf_images(self.sc, self.ds, self.formulas.get_sf_peak_df(), self.ds_config['image_generation']['ppm']) all_sf_metrics_df = self.calc_metrics(sf_images) sf_metrics_fdr_df = self.estimate_fdr(all_sf_metrics_df) sf_metrics_fdr_df = self.filter_sf_metrics(sf_metrics_fdr_df) return sf_metrics_fdr_df, self.filter_sf_images(sf_images, sf_metrics_fdr_df)
def store_sf_img_metrics(self): """ Store formula image metrics in the database """ logger.info('Storing iso image metrics') rows = list( self._metrics_table_row_gen(self.job_id, self.sf_db_id, self.sf_metrics_df, self.sf_adduct_peaksn, self.metrics)) self.db.insert(METRICS_INS, rows)
def index_ds(self, db, ds_name, db_name): annotations = db.select(RESULTS_TABLE_SQL, ds_name, db_name) logger.info('Deleting documents from the index: {}-{}'.format(ds_name, db_name)) self._delete(annotations) logger.info('Indexing documents: {}-{}'.format(ds_name, db_name)) self._index(annotations)
def clean(self): try: bucket_obj = self.s3.Bucket(self.bucket) for obj in bucket_obj.objects.filter(Prefix=self.ds_path): self.s3.Object(self.bucket, obj.key).delete() logger.info('Successfully deleted interim data') except CalledProcessError as e: logger.warning('Deleting interim data files error: %s', e.message)
def search(self): logger.info('Running molecule search') sf_images = compute_sf_images( self.sc, self.ds, self.formulas.get_sf_peak_df(), self.ds_config['image_generation']['ppm']) all_sf_metrics_df = self.calc_metrics(sf_images) sf_metrics_fdr_df = self.estimate_fdr(all_sf_metrics_df) sf_metrics_fdr_df = self.filter_sf_metrics(sf_metrics_fdr_df) return sf_metrics_fdr_df, self.filter_sf_images( sf_images, sf_metrics_fdr_df)
def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info('No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True
def exists(self, path): try: self.s3.Object(*split_s3_path(path)).load() except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == "404": return False else: raise e else: logger.info('Path s3://%s/%s already exists', self.bucket, path) return True
def _import_theor_peaks_to_db(self, peak_lines): logger.info('Saving new peaks to the DB') if not exists(self.theor_peaks_tmp_dir): makedirs(self.theor_peaks_tmp_dir) peak_lines_path = join(self.theor_peaks_tmp_dir, 'peak_lines.csv') with open(peak_lines_path, 'w') as f: f.write('\n'.join(peak_lines)) with open(peak_lines_path) as peaks_file: self.db.copy(peaks_file, 'theor_peaks')
def store_job_meta(self): """ Store search job metadata in the database """ logger.info('Storing job metadata') self.ds_id = int(self.db.select_one(DS_ID_SEL, self.ds_name)[0]) self.job_id = self.ds_id self.db.alter(DEL_JOB_SQL, self.job_id) rows = [(self.job_id, self.sf_db_id, self.ds_id, datetime.now().strftime('%Y-%m-%d %H:%M:%S'))] self.db.insert(JOB_INS, rows) rows = [(self.job_id, adduct) for adduct in self.ds_config['isotope_generation']['adducts']] self.db.insert(ADDUCT_INS, rows)
def get_spectra(self): """ Returns ------- : pyspark.rdd.RDD Spark RDD with spectra. One spectrum per RDD entry. """ txt_to_spectrum = self.txt_to_spectrum_non_cum # if self.sm_config['fs']['local']: logger.info('Converting txt to spectrum rdd from %s', self.wd_manager.txt_path) return self.sc.textFile(self.wd_manager.txt_path, minPartitions=8).map(txt_to_spectrum)
def _valid_sf_adduct(cls, sf, adduct): if sf is None or adduct is None or sf == 'None' or adduct == 'None': logger.warning('Invalid sum formula or adduct: sf=%s, adduct=%s', sf, adduct) return False if '-' in adduct and adduct.strip('-') not in cls._sf_elements(sf): logger.info( 'No negative adduct element in the sum formula: sf=%s, adduct=%s', sf, adduct) return False return True
def apply_database_filters(self, formula_list): """ Filters according to settings in dataset config Args ---- formula_list : list List of pairs (id, sum formula) to search through Returns ------- : list Filtered list of pairs (id, sum formula) """ if 'organic' in map(lambda s: s.lower(), self.ds_config['database'].get('filters', [])): logger.info('Organic sum formula filter has been applied') return filter(lambda (_, sf): 'C' in self._sf_elements(sf), formula_list) return formula_list
def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters(self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand)
def estimate_fdr(self, msm_df): logger.info('Estimating FDR...') target_fdr_df_list = [] for ta in self.target_adducts: target_msm = msm_df.loc(axis=0)[:,ta] msm_fdr_list = [] for i in range(self.decoy_sample_size): sf_da_list = map(tuple, self.td_df[self.td_df.ta == ta][['sf_id', 'da']][i::self.decoy_sample_size].values) decoy_msm = msm_df.loc[sf_da_list] msm_fdr = self._msm_fdr_map(target_msm, decoy_msm) msm_fdr_list.append(msm_fdr) msm_fdr_avg = pd.Series(pd.concat(msm_fdr_list, axis=1).median(axis=1), name='fdr') target_fdr = self._digitize_fdr(target_msm.join(msm_fdr_avg, on='msm')) target_fdr_df_list.append(target_fdr.drop('msm', axis=1)) return pd.concat(target_fdr_df_list, axis=0)
def create_index(self, name='sm'): body = { 'settings': { "index": { 'max_result_window': 2147483647, "analysis": { "analyzer": { "analyzer_keyword": { "tokenizer": "keyword", "filter": "lowercase" } } } } }, 'mappings': { "annotation": { "properties": { "db_name": {"type": "string", "index": "not_analyzed"}, "ds_name": {"type": "string", "index": "not_analyzed"}, "sf": {"type": "string", "index": "not_analyzed"}, "comp_names": { "type": "string", "analyzer": "analyzer_keyword", }, "comp_ids": {"type": "string", "index": "not_analyzed"}, "chaos": {"type": "float", "index": "not_analyzed"}, "image_corr": {"type": "float", "index": "not_analyzed"}, "pattern_match": {"type": "float", "index": "not_analyzed"}, "msm": {"type": "float", "index": "not_analyzed"}, "adduct": {"type": "string", "index": "not_analyzed"}, "fdr": {"type": "float", "index": "not_analyzed"}, "mz": {"type": "string", "index": "not_analyzed"} } } } } if not self.ind_client.exists(name): out = self.ind_client.create(index=name, body=body) logger.info('Index {} created\n{}'.format(name, out)) else: logger.info('Index {} already exists'.format(name))
def __init__(self, job_id, db_id, ds_config, db): self.job_id = job_id self.db_id = db_id self.ppm = ds_config['image_generation']['ppm'] iso_gen_conf = ds_config['isotope_generation'] charge = '{}{}'.format(iso_gen_conf['charge']['polarity'], iso_gen_conf['charge']['n_charges']) target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id, iso_gen_conf['adducts'], iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)' decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id, self.db_id, iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)' sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs self.sf_df = (pd.DataFrame(sf_peak_rs, columns=['sf_id', 'adduct', 'centr_mzs', 'centr_ints']) .sort_values(['sf_id', 'adduct'])) self.check_formula_uniqueness(self.sf_df) logger.info('Loaded %s sum formula, adduct combinations from the DB', self.sf_df.shape[0])
def __init__(self, job_id, db_id, ds_config, db): self.job_id = job_id self.db_id = db_id self.ppm = ds_config['image_generation']['ppm'] iso_gen_conf = ds_config['isotope_generation'] charge = '{}{}'.format(iso_gen_conf['charge']['polarity'], iso_gen_conf['charge']['n_charges']) target_sf_peaks_rs = db.select(THEOR_PEAKS_TARGET_ADD_SEL, self.db_id, iso_gen_conf['adducts'], iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert target_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (target)' decoy_sf_peaks_rs = db.select(THEOR_PEAKS_DECOY_ADD_SEL, self.job_id, self.db_id, iso_gen_conf['isocalc_sigma'], iso_gen_conf['isocalc_pts_per_mz'], charge) assert decoy_sf_peaks_rs, 'No formulas matching the criteria were found in theor_peaks! (decoy)' sf_peak_rs = target_sf_peaks_rs + decoy_sf_peaks_rs self.sf_ids, self.adducts, self.sf_theor_peaks, self.sf_theor_peak_ints = zip(*sf_peak_rs) self.check_formula_uniqueness(self.sf_ids, self.adducts) logger.info('Loaded %s sum formulas from the DB', len(self.sf_ids))
def _configure_spark(self): logger.info('Configuring Spark') sconf = SparkConf() for prop, value in self.sm_config['spark'].iteritems(): if prop.startswith('spark.'): sconf.set(prop, value) if 'aws' in self.sm_config: sconf.set("spark.hadoop.fs.s3a.access.key", self.sm_config['aws']['aws_access_key_id']) sconf.set("spark.hadoop.fs.s3a.secret.key", self.sm_config['aws']['aws_secret_access_key']) sconf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sconf.set("spark.python.profile", "true") self.sc = SparkContext(master=self.sm_config['spark']['master'], conf=sconf, appName='SM engine') if not self.sm_config['spark']['master'].startswith('local'): self.sc.addPyFile(join(local_path(proj_root()), 'sm.zip'))
def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i+n], numSlices=128) peak_lines = (sf_adduct_cand_rdd .flatMap(lambda (sf_id, sf, adduct): formatted_iso_peaks(db_id, sf_id, sf, adduct)) .collect()) self._import_theor_peaks_to_db(peak_lines)
def run(self): """ Starts peaks generation. Checks all formula peaks saved in the database and generates peaks only for new ones""" logger.info('Running theoretical peaks generation') db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] formula_list = self.apply_database_filters( self.db.select(AGG_FORMULA_SEL, db_id)) stored_sf_adduct = self.db.select(SF_ADDUCT_SEL, db_id, self.isocalc_wrapper.sigma, self.isocalc_wrapper.charge, self.isocalc_wrapper.pts_per_mz) sf_adduct_cand = self.find_sf_adduct_cand(formula_list, set(stored_sf_adduct)) logger.info('%d saved (sf, adduct)s, %s not saved (sf, adduct)s', len(stored_sf_adduct), len(sf_adduct_cand)) if sf_adduct_cand: self.generate_theor_peaks(sf_adduct_cand)
def save_ds_meta(self): """ Save dataset metadata (name, path, image bounds, coordinates) to the database """ # ds_id_row = self.db.select_one(DS_ID_SELECT, self.name) # if not ds_id_row: # logger.info('No dataset with name %s found', self.name) # ds_id = self.db.select_one(MAX_DS_ID_SELECT)[0] + 1 self.db.alter(DS_DEL, self.name) img_bounds = json.dumps({ 'x': { 'min': self.min_x, 'max': self.max_x }, 'y': { 'min': self.min_y, 'max': self.max_y } }) ds_config_json = json.dumps(self.ds_config) owner_rs = self.db.select_one(CLIENT_ID_SEL, self.owner_email) if self.owner_email and not owner_rs: raise Exception("Could't find a user with email {}".format( self.owner_email)) owner_id = owner_rs[0] if owner_rs else None ds_row = [(self.name, owner_id, self.input_path, img_bounds, ds_config_json)] self.db.insert(DS_INSERT, ds_row) ds_id = self.db.select(DS_ID_SELECT, self.name)[0] logger.info("Inserted into the dataset table: %s, %s", ds_id, self.name) xs, ys = map(list, zip(*self.coords)) self.db.insert(COORD_INSERT, [(ds_id, xs, ys)]) logger.info("Inserted to the coordinates table")
def generate_theor_peaks(self, sf_adduct_cand): """ Args ---- sf_adduct_cand : list List of (formula id, formula, adduct) triples which don't have theoretical patterns saved in the database Returns ------- : list List of strings with formatted theoretical peaks data """ logger.info('Generating missing peaks') formatted_iso_peaks = self.isocalc_wrapper.formatted_iso_peaks db_id = self.db.select_one(DB_ID_SEL, self.ds_config['database']['name'])[0] n = 10000 for i in xrange(0, len(sf_adduct_cand), n): sf_adduct_cand_rdd = self.sc.parallelize(sf_adduct_cand[i:i + n], numSlices=128) peak_lines = (sf_adduct_cand_rdd.flatMap( lambda (sf_id, sf, adduct): formatted_iso_peaks( db_id, sf_id, sf, adduct)).collect()) self._import_theor_peaks_to_db(peak_lines)
def convert(self, preprocess=False, print_progress=True): """ Converts MS imaging data provided by given parser to a text-based format. Optionally writes the coordinates into a coordinate file. Args ---- preprocess : bool Apply filter and centroid detection to all spectra before writing (rarely useful) print_progress : bool Whether or not to print progress information to stdout """ logger.info("ImzML -> Txt conversion...") self.preprocess = preprocess if not exists(self.txt_path): self.txt_file = open(self.txt_path, 'w') self.coord_file = open(self.coord_path, 'w') if self.coord_path else None self.parser = ImzMLParser(self.imzml_path) n_pixels = len(self.parser.coordinates) track_progress = get_track_progress(n_pixels, max(n_pixels / 100, 100), print_progress) for i, coord in enumerate(self.parser.coordinates): x, y = coord[:2] self._uniq_coord(x, y) self.parse_save_spectrum(i, x, y) track_progress(i) self.txt_file.close() if self.coord_file: self.coord_file.close() logger.info("Conversion finished successfully") else: logger.info('File %s already exists', self.txt_path)
def exists(self, path): if exists(split_local_path(path)): logger.info('Path %s already exists', path) return True else: return False
def run(self, input_path, ds_config_path, clean=False): """ Entry point of the engine. Molecule search is completed in several steps: * Copying input data to the engine work dir * Conversion input data (imzML+ibd) to plain text format. One line - one spectrum data * Generation and saving to the database theoretical peaks for all formulas from the molecule database * Molecules search. The most compute intensive part. Spark is used to run it in distributed manner. * Saving results (isotope images and their metrics of quality for each putative molecule) to the database Args ------- input_path : string Path to the dataset folder with .imzML and .ibd files ds_config_path: string Path to the dataset config file clean : bool Clean all interim data files before starting molecule search """ try: self.wd_manager = WorkDirManager(self.ds_name) if clean: self.wd_manager.clean() self.wd_manager.copy_input_data(input_path, ds_config_path) self._read_ds_config() logger.info('Dataset config:\n%s', pformat(self.ds_config)) self._configure_spark() self._init_db() if not self.wd_manager.exists(self.wd_manager.txt_path): imzml_converter = ImzmlTxtConverter( self.ds_name, self.wd_manager.local_dir.imzml_path, self.wd_manager.local_dir.txt_path, self.wd_manager.local_dir.coord_path) imzml_converter.convert() if not self.wd_manager.local_fs_only: self.wd_manager.upload_to_remote() self.ds = Dataset(self.sc, self.ds_name, self.client_email, input_path, self.ds_config, self.wd_manager, self.db) self.ds.save_ds_meta() self.store_job_meta() theor_peaks_gen = TheorPeaksGenerator(self.sc, self.sm_config, self.ds_config) theor_peaks_gen.run() target_adducts = self.ds_config['isotope_generation']['adducts'] self.fdr = FDR(self.job_id, self.sf_db_id, decoy_sample_size=20, target_adducts=target_adducts, db=self.db) self.fdr.decoy_adduct_selection() self.formulas = FormulasSegm(self.job_id, self.sf_db_id, self.ds_config, self.db) # search_alg = MSMBasicSearch(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) search_alg = MSMExtraFeats(self.sc, self.ds, self.formulas, self.fdr, self.ds_config) sf_metrics_df, sf_iso_images = search_alg.search() search_results = SearchResults( self.sf_db_id, self.ds_id, self.job_id, self.ds_name, self.formulas.get_sf_adduct_peaksn(), self.db, self.sm_config, self.ds_config) search_results.sf_metrics_df = sf_metrics_df search_results.sf_iso_images = sf_iso_images search_results.metrics = search_alg.metrics search_results.nrows, search_results.ncols = self.ds.get_dims() search_results.store() es = ESExporter(self.sm_config) es.index_ds(self.db, self.ds_name, self.ds_config['database']['name']) except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.error('\n'.join( traceback.format_exception(exc_type, exc_value, exc_traceback))) finally: if self.sc: # self.sc.show_profiles() self.sc.stop() if self.db: self.db.close()
def copy(self, local, remote): logger.info('Coping DS text files to S3...') self.s3transfer.upload_file(local, *split_s3_path(remote))
if __name__ == "__main__": parser = argparse.ArgumentParser( description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = '\t'.join(['formula_db', 'db_ids', 'sf_name', 'sf', 'adduct']) +'\t' + '\t'.join(metrics) + '\t' + \ '\t'.join(['fdr', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines(['\t'.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config, img_bounds = db.select_one(DS_CONFIG_SEL, args.ds_name) nrows, ncols = get_img_dims(img_bounds) isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, args.ds_name, args.sf) export_df = pd.DataFrame( export_rs, columns=['sf', 'adduct', 'peak', 'pxl_inds', 'ints']) export_df['img_dims'] = [(img_bounds['y']['min'], img_bounds['y']['max'], img_bounds['x']['min'], img_bounds['x']['max']) ] * len(export_df) # export_df['img'] = export_df.apply(lambda r: build_matrix(np.array(r['pxl_inds']), # np.array(r['ints']), nrows, ncols), axis=1) # export_df.drop(['pxl_inds', 'ints'], axis=1, inplace=True) # export_df.to_csv(args.csv_path, index=False) # cPickle.dump(export_df, open(args.pkl_path, 'wb')) export_df.to_csv(args.pkl_path, index=False) logger.info( 'Exported all images for "%s" sum formula in "%s" dataset into "%s" file', args.sf, args.ds_name, args.pkl_path)
def _init_db(self): logger.info('Connecting to the DB') self.db = DB(self.sm_config['db']) self.sf_db_id = self.db.select_one( DB_ID_SEL, self.ds_config['database']['name'])[0]
if __name__ == "__main__": parser = argparse.ArgumentParser(description='SM process dataset at a remote spark location.') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('input_path', type=str, help='Path to a dataset location') parser.add_argument('ds_config_path', type=str, help='Path to a dataset config file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.add_argument('--no-clean', dest='no_clean', action='store_true', help='do not clean interim files') start = time.time() args = parser.parse_args() SMConfig.set_path(args.sm_config_path) fileHandler = FileHandler(filename='logs/{}.log'.format(args.ds_name.replace('/', '_'))) fileHandler.setLevel(DEBUG) fileHandler.setFormatter(Formatter(sm_log_formatters['SM']['format'])) logger.addHandler(fileHandler) logger.debug('Using SM config:\n%s', pformat(SMConfig.get_conf())) logger.info("Processing...") job = SearchJob(None, args.ds_name) job.run(args.input_path, args.ds_config_path, clean=not args.no_clean) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))
def store(self): logger.info('Storing search results to the DB') self.clear_old_results() self.store_sf_img_metrics() self.store_sf_iso_images()
class SearchResults(object): """ Container for molecule search results Args ---------- sf_db_id : int Formula database id ds_id : int Dataset id job_id : int Search job id sf_metrics_df : pandas.Dataframe sf_iso_images : pyspark.RDD Result images of format ((formula_id, adduct)), list of images) sf_adduct_peaksn : list List of triples (formula id, adduct, number of theoretical peaks) db: engine.db.DB sm_config: dict """ def __init__(self, sf_db_id, ds_id, job_id, ds_name, sf_adduct_peaksn, db, sm_config, ds_config): self.sf_db_id = sf_db_id self.ds_id = ds_id self.job_id = job_id self.ds_name = ds_name self.db = db self.sm_config = sm_config self.ds_config = ds_config self.sf_adduct_peaksn = sf_adduct_peaksn self.sf_iso_images = None self.sf_metrics_df = None self.metrics = None self.ncols = None self.nrows = None def clear_old_results(self): """ Clear all previous search results for the dataset from the database """ logger.info('Clearing old job results') self.db.alter(clear_iso_image_sql, self.job_id) self.db.alter(clear_iso_image_metrics_sql, self.job_id) @staticmethod def _metrics_table_row_gen(job_id, db_id, metr_df, sf_adduct_peaksn, metrics): for ind, r in metr_df.reset_index().iterrows(): metr_json = json.dumps( OrderedDict([(m, float(r[m])) for m in metrics])) peaks_n = sf_adduct_peaksn[ind][2] yield (job_id, db_id, r.sf_id, r.adduct, float(r.msm), float(r.fdr), metr_json, peaks_n) def store_sf_img_metrics(self): """ Store formula image metrics in the database """ logger.info('Storing iso image metrics') rows = list( self._metrics_table_row_gen(self.job_id, self.sf_db_id, self.sf_metrics_df, self.sf_adduct_peaksn, self.metrics)) self.db.insert(METRICS_INS, rows) def store_sf_iso_images(self): """ Store formula images in the database Args ----------- nrows : int Number of rows in the dataset image ncols : int Number of columns in the dataset image """ job_id = self.job_id sf_db_id = self.sf_db_id db_config = self.sm_config['db'] nrows = self.nrows ncols = self.ncols def iso_img_row_gen(((sf_id, adduct), img_list)): for peak_i, img_sparse in enumerate(img_list): img_ints = np.zeros( int(nrows) * int(ncols) ) if img_sparse is None else img_sparse.toarray().flatten() pixel_inds = np.arange(img_ints.shape[0]) img_ints_mask = img_ints > 0.001 if img_ints_mask.sum() > 0: yield (job_id, sf_db_id, sf_id, adduct, peak_i, pixel_inds[img_ints_mask].tolist(), img_ints[img_ints_mask].tolist(), img_ints.min(), img_ints.max()) def store_iso_img_rows(row_it): db = DB(db_config) try: rows = list(row_it) if rows: db.insert(SF_ISO_IMGS_INS, rows) finally: db.close() logger.info('Storing iso images') # self.sf_iso_images.flatMap(iso_img_row_gen).coalesce(32).foreachPartition(store_iso_img_rows) self.sf_iso_images.flatMap(iso_img_row_gen).foreachPartition( store_iso_img_rows)
def delete_index(self, name='sm'): out = self.ind_client.delete(name) logger.info('Index {} deleted\n{}'.format(name, out))
"JOIN job j ON j.id = m.job_id " "JOIN dataset ds ON ds.id = j.ds_id " "JOIN theor_peaks tp ON tp.db_id = sf_db.id AND tp.sf_id = m.sf_id AND tp.adduct = m.adduct " "WHERE sf_db.name = %s AND ds.name = %s " "AND ROUND(sigma::numeric, 6) = %s AND charge = %s AND pts_per_mz = %s") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting search results into a csv file') parser.add_argument('ds_name', type=str, help='Dataset name') parser.add_argument('csv_path', type=str, help='Path for the csv file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults(sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) ds_config = db.select_one(DS_CONFIG_SEL, args.ds_name)[0] isotope_gen_config = ds_config['isotope_generation'] charge = '{}{}'.format(isotope_gen_config['charge']['polarity'], isotope_gen_config['charge']['n_charges']) export_rs = db.select(EXPORT_SEL, ds_config['database']['name'], args.ds_name, isotope_gen_config['isocalc_sigma'], charge, isotope_gen_config['isocalc_pts_per_mz']) header = ','.join(['formula_db', 'ds_name', 'sf', 'adduct', 'chaos', 'img_corr', 'pat_match', 'isocalc_sigma', 'isocalc_charge', 'isocalc_pts_per_mz', 'first_peak_mz']) + '\n' with open(args.csv_path, 'w') as f: f.write(header) f.writelines([','.join(map(str, row)) + '\n' for row in export_rs]) logger.info('Exported all search results for "%s" dataset into "%s" file', args.ds_name, args.csv_path)
def clear_old_results(self): """ Clear all previous search results for the dataset from the database """ logger.info('Clearing old job results') self.db.alter(clear_iso_image_sql, self.job_id) self.db.alter(clear_iso_image_metrics_sql, self.job_id)
"SELECT f.sf, t.adduct, t.centr_mzs, t.centr_ints " "FROM public.agg_formula f, public.theor_peaks t " "WHERE t.sf_id = f.id AND f.db_id = 1 AND f.sf = %s AND t.adduct = %s " # hardcoded to always fetch from HMDB, lazy i know "ORDER BY t.adduct;") if __name__ == "__main__": parser = argparse.ArgumentParser(description='Exporting isotopic images') parser.add_argument('sf', type=str, help='sum formula') parser.add_argument('add', type=str, help='adduct') parser.add_argument('pkl_path', type=str, help='Path for the cPickle file') parser.add_argument('--config', dest='sm_config_path', type=str, help='SM config path') parser.set_defaults( sm_config_path=path.join(proj_root(), 'conf/config.json')) args = parser.parse_args() SMConfig.set_path(args.sm_config_path) db = DB(SMConfig.get_conf()['db']) export_rs = db.select(EXPORT_SEL, args.sf, args.add) export_df = pd.DataFrame( export_rs, columns=['sf', 'adduct', 'centr_mzs', 'centr_ints']) export_df.to_csv(args.pkl_path, index=False) logger.info( 'Exported the spectra for the "%s" sum formula, "%s" adduct into "%s" file', args.sf, args.add, args.pkl_path)
dest='sm_config_path', type=str, help='SM config path') parser.add_argument('--no-clean', dest='no_clean', action='store_true', help='do not clean interim files') start = time.time() args = parser.parse_args() SMConfig.set_path(args.sm_config_path) fileHandler = FileHandler( filename='logs/{}.log'.format(args.ds_name.replace('/', '_'))) fileHandler.setLevel(DEBUG) fileHandler.setFormatter(Formatter(sm_log_formatters['SM']['format'])) logger.addHandler(fileHandler) logger.debug('Using SM config:\n%s', pformat(SMConfig.get_conf())) logger.info("Processing...") job = SearchJob(None, args.ds_name) job.run(args.input_path, args.ds_config_path, clean=not args.no_clean) logger.info("All done!") time_spent = time.time() - start logger.info('Time spent: %d mins %d secs', *divmod(int(round(time_spent)), 60))