def runworker(self, molecular_search_settings): classes_list, class_to_create, existing_classes_objs = self.check_database_get_class_list(molecular_search_settings) settings = MolecularLookupDictSettings() settings.usedAtoms = deepcopy(molecular_search_settings.usedAtoms) settings.url_database = molecular_search_settings.url_database settings.db_jobs = molecular_search_settings.db_jobs self.add_carbonsHydrogens(settings, existing_classes_objs) if class_to_create: settings = MolecularLookupDictSettings() settings.usedAtoms = deepcopy(molecular_search_settings.usedAtoms) settings.url_database = molecular_search_settings.url_database settings.db_jobs = molecular_search_settings.db_jobs self.sql_db.session.commit() odd_ch_obj = self.get_carbonsHydrogens(settings,'odd') self.odd_ch_id = [obj.id for obj in odd_ch_obj] self.odd_ch_dict = [{'C':obj.C, 'H':obj.H} for obj in odd_ch_obj] self.odd_ch_mass = [obj.mass for obj in odd_ch_obj] self.odd_ch_dbe = [obj.dbe for obj in odd_ch_obj] even_ch_obj = self.get_carbonsHydrogens(settings, 'even') self.even_ch_id = [obj.id for obj in even_ch_obj] self.even_ch_dict = [{'C':obj.C, 'H':obj.H} for obj in even_ch_obj] self.even_ch_mass = [obj.mass for obj in even_ch_obj] self.even_ch_dbe = [obj.dbe for obj in even_ch_obj] all_results= list() for class_tuple in tqdm(class_to_create): results = self.populate_combinations(class_tuple, settings) all_results.extend(results) if settings.db_jobs == 1: #if len(all_results) >= self.sql_db.chunks_count: list_insert_chunks = list(chunks(results, self.sql_db.chunks_count)) for chunk in list_insert_chunks: insert_query = MolecularFormulaLink.__table__.insert().values(chunk) self.sql_db.session.execute(insert_query) #all_results = list() self.sql_db.session.commit() # each chunk takes ~600Mb of memory, so if using 8 processes the total free memory needs to be 5GB if settings.db_jobs > 1: list_insert_chunks = list(chunks(all_results, self.sql_db.chunks_count)) print( "Started database insert using {} iterations for a total of {} rows".format(len(list_insert_chunks), len(all_results))) worker_args = [(chunk, settings.url_database) for chunk in list_insert_chunks] p = multiprocessing.Pool(settings.db_jobs) for class_list in tqdm(p.imap_unordered(insert_database_worker, worker_args)): pass p.close() p.join() return classes_list
def run_noise_threshold_calc(self, auto, bayes=False): if self.is_centroid: # calculates noise_baseline and noise_std # needed to run auto noise threshold mode # it is not used for signal to noise nor # relative abudance methods abundances_chunks = chunks(self.abundance, 50) each_min_abund = [min(x) for x in abundances_chunks] return average(each_min_abund), std(each_min_abund) else: mz_cut, abundance_cut = self.cut_mz_domain_noise(auto) if auto: yminima = self.get_abundance_minima_centroid( mz_cut, abundance_cut) return self.get_noise_average(yminima, auto=auto, bayes=bayes) else: # pyplot.show() return self.get_noise_average(abundance_cut, auto=auto, bayes=bayes)
def run(): for classe_chunk in chunks(classes, 300): classes_str_list = [class_tuple[0] for class_tuple in classe_chunk] # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}} # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} dict_res = self.database_to_dict(classes_str_list, nominal_mzs, self.mass_spectrum_obj.molecular_search_settings, ion_charge) pbar = tqdm.tqdm(classe_chunk) for classe_tuple in pbar: # class string is a json serialized dict classe_str = classe_tuple[0] classe_dict = classe_tuple[1] if self.mass_spectrum_obj.molecular_search_settings.isProtonated: ion_type = Labels.protonated_de_ion pbar.set_description_str(desc="Started molecular formula search for class %s, (de)protonated " % classe_str, refresh=True) candidate_formulas = dict_res.get(ion_type).get(classe_str) if candidate_formulas: self.run_search(ms_peaks, candidate_formulas, min_abundance, ion_type, ion_charge) if self.mass_spectrum_obj.molecular_search_settings.isRadical: pbar.set_description_str(desc="Started molecular formula search for class %s, radical " % classe_str, refresh=True) ion_type = Labels.radical_ion candidate_formulas = dict_res.get(ion_type).get(classe_str) if candidate_formulas: self.run_search(ms_peaks, candidate_formulas, min_abundance, ion_type, ion_charge) # looks for adduct, used_atom_valences should be 0 # this code does not support H exchance by halogen atoms if self.mass_spectrum_obj.molecular_search_settings.isAdduct: pbar.set_description_str(desc="Started molecular formula search for class %s, adduct " % classe_str, refresh=True) ion_type = Labels.adduct_ion dict_atoms_formulas = dict_res.get(ion_type) for adduct_atom, dict_by_class in dict_atoms_formulas.items(): candidate_formulas = dict_by_class.get(classe_str) if candidate_formulas: self.run_search(ms_peaks, candidate_formulas, min_abundance, ion_type, ion_charge, adduct_atom=adduct_atom)
def check_database_get_class_list(self, molecular_search_settings): all_class_to_create = [] classes_dict = self.get_classes_in_order(molecular_search_settings) class_str_set = set(classes_dict.keys()) existing_classes_objs = self.sql_db.session.query( HeteroAtoms).distinct().all() existing_classes_str = set( [classe.name for classe in existing_classes_objs]) self.len_existing_classes = len(existing_classes_str) class_to_create = class_str_set - existing_classes_str class_count = len(existing_classes_objs) data_classes = list() for index, class_str in enumerate(class_to_create): class_dict = classes_dict.get(class_str) halogen_count = self.get_total_halogen_atoms(class_dict) data_classes.append({ "name": class_str, "id": class_count + index + 1, "halogensCount": halogen_count }) #data_classes = [{"name":class_str, "id":class_count+ index + 1} for index, class_str in enumerate(class_to_create)] if data_classes: list_insert_chunks = chunks(data_classes, self.sql_db.chunks_count) for insert_chunk in list_insert_chunks: insert_query = HeteroAtoms.__table__.insert().values( insert_chunk) self.sql_db.session.execute(insert_query) for index, class_str in enumerate(class_to_create): class_tuple = (class_str, classes_dict.get(class_str), class_count + index + 1) all_class_to_create.append(class_tuple) return [(c_s, c_d) for c_s, c_d in classes_dict.items() ], all_class_to_create, existing_classes_objs
def add_carbonsHydrogens(self, settings, existing_classes_objs): usedAtoms = settings.usedAtoms user_min_c, user_max_c = usedAtoms.get('C') user_min_h, user_max_h = usedAtoms.get('H') query_obj = self.sql_db.session.query(func.max(CarbonHydrogen.C).label("max_c"), func.min(CarbonHydrogen.C).label("min_c"), func.max(CarbonHydrogen.H).label("max_h"), func.min(CarbonHydrogen.H).label("min_h"), ) database = query_obj.first() if database.max_c == user_max_c and database.min_c == user_min_c and database.max_h == user_max_h and database.min_h == user_min_h: #all data is already available at the database pass else: current_count = self.sql_db.session.query(CarbonHydrogen.C).count() databaseCarbonHydrogen = self.sql_db.session.query(CarbonHydrogen).all() userCarbon = set(range(user_min_c, user_max_c + 1)) userHydrogen = set(range(user_min_h, user_max_h + 1)) carbon_hydrogen_objs_database = {} for obj in databaseCarbonHydrogen: str_data = "C:{},H:{}".format(obj.C, obj.H) carbon_hydrogen_objs_database[str_data] = str_data carbon_hydrogen_objs_to_create = {'even': {}, 'odd': {}} list_ch_obj_to_add = list() i = 0 for comb in itertools.product(userCarbon, userHydrogen): C = comb[0] H = comb[1] data = {"C":C, "H":H, } data_insert = {"C":C, "H":H, } str_data = "C:{},H:{}".format(C,H) if not str_data in carbon_hydrogen_objs_database.keys(): label = 'even' if comb[1]%2 == 0 else 'odd' data["mass"] = (C * Atoms.atomic_masses.get('C')) + (H * Atoms.atomic_masses.get('H')) data["dbe"] = C - (H/2) + 1 data["id"] = i + current_count + 1 data_insert["id"] = i + current_count + 1 i = i + 1 carbon_hydrogen_objs_to_create[label][str_data] = data list_ch_obj_to_add.append(data_insert) if list_ch_obj_to_add: # insert carbon hydrogen objs list_insert_chunks = chunks(list_ch_obj_to_add, self.sql_db.chunks_count) for insert_chunk in list_insert_chunks: insert_query = CarbonHydrogen.__table__.insert().values(insert_chunk) self.sql_db.session.execute(insert_query) self.sql_db.session.commit() list_molecular_form= list() for classe_obj in existing_classes_objs: classe_dict = classe_obj.to_dict() classe_mass = self.calc_mz(classe_dict) classe_dbe = self.calc_dbe_class(classe_dict) odd_even_label = self.get_h_odd_or_even(classe_dict) ch_datalist = carbon_hydrogen_objs_to_create.get(odd_even_label).values() for ch_dict in ch_datalist: mass = ch_dict.get('mass') + classe_mass dbe = ch_dict.get('dbe') + classe_dbe if settings.min_mz <= mass <= settings.max_mz: if settings.min_dbe <= dbe <= settings.max_dbe: list_molecular_form.append( {"heteroAtoms_id":classe_obj.id, "carbonHydrogen_id":ch_dict.get('id'), "mass":mass, "DBE":dbe}) list_insert_chunks = chunks(list_molecular_form, self.sql_db.chunks_count) for insert_chunk in list_insert_chunks: insert_query = MolecularFormulaLink.__table__.insert().values(insert_chunk) self.sql_db.session.execute(insert_query) self.sql_db.session.commit()