Beispiel #1
0
    def runworker(self, molecular_search_settings):
        
        classes_list, class_to_create, existing_classes_objs = self.check_database_get_class_list(molecular_search_settings)
        
        settings = MolecularLookupDictSettings()
        settings.usedAtoms = deepcopy(molecular_search_settings.usedAtoms)
        settings.url_database = molecular_search_settings.url_database
        settings.db_jobs = molecular_search_settings.db_jobs

        self.add_carbonsHydrogens(settings, existing_classes_objs)
        
        if class_to_create:
            
            settings = MolecularLookupDictSettings()
            settings.usedAtoms = deepcopy(molecular_search_settings.usedAtoms)
            settings.url_database = molecular_search_settings.url_database
            settings.db_jobs = molecular_search_settings.db_jobs
            
            self.sql_db.session.commit()
            odd_ch_obj = self.get_carbonsHydrogens(settings,'odd')
            self.odd_ch_id = [obj.id for obj in odd_ch_obj]
            self.odd_ch_dict = [{'C':obj.C, 'H':obj.H} for obj in odd_ch_obj]
            self.odd_ch_mass = [obj.mass for obj in odd_ch_obj]
            self.odd_ch_dbe = [obj.dbe for obj in odd_ch_obj]
            
            even_ch_obj = self.get_carbonsHydrogens(settings, 'even')
            self.even_ch_id = [obj.id for obj in even_ch_obj]
            self.even_ch_dict = [{'C':obj.C, 'H':obj.H} for obj in even_ch_obj]
            self.even_ch_mass = [obj.mass for obj in even_ch_obj]
            self.even_ch_dbe = [obj.dbe for obj in even_ch_obj]

            all_results= list()
            for class_tuple in tqdm(class_to_create):
                
                results = self.populate_combinations(class_tuple, settings)
                all_results.extend(results)
                if settings.db_jobs == 1: 
                    #if len(all_results) >= self.sql_db.chunks_count:
                        list_insert_chunks = list(chunks(results, self.sql_db.chunks_count))
                        for chunk in list_insert_chunks:
                            insert_query = MolecularFormulaLink.__table__.insert().values(chunk)
                            self.sql_db.session.execute(insert_query)
                        #all_results = list()
            self.sql_db.session.commit()
            # each chunk takes ~600Mb of memory, so if using 8 processes the total free memory needs to be 5GB
            if settings.db_jobs > 1: 
                list_insert_chunks = list(chunks(all_results, self.sql_db.chunks_count))
                print( "Started database insert using {} iterations for a total of {} rows".format(len(list_insert_chunks), len(all_results)))
                worker_args = [(chunk, settings.url_database) for chunk in list_insert_chunks]
                p = multiprocessing.Pool(settings.db_jobs)
                for class_list in tqdm(p.imap_unordered(insert_database_worker, worker_args)):
                    pass
                p.close()
                p.join()
        
        return classes_list
Beispiel #2
0
    def run_noise_threshold_calc(self, auto, bayes=False):

        if self.is_centroid:
            # calculates noise_baseline and noise_std
            # needed to run auto noise threshold mode
            # it is not used for signal to noise nor
            # relative abudance methods
            abundances_chunks = chunks(self.abundance, 50)
            each_min_abund = [min(x) for x in abundances_chunks]

            return average(each_min_abund), std(each_min_abund)

        else:

            mz_cut, abundance_cut = self.cut_mz_domain_noise(auto)

            if auto:

                yminima = self.get_abundance_minima_centroid(
                    mz_cut, abundance_cut)

                return self.get_noise_average(yminima, auto=auto, bayes=bayes)

            else:

                # pyplot.show()
                return self.get_noise_average(abundance_cut,
                                              auto=auto,
                                              bayes=bayes)
Beispiel #3
0
        def run():

            for classe_chunk in chunks(classes, 300): 

                classes_str_list = [class_tuple[0] for class_tuple in classe_chunk]

                # load the molecular formula objs binned by ion type and heteroatoms classes, {ion type:{classe:[list_formula]}}
                # for adduct ion type a third key is added {atoms:{ion type:{classe:[list_formula]}}} 
                dict_res = self.database_to_dict(classes_str_list, nominal_mzs, self.mass_spectrum_obj.molecular_search_settings, ion_charge)

                pbar = tqdm.tqdm(classe_chunk)

                for classe_tuple in pbar:

                    # class string is a json serialized dict
                    classe_str = classe_tuple[0]
                    classe_dict = classe_tuple[1]

                    if self.mass_spectrum_obj.molecular_search_settings.isProtonated:

                        ion_type = Labels.protonated_de_ion

                        pbar.set_description_str(desc="Started molecular formula search for class %s, (de)protonated " % classe_str, refresh=True)

                        candidate_formulas = dict_res.get(ion_type).get(classe_str)

                        if candidate_formulas:

                            self.run_search(ms_peaks, candidate_formulas,
                                            min_abundance, ion_type, ion_charge)

                    if self.mass_spectrum_obj.molecular_search_settings.isRadical:

                        pbar.set_description_str(desc="Started molecular formula search for class %s, radical " % classe_str, refresh=True)

                        ion_type = Labels.radical_ion

                        candidate_formulas = dict_res.get(ion_type).get(classe_str)

                        if candidate_formulas:

                            self.run_search(ms_peaks, candidate_formulas,
                                            min_abundance, ion_type, ion_charge)
                    # looks for adduct, used_atom_valences should be 0 
                    # this code does not support H exchance by halogen atoms
                    if self.mass_spectrum_obj.molecular_search_settings.isAdduct:

                        pbar.set_description_str(desc="Started molecular formula search for class %s, adduct " % classe_str, refresh=True)

                        ion_type = Labels.adduct_ion
                        dict_atoms_formulas =  dict_res.get(ion_type)

                        for adduct_atom, dict_by_class in dict_atoms_formulas.items():

                            candidate_formulas = dict_by_class.get(classe_str)

                            if candidate_formulas:
                                self.run_search(ms_peaks, candidate_formulas,
                                                min_abundance, ion_type, ion_charge, adduct_atom=adduct_atom)
Beispiel #4
0
    def check_database_get_class_list(self, molecular_search_settings):

        all_class_to_create = []

        classes_dict = self.get_classes_in_order(molecular_search_settings)

        class_str_set = set(classes_dict.keys())

        existing_classes_objs = self.sql_db.session.query(
            HeteroAtoms).distinct().all()

        existing_classes_str = set(
            [classe.name for classe in existing_classes_objs])

        self.len_existing_classes = len(existing_classes_str)

        class_to_create = class_str_set - existing_classes_str

        class_count = len(existing_classes_objs)

        data_classes = list()
        for index, class_str in enumerate(class_to_create):

            class_dict = classes_dict.get(class_str)
            halogen_count = self.get_total_halogen_atoms(class_dict)
            data_classes.append({
                "name": class_str,
                "id": class_count + index + 1,
                "halogensCount": halogen_count
            })

        #data_classes = [{"name":class_str, "id":class_count+ index + 1} for index, class_str in enumerate(class_to_create)]

        if data_classes:

            list_insert_chunks = chunks(data_classes, self.sql_db.chunks_count)
            for insert_chunk in list_insert_chunks:
                insert_query = HeteroAtoms.__table__.insert().values(
                    insert_chunk)
                self.sql_db.session.execute(insert_query)

        for index, class_str in enumerate(class_to_create):

            class_tuple = (class_str, classes_dict.get(class_str),
                           class_count + index + 1)

            all_class_to_create.append(class_tuple)

        return [(c_s, c_d) for c_s, c_d in classes_dict.items()
                ], all_class_to_create, existing_classes_objs
Beispiel #5
0
    def add_carbonsHydrogens(self, settings, existing_classes_objs):

        usedAtoms = settings.usedAtoms

        user_min_c, user_max_c = usedAtoms.get('C')
        user_min_h, user_max_h = usedAtoms.get('H')

        query_obj = self.sql_db.session.query(func.max(CarbonHydrogen.C).label("max_c"), 
                        func.min(CarbonHydrogen.C).label("min_c"),
                        func.max(CarbonHydrogen.H).label("max_h"),
                        func.min(CarbonHydrogen.H).label("min_h"),
                        )

        
        database = query_obj.first()
        if database.max_c == user_max_c and database.min_c == user_min_c and database.max_h == user_max_h and database.min_h == user_min_h:   
            #all data is already available at the database
            pass
        
        else:
            
            current_count = self.sql_db.session.query(CarbonHydrogen.C).count()
            
            databaseCarbonHydrogen = self.sql_db.session.query(CarbonHydrogen).all()
            
            userCarbon = set(range(user_min_c, user_max_c + 1))
            userHydrogen = set(range(user_min_h, user_max_h + 1))
            
            carbon_hydrogen_objs_database = {}
            for obj in databaseCarbonHydrogen:
                
                str_data = "C:{},H:{}".format(obj.C, obj.H)
                carbon_hydrogen_objs_database[str_data] = str_data

            carbon_hydrogen_objs_to_create = {'even': {}, 'odd': {}}
            
            list_ch_obj_to_add = list()
            i = 0
            for comb in itertools.product(userCarbon, userHydrogen):
                
                C  = comb[0]
                H =  comb[1]
                data = {"C":C,
                       "H":H,
                }

                data_insert = {"C":C,
                       "H":H,
                }

                str_data = "C:{},H:{}".format(C,H)
                
                if not str_data in carbon_hydrogen_objs_database.keys():
                    
                    label = 'even' if comb[1]%2 == 0 else 'odd'
                    data["mass"] = (C * Atoms.atomic_masses.get('C')) + (H * Atoms.atomic_masses.get('H'))
                    data["dbe"] = C - (H/2) + 1
                    data["id"] = i + current_count + 1
                    data_insert["id"] = i + current_count + 1
                    i = i + 1 
                    carbon_hydrogen_objs_to_create[label][str_data] = data
                    
                    list_ch_obj_to_add.append(data_insert)

            if list_ch_obj_to_add:
                # insert carbon hydrogen objs
                list_insert_chunks = chunks(list_ch_obj_to_add, self.sql_db.chunks_count)
                for insert_chunk in  list_insert_chunks:   
                    insert_query = CarbonHydrogen.__table__.insert().values(insert_chunk)
                    self.sql_db.session.execute(insert_query)
                self.sql_db.session.commit()    
            
                
                list_molecular_form= list()
                for classe_obj in existing_classes_objs:

                    classe_dict = classe_obj.to_dict()  
                    classe_mass = self.calc_mz(classe_dict)
                    classe_dbe = self.calc_dbe_class(classe_dict)

                    odd_even_label = self.get_h_odd_or_even(classe_dict)

                    ch_datalist = carbon_hydrogen_objs_to_create.get(odd_even_label).values()

                    for ch_dict in ch_datalist:
                        mass = ch_dict.get('mass') + classe_mass
                        dbe = ch_dict.get('dbe') + classe_dbe

                        if settings.min_mz <= mass <= settings.max_mz:
                
                            if settings.min_dbe <= dbe <= settings.max_dbe:
                                
                                list_molecular_form.append( {"heteroAtoms_id":classe_obj.id, 
                                        "carbonHydrogen_id":ch_dict.get('id'), 
                                        "mass":mass, "DBE":dbe})

                list_insert_chunks = chunks(list_molecular_form, self.sql_db.chunks_count)
                for insert_chunk in  list_insert_chunks:   
                    insert_query = MolecularFormulaLink.__table__.insert().values(insert_chunk)
                    self.sql_db.session.execute(insert_query)
                self.sql_db.session.commit()