コード例 #1
0
 def correct_precursor_mz(spectral_manager):
     
     for spectrum in specmanager.ms_spectra:
         mass=float(spectrum.parameters['exactmass']);
         mode=spectrum.parameters['mode'];
         for peak in spectrum.peaks:
             if 'ion_type' in peak.parameters:            
                 adduct=get_adduct_by_name(peak.parameters['ion_type'],mode);
                 if not (adduct is None):
                     peak.mz=adduct.get_mz(mass);
コード例 #2
0
    def remove_low_precision_spectra(precision):
        logit('Removing low precision spectra')

        for i in reversed(range(len(specmanager.ms_spectra))):
            spectrum = specmanager.ms_spectra[i]

            mass = float(spectrum.parameters['exactmass'])
            mode = spectrum.parameters['mode']
            keepspectrum = False

            for peakindex in reversed(range(len(spectrum.peaks))):
                keep = False
                peak = spectrum.peaks[peakindex]
                if hasattr(peak, 'parameters'):
                    if 'ion_type' in peak.parameters:
                        adduct = get_adduct_by_name(
                            peak.parameters['ion_type'], mode)
                        if not (adduct is None):
                            keep = True
                            mz = adduct.get_mz(mass)
                            minmzd = 1.0
                            for subspectrum in peak.ms_spectra:
                                if abs(mz - float(
                                        subspectrum.parameters['precursor_mz'])
                                       ) > precision:
                                    keep = False
                                    break
                                else:
                                    for subpeak in subspectrum.peaks:
                                        if abs(subpeak.mz - mz) < minmzd:
                                            minmzd = abs(subpeak.mz - mz)
                            if keep and (minmzd > precision):
                                keep = False
                if keep == False:
                    del spectrum.peaks[peakindex]
                else:
                    keepspectrum = True

            if keepspectrum == False:
                del specmanager.ms_spectra[i]

        logit('Finshed removing spectra. Total number left: %s' %
              len(specmanager.ms_spectra))
コード例 #3
0
def remove_low_precision_spectra(precision):
        print('Removing low precision spectra');
        
        
        for i in reversed(range(len(specmanager.ms_spectra))):
            spectrum=specmanager.ms_spectra[i];
            
            mass=float(spectrum.parameters['exactmass']);
            mode=spectrum.parameters['mode'];
            keepspectrum=False;
            
            for peakindex in reversed(range(len(spectrum.peaks))):
                keep=False;
                peak=spectrum.peaks[peakindex];
                if hasattr(peak,'parameters'):
                    if 'ion_type' in peak.parameters:  
                        if peak.parameters['ion_type'] == '[M+H]+' or peak.parameters['ion_type'] == '[M-H]-':
                            adduct=get_adduct_by_name(peak.parameters['ion_type'],mode);
                            if not (adduct is None):
                                keep=True;
                                mz=adduct.get_mz(mass);
                                minmzd=1.0;
                                for subspectrum in peak.ms_spectra:
                                    if abs(mz-float(subspectrum.parameters['precursor_mz']))>precision:
                                        keep=False;
                                        break;
                                    else:
                                        for subpeak in subspectrum.peaks:
                                            if abs(subpeak.mz-mz)<minmzd:
                                                minmzd=abs(subpeak.mz-mz);
                                if keep and (minmzd>precision):
                                    keep=False;
                if keep==False:
                    del spectrum.peaks[peakindex];
                else:
                    keepspectrum=True;
                    
            if keepspectrum==False:
                del specmanager.ms_spectra[i];
                
        
                        
        print('Finshed removing spectra. Total number left: %s'%len(specmanager.ms_spectra));    
コード例 #4
0
    def testrun(scorers_list=[],
                scorers_settings=[],
                batchindex=-1,
                header='',
                use_metalikeness=False,
                use_formula=False,
                use_elements=False):

        #for batchindex in range(5):

        logit('Assessing Fragprint based search. Batch %s...' % batchindex)

        total_peaks = 0

        best_results_neg = [0] * max_results_per_query
        worst_results_neg = [0] * max_results_per_query
        return_number_neg = [0] * max_results_per_query
        neg_mode_count = 0

        best_results_pos = [0] * max_results_per_query
        worst_results_pos = [0] * max_results_per_query
        return_number_pos = [0] * max_results_per_query
        pos_mode_count = 0

        logtime()

        logit('Assuming correct adduct info and 0 isotope.')
        logit('No formula assumption. Assuming Mass precision: %s ppm' % ppm)
        logit('Only considering [M+H]+ and [M-H]- adducts for now')

        peaks = []
        for spectrum_index in range(len(specmanager.ms_spectra)):
            print('%s of %s spectra preprocessed' %
                  (spectrum_index, len(specmanager.ms_spectra)))
            spectrum = specmanager.ms_spectra[spectrum_index]
            if batchindex == -1 or int(
                    spectrum.parameters['crossvalidation_batch_index']
            ) == batchindex:
                for peak in spectrum.peaks:
                    if 'ion_type' in peak.parameters:
                        if peak.parameters[
                                'ion_type'] in FragPrintScorer.supported_adducts:
                            peak.ppm = ppm
                            #Assuming 0 isotope states only.

                            shortinchi = get_short_inchi_from_full_inchi(
                                peak.parent_spectrum.parameters['inchi'])
                            #
                            filters = []
                            if use_metalikeness:

                                metabolite_likeness = LikenessFilter(
                                    likeness=['MetaLike'])
                                filters.append(metabolite_likeness)

                            if use_formula:
                                formula = peak.parent_spectrum.parameters[
                                    'formula']
                                correct_formula = FormulasFilter(formula)
                                filters.append(correct_formula)

                            if use_elements:
                                elements = peak.parent_spectrum.parameters[
                                    'formula']
                                correct_elements = ElementCompositionFilter(
                                    elements, elements)
                                filters.append(correct_elements)


                            annotation=MSPeakAnnotation(peak, adduct=get_adduct_by_name(peak.parameters['ion_type'],\
                                    spectrum.parameters['mode']), isotope=0, formula_scorer=None,\
                                    filters=filters, scores={'AdductIsotopeScore':1.0})

                            if not (annotation.adduct is None):
                                peak.annotations = [annotation]
                                peaks.append(peak)
                            else:
                                logit('Unsupported Adduct: %s' %
                                      peak.parameters['ion_type'])

        logit('Peaks to annotate: %s' % len(peaks))

        #

        peak_annotator.annotate_peaks(peaks, test_chemical_databases, \
            scorers_list=scorers_list, scorers_settings=scorers_settings, total_score=total_multiplicative_score, required_fields=set(['ShortInChI']),\
            results_limit=max_results_per_query, save_memory=False, ppm=ppm, overwrite=True)

        for i in reversed(range(len(peaks))):
            if peaks[i].annotations[0].mol_candidates is None:
                del peaks[i]

        logit('Finished annotating... Annotated peaks :%s' % len(peaks))

        logtime()

        total_peaks = len(peaks)

        total_candidates = 0

        for peakindex in range(total_peaks):
            peak = peaks[peakindex]
            peakmode = 0
            if peak.parent_spectrum.parameters['mode'] == 1:
                pos_mode_count += 1
                peakmode = 1
            elif peak.parent_spectrum.parameters['mode'] == -1:
                neg_mode_count += 1
                peakmode = -1

            #print('%s of %s '%(peakindex,total_peaks));

            shortinchi = get_short_inchi_from_full_inchi(
                peak.parent_spectrum.parameters['inchi'])

            for annotation in peak.annotations:
                annotation.min_correct = -1
                annotation.max_correct = -1
                total_candidates += annotation.mol_candidates.total_candidate_count

                #Get annotation score

                annotation.mean_score = 0.0
                annotation.max_score = 0.0
                for index in range(len(annotation.mol_candidates.mol_list)):
                    total_score = annotation.mol_candidates.mol_list[index][
                        'TotalScore']
                    #print(annotation.mol_candidates.mol_list[index]);
                    annotation.mean_score += total_score
                    if total_score > annotation.max_score:
                        annotation.max_score = total_score
                if len(annotation.mol_candidates.mol_list) > 0:
                    annotation.mean_score = annotation.mean_score / len(
                        annotation.mol_candidates.mol_list)
                    if len(annotation.mol_candidates.mol_list
                           ) <= max_results_per_query:
                        if peakmode == 1:
                            return_number_pos[
                                len(annotation.mol_candidates.mol_list) -
                                1] += 1
                        elif peakmode == -1:
                            return_number_neg[
                                len(annotation.mol_candidates.mol_list) -
                                1] += 1

                for index in reversed(
                        range(1, len(annotation.mol_candidates.mol_list))):
                    if annotation.mol_candidates.mol_list[index - 1][
                            'ShortInChI'] == annotation.mol_candidates.mol_list[
                                index]['ShortInChI']:
                        del annotation.mol_candidates.mol_list[index]
                        # For the purpose of statistics condensing sequential identical Short InChI-s

                #print(len(annotation.mol_candidates.mol_list));
                #print('Correct: %s'%shortinchi);

                for index in range(len(annotation.mol_candidates.mol_list)):
                    #print(annotation.mol_candidates.mol_list[index]['ShortInChI']);
                    if annotation.mol_candidates.mol_list[index][
                            'ShortInChI'] == shortinchi:
                        #print('Got it!');
                        if annotation.min_correct == -1:
                            annotation.min_correct = index
                        annotation.max_correct = index

                if annotation.min_correct > -1:
                    if peakmode == 1:
                        for i in range(len(best_results_pos)):
                            if annotation.min_correct <= i:
                                best_results_pos[i] += 1
                            if annotation.max_correct <= i:
                                worst_results_pos[i] += 1
                    elif peakmode == -1:
                        for i in range(len(best_results_neg)):
                            if annotation.min_correct <= i:
                                best_results_neg[i] += 1
                            if annotation.max_correct <= i:
                                worst_results_neg[i] += 1
                #else:
                #    logit('No candidate for %s (Mass: %s, peak mode: %s, peak_mz %s, deltaM %s)'%(shortinchi,peak.parent_spectrum.parameters['exactmass'],peakmode, peak.mz,abs(peak.mz-float(peak.parent_spectrum.parameters['exactmass']))));
        logit('Finished Calculating Retrieval Stats.')
        logit('Positive Mode Count: %s' % pos_mode_count)
        logit('Negative Mode Count: %s' % neg_mode_count)
        logit('Total Peaks: %s' % total_peaks)
        logit('Total Candidate Count: %s' % total_candidates)
        logtime()

        logit('Positive Mode (Total: %s ):' % pos_mode_count)
        if pos_mode_count > 0:
            for i in range(max_results_per_query):
                logit(
                    'Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'
                    % (i + 1, best_results_pos[i] * 100 / pos_mode_count,
                       worst_results_pos[i] * 100 / pos_mode_count,
                       return_number_pos[i]))

        logit('Negative Mode (Total: %s ):' % neg_mode_count)
        if neg_mode_count > 0:
            for i in range(max_results_per_query):
                logit(
                    'Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'
                    % (i + 1, best_results_neg[i] * 100 / neg_mode_count,
                       worst_results_neg[i] * 100 / neg_mode_count,
                       return_number_neg[i]))

        logit('Both Modes (Total: %s ):' % (pos_mode_count + neg_mode_count))
        if neg_mode_count > 0 or pos_mode_count > 0:
            for i in range(max_results_per_query):
                logit(
                    'Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'
                    % (i + 1,
                       (best_results_neg[i] + best_results_pos[i]) * 100 /
                       (neg_mode_count + pos_mode_count),
                       (worst_results_neg[i] + worst_results_pos[i]) * 100 /
                       (neg_mode_count + pos_mode_count),
                       return_number_pos[i] + return_number_neg[i]))

        for i in range(len(best_results_neg)):
            best_results_neg[i] = best_results_neg[i] * 100 / neg_mode_count
            worst_results_neg[i] = worst_results_neg[i] * 100 / neg_mode_count
            best_results_pos[i] = best_results_pos[i] * 100 / pos_mode_count
            worst_results_pos[i] = worst_results_pos[i] * 100 / pos_mode_count

        store_stats(header, best_results_neg, worst_results_neg,
                    best_results_pos, worst_results_pos)
        #logit('Missed Peaks due to unknown mode: %s'%(total_peaks-neg_mode_count-pos_mode_count));

        logtime()
        '''
コード例 #5
0
    def _pipe_from_textfile(self, finp):
        while True:
            s = finp.readline()
            if s == '':
                return
            s = s.rstrip('\n').lstrip()
            if '##' in s:
                s = s[:s.index('##')]

            if '=' in s:
                s = s.split('=', 1)
                if s[0].lower().startswith('adduct'):
                    self.adduct = get_adduct_by_name(s[1])
                elif s[0].lower().startswith('isotope_extra_mass'):
                    self.isotope_extra_mass = float(s[1])
                elif s[0].lower().startswith('isotope'):
                    self.isotope = int(s[1])
                else:
                    self.parameters[s[0].lower()] = s[1]
            elif s.lower().startswith('formula_scorer'):
                self.formula_scorer = FormulaScorer()
                self.formula_scorer._pipe_from_textfile(finp)

            elif s.lower().startswith('element_scorer'):
                self.element_scorer = ElementScorer()
                self.element_scorer._pipe_from_textfile(finp)

            elif s.lower().startswith('mol_candidates'):
                self.mol_candidates = DBQueryResult()
                self.mol_candidates._pipe_from_textfile(finp)

            elif s.lower().startswith('scores'):
                finished = False
                while not finished:
                    s = finp.readline()
                    if s == '':
                        return
                    s = s.rstrip('\n').lstrip()
                    if '##' in s:
                        s = s[:s.index('##')]
                    if '=' in s:
                        s = s.split('=', 1)
                        self.scores[s[0]] = float(s[1])
                    elif (s.lower().startswith('end')):
                        finished = True

            elif s.lower().startswith('filters'):
                finished = False
                while not finished:
                    s = finp.readline()
                    if s == '':
                        return
                    s = s.rstrip('\n').lstrip()
                    if '##' in s:
                        s = s[:s.index('##')]
                    if '=' in s:
                        s = s.split('=', 1)
                        chemfilter = get_filter_by_name(s[1])
                        chemfilter._pipe_from_textfile(finp)
                        self.filters.append(chemfilter)
                    elif (s.lower().startswith('end')):
                        finished = True

            elif s.lower().startswith('end'):
                return
コード例 #6
0
def run_annotation(spectral_input_path, db_file, chemical_databases, output_folder, SVMs_path, ncpu, ppm, max_results_per_query, test_mode=False, ignore_peaks_without_spectra=True):

    global starttime;
    global logfile;

    if not os.path.isfile(db_file):
        print('No database list file found!');
        quit();

    db_files=[db_file];
    
    if not os.path.exists(output_folder):
        try:
            os.makedirs(output_folder);
        except:
            print('Cannot create output path %s !'%output_folder);
            print('Run: python annotate.py -h for help');
            sys.exit(2);
    
    logfile=open(os.path.join(output_folder,'log.txt'),'w');
    starttime=time.time();

    logit('Starting. %s'%time.strftime('%d/%m/%y %H:%M:%S'));
    logit('Input Spectral path: %s'%spectral_input_path);
    logit('Output path: %s'%output_folder)
    if chemical_databases:
        logit('Chemical Databases Included: %s'%chemical_databases);
    else:
        logit('Chemical Databases Included: All');
        
    logit('from chemical database paths file %s'%db_file);
    logit('SVMs from %s'%SVMs_path);
    logit('nCPUs to use: %s'%ncpu);
    logit('Default ppm tolerance for MS1: %s'%ppm);    

    
    specmanager=SpectralManager();
    logit('Reading test spectra...');
        
    specmanager.import_textfile_spectra_from_folder(spectral_input_path);
        
    logit('Finshed reading spectra. Total number: %s'%len(specmanager.ms_spectra));
        
    logtime();
    
    multiprocessing.freeze_support();
    
    scorers_list=['FingerPrintScorer','FragPrintScorer'];
    scorers_settings=[[],[]];
    total_score=total_multiplicative_score;
        
    with PeakAnnotator(db_files, SVMs_path, ncpu) as peak_annotator:
        logit('Beginning annotation...');
        logtime();
        
        total_peaks=0;
    
        best_results_neg=[0]*max_results_per_query;
        worst_results_neg=[0]*max_results_per_query;
        return_number_neg=[0]*max_results_per_query;
        neg_mode_count=0;
            
        best_results_pos=[0]*max_results_per_query;
        worst_results_pos=[0]*max_results_per_query;
        return_number_pos=[0]*max_results_per_query;
        pos_mode_count=0;
        
        peaks=[];
        
        supported_adducts=set(global_supported_adducts);
                                
        #Collect all supported adducts
        #if 'FingerPrintScorer' in scorers_list:
        #    supported_adducts=supported_adducts|FingerPrintScorer.supported_adducts;

        #if 'FragPrintScorer' in scorers_list:
        #    supported_adducts=supported_adducts|FragPrintScorer.supported_adducts;

        #Leave only adducts supported by all selected filters                                
        if 'FingerPrintScorer' in scorers_list:
            supported_adducts=supported_adducts&FingerPrintScorer.supported_adducts;

        if 'FragPrintScorer' in scorers_list:
            supported_adducts=supported_adducts&FragPrintScorer.supported_adducts;

                
        logit('Preparing annotations');                        
        if test_mode:
            logit('Running in test mode.....');            
            
            logit('Assuming correct adduct info and 0 isotope.');
            logit('No formula assumption. Mass precision: %s ppm'%ppm);
            logit('Only considering [M+H]+ and [M-H]- adducts for now');
            
            print('\n');
                
            for spectrum_index in range(len(specmanager.ms_spectra)):
                    #print('\r%s of %s spectra preprocessed'%(spectrum_index,len(specmanager.ms_spectra)));
                    spectrum=specmanager.ms_spectra[spectrum_index];
                    
                    for peak in spectrum.peaks:
                        if (not ignore_peaks_without_spectra) or (hasattr(peak,'ms_spectra') and peak.ms_spectra):
                            if hasattr(peak,'parameters') and ('ion_type' in peak.parameters):
                                if (peak.parameters['ion_type'] in FragPrintScorer.supported_adducts):
                                    peak.ppm=ppm; 
                                    
                                    shortinchi=get_short_inchi_from_full_inchi(peak.parent_spectrum.parameters['inchi']);
                                    
                                    annotation=MSPeakAnnotation(peak, adduct=\
                                        get_adduct_by_name(peak.parameters['ion_type'], \
                                        spectrum.parameters['mode']), isotope=0, \
                                        formula_scorer=None, element_scorer=None, filters=[], scores={});
                                        
                                    #testformulascorer=FormulaScorer();
                                    #testformulascorer.setup_scorer(peak.parent_spectrum.parameters['formula'],1.0);
                                    #testelementscorer=ElementScorer();
                                    #testelementscorer.setup_scorer({'C':0.8,'O':0.9,'Si':0.1});
                                    
                                    #formulafilter=FormulasFilter(formulas=[peak.parent_spectrum.parameters['formula']]);
                                    
                                    #elementfilter=ElementCompositionFilter(peak.parent_spectrum.parameters['formula'], peak.parent_spectrum.parameters['formula']);
                                    
                                    #inchifilter=InChIFilter(ref_inchi=peak.parent_spectrum.parameters['inchi'],use_short_inchi=False, match_type=0)
                                                                        
                                    
                                    #annotation=MSPeakAnnotation(peak, adduct=\
                                    #    get_adduct_by_name(peak.parameters['ion_type'], \
                                    #    spectrum.parameters['mode']), isotope=0, \
                                    #    formula_scorer=testformulascorer, element_scorer=testelementscorer, \
                                    #    filters=[formulafilter, inchifilter, elementfilter], scores={'ExtraScore':25.0, 'AllExtra':1.9});
                                        
                                    
                                    if not (annotation.adduct is None):
                                        peak.annotations=[annotation];
                                        peaks.append(peak);
                                    else:
                                        logit('Unsupported Adduct: %s'%peak.parameters['ion_type']);
        else:
            

            logit('Running in normal mode.....');            
            logit('Mass precision: %s ppm'%ppm);
            
            print('\n');
            
            for spectrum_index in range(len(specmanager.ms_spectra)):
                    #print('\r%s of %s spectra preprocessed'%(spectrum_index,len(specmanager.ms_spectra)));
                    spectrum=specmanager.ms_spectra[spectrum_index];
                    for peak in spectrum.peaks:
                        if (not ignore_peaks_without_spectra) or (hasattr(peak,'ms_spectra') and peak.ms_spectra):
                            peak.ppm=ppm; 
                            generate_annotation=True;
                            if hasattr(peak, 'annotations'):
                                if peak.annotations:
                                    generate_annotation=False;
                                    peaks.append(peak);
                            
                            if generate_annotation:
                                #Assuming isotope 0 by default
                                if spectrum.parameters['mode']==1:
                                    selected_adducts=get_positive_mode_adducts(supported_adducts);
                                else:
                                    selected_adducts=get_negative_mode_adducts(supported_adducts);
                                
                                peak.annotations=[];
                                for adduct in selected_adducts:
                                    annotation=MSPeakAnnotation(peak, adduct=adduct, isotope=0, formula_scorer=None, element_scorer=None, filters=[], scores={});
                                    peak.annotations.append(annotation);
                                    peaks.append(peak);
                                    
        
        logit('Peaks to annotate: %s'%len(peaks));
        
        if test_mode:
            required_fields=set(['ShortInChI','InChI','SMILES','Formula','IDs']);
            #required_fields=set(['ShortInChI','InChI','SMILES','Formula','IDs', 'InChIKeyValues', 'InChiKey', 'FormulaVector', 'ElementVector', 'Frag', 'FPT']);
            
        else:
            required_fields=set(['InChI','SMILES','Formula','IDs']);
        
        peak_annotator.annotate_peaks(peaks, chemical_databases, \
                scorers_list=scorers_list, scorers_settings=scorers_settings, total_score=total_score, required_fields=required_fields,\
                results_limit=max_results_per_query, save_memory=False, ppm=ppm, overwrite=True);
        
        
        
        annotation_count=0;
        total_candidates=0;                    

        total_peaks=len(peaks);
        for peakindex in range(total_peaks):
            peak=peaks[peakindex];
            if peak.annotations:
                for annotation in peak.annotations:
                    if not (annotation.mol_candidates is None):
                        annotation_count+=1;
                        total_candidates+=annotation.mol_candidates.total_candidate_count;
                        
        
        
        logit('Finished annotating... ');
        logit('Annotated peaks :%s'%total_peaks);    
        if total_peaks>0:
            logit('Averaged annotations per peak:%s'%(float(annotation_count)/total_peaks));    
            logit('Averaged candidate molecules per peak:%s'%(float(total_candidates)/total_peaks));    
        
        
        #Calculating retrieval statistics using test data
        #=====================================================================
        
        if test_mode:
            #logit('Removing peaks with no annotations found from consideration...');
            #for i in reversed(range(len(peaks))):
            #    if not peaks[i].annotations:
            #        del peaks[i];
                
            #logit('Finished removing... Annotated peaks :%s'%len(peaks));    
                
            total_peaks=len(peaks);
            print('Calculating retrieval stats...\n');
                
            for peakindex in range(total_peaks):
                peak=peaks[peakindex];
                peakmode=0;
                if peak.parent_spectrum.parameters['mode']==1:
                    pos_mode_count+=1;
                    peakmode=1;
                elif peak.parent_spectrum.parameters['mode']==-1:
                    neg_mode_count+=1;
                    peakmode=-1;
                        
                #print('\r%s of %s '%(peakindex,total_peaks));
                shortinchi=get_short_inchi_from_full_inchi(peak.parent_spectrum.parameters['inchi']);
        
                for annotation in peak.annotations:
                    annotation.min_correct=-1;
                    annotation.max_correct=-1;
                    annotation.mean_score=0.0;
                    annotation.max_score=0.0;
                    for index in range(len(annotation.mol_candidates.mol_list)):
                        total_score=annotation.mol_candidates.mol_list[index]['TotalScore'];
                        annotation.mean_score+=total_score;
                        if total_score>annotation.max_score:
                            annotation.max_score=total_score;
                    if len(annotation.mol_candidates.mol_list)>0:
                        annotation.mean_score=annotation.mean_score/len(annotation.mol_candidates.mol_list);
                        if len(annotation.mol_candidates.mol_list)<=max_results_per_query:
                            if peakmode==1:
                                return_number_pos[len(annotation.mol_candidates.mol_list)-1]+=1;
                            elif peakmode==-1:
                                return_number_neg[len(annotation.mol_candidates.mol_list)-1]+=1;                            
                            
                    for index in reversed(range(1,len(annotation.mol_candidates.mol_list))):
                        if annotation.mol_candidates.mol_list[index-1]['ShortInChI']==annotation.mol_candidates.mol_list[index]['ShortInChI']:
                            del annotation.mol_candidates.mol_list[index]; # For the purpose of statistics condensing sequential identical Short InChI-s
                        
                    for index in range(len(annotation.mol_candidates.mol_list)):
                        if annotation.mol_candidates.mol_list[index]['ShortInChI']==shortinchi:
                            annotation.mol_candidates.mol_list[index]['Annotation']='Correct';
                            if annotation.min_correct==-1:
                                annotation.min_correct=index;
                            annotation.max_correct=index;
                        else:
                            annotation.mol_candidates.mol_list[index]['Annotation']='Wrong';
                            
                    if annotation.min_correct>-1:
                        if peakmode==1:
                            for i in range(len(best_results_pos)):
                                if annotation.min_correct<=i:
                                    best_results_pos[i]+=1;
                                if annotation.max_correct<=i:
                                    worst_results_pos[i]+=1;
                        elif peakmode==-1:
                            for i in range(len(best_results_neg)):
                                if annotation.min_correct<=i:
                                    best_results_neg[i]+=1;
                                if annotation.max_correct<=i:
                                    worst_results_neg[i]+=1;
    
            logit('Finished Calculating Retrieval Stats.');
            logit('Positive Mode Count: %s'%pos_mode_count);
            logit('Negative Mode Count: %s'%neg_mode_count);
            logit('Total Peaks: %s'%total_peaks);
        
            logit('Positive Mode (Total: %s ):'%pos_mode_count);
            if pos_mode_count>0:
                for i in range(max_results_per_query):
                    logit('Correct within first\t%s:\tBest:\t%.2f%%\tWorst:\t%.2f%%\tCCount:\t%s'%(i+1,float(best_results_pos[i])*100/pos_mode_count,worst_results_pos[i]*100/pos_mode_count,return_number_pos[i]));
        
            logit('Negative Mode (Total: %s ):'%neg_mode_count);
            if neg_mode_count>0:
                for i in range(max_results_per_query):
                    logit('Correct within first\t%s:\tBest:\t%.2f%%\tWorst:\t%.2f%%\tCCount:\t%s'%(i+1,float(best_results_neg[i])*100/neg_mode_count,worst_results_neg[i]*100/neg_mode_count,return_number_neg[i]));
        
            logit('Both Modes (Total: %s ):'%(pos_mode_count+neg_mode_count));
            if neg_mode_count>0 or pos_mode_count>0:
                for i in range(max_results_per_query):
                    logit('Correct within first\t%s:\tBest:\t%.2f%%\tWorst:\t%.2f%%\tCCount:\t%s'%(i+1,float(best_results_neg[i]+best_results_pos[i])*100/(neg_mode_count+pos_mode_count),float(worst_results_neg[i]+worst_results_pos[i])*100/(neg_mode_count+pos_mode_count),return_number_pos[i]+return_number_neg[i]));
            for peak in peaks:
                merge_annotations(peak, remove_old_annotations=False);

        #Finished calculating retrieval statistics using test data
        #=====================================================================
        
    logtime();
    logit('Finished Annotation. Exporting results...');
    
    logtime();
    logit('Exporting to JSON...');
        
    spectra_to_json(os.path.join(output_folder,'annotated_spectra.json'), specmanager.ms_spectra);
    
    logtime();
    logit('Exporting to internal text format...');
    
    
    specmanager.export_textfile_spectra_to_folder(os.path.join(output_folder,'annotated_spectra'));
    
    logit('Preparing HTML report...');
    
    generate_HTML_report(os.path.join(output_folder,'Report'), specmanager);
                
    specmanager.close();
    
    logtime();
    
    logit('Finished');
    
    logfile.close();
コード例 #7
0
 def testrun(scorers_list=[], scorers_settings=[], batchindex=-1):
 
         #for batchindex in range(5):
         
         logit('Assessing Fragprint based search. Batch %s...'%batchindex);
         
         total_peaks=0;
 
         best_results_neg=[0]*max_results_per_query;
         worst_results_neg=[0]*max_results_per_query;
         return_number_neg=[0]*max_results_per_query;
         neg_mode_count=0;
         
         best_results_pos=[0]*max_results_per_query;
         worst_results_pos=[0]*max_results_per_query;
         return_number_pos=[0]*max_results_per_query;
         pos_mode_count=0;
         
             
         logtime();
     
         logit('Assuming correct adduct info and 0 isotope.');
         logit('No formula assumption. Assuming Mass precision: %s ppm'%ppm);
         logit('Only considering [M+H]+ and [M-H]- adducts for now');
         
         peaks=[];
         for spectrum_index in range(len(specmanager.ms_spectra)):
             print('%s of %s spectra preprocessed'%(spectrum_index,len(specmanager.ms_spectra)));
             spectrum=specmanager.ms_spectra[spectrum_index];
             if batchindex==-1 or int(spectrum.parameters['crossvalidation_batch_index'])==batchindex:
                 for peak in spectrum.peaks:
                     if 'ion_type' in peak.parameters:
                         if peak.parameters['ion_type'] in FragPrintScorer.supported_adducts:
                             peak.ppm=ppm; #Assuming 0 isotope states only.
                             shortinchi=peak.parent_spectrum.parameters['shortinchi'];                            
                             #formula=peak.parent_spectrum.parameters['formula'];                            
                             #correct_inchi=InChiFilter(shortinchi,True,4);
                             #correct_formula=FormulasFilter(formula);
                             
                             annotation=MSPeakAnnotation(adduct=get_adduct_by_name(peak.parameters['ion_type'],\
                                 spectrum.parameters['mode']), isotope=0, formula_scorer=None,\
                                 filters=[], scores={'AdductIsotopeScore':1.0});
                             if not (annotation.adduct is None):
                                 peak.annotations=[annotation];
                                 annotation.parent_peak=peak;
                                 peaks.append(peak);
                             else:
                                 logit('Unsupported Adduct: %s'%peak.parameters['ion_type']);
                     
         logit('Peaks to annotate: %s'%len(peaks));
 
         #
         peak_annotator.annotate_peaks(peaks, test_chemical_databases, \
             scorers_list=scorers_list, scorers_settings=scorers_settings, total_score=total_multiplicative_score,\
             results_limit=max_results_per_query, save_memory=False, batch_index=batchindex, ppm=ppm, overwrite=True);
         
         for i in reversed(range(len(peaks))):
             if not peaks[i].annotations[0].mol_candidates:
                 del peaks[i];
         
         logit('Finished annotating... Annotated peaks :%s'%len(peaks));    
 
 
         logtime();
         
         total_peaks=len(peaks);
         
         for peakindex in range(total_peaks):
             peak=peaks[peakindex];
             peakmode=0;
             if peak.parent_spectrum.parameters['mode']==1:
                 pos_mode_count+=1;
                 peakmode=1;
             elif peak.parent_spectrum.parameters['mode']==-1:
                 neg_mode_count+=1;
                 peakmode=-1;
                 
             print('%s of %s '%(peakindex,total_peaks));
             shortinchi=peak.parent_spectrum.parameters['shortinchi'];
 
             for annotation in peak.annotations:
                 annotation.min_correct=-1;
                 annotation.max_correct=-1;
                 
                 #Get annotation score
                 
                 annotation.mean_score=0.0;
                 annotation.max_score=0.0;
                 for index in range(len(annotation.mol_candidates.mol_list)):
                     total_score=annotation.mol_candidates.mol_list[index]['TotalScore'];
                     #print(annotation.mol_candidates.mol_list[index]);
                     annotation.mean_score+=total_score;
                     if total_score>annotation.max_score:
                         annotation.max_score=total_score;
                 if len(annotation.mol_candidates.mol_list)>0:
                     annotation.mean_score=annotation.mean_score/len(annotation.mol_candidates.mol_list);
                     if len(annotation.mol_candidates.mol_list)<=max_results_per_query:
                         if peakmode==1:
                             return_number_pos[len(annotation.mol_candidates.mol_list)-1]+=1;
                         elif peakmode==-1:
                             return_number_neg[len(annotation.mol_candidates.mol_list)-1]+=1;                            
                     
                 
                 
                 
 
                 for index in reversed(range(1,len(annotation.mol_candidates.mol_list))):
                     if annotation.mol_candidates.mol_list[index-1]['ShortInChi']==annotation.mol_candidates.mol_list[index]['ShortInChi']:
                         del annotation.mol_candidates.mol_list[index]; # For the purpose of statistics condensing sequential identical Short InChi-s
                         
                 #print(len(annotation.mol_candidates.mol_list));
                 #print('Correct: %s'%shortinchi);
                 
                 for index in range(len(annotation.mol_candidates.mol_list)):
                     #print(annotation.mol_candidates.mol_list[index]['ShortInChi']);
                     if annotation.mol_candidates.mol_list[index]['ShortInChi']==shortinchi:
                         #print('Got it!');
                         if annotation.min_correct==-1:
                             annotation.min_correct=index;
                         annotation.max_correct=index;
 
                 if annotation.min_correct>-1:
                     if peakmode==1:
                         for i in range(len(best_results_pos)):
                             if annotation.min_correct<=i:
                                 best_results_pos[i]+=1;
                             if annotation.max_correct<=i:
                                 worst_results_pos[i]+=1;
                     elif peakmode==-1:
                         for i in range(len(best_results_neg)):
                             if annotation.min_correct<=i:
                                 best_results_neg[i]+=1;
                             if annotation.max_correct<=i:
                                 worst_results_neg[i]+=1;
                 #else:
                 #    logit('No candidate for %s (Mass: %s, peak mode: %s, peak_mz %s, deltaM %s)'%(shortinchi,peak.parent_spectrum.parameters['exactmass'],peakmode, peak.mz,abs(peak.mz-float(peak.parent_spectrum.parameters['exactmass']))));
         logit('Finished Calculating Retrieval Stats.');
         logit('Positive Mode Count: %s'%pos_mode_count);
         logit('Negative Mode Count: %s'%neg_mode_count);
         logit('Total Peaks: %s'%total_peaks);
         logtime();
 
         logit('Positive Mode (Total: %s ):'%pos_mode_count);
         if pos_mode_count>0:
             for i in range(max_results_per_query):
                 logit('Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'%(i+1,best_results_pos[i]*100/pos_mode_count,worst_results_pos[i]*100/pos_mode_count,return_number_pos[i]));
 
         logit('Negative Mode (Total: %s ):'%neg_mode_count);
         if neg_mode_count>0:
             for i in range(max_results_per_query):
                 logit('Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'%(i+1,best_results_neg[i]*100/neg_mode_count,worst_results_neg[i]*100/neg_mode_count,return_number_neg[i]));
 
         logit('Both Modes (Total: %s ):'%(pos_mode_count+neg_mode_count));
         if neg_mode_count>0 or pos_mode_count>0:
             for i in range(max_results_per_query):
                 logit('Correct within first\t%s:\tBest:\t%s%%\tWorst:\t%s%%\tCandidateCount:\t%s'%(i+1,(best_results_neg[i]+best_results_pos[i])*100/(neg_mode_count+pos_mode_count),(worst_results_neg[i]+worst_results_pos[i])*100/(neg_mode_count+pos_mode_count),return_number_pos[i]+return_number_neg[i]));
         
         #logit('Missed Peaks due to unknown mode: %s'%(total_peaks-neg_mode_count-pos_mode_count));
 
         logtime();        
         '''